summaryrefslogtreecommitdiff
path: root/vpx_dsp
diff options
context:
space:
mode:
authorScott LaVarnway <slavarnway@google.com>2022-08-05 07:40:26 -0700
committerScott LaVarnway <slavarnway@google.com>2022-08-05 16:02:38 -0700
commitc9f049fd9164e0b5b950bdb8ac80186787b5b64c (patch)
tree71b33ea08ef96911c08aade6a8c1d03ccf1b20dd /vpx_dsp
parentaa2dc0cc7294158e1c6daab10d2714beef11ecad (diff)
downloadlibvpx-c9f049fd9164e0b5b950bdb8ac80186787b5b64c.tar
libvpx-c9f049fd9164e0b5b950bdb8ac80186787b5b64c.tar.gz
libvpx-c9f049fd9164e0b5b950bdb8ac80186787b5b64c.tar.bz2
libvpx-c9f049fd9164e0b5b950bdb8ac80186787b5b64c.zip
VPX: Add vpx_subtract_block_avx2().
~1.3x faster than vpx_subtract_block_sse2(). Based on aom_subtract_block_avx2(). Bug: b/241580104 Change-Id: I17da036363f213d53c6546c3e858e4c3cba44a5b
Diffstat (limited to 'vpx_dsp')
-rw-r--r--vpx_dsp/vpx_dsp.mk1
-rw-r--r--vpx_dsp/vpx_dsp_rtcd_defs.pl2
-rw-r--r--vpx_dsp/x86/subtract_avx2.c96
3 files changed, 98 insertions, 1 deletions
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index dd667195f..ffe954832 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -376,6 +376,7 @@ DSP_SRCS-$(HAVE_MMI) += mips/subtract_mmi.c
DSP_SRCS-$(HAVE_AVX2) += x86/sad4d_avx2.c
DSP_SRCS-$(HAVE_AVX2) += x86/sad_avx2.c
+DSP_SRCS-$(HAVE_AVX2) += x86/subtract_avx2.c
DSP_SRCS-$(HAVE_AVX512) += x86/sad4d_avx512.c
DSP_SRCS-$(HAVE_SSE2) += x86/sad4d_sse2.asm
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index e7ad640af..db211ed8c 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -730,7 +730,7 @@ if (vpx_config("CONFIG_ENCODERS") eq "yes") {
# Block subtraction
#
add_proto qw/void vpx_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";
-specialize qw/vpx_subtract_block neon msa mmi sse2 vsx lsx/;
+specialize qw/vpx_subtract_block neon msa mmi sse2 avx2 vsx lsx/;
#
# Single block SAD
diff --git a/vpx_dsp/x86/subtract_avx2.c b/vpx_dsp/x86/subtract_avx2.c
new file mode 100644
index 000000000..4d259ef5c
--- /dev/null
+++ b/vpx_dsp/x86/subtract_avx2.c
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+static VPX_FORCE_INLINE void subtract32_avx2(int16_t *diff_ptr,
+ const uint8_t *src_ptr,
+ const uint8_t *pred_ptr) {
+ const __m256i s = _mm256_lddqu_si256((const __m256i *)src_ptr);
+ const __m256i p = _mm256_lddqu_si256((const __m256i *)pred_ptr);
+ const __m256i s_0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s));
+ const __m256i s_1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s, 1));
+ const __m256i p_0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(p));
+ const __m256i p_1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(p, 1));
+ const __m256i d_0 = _mm256_sub_epi16(s_0, p_0);
+ const __m256i d_1 = _mm256_sub_epi16(s_1, p_1);
+ _mm256_storeu_si256((__m256i *)diff_ptr, d_0);
+ _mm256_storeu_si256((__m256i *)(diff_ptr + 16), d_1);
+}
+
+static VPX_FORCE_INLINE void subtract_block_16xn_avx2(
+ int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
+ ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
+ int j;
+ for (j = 0; j < rows; ++j) {
+ const __m128i s = _mm_lddqu_si128((const __m128i *)src_ptr);
+ const __m128i p = _mm_lddqu_si128((const __m128i *)pred_ptr);
+ const __m256i s_0 = _mm256_cvtepu8_epi16(s);
+ const __m256i p_0 = _mm256_cvtepu8_epi16(p);
+ const __m256i d_0 = _mm256_sub_epi16(s_0, p_0);
+ _mm256_storeu_si256((__m256i *)diff_ptr, d_0);
+ src_ptr += src_stride;
+ pred_ptr += pred_stride;
+ diff_ptr += diff_stride;
+ }
+}
+
+static VPX_FORCE_INLINE void subtract_block_32xn_avx2(
+ int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
+ ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
+ int j;
+ for (j = 0; j < rows; ++j) {
+ subtract32_avx2(diff_ptr, src_ptr, pred_ptr);
+ src_ptr += src_stride;
+ pred_ptr += pred_stride;
+ diff_ptr += diff_stride;
+ }
+}
+
+static VPX_FORCE_INLINE void subtract_block_64xn_avx2(
+ int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
+ ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
+ int j;
+ for (j = 0; j < rows; ++j) {
+ subtract32_avx2(diff_ptr, src_ptr, pred_ptr);
+ subtract32_avx2(diff_ptr + 32, src_ptr + 32, pred_ptr + 32);
+ src_ptr += src_stride;
+ pred_ptr += pred_stride;
+ diff_ptr += diff_stride;
+ }
+}
+
+void vpx_subtract_block_avx2(int rows, int cols, int16_t *diff_ptr,
+ ptrdiff_t diff_stride, const uint8_t *src_ptr,
+ ptrdiff_t src_stride, const uint8_t *pred_ptr,
+ ptrdiff_t pred_stride) {
+ switch (cols) {
+ case 16:
+ subtract_block_16xn_avx2(rows, diff_ptr, diff_stride, src_ptr, src_stride,
+ pred_ptr, pred_stride);
+ break;
+ case 32:
+ subtract_block_32xn_avx2(rows, diff_ptr, diff_stride, src_ptr, src_stride,
+ pred_ptr, pred_stride);
+ break;
+ case 64:
+ subtract_block_64xn_avx2(rows, diff_ptr, diff_stride, src_ptr, src_stride,
+ pred_ptr, pred_stride);
+ break;
+ default:
+ vpx_subtract_block_sse2(rows, cols, diff_ptr, diff_stride, src_ptr,
+ src_stride, pred_ptr, pred_stride);
+ break;
+ }
+}