diff options
author | Scott LaVarnway <slavarnway@google.com> | 2022-10-05 14:03:55 -0700 |
---|---|---|
committer | Scott LaVarnway <slavarnway@google.com> | 2022-10-06 05:33:40 -0700 |
commit | 4955b945d851cd86c287401d3bca846dae354d16 (patch) | |
tree | 88b17c68d3e64832775c9602ccf8f24d2d94809c /vpx_dsp | |
parent | c03c882785dc96ed91799280e68f8998bec50b90 (diff) | |
download | libvpx-4955b945d851cd86c287401d3bca846dae354d16.tar libvpx-4955b945d851cd86c287401d3bca846dae354d16.tar.gz libvpx-4955b945d851cd86c287401d3bca846dae354d16.tar.bz2 libvpx-4955b945d851cd86c287401d3bca846dae354d16.zip |
Add vpx_highbd_sad32x{64,32,16}_avx2.
2.7x to 3.1x faster than the sse2 version.
Bug: b/245917257
Change-Id: Idff3284932f7ee89d036f38893205bf622a159a3
Diffstat (limited to 'vpx_dsp')
-rw-r--r-- | vpx_dsp/vpx_dsp_rtcd_defs.pl | 6 | ||||
-rw-r--r-- | vpx_dsp/x86/highbd_sad_avx2.c | 62 |
2 files changed, 64 insertions, 4 deletions
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 004afb38f..d669b9999 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -947,13 +947,13 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vpx_highbd_sad64x32 sse2 neon/; add_proto qw/unsigned int vpx_highbd_sad32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; - specialize qw/vpx_highbd_sad32x64 sse2 neon/; + specialize qw/vpx_highbd_sad32x64 sse2 neon avx2/; add_proto qw/unsigned int vpx_highbd_sad32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; - specialize qw/vpx_highbd_sad32x32 sse2 neon/; + specialize qw/vpx_highbd_sad32x32 sse2 neon avx2/; add_proto qw/unsigned int vpx_highbd_sad32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; - specialize qw/vpx_highbd_sad32x16 sse2 neon/; + specialize qw/vpx_highbd_sad32x16 sse2 neon avx2/; add_proto qw/unsigned int vpx_highbd_sad16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; specialize qw/vpx_highbd_sad16x32 sse2 neon avx2/; diff --git a/vpx_dsp/x86/highbd_sad_avx2.c b/vpx_dsp/x86/highbd_sad_avx2.c index 36e9fa6c0..eb0e3eec5 100644 --- a/vpx_dsp/x86/highbd_sad_avx2.c +++ b/vpx_dsp/x86/highbd_sad_avx2.c @@ -7,7 +7,7 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#include <immintrin.h> // AVX2 +#include <immintrin.h> #include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" @@ -19,6 +19,66 @@ static VPX_FORCE_INLINE unsigned int calc_final(const __m256i sums_32) { return (unsigned int)_mm_cvtsi128_si32(sum); } +static VPX_FORCE_INLINE void highbd_sad32xH(__m256i *sums_16, + const uint16_t *src, int src_stride, + uint16_t *ref, int ref_stride, + int height) { + int i; + for (i = 0; i < height; ++i) { + // load src and all ref[] + const __m256i s0 = _mm256_load_si256((const __m256i *)src); + const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16)); + const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref); + const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16)); + // absolute differences between every ref[] to src + const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(r0, s0)); + const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(r1, s1)); + // sum every abs diff + *sums_16 = _mm256_add_epi16(*sums_16, abs_diff0); + *sums_16 = _mm256_add_epi16(*sums_16, abs_diff1); + + src += src_stride; + ref += ref_stride; + } +} + +#define HIGHBD_SAD32XN(n) \ + unsigned int vpx_highbd_sad32x##n##_avx2( \ + const uint8_t *src8_ptr, int src_stride, const uint8_t *ref8_ptr, \ + int ref_stride) { \ + const uint16_t *src = CONVERT_TO_SHORTPTR(src8_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8_ptr); \ + __m256i sums_32 = _mm256_setzero_si256(); \ + int i; \ + \ + for (i = 0; i < (n / 8); ++i) { \ + __m256i sums_16 = _mm256_setzero_si256(); \ + \ + highbd_sad32xH(&sums_16, src, src_stride, ref, ref_stride, 8); \ + \ + /* sums_16 will outrange after 8 rows, so add current sums_16 to \ + * sums_32*/ \ + sums_32 = _mm256_add_epi32( \ + sums_32, \ + _mm256_add_epi32( \ + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), \ + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); \ + \ + src += src_stride << 3; \ + ref += ref_stride << 3; \ + } \ + return calc_final(sums_32); \ + } + +// 32x64 +HIGHBD_SAD32XN(64) + +// 32x32 +HIGHBD_SAD32XN(32) + +// 32x16 +HIGHBD_SAD32XN(16) + static VPX_FORCE_INLINE void highbd_sad16xH(__m256i *sums_16, const uint16_t *src, int src_stride, uint16_t *ref, int ref_stride, |