diff options
author | Scott LaVarnway <slavarnway@google.com> | 2022-10-05 07:04:27 -0700 |
---|---|---|
committer | Scott LaVarnway <slavarnway@google.com> | 2022-10-05 10:04:30 -0700 |
commit | c03c882785dc96ed91799280e68f8998bec50b90 (patch) | |
tree | 17e24e1f2bfd4aa53b97ab70c5f39a8297c199a2 /vpx_dsp | |
parent | dca6dcef0ad03a13799e2fa4cd85e28b25047b4c (diff) | |
download | libvpx-c03c882785dc96ed91799280e68f8998bec50b90.tar libvpx-c03c882785dc96ed91799280e68f8998bec50b90.tar.gz libvpx-c03c882785dc96ed91799280e68f8998bec50b90.tar.bz2 libvpx-c03c882785dc96ed91799280e68f8998bec50b90.zip |
Add vpx_highbd_sad16x{32,16,8}_avx2.
1.9x to 2.4x faster than the sse2 version.
Bug: b/245917257
Change-Id: I686452772f9b72233930de2207af36a0cd72e0bb
Diffstat (limited to 'vpx_dsp')
-rw-r--r-- | vpx_dsp/vpx_dsp.mk | 1 | ||||
-rw-r--r-- | vpx_dsp/vpx_dsp_rtcd_defs.pl | 6 | ||||
-rw-r--r-- | vpx_dsp/x86/highbd_sad_avx2.c | 100 |
3 files changed, 104 insertions, 3 deletions
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index f9a5c97dd..32d21e03f 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -394,6 +394,7 @@ DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad4d_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad_sse2.asm DSP_SRCS-$(HAVE_NEON) += arm/highbd_sad_neon.c DSP_SRCS-$(HAVE_AVX2) += x86/highbd_sad4d_avx2.c +DSP_SRCS-$(HAVE_AVX2) += x86/highbd_sad_avx2.c endif # CONFIG_VP9_HIGHBITDEPTH endif # CONFIG_ENCODERS diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 527d0e6e7..004afb38f 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -956,13 +956,13 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vpx_highbd_sad32x16 sse2 neon/; add_proto qw/unsigned int vpx_highbd_sad16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; - specialize qw/vpx_highbd_sad16x32 sse2 neon/; + specialize qw/vpx_highbd_sad16x32 sse2 neon avx2/; add_proto qw/unsigned int vpx_highbd_sad16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; - specialize qw/vpx_highbd_sad16x16 sse2 neon/; + specialize qw/vpx_highbd_sad16x16 sse2 neon avx2/; add_proto qw/unsigned int vpx_highbd_sad16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; - specialize qw/vpx_highbd_sad16x8 sse2 neon/; + specialize qw/vpx_highbd_sad16x8 sse2 neon avx2/; add_proto qw/unsigned int vpx_highbd_sad8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; specialize qw/vpx_highbd_sad8x16 sse2 neon/; diff --git a/vpx_dsp/x86/highbd_sad_avx2.c b/vpx_dsp/x86/highbd_sad_avx2.c new file mode 100644 index 000000000..36e9fa6c0 --- /dev/null +++ b/vpx_dsp/x86/highbd_sad_avx2.c @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include <immintrin.h> // AVX2 +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" + +static VPX_FORCE_INLINE unsigned int calc_final(const __m256i sums_32) { + const __m256i t0 = _mm256_add_epi32(sums_32, _mm256_srli_si256(sums_32, 8)); + const __m256i t1 = _mm256_add_epi32(t0, _mm256_srli_si256(t0, 4)); + const __m128i sum = _mm_add_epi32(_mm256_castsi256_si128(t1), + _mm256_extractf128_si256(t1, 1)); + return (unsigned int)_mm_cvtsi128_si32(sum); +} + +static VPX_FORCE_INLINE void highbd_sad16xH(__m256i *sums_16, + const uint16_t *src, int src_stride, + uint16_t *ref, int ref_stride, + int height) { + int i; + for (i = 0; i < height; i += 2) { + // load src and all ref[] + const __m256i s0 = _mm256_load_si256((const __m256i *)src); + const __m256i s1 = _mm256_load_si256((const __m256i *)(src + src_stride)); + const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref); + const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + ref_stride)); + // absolute differences between every ref[] to src + const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(r0, s0)); + const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(r1, s1)); + // sum every abs diff + *sums_16 = _mm256_add_epi16(*sums_16, abs_diff0); + *sums_16 = _mm256_add_epi16(*sums_16, abs_diff1); + + src += src_stride << 1; + ref += ref_stride << 1; + } +} + +unsigned int vpx_highbd_sad16x32_avx2(const uint8_t *src8_ptr, int src_stride, + const uint8_t *ref8_ptr, int ref_stride) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src8_ptr); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8_ptr); + __m256i sums_32 = _mm256_setzero_si256(); + int i; + + for (i = 0; i < 2; ++i) { + __m256i sums_16 = _mm256_setzero_si256(); + + highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, 16); + + // sums_16 will outrange after 16 rows, so add current sums_16 to sums_32 + sums_32 = _mm256_add_epi32( + sums_32, + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); + + src += src_stride << 4; + ref += ref_stride << 4; + } + return calc_final(sums_32); +} + +unsigned int vpx_highbd_sad16x16_avx2(const uint8_t *src8_ptr, int src_stride, + const uint8_t *ref8_ptr, int ref_stride) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src8_ptr); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8_ptr); + __m256i sums_16 = _mm256_setzero_si256(); + + highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, 16); + + { + const __m256i sums_32 = _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))); + return calc_final(sums_32); + } +} + +unsigned int vpx_highbd_sad16x8_avx2(const uint8_t *src8_ptr, int src_stride, + const uint8_t *ref8_ptr, int ref_stride) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src8_ptr); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8_ptr); + __m256i sums_16 = _mm256_setzero_si256(); + + highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, 8); + + { + const __m256i sums_32 = _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))); + return calc_final(sums_32); + } +} |