Add vpx_highbd_sad16x{32,16,8}_avx2.

1.9x to 2.4x faster than the sse2 version. Bug: b/245917257 Change-Id: I686452772f9b72233930de2207af36a0cd72e0bb
author: Scott LaVarnway <slavarnway@google.com> 2022-10-05 07:04:27 -0700
committer: Scott LaVarnway <slavarnway@google.com> 2022-10-05 10:04:30 -0700
commit: c03c882785dc96ed91799280e68f8998bec50b90 (patch)
tree: 17e24e1f2bfd4aa53b97ab70c5f39a8297c199a2 /vpx_dsp
parent: dca6dcef0ad03a13799e2fa4cd85e28b25047b4c (diff)
download: libvpx-c03c882785dc96ed91799280e68f8998bec50b90.tar
libvpx-c03c882785dc96ed91799280e68f8998bec50b90.tar.gz
libvpx-c03c882785dc96ed91799280e68f8998bec50b90.tar.bz2
libvpx-c03c882785dc96ed91799280e68f8998bec50b90.zip
3 files changed, 104 insertions, 3 deletions
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index f9a5c97dd..32d21e03f 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -394,6 +394,7 @@ DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad4d_sse2.asm
 DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad_sse2.asm
 DSP_SRCS-$(HAVE_NEON) += arm/highbd_sad_neon.c
 DSP_SRCS-$(HAVE_AVX2) += x86/highbd_sad4d_avx2.c
+DSP_SRCS-$(HAVE_AVX2) += x86/highbd_sad_avx2.c
 endif  # CONFIG_VP9_HIGHBITDEPTH
 
 endif  # CONFIG_ENCODERS
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 527d0e6e7..004afb38f 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -956,13 +956,13 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   specialize qw/vpx_highbd_sad32x16 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_sad16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad16x32 sse2 neon/;
+  specialize qw/vpx_highbd_sad16x32 sse2 neon avx2/;
 
   add_proto qw/unsigned int vpx_highbd_sad16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad16x16 sse2 neon/;
+  specialize qw/vpx_highbd_sad16x16 sse2 neon avx2/;
 
   add_proto qw/unsigned int vpx_highbd_sad16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad16x8 sse2 neon/;
+  specialize qw/vpx_highbd_sad16x8 sse2 neon avx2/;
 
   add_proto qw/unsigned int vpx_highbd_sad8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
   specialize qw/vpx_highbd_sad8x16 sse2 neon/;
diff --git a/vpx_dsp/x86/highbd_sad_avx2.c b/vpx_dsp/x86/highbd_sad_avx2.c
new file mode 100644
index 000000000..36e9fa6c0
--- /dev/null
+++ b/vpx_dsp/x86/highbd_sad_avx2.c
@@ -0,0 +1,100 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include <immintrin.h>  // AVX2
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+static VPX_FORCE_INLINE unsigned int calc_final(const __m256i sums_32) {
+  const __m256i t0 = _mm256_add_epi32(sums_32, _mm256_srli_si256(sums_32, 8));
+  const __m256i t1 = _mm256_add_epi32(t0, _mm256_srli_si256(t0, 4));
+  const __m128i sum = _mm_add_epi32(_mm256_castsi256_si128(t1),
+                                    _mm256_extractf128_si256(t1, 1));
+  return (unsigned int)_mm_cvtsi128_si32(sum);
+}
+
+static VPX_FORCE_INLINE void highbd_sad16xH(__m256i *sums_16,
+                                            const uint16_t *src, int src_stride,
+                                            uint16_t *ref, int ref_stride,
+                                            int height) {
+  int i;
+  for (i = 0; i < height; i += 2) {
+    // load src and all ref[]
+    const __m256i s0 = _mm256_load_si256((const __m256i *)src);
+    const __m256i s1 = _mm256_load_si256((const __m256i *)(src + src_stride));
+    const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
+    const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + ref_stride));
+    // absolute differences between every ref[] to src
+    const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(r0, s0));
+    const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(r1, s1));
+    // sum every abs diff
+    *sums_16 = _mm256_add_epi16(*sums_16, abs_diff0);
+    *sums_16 = _mm256_add_epi16(*sums_16, abs_diff1);
+
+    src += src_stride << 1;
+    ref += ref_stride << 1;
+  }
+}
+
+unsigned int vpx_highbd_sad16x32_avx2(const uint8_t *src8_ptr, int src_stride,
+                                      const uint8_t *ref8_ptr, int ref_stride) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8_ptr);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8_ptr);
+  __m256i sums_32 = _mm256_setzero_si256();
+  int i;
+
+  for (i = 0; i < 2; ++i) {
+    __m256i sums_16 = _mm256_setzero_si256();
+
+    highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, 16);
+
+    // sums_16 will outrange after 16 rows, so add current sums_16 to sums_32
+    sums_32 = _mm256_add_epi32(
+        sums_32,
+        _mm256_add_epi32(
+            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
+            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))));
+
+    src += src_stride << 4;
+    ref += ref_stride << 4;
+  }
+  return calc_final(sums_32);
+}
+
+unsigned int vpx_highbd_sad16x16_avx2(const uint8_t *src8_ptr, int src_stride,
+                                      const uint8_t *ref8_ptr, int ref_stride) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8_ptr);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8_ptr);
+  __m256i sums_16 = _mm256_setzero_si256();
+
+  highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, 16);
+
+  {
+    const __m256i sums_32 = _mm256_add_epi32(
+        _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
+        _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)));
+    return calc_final(sums_32);
+  }
+}
+
+unsigned int vpx_highbd_sad16x8_avx2(const uint8_t *src8_ptr, int src_stride,
+                                     const uint8_t *ref8_ptr, int ref_stride) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8_ptr);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8_ptr);
+  __m256i sums_16 = _mm256_setzero_si256();
+
+  highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, 8);
+
+  {
+    const __m256i sums_32 = _mm256_add_epi32(
+        _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
+        _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)));
+    return calc_final(sums_32);
+  }
+}
author	Scott LaVarnway <slavarnway@google.com>	2022-10-05 07:04:27 -0700
committer	Scott LaVarnway <slavarnway@google.com>	2022-10-05 10:04:30 -0700
commit	c03c882785dc96ed91799280e68f8998bec50b90 (patch)
tree	17e24e1f2bfd4aa53b97ab70c5f39a8297c199a2 /vpx_dsp
parent	dca6dcef0ad03a13799e2fa4cd85e28b25047b4c (diff)
download	libvpx-c03c882785dc96ed91799280e68f8998bec50b90.tar libvpx-c03c882785dc96ed91799280e68f8998bec50b90.tar.gz libvpx-c03c882785dc96ed91799280e68f8998bec50b90.tar.bz2 libvpx-c03c882785dc96ed91799280e68f8998bec50b90.zip