summaryrefslogtreecommitdiff
path: root/vpx_dsp
diff options
context:
space:
mode:
Diffstat (limited to 'vpx_dsp')
-rw-r--r--vpx_dsp/sad.c1
-rw-r--r--vpx_dsp/vpx_dsp_rtcd_defs.pl3
-rw-r--r--vpx_dsp/x86/sad4d_avx2.c65
3 files changed, 65 insertions, 4 deletions
diff --git a/vpx_dsp/sad.c b/vpx_dsp/sad.c
index 873ddca09..769322019 100644
--- a/vpx_dsp/sad.c
+++ b/vpx_dsp/sad.c
@@ -83,6 +83,7 @@ sadMxNx4D(32, 64)
// 32x32
sadMxN(32, 32)
+sadMxNxK(32, 32, 8)
sadMxNx4D(32, 32)
// 32x16
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 797ef7fe0..fd7eefdad 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -893,6 +893,9 @@ add_proto qw/void vpx_sad4x4x3/, "const uint8_t *src_ptr, int src_stride, const
specialize qw/vpx_sad4x4x3 sse3 msa mmi/;
# Blocks of 8
+add_proto qw/void vpx_sad32x32x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad32x32x8 avx2/;
+
add_proto qw/void vpx_sad16x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
specialize qw/vpx_sad16x16x8 sse4_1 msa mmi/;
diff --git a/vpx_dsp/x86/sad4d_avx2.c b/vpx_dsp/x86/sad4d_avx2.c
index b18fecf70..a5c4f8c53 100644
--- a/vpx_dsp/x86/sad4d_avx2.c
+++ b/vpx_dsp/x86/sad4d_avx2.c
@@ -11,8 +11,8 @@
#include "./vpx_dsp_rtcd.h"
#include "vpx/vpx_integer.h"
-static INLINE void calc_final(const __m256i *const sums /*[4]*/,
- uint32_t sad_array[4]) {
+static INLINE void calc_final_4(const __m256i *const sums /*[4]*/,
+ uint32_t *sad_array) {
const __m256i t0 = _mm256_hadd_epi32(sums[0], sums[1]);
const __m256i t1 = _mm256_hadd_epi32(sums[2], sums[3]);
const __m256i t2 = _mm256_hadd_epi32(t0, t1);
@@ -66,7 +66,64 @@ void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride,
refs[3] += ref_stride;
}
- calc_final(sums, sad_array);
+ calc_final_4(sums, sad_array);
+}
+
+void vpx_sad32x32x8_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ uint32_t *sad_array) {
+ int i;
+ __m256i sums[8];
+
+ sums[0] = _mm256_setzero_si256();
+ sums[1] = _mm256_setzero_si256();
+ sums[2] = _mm256_setzero_si256();
+ sums[3] = _mm256_setzero_si256();
+ sums[4] = _mm256_setzero_si256();
+ sums[5] = _mm256_setzero_si256();
+ sums[6] = _mm256_setzero_si256();
+ sums[7] = _mm256_setzero_si256();
+
+ for (i = 0; i < 32; i++) {
+ __m256i r[8];
+
+ // load src and all ref[]
+ const __m256i s = _mm256_load_si256((const __m256i *)src_ptr);
+ r[0] = _mm256_loadu_si256((const __m256i *)&ref_ptr[0]);
+ r[1] = _mm256_loadu_si256((const __m256i *)&ref_ptr[1]);
+ r[2] = _mm256_loadu_si256((const __m256i *)&ref_ptr[2]);
+ r[3] = _mm256_loadu_si256((const __m256i *)&ref_ptr[3]);
+ r[4] = _mm256_loadu_si256((const __m256i *)&ref_ptr[4]);
+ r[5] = _mm256_loadu_si256((const __m256i *)&ref_ptr[5]);
+ r[6] = _mm256_loadu_si256((const __m256i *)&ref_ptr[6]);
+ r[7] = _mm256_loadu_si256((const __m256i *)&ref_ptr[7]);
+
+ // sum of the absolute differences between every ref[] to src
+ r[0] = _mm256_sad_epu8(r[0], s);
+ r[1] = _mm256_sad_epu8(r[1], s);
+ r[2] = _mm256_sad_epu8(r[2], s);
+ r[3] = _mm256_sad_epu8(r[3], s);
+ r[4] = _mm256_sad_epu8(r[4], s);
+ r[5] = _mm256_sad_epu8(r[5], s);
+ r[6] = _mm256_sad_epu8(r[6], s);
+ r[7] = _mm256_sad_epu8(r[7], s);
+
+ // sum every ref[]
+ sums[0] = _mm256_add_epi32(sums[0], r[0]);
+ sums[1] = _mm256_add_epi32(sums[1], r[1]);
+ sums[2] = _mm256_add_epi32(sums[2], r[2]);
+ sums[3] = _mm256_add_epi32(sums[3], r[3]);
+ sums[4] = _mm256_add_epi32(sums[4], r[4]);
+ sums[5] = _mm256_add_epi32(sums[5], r[5]);
+ sums[6] = _mm256_add_epi32(sums[6], r[6]);
+ sums[7] = _mm256_add_epi32(sums[7], r[7]);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ }
+
+ calc_final_4(sums, sad_array);
+ calc_final_4(sums + 4, sad_array + 4);
}
void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride,
@@ -126,5 +183,5 @@ void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride,
refs[3] += ref_stride;
}
- calc_final(sums, sad_array);
+ calc_final_4(sums, sad_array);
}