diff options
author | Angie Chiang <angiebird@google.com> | 2019-07-14 09:20:58 -0700 |
---|---|---|
committer | Angie Chiang <angiebird@google.com> | 2019-07-16 16:46:59 -0700 |
commit | 291055812b6962e808619892abe8c87277d843c4 (patch) | |
tree | 46b1d4d795079f1fe913cc344f1103ccac289e34 /vpx_dsp/x86 | |
parent | a6622470704b5252e415db18263cb5f8ee194800 (diff) | |
download | libvpx-291055812b6962e808619892abe8c87277d843c4.tar libvpx-291055812b6962e808619892abe8c87277d843c4.tar.gz libvpx-291055812b6962e808619892abe8c87277d843c4.tar.bz2 libvpx-291055812b6962e808619892abe8c87277d843c4.zip |
Add vpx_sad32x32x8_c/avx2
Change-Id: I4dbb7b6c8979c39eb6ffb97750e3cca0f4b7921f
Diffstat (limited to 'vpx_dsp/x86')
-rw-r--r-- | vpx_dsp/x86/sad4d_avx2.c | 65 |
1 files changed, 61 insertions, 4 deletions
diff --git a/vpx_dsp/x86/sad4d_avx2.c b/vpx_dsp/x86/sad4d_avx2.c index b18fecf70..a5c4f8c53 100644 --- a/vpx_dsp/x86/sad4d_avx2.c +++ b/vpx_dsp/x86/sad4d_avx2.c @@ -11,8 +11,8 @@ #include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" -static INLINE void calc_final(const __m256i *const sums /*[4]*/, - uint32_t sad_array[4]) { +static INLINE void calc_final_4(const __m256i *const sums /*[4]*/, + uint32_t *sad_array) { const __m256i t0 = _mm256_hadd_epi32(sums[0], sums[1]); const __m256i t1 = _mm256_hadd_epi32(sums[2], sums[3]); const __m256i t2 = _mm256_hadd_epi32(t0, t1); @@ -66,7 +66,64 @@ void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, refs[3] += ref_stride; } - calc_final(sums, sad_array); + calc_final_4(sums, sad_array); +} + +void vpx_sad32x32x8_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + uint32_t *sad_array) { + int i; + __m256i sums[8]; + + sums[0] = _mm256_setzero_si256(); + sums[1] = _mm256_setzero_si256(); + sums[2] = _mm256_setzero_si256(); + sums[3] = _mm256_setzero_si256(); + sums[4] = _mm256_setzero_si256(); + sums[5] = _mm256_setzero_si256(); + sums[6] = _mm256_setzero_si256(); + sums[7] = _mm256_setzero_si256(); + + for (i = 0; i < 32; i++) { + __m256i r[8]; + + // load src and all ref[] + const __m256i s = _mm256_load_si256((const __m256i *)src_ptr); + r[0] = _mm256_loadu_si256((const __m256i *)&ref_ptr[0]); + r[1] = _mm256_loadu_si256((const __m256i *)&ref_ptr[1]); + r[2] = _mm256_loadu_si256((const __m256i *)&ref_ptr[2]); + r[3] = _mm256_loadu_si256((const __m256i *)&ref_ptr[3]); + r[4] = _mm256_loadu_si256((const __m256i *)&ref_ptr[4]); + r[5] = _mm256_loadu_si256((const __m256i *)&ref_ptr[5]); + r[6] = _mm256_loadu_si256((const __m256i *)&ref_ptr[6]); + r[7] = _mm256_loadu_si256((const __m256i *)&ref_ptr[7]); + + // sum of the absolute differences between every ref[] to src + r[0] = _mm256_sad_epu8(r[0], s); + r[1] = _mm256_sad_epu8(r[1], s); + r[2] = _mm256_sad_epu8(r[2], s); + r[3] = _mm256_sad_epu8(r[3], s); + r[4] = _mm256_sad_epu8(r[4], s); + r[5] = _mm256_sad_epu8(r[5], s); + r[6] = _mm256_sad_epu8(r[6], s); + r[7] = _mm256_sad_epu8(r[7], s); + + // sum every ref[] + sums[0] = _mm256_add_epi32(sums[0], r[0]); + sums[1] = _mm256_add_epi32(sums[1], r[1]); + sums[2] = _mm256_add_epi32(sums[2], r[2]); + sums[3] = _mm256_add_epi32(sums[3], r[3]); + sums[4] = _mm256_add_epi32(sums[4], r[4]); + sums[5] = _mm256_add_epi32(sums[5], r[5]); + sums[6] = _mm256_add_epi32(sums[6], r[6]); + sums[7] = _mm256_add_epi32(sums[7], r[7]); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } + + calc_final_4(sums, sad_array); + calc_final_4(sums + 4, sad_array + 4); } void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, @@ -126,5 +183,5 @@ void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, refs[3] += ref_stride; } - calc_final(sums, sad_array); + calc_final_4(sums, sad_array); } |