diff options
author | Sai Deng <sdeng@google.com> | 2018-12-07 17:00:03 +0000 |
---|---|---|
committer | Gerrit Code Review <noreply-gerritcodereview@google.com> | 2018-12-07 17:00:03 +0000 |
commit | b02ac73d8c543fdb04fed525d677059595e76188 (patch) | |
tree | 7bf6b67d789170b62eeda23df7a047a9ce3397f1 | |
parent | 418acaa0bd06e0666d5a55743e3a85b83c759619 (diff) | |
parent | b28b0709b9536eb376e59d0b8046158d8f1687ab (diff) | |
download | libvpx-b02ac73d8c543fdb04fed525d677059595e76188.tar libvpx-b02ac73d8c543fdb04fed525d677059595e76188.tar.gz libvpx-b02ac73d8c543fdb04fed525d677059595e76188.tar.bz2 libvpx-b02ac73d8c543fdb04fed525d677059595e76188.zip |
Merge "Add high bit Hadamard 16x16 avx2 implementation"
-rw-r--r-- | test/hadamard_test.cc | 4 | ||||
-rw-r--r-- | vpx_dsp/vpx_dsp_rtcd_defs.pl | 2 | ||||
-rw-r--r-- | vpx_dsp/x86/avg_intrin_avx2.c | 41 |
3 files changed, 45 insertions, 2 deletions
diff --git a/test/hadamard_test.cc b/test/hadamard_test.cc index b2b2d5fcd..a0f463ce9 100644 --- a/test/hadamard_test.cc +++ b/test/hadamard_test.cc @@ -310,7 +310,9 @@ INSTANTIATE_TEST_CASE_P( #if HAVE_AVX2 INSTANTIATE_TEST_CASE_P( AVX2, HadamardHighbdTest, - ::testing::Values(HadamardFuncWithSize(&vpx_highbd_hadamard_8x8_avx2, 8))); + ::testing::Values(HadamardFuncWithSize(&vpx_highbd_hadamard_8x8_avx2, 8), + HadamardFuncWithSize(&vpx_highbd_hadamard_16x16_avx2, + 16))); #endif // HAVE_AVX2 #endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 5dc682382..9992c09d7 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -789,7 +789,7 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") { specialize qw/vpx_highbd_hadamard_8x8 avx2/; add_proto qw/void vpx_highbd_hadamard_16x16/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff"; - specialize qw/vpx_highbd_hadamard_16x16/; + specialize qw/vpx_highbd_hadamard_16x16 avx2/; add_proto qw/void vpx_highbd_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff"; specialize qw/vpx_highbd_hadamard_32x32/; diff --git a/vpx_dsp/x86/avg_intrin_avx2.c b/vpx_dsp/x86/avg_intrin_avx2.c index 7d74705ea..a3ebacd4e 100644 --- a/vpx_dsp/x86/avg_intrin_avx2.c +++ b/vpx_dsp/x86/avg_intrin_avx2.c @@ -134,6 +134,47 @@ void vpx_highbd_hadamard_8x8_avx2(const int16_t *src_diff, ptrdiff_t src_stride, coeff += 8; _mm256_storeu_si256((__m256i *)coeff, src32[7]); } + +void vpx_highbd_hadamard_16x16_avx2(const int16_t *src_diff, + ptrdiff_t src_stride, tran_low_t *coeff) { + int idx; + tran_low_t *t_coeff = coeff; + for (idx = 0; idx < 4; ++idx) { + const int16_t *src_ptr = + src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8; + vpx_highbd_hadamard_8x8_avx2(src_ptr, src_stride, t_coeff + idx * 64); + } + + for (idx = 0; idx < 64; idx += 8) { + __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff); + __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 64)); + __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 128)); + __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 192)); + + __m256i b0 = _mm256_add_epi32(coeff0, coeff1); + __m256i b1 = _mm256_sub_epi32(coeff0, coeff1); + __m256i b2 = _mm256_add_epi32(coeff2, coeff3); + __m256i b3 = _mm256_sub_epi32(coeff2, coeff3); + + b0 = _mm256_srai_epi32(b0, 1); + b1 = _mm256_srai_epi32(b1, 1); + b2 = _mm256_srai_epi32(b2, 1); + b3 = _mm256_srai_epi32(b3, 1); + + coeff0 = _mm256_add_epi32(b0, b2); + coeff1 = _mm256_add_epi32(b1, b3); + coeff2 = _mm256_sub_epi32(b0, b2); + coeff3 = _mm256_sub_epi32(b1, b3); + + _mm256_storeu_si256((__m256i *)coeff, coeff0); + _mm256_storeu_si256((__m256i *)(coeff + 64), coeff1); + _mm256_storeu_si256((__m256i *)(coeff + 128), coeff2); + _mm256_storeu_si256((__m256i *)(coeff + 192), coeff3); + + coeff += 8; + t_coeff += 8; + } +} #endif // CONFIG_VP9_HIGHBITDEPTH static void hadamard_col8x2_avx2(__m256i *in, int iter) { |