summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSai Deng <sdeng@google.com>2018-12-07 17:00:03 +0000
committerGerrit Code Review <noreply-gerritcodereview@google.com>2018-12-07 17:00:03 +0000
commitb02ac73d8c543fdb04fed525d677059595e76188 (patch)
tree7bf6b67d789170b62eeda23df7a047a9ce3397f1
parent418acaa0bd06e0666d5a55743e3a85b83c759619 (diff)
parentb28b0709b9536eb376e59d0b8046158d8f1687ab (diff)
downloadlibvpx-b02ac73d8c543fdb04fed525d677059595e76188.tar
libvpx-b02ac73d8c543fdb04fed525d677059595e76188.tar.gz
libvpx-b02ac73d8c543fdb04fed525d677059595e76188.tar.bz2
libvpx-b02ac73d8c543fdb04fed525d677059595e76188.zip
Merge "Add high bit Hadamard 16x16 avx2 implementation"
-rw-r--r--test/hadamard_test.cc4
-rw-r--r--vpx_dsp/vpx_dsp_rtcd_defs.pl2
-rw-r--r--vpx_dsp/x86/avg_intrin_avx2.c41
3 files changed, 45 insertions, 2 deletions
diff --git a/test/hadamard_test.cc b/test/hadamard_test.cc
index b2b2d5fcd..a0f463ce9 100644
--- a/test/hadamard_test.cc
+++ b/test/hadamard_test.cc
@@ -310,7 +310,9 @@ INSTANTIATE_TEST_CASE_P(
#if HAVE_AVX2
INSTANTIATE_TEST_CASE_P(
AVX2, HadamardHighbdTest,
- ::testing::Values(HadamardFuncWithSize(&vpx_highbd_hadamard_8x8_avx2, 8)));
+ ::testing::Values(HadamardFuncWithSize(&vpx_highbd_hadamard_8x8_avx2, 8),
+ HadamardFuncWithSize(&vpx_highbd_hadamard_16x16_avx2,
+ 16)));
#endif // HAVE_AVX2
#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 5dc682382..9992c09d7 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -789,7 +789,7 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
specialize qw/vpx_highbd_hadamard_8x8 avx2/;
add_proto qw/void vpx_highbd_hadamard_16x16/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
- specialize qw/vpx_highbd_hadamard_16x16/;
+ specialize qw/vpx_highbd_hadamard_16x16 avx2/;
add_proto qw/void vpx_highbd_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
specialize qw/vpx_highbd_hadamard_32x32/;
diff --git a/vpx_dsp/x86/avg_intrin_avx2.c b/vpx_dsp/x86/avg_intrin_avx2.c
index 7d74705ea..a3ebacd4e 100644
--- a/vpx_dsp/x86/avg_intrin_avx2.c
+++ b/vpx_dsp/x86/avg_intrin_avx2.c
@@ -134,6 +134,47 @@ void vpx_highbd_hadamard_8x8_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
coeff += 8;
_mm256_storeu_si256((__m256i *)coeff, src32[7]);
}
+
+void vpx_highbd_hadamard_16x16_avx2(const int16_t *src_diff,
+ ptrdiff_t src_stride, tran_low_t *coeff) {
+ int idx;
+ tran_low_t *t_coeff = coeff;
+ for (idx = 0; idx < 4; ++idx) {
+ const int16_t *src_ptr =
+ src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
+ vpx_highbd_hadamard_8x8_avx2(src_ptr, src_stride, t_coeff + idx * 64);
+ }
+
+ for (idx = 0; idx < 64; idx += 8) {
+ __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff);
+ __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 64));
+ __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 128));
+ __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 192));
+
+ __m256i b0 = _mm256_add_epi32(coeff0, coeff1);
+ __m256i b1 = _mm256_sub_epi32(coeff0, coeff1);
+ __m256i b2 = _mm256_add_epi32(coeff2, coeff3);
+ __m256i b3 = _mm256_sub_epi32(coeff2, coeff3);
+
+ b0 = _mm256_srai_epi32(b0, 1);
+ b1 = _mm256_srai_epi32(b1, 1);
+ b2 = _mm256_srai_epi32(b2, 1);
+ b3 = _mm256_srai_epi32(b3, 1);
+
+ coeff0 = _mm256_add_epi32(b0, b2);
+ coeff1 = _mm256_add_epi32(b1, b3);
+ coeff2 = _mm256_sub_epi32(b0, b2);
+ coeff3 = _mm256_sub_epi32(b1, b3);
+
+ _mm256_storeu_si256((__m256i *)coeff, coeff0);
+ _mm256_storeu_si256((__m256i *)(coeff + 64), coeff1);
+ _mm256_storeu_si256((__m256i *)(coeff + 128), coeff2);
+ _mm256_storeu_si256((__m256i *)(coeff + 192), coeff3);
+
+ coeff += 8;
+ t_coeff += 8;
+ }
+}
#endif // CONFIG_VP9_HIGHBITDEPTH
static void hadamard_col8x2_avx2(__m256i *in, int iter) {