diff options
author | Scott LaVarnway <slavarnway@google.com> | 2018-07-23 08:12:19 -0700 |
---|---|---|
committer | Scott LaVarnway <slavarnway@google.com> | 2018-07-23 12:49:50 -0700 |
commit | a83d11f9c44343df9585afa6f13545701d79adfb (patch) | |
tree | c3ee1ab2e63b6c393bde458f74edeee25c20f8e8 /vpx_dsp | |
parent | e858863dda2e242ede57916dae4086a991f618dd (diff) | |
download | libvpx-a83d11f9c44343df9585afa6f13545701d79adfb.tar libvpx-a83d11f9c44343df9585afa6f13545701d79adfb.tar.gz libvpx-a83d11f9c44343df9585afa6f13545701d79adfb.tar.bz2 libvpx-a83d11f9c44343df9585afa6f13545701d79adfb.zip |
VPX: Add vpx_hadamard_32x32_avx2
BUG=webm:1546
Change-Id: I64629ed83cb7acd0f2ac49b9c31f369d17a1aed2
Diffstat (limited to 'vpx_dsp')
-rw-r--r-- | vpx_dsp/vpx_dsp_rtcd_defs.pl | 4 | ||||
-rw-r--r-- | vpx_dsp/x86/avg_intrin_avx2.c | 35 |
2 files changed, 37 insertions, 2 deletions
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 13b83e1f5..2350bc6e8 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -783,7 +783,7 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") { specialize qw/vpx_hadamard_16x16 avx2 sse2 neon vsx/; add_proto qw/void vpx_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff"; - specialize qw/vpx_hadamard_32x32 sse2/; + specialize qw/vpx_hadamard_32x32 sse2 avx2/; add_proto qw/int vpx_satd/, "const tran_low_t *coeff, int length"; specialize qw/vpx_satd avx2 sse2 neon/; @@ -795,7 +795,7 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") { specialize qw/vpx_hadamard_16x16 avx2 sse2 neon msa vsx/; add_proto qw/void vpx_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff"; - specialize qw/vpx_hadamard_32x32 sse2/; + specialize qw/vpx_hadamard_32x32 sse2 avx2/; add_proto qw/int vpx_satd/, "const int16_t *coeff, int length"; specialize qw/vpx_satd avx2 sse2 neon msa/; diff --git a/vpx_dsp/x86/avg_intrin_avx2.c b/vpx_dsp/x86/avg_intrin_avx2.c index ff19ea647..7cbb54b51 100644 --- a/vpx_dsp/x86/avg_intrin_avx2.c +++ b/vpx_dsp/x86/avg_intrin_avx2.c @@ -172,6 +172,41 @@ void vpx_hadamard_16x16_avx2(int16_t const *src_diff, ptrdiff_t src_stride, } } +void vpx_hadamard_32x32_avx2(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + int idx; + for (idx = 0; idx < 4; ++idx) { + // src_diff: 9 bit, dynamic range [-255, 255] + const int16_t *src_ptr = + src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16; + vpx_hadamard_16x16_avx2(src_ptr, src_stride, coeff + idx * 256); + } + + for (idx = 0; idx < 256; idx += 16) { + const __m256i coeff0 = load_tran_low(coeff); + const __m256i coeff1 = load_tran_low(coeff + 256); + const __m256i coeff2 = load_tran_low(coeff + 512); + const __m256i coeff3 = load_tran_low(coeff + 768); + + __m256i b0 = _mm256_add_epi16(coeff0, coeff1); + __m256i b1 = _mm256_sub_epi16(coeff0, coeff1); + __m256i b2 = _mm256_add_epi16(coeff2, coeff3); + __m256i b3 = _mm256_sub_epi16(coeff2, coeff3); + + b0 = _mm256_srai_epi16(b0, 2); + b1 = _mm256_srai_epi16(b1, 2); + b2 = _mm256_srai_epi16(b2, 2); + b3 = _mm256_srai_epi16(b3, 2); + + store_tran_low(_mm256_add_epi16(b0, b2), coeff); + store_tran_low(_mm256_add_epi16(b1, b3), coeff + 256); + store_tran_low(_mm256_sub_epi16(b0, b2), coeff + 512); + store_tran_low(_mm256_sub_epi16(b1, b3), coeff + 768); + + coeff += 16; + } +} + int vpx_satd_avx2(const tran_low_t *coeff, int length) { const __m256i one = _mm256_set1_epi16(1); __m256i accum = _mm256_setzero_si256(); |