diff options
author | Scott LaVarnway <slavarnway@google.com> | 2017-10-20 05:21:15 -0700 |
---|---|---|
committer | Scott LaVarnway <slavarnway@google.com> | 2017-10-20 08:55:06 -0700 |
commit | 4906cea0276c17a089f3b7616a43511cbc5ff069 (patch) | |
tree | b85b9237374c82067615ca73c02ba030dc2d69ac | |
parent | b58259ab55674cb028898a0ac9e8fdd3cf1d4b39 (diff) | |
download | libvpx-4906cea0276c17a089f3b7616a43511cbc5ff069.tar libvpx-4906cea0276c17a089f3b7616a43511cbc5ff069.tar.gz libvpx-4906cea0276c17a089f3b7616a43511cbc5ff069.tar.bz2 libvpx-4906cea0276c17a089f3b7616a43511cbc5ff069.zip |
vpx: [x86] vpx_hadamard_16x16_avx2() improvements
~10% performance gain. Fixed the cosmetics noted in the
previous commit.
Change-Id: Iddf475f34d0d0a3e356b2143682aeabac459ed13
-rw-r--r-- | vpx_dsp/x86/avg_intrin_avx2.c | 68 | ||||
-rw-r--r-- | vpx_dsp/x86/bitdepth_conversion_avx2.h | 22 |
2 files changed, 26 insertions, 64 deletions
diff --git a/vpx_dsp/x86/avg_intrin_avx2.c b/vpx_dsp/x86/avg_intrin_avx2.c index 2ded29c59..4192a4582 100644 --- a/vpx_dsp/x86/avg_intrin_avx2.c +++ b/vpx_dsp/x86/avg_intrin_avx2.c @@ -106,47 +106,25 @@ static void hadamard_8x8x2_avx2(int16_t const *src_diff, int src_stride, hadamard_col8x2_avx2(src, 0); hadamard_col8x2_avx2(src, 1); - store_tran_low(_mm256_castsi256_si128(src[0]), coeff); - coeff += 8; - store_tran_low(_mm256_castsi256_si128(src[1]), coeff); - coeff += 8; - store_tran_low(_mm256_castsi256_si128(src[2]), coeff); - coeff += 8; - store_tran_low(_mm256_castsi256_si128(src[3]), coeff); - coeff += 8; - store_tran_low(_mm256_castsi256_si128(src[4]), coeff); - coeff += 8; - store_tran_low(_mm256_castsi256_si128(src[5]), coeff); - coeff += 8; - store_tran_low(_mm256_castsi256_si128(src[6]), coeff); - coeff += 8; - store_tran_low(_mm256_castsi256_si128(src[7]), coeff); - coeff += 8; - - src[0] = _mm256_castsi128_si256(_mm256_extractf128_si256(src[0], 1)); - src[1] = _mm256_castsi128_si256(_mm256_extractf128_si256(src[1], 1)); - src[2] = _mm256_castsi128_si256(_mm256_extractf128_si256(src[2], 1)); - src[3] = _mm256_castsi128_si256(_mm256_extractf128_si256(src[3], 1)); - src[4] = _mm256_castsi128_si256(_mm256_extractf128_si256(src[4], 1)); - src[5] = _mm256_castsi128_si256(_mm256_extractf128_si256(src[5], 1)); - src[6] = _mm256_castsi128_si256(_mm256_extractf128_si256(src[6], 1)); - src[7] = _mm256_castsi128_si256(_mm256_extractf128_si256(src[7], 1)); - - store_tran_low(_mm256_castsi256_si128(src[0]), coeff); - coeff += 8; - store_tran_low(_mm256_castsi256_si128(src[1]), coeff); - coeff += 8; - store_tran_low(_mm256_castsi256_si128(src[2]), coeff); - coeff += 8; - store_tran_low(_mm256_castsi256_si128(src[3]), coeff); - coeff += 8; - store_tran_low(_mm256_castsi256_si128(src[4]), coeff); - coeff += 8; - store_tran_low(_mm256_castsi256_si128(src[5]), coeff); - coeff += 8; - store_tran_low(_mm256_castsi256_si128(src[6]), coeff); - coeff += 8; - store_tran_low(_mm256_castsi256_si128(src[7]), coeff); + // TODO(slavarnway): FIXME: For high bitdepths, it is unnecessary to + // mult/unpack/store here and load/pack the same memory in the next stage. + // Try using an intermediate buffer and store_tran_low() in the last stage. + store_tran_low(_mm256_permute2x128_si256(src[0], src[1], 0x20), coeff); + coeff += 16; + store_tran_low(_mm256_permute2x128_si256(src[2], src[3], 0x20), coeff); + coeff += 16; + store_tran_low(_mm256_permute2x128_si256(src[4], src[5], 0x20), coeff); + coeff += 16; + store_tran_low(_mm256_permute2x128_si256(src[6], src[7], 0x20), coeff); + coeff += 16; + + store_tran_low(_mm256_permute2x128_si256(src[0], src[1], 0x31), coeff); + coeff += 16; + store_tran_low(_mm256_permute2x128_si256(src[2], src[3], 0x31), coeff); + coeff += 16; + store_tran_low(_mm256_permute2x128_si256(src[4], src[5], 0x31), coeff); + coeff += 16; + store_tran_low(_mm256_permute2x128_si256(src[6], src[7], 0x31), coeff); } void vpx_hadamard_16x16_avx2(int16_t const *src_diff, int src_stride, @@ -172,10 +150,10 @@ void vpx_hadamard_16x16_avx2(int16_t const *src_diff, int src_stride, b2 = _mm256_srai_epi16(b2, 1); b3 = _mm256_srai_epi16(b3, 1); - store_tran_low_256(_mm256_add_epi16(b0, b2), coeff); - store_tran_low_256(_mm256_add_epi16(b1, b3), coeff + 64); - store_tran_low_256(_mm256_sub_epi16(b0, b2), coeff + 128); - store_tran_low_256(_mm256_sub_epi16(b1, b3), coeff + 192); + store_tran_low(_mm256_add_epi16(b0, b2), coeff); + store_tran_low(_mm256_add_epi16(b1, b3), coeff + 64); + store_tran_low(_mm256_sub_epi16(b0, b2), coeff + 128); + store_tran_low(_mm256_sub_epi16(b1, b3), coeff + 192); coeff += 16; } diff --git a/vpx_dsp/x86/bitdepth_conversion_avx2.h b/vpx_dsp/x86/bitdepth_conversion_avx2.h index 7bbdac7ef..b8fd1cb58 100644 --- a/vpx_dsp/x86/bitdepth_conversion_avx2.h +++ b/vpx_dsp/x86/bitdepth_conversion_avx2.h @@ -27,33 +27,17 @@ static INLINE __m256i load_tran_low(const tran_low_t *a) { #endif } -// Store 8 16 bit values. If the destination is 32 bits then sign extend the -// values by multiplying by 1. -static INLINE void store_tran_low(__m128i a, tran_low_t *b) { -#if CONFIG_VP9_HIGHBITDEPTH - const __m128i one = _mm_set1_epi16(1); - const __m128i a_hi = _mm_mulhi_epi16(a, one); - const __m128i a_lo = _mm_mullo_epi16(a, one); - const __m128i a_1 = _mm_unpacklo_epi16(a_lo, a_hi); - const __m128i a_2 = _mm_unpackhi_epi16(a_lo, a_hi); - _mm_store_si128((__m128i *)(b), a_1); - _mm_store_si128((__m128i *)(b + 4), a_2); -#else - _mm_store_si128((__m128i *)(b), a); -#endif -} - -static INLINE void store_tran_low_256(__m256i a, tran_low_t *b) { +static INLINE void store_tran_low(__m256i a, tran_low_t *b) { #if CONFIG_VP9_HIGHBITDEPTH const __m256i one = _mm256_set1_epi16(1); const __m256i a_hi = _mm256_mulhi_epi16(a, one); const __m256i a_lo = _mm256_mullo_epi16(a, one); const __m256i a_1 = _mm256_unpacklo_epi16(a_lo, a_hi); const __m256i a_2 = _mm256_unpackhi_epi16(a_lo, a_hi); - _mm256_storeu_si256((__m256i *)(b), a_1); + _mm256_storeu_si256((__m256i *)b, a_1); _mm256_storeu_si256((__m256i *)(b + 8), a_2); #else - _mm256_storeu_si256((__m256i *)(b), a); + _mm256_storeu_si256((__m256i *)b, a); #endif } #endif // VPX_DSP_X86_BITDEPTH_CONVERSION_AVX2_H_ |