diff options
-rw-r--r-- | test/hadamard_test.cc | 18 | ||||
-rw-r--r-- | vpx_dsp/vpx_dsp_rtcd_defs.pl | 4 | ||||
-rw-r--r-- | vpx_dsp/x86/avg_intrin_sse2.c | 39 |
3 files changed, 59 insertions, 2 deletions
diff --git a/test/hadamard_test.cc b/test/hadamard_test.cc index a2e36a757..ad9b1a38a 100644 --- a/test/hadamard_test.cc +++ b/test/hadamard_test.cc @@ -302,12 +302,30 @@ INSTANTIATE_TEST_CASE_P(MSA, Hadamard16x16Test, class Hadamard32x32Test : public HadamardTestBase {}; +void HadamardSpeedTest32x32(HadamardFunc const func, int times) { + DECLARE_ALIGNED(16, int16_t, input[1024]); + DECLARE_ALIGNED(16, tran_low_t, output[1024]); + memset(input, 1, sizeof(input)); + HadamardSpeedTest("Hadamard32x32", func, input, 32, output, times); +} + TEST_P(Hadamard32x32Test, CompareReferenceRandom) { CompareReferenceRandom<32>(); } TEST_P(Hadamard32x32Test, VaryStride) { VaryStride<32>(); } +TEST_P(Hadamard32x32Test, DISABLED_Speed) { + HadamardSpeedTest32x32(h_func_, 10); + HadamardSpeedTest32x32(h_func_, 10000); + HadamardSpeedTest32x32(h_func_, 10000000); +} + INSTANTIATE_TEST_CASE_P(C, Hadamard32x32Test, ::testing::Values(&vpx_hadamard_32x32_c)); + +#if HAVE_SSE2 +INSTANTIATE_TEST_CASE_P(SSE2, Hadamard32x32Test, + ::testing::Values(&vpx_hadamard_32x32_sse2)); +#endif // HAVE_SSE2 } // namespace diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index b662c70c0..13b83e1f5 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -783,7 +783,7 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") { specialize qw/vpx_hadamard_16x16 avx2 sse2 neon vsx/; add_proto qw/void vpx_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff"; - specialize qw/vpx_hadamard_32x32/; + specialize qw/vpx_hadamard_32x32 sse2/; add_proto qw/int vpx_satd/, "const tran_low_t *coeff, int length"; specialize qw/vpx_satd avx2 sse2 neon/; @@ -795,7 +795,7 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") { specialize qw/vpx_hadamard_16x16 avx2 sse2 neon msa vsx/; add_proto qw/void vpx_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff"; - specialize qw/vpx_hadamard_32x32/; + specialize qw/vpx_hadamard_32x32 sse2/; add_proto qw/int vpx_satd/, "const int16_t *coeff, int length"; specialize qw/vpx_satd avx2 sse2 neon msa/; diff --git a/vpx_dsp/x86/avg_intrin_sse2.c b/vpx_dsp/x86/avg_intrin_sse2.c index bb744a981..6b13171fd 100644 --- a/vpx_dsp/x86/avg_intrin_sse2.c +++ b/vpx_dsp/x86/avg_intrin_sse2.c @@ -372,6 +372,45 @@ void vpx_hadamard_16x16_sse2(int16_t const *src_diff, ptrdiff_t src_stride, } } +void vpx_hadamard_32x32_sse2(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + int idx; + for (idx = 0; idx < 4; ++idx) { + const int16_t *src_ptr = + src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16; + vpx_hadamard_16x16_sse2(src_ptr, src_stride, coeff + idx * 256); + } + + for (idx = 0; idx < 256; idx += 8) { + __m128i coeff0 = load_tran_low(coeff); + __m128i coeff1 = load_tran_low(coeff + 256); + __m128i coeff2 = load_tran_low(coeff + 512); + __m128i coeff3 = load_tran_low(coeff + 768); + + __m128i b0 = _mm_add_epi16(coeff0, coeff1); + __m128i b1 = _mm_sub_epi16(coeff0, coeff1); + __m128i b2 = _mm_add_epi16(coeff2, coeff3); + __m128i b3 = _mm_sub_epi16(coeff2, coeff3); + + b0 = _mm_srai_epi16(b0, 2); + b1 = _mm_srai_epi16(b1, 2); + b2 = _mm_srai_epi16(b2, 2); + b3 = _mm_srai_epi16(b3, 2); + + coeff0 = _mm_add_epi16(b0, b2); + coeff1 = _mm_add_epi16(b1, b3); + store_tran_low(coeff0, coeff); + store_tran_low(coeff1, coeff + 256); + + coeff2 = _mm_sub_epi16(b0, b2); + coeff3 = _mm_sub_epi16(b1, b3); + store_tran_low(coeff2, coeff + 512); + store_tran_low(coeff3, coeff + 768); + + coeff += 8; + } +} + int vpx_satd_sse2(const tran_low_t *coeff, int length) { int i; const __m128i zero = _mm_setzero_si128(); |