diff options
author | Anupam Pandey <anupam.pandey@ittiam.com> | 2023-04-18 14:46:56 +0530 |
---|---|---|
committer | Anupam Pandey <anupam.pandey@ittiam.com> | 2023-05-05 15:55:16 +0530 |
commit | 255ee1888589aa15ae909b992fe123c0358b1730 (patch) | |
tree | d46b2799a29b05c325497d01d2b44b33d456ff1d /test | |
parent | 24802201acd7dfa15928bcc47c1e270e7db5afac (diff) | |
download | libvpx-255ee1888589aa15ae909b992fe123c0358b1730.tar libvpx-255ee1888589aa15ae909b992fe123c0358b1730.tar.gz libvpx-255ee1888589aa15ae909b992fe123c0358b1730.tar.bz2 libvpx-255ee1888589aa15ae909b992fe123c0358b1730.zip |
Add AVX2 intrinsic for idct16x16 and idct32x32 functions
Added AVX2 intrinsic optimization for the following functions
1. vpx_idct16x16_256_add
2. vpx_idct32x32_1024_add
3. vpx_idct32x32_135_add
The module level scaling w.r.t C function (timer based) for
existing (SSE2) and new AVX2 intrinsics:
Scaling
Function Name SSE2 AVX2
vpx_idct32x32_1024_add 3.62x 7.49x
vpx_idct32x32_135_add 4.85x 9.41x
vpx_idct16x16_256_add 4.82x 7.70x
This is a bit-exact change.
Change-Id: Id9dda933aa1f5093bb6b35ac3b8a41846afca9d2
Diffstat (limited to 'test')
-rw-r--r-- | test/dct16x16_test.cc | 98 | ||||
-rw-r--r-- | test/dct32x32_test.cc | 197 |
2 files changed, 292 insertions, 3 deletions
diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc index 3c104f3a4..4ad2263cf 100644 --- a/test/dct16x16_test.cc +++ b/test/dct16x16_test.cc @@ -592,7 +592,7 @@ class Trans16x16TestBase { const int count_test_block = 10000; const int eob = 10; const int16_t *scan = vp9_default_scan_orders[TX_16X16].scan; - DECLARE_ALIGNED(16, tran_low_t, coeff[kNumCoeffs]); + DECLARE_ALIGNED(32, tran_low_t, coeff[kNumCoeffs]); DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]); DECLARE_ALIGNED(16, uint8_t, ref[kNumCoeffs]); #if CONFIG_VP9_HIGHBITDEPTH @@ -643,6 +643,80 @@ class Trans16x16TestBase { } } + void RunInvTrans16x16SpeedTest(IdctFunc ref_txfm, int thresh) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + const int count_test_block = 10000; + const int eob = 10; + const int16_t *scan = vp9_default_scan_orders[TX_16X16].scan; + int64_t c_sum_time = 0; + int64_t simd_sum_time = 0; + DECLARE_ALIGNED(32, tran_low_t, coeff[kNumCoeffs]); + DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]); + DECLARE_ALIGNED(16, uint8_t, ref[kNumCoeffs]); +#if CONFIG_VP9_HIGHBITDEPTH + DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]); + DECLARE_ALIGNED(16, uint16_t, ref16[kNumCoeffs]); +#endif // CONFIG_VP9_HIGHBITDEPTH + + for (int j = 0; j < kNumCoeffs; ++j) { + if (j < eob) { + // Random values less than the threshold, either positive or negative + coeff[scan[j]] = rnd(thresh); + } else { + coeff[scan[j]] = 0; + } + if (bit_depth_ == VPX_BITS_8) { + dst[j] = 0; + ref[j] = 0; +#if CONFIG_VP9_HIGHBITDEPTH + } else { + dst16[j] = 0; + ref16[j] = 0; +#endif // CONFIG_VP9_HIGHBITDEPTH + } + } + + if (bit_depth_ == VPX_BITS_8) { + vpx_usec_timer timer_c; + vpx_usec_timer_start(&timer_c); + for (int i = 0; i < count_test_block; ++i) { + ref_txfm(coeff, ref, pitch_); + } + vpx_usec_timer_mark(&timer_c); + c_sum_time += vpx_usec_timer_elapsed(&timer_c); + + vpx_usec_timer timer_mod; + vpx_usec_timer_start(&timer_mod); + for (int i = 0; i < count_test_block; ++i) { + RunInvTxfm(coeff, dst, pitch_); + } + vpx_usec_timer_mark(&timer_mod); + simd_sum_time += vpx_usec_timer_elapsed(&timer_mod); + } else { +#if CONFIG_VP9_HIGHBITDEPTH + vpx_usec_timer timer_c; + vpx_usec_timer_start(&timer_c); + for (int i = 0; i < count_test_block; ++i) { + ref_txfm(coeff, CAST_TO_BYTEPTR(ref16), pitch_); + } + vpx_usec_timer_mark(&timer_c); + c_sum_time += vpx_usec_timer_elapsed(&timer_c); + + vpx_usec_timer timer_mod; + vpx_usec_timer_start(&timer_mod); + for (int i = 0; i < count_test_block; ++i) { + RunInvTxfm(coeff, CAST_TO_BYTEPTR(dst16), pitch_); + } + vpx_usec_timer_mark(&timer_mod); + simd_sum_time += vpx_usec_timer_elapsed(&timer_mod); +#endif // CONFIG_VP9_HIGHBITDEPTH + } + printf( + "c_time = %" PRId64 " \t simd_time = %" PRId64 " \t Gain = %4.2f \n", + c_sum_time, simd_sum_time, + (static_cast<float>(c_sum_time) / static_cast<float>(simd_sum_time))); + } + int pitch_; int tx_type_; vpx_bit_depth_t bit_depth_; @@ -755,7 +829,6 @@ TEST_P(Trans16x16HT, QuantCheck) { RunQuantCheck(429, 729); } -#if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE class InvTrans16x16DCT : public Trans16x16TestBase, public ::testing::TestWithParam<Idct16x16Param> { public: @@ -786,7 +859,10 @@ GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(InvTrans16x16DCT); TEST_P(InvTrans16x16DCT, CompareReference) { CompareInvReference(ref_txfm_, thresh_); } -#endif // HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE + +TEST_P(InvTrans16x16DCT, DISABLED_Speed) { + RunInvTrans16x16SpeedTest(ref_txfm_, thresh_); +} using std::make_tuple; @@ -828,6 +904,12 @@ INSTANTIATE_TEST_SUITE_P( make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 1, VPX_BITS_8), make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 2, VPX_BITS_8), make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 3, VPX_BITS_8))); + +INSTANTIATE_TEST_SUITE_P(C, InvTrans16x16DCT, + ::testing::Values(make_tuple(&vpx_idct16x16_256_add_c, + &vpx_idct16x16_256_add_c, + 6225, VPX_BITS_8))); + #endif // CONFIG_VP9_HIGHBITDEPTH #if HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE @@ -862,6 +944,11 @@ INSTANTIATE_TEST_SUITE_P( 2, VPX_BITS_8), make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 3, VPX_BITS_8))); + +INSTANTIATE_TEST_SUITE_P(SSE2, InvTrans16x16DCT, + ::testing::Values(make_tuple( + &vpx_idct16x16_256_add_c, + &vpx_idct16x16_256_add_sse2, 6225, VPX_BITS_8))); #endif // HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE #if HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE @@ -869,6 +956,11 @@ INSTANTIATE_TEST_SUITE_P( AVX2, Trans16x16DCT, ::testing::Values(make_tuple(&vpx_fdct16x16_avx2, &vpx_idct16x16_256_add_sse2, 0, VPX_BITS_8))); + +INSTANTIATE_TEST_SUITE_P(AVX2, InvTrans16x16DCT, + ::testing::Values(make_tuple( + &vpx_idct16x16_256_add_c, + &vpx_idct16x16_256_add_avx2, 6225, VPX_BITS_8))); #endif // HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE #if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE diff --git a/test/dct32x32_test.cc b/test/dct32x32_test.cc index 91bb8e01e..1167038b5 100644 --- a/test/dct32x32_test.cc +++ b/test/dct32x32_test.cc @@ -24,10 +24,12 @@ #include "test/register_state_check.h" #include "test/util.h" #include "vp9/common/vp9_entropy.h" +#include "vp9/common/vp9_scan.h" #include "vpx/vpx_codec.h" #include "vpx/vpx_integer.h" #include "vpx_ports/mem.h" #include "vpx_ports/msvc.h" // for round() +#include "vpx_ports/vpx_timer.h" using libvpx_test::ACMRandom; @@ -71,6 +73,9 @@ typedef void (*InvTxfmFunc)(const tran_low_t *in, uint8_t *out, int stride); typedef std::tuple<FwdTxfmFunc, InvTxfmFunc, int, vpx_bit_depth_t> Trans32x32Param; +typedef std::tuple<InvTxfmFunc, InvTxfmFunc, int, vpx_bit_depth_t, int, int> + InvTrans32x32Param; + #if CONFIG_VP9_HIGHBITDEPTH void idct32x32_10(const tran_low_t *in, uint8_t *out, int stride) { vpx_highbd_idct32x32_1024_add_c(in, CAST_TO_SHORTPTR(out), stride, 10); @@ -314,6 +319,174 @@ TEST_P(Trans32x32Test, InverseAccuracy) { } } +class InvTrans32x32Test : public ::testing::TestWithParam<InvTrans32x32Param> { + public: + virtual ~InvTrans32x32Test() {} + virtual void SetUp() { + ref_txfm_ = GET_PARAM(0); + inv_txfm_ = GET_PARAM(1); + version_ = GET_PARAM(2); // 0: high precision forward transform + // 1: low precision version for rd loop + bit_depth_ = GET_PARAM(3); + eob_ = GET_PARAM(4); + thresh_ = GET_PARAM(4); + mask_ = (1 << bit_depth_) - 1; + pitch_ = 32; + } + + virtual void TearDown() { libvpx_test::ClearSystemState(); } + + protected: + void RunRefTxfm(tran_low_t *out, uint8_t *dst, int stride) { + ref_txfm_(out, dst, stride); + } + void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) { + inv_txfm_(out, dst, stride); + } + int version_; + vpx_bit_depth_t bit_depth_; + int mask_; + int eob_; + int thresh_; + + InvTxfmFunc ref_txfm_; + InvTxfmFunc inv_txfm_; + int pitch_; + + void RunInvTrans32x32SpeedTest() { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + const int count_test_block = 10000; + int64_t c_sum_time = 0; + int64_t simd_sum_time = 0; + const int16_t *scan = vp9_default_scan_orders[TX_32X32].scan; + DECLARE_ALIGNED(32, tran_low_t, coeff[kNumCoeffs]); + DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]); + DECLARE_ALIGNED(16, uint8_t, ref[kNumCoeffs]); +#if CONFIG_VP9_HIGHBITDEPTH + DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]); + DECLARE_ALIGNED(16, uint16_t, ref16[kNumCoeffs]); +#endif // CONFIG_VP9_HIGHBITDEPTH + + for (int j = 0; j < kNumCoeffs; ++j) { + if (j < eob_) { + // Random values less than the threshold, either positive or negative + coeff[scan[j]] = rnd(thresh_); + } else { + coeff[scan[j]] = 0; + } + if (bit_depth_ == VPX_BITS_8) { + dst[j] = 0; + ref[j] = 0; +#if CONFIG_VP9_HIGHBITDEPTH + } else { + dst16[j] = 0; + ref16[j] = 0; +#endif // CONFIG_VP9_HIGHBITDEPTH + } + } + + if (bit_depth_ == VPX_BITS_8) { + vpx_usec_timer timer_c; + vpx_usec_timer_start(&timer_c); + for (int i = 0; i < count_test_block; ++i) { + RunRefTxfm(coeff, ref, pitch_); + } + vpx_usec_timer_mark(&timer_c); + c_sum_time += vpx_usec_timer_elapsed(&timer_c); + + vpx_usec_timer timer_mod; + vpx_usec_timer_start(&timer_mod); + for (int i = 0; i < count_test_block; ++i) { + RunInvTxfm(coeff, dst, pitch_); + } + vpx_usec_timer_mark(&timer_mod); + simd_sum_time += vpx_usec_timer_elapsed(&timer_mod); + } else { +#if CONFIG_VP9_HIGHBITDEPTH + vpx_usec_timer timer_c; + vpx_usec_timer_start(&timer_c); + for (int i = 0; i < count_test_block; ++i) { + RunRefTxfm(coeff, CAST_TO_BYTEPTR(ref16), pitch_); + } + vpx_usec_timer_mark(&timer_c); + c_sum_time += vpx_usec_timer_elapsed(&timer_c); + + vpx_usec_timer timer_mod; + vpx_usec_timer_start(&timer_mod); + for (int i = 0; i < count_test_block; ++i) { + RunInvTxfm(coeff, CAST_TO_BYTEPTR(dst16), pitch_); + } + vpx_usec_timer_mark(&timer_mod); + simd_sum_time += vpx_usec_timer_elapsed(&timer_mod); +#endif // CONFIG_VP9_HIGHBITDEPTH + } + printf( + "c_time = %" PRId64 " \t simd_time = %" PRId64 " \t Gain = %4.2f \n", + c_sum_time, simd_sum_time, + (static_cast<float>(c_sum_time) / static_cast<float>(simd_sum_time))); + } + + void CompareInvReference32x32() { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + const int count_test_block = 10000; + const int eob = 31; + const int16_t *scan = vp9_default_scan_orders[TX_32X32].scan; + DECLARE_ALIGNED(32, tran_low_t, coeff[kNumCoeffs]); + DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]); + DECLARE_ALIGNED(16, uint8_t, ref[kNumCoeffs]); +#if CONFIG_VP9_HIGHBITDEPTH + DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]); + DECLARE_ALIGNED(16, uint16_t, ref16[kNumCoeffs]); +#endif // CONFIG_VP9_HIGHBITDEPTH + + for (int i = 0; i < count_test_block; ++i) { + for (int j = 0; j < kNumCoeffs; ++j) { + if (j < eob) { + coeff[scan[j]] = rnd.Rand8Extremes(); + } else { + coeff[scan[j]] = 0; + } + if (bit_depth_ == VPX_BITS_8) { + dst[j] = 0; + ref[j] = 0; +#if CONFIG_VP9_HIGHBITDEPTH + } else { + dst16[j] = 0; + ref16[j] = 0; +#endif // CONFIG_VP9_HIGHBITDEPTH + } + } + if (bit_depth_ == VPX_BITS_8) { + RunRefTxfm(coeff, ref, pitch_); + RunInvTxfm(coeff, dst, pitch_); + } else { +#if CONFIG_VP9_HIGHBITDEPTH + RunRefTxfm(coeff, CAST_TO_BYTEPTR(ref16), pitch_); + ASM_REGISTER_STATE_CHECK( + RunInvTxfm(coeff, CAST_TO_BYTEPTR(dst16), pitch_)); +#endif // CONFIG_VP9_HIGHBITDEPTH + } + + for (int j = 0; j < kNumCoeffs; ++j) { +#if CONFIG_VP9_HIGHBITDEPTH + const uint32_t diff = + bit_depth_ == VPX_BITS_8 ? dst[j] - ref[j] : dst16[j] - ref16[j]; +#else + const uint32_t diff = dst[j] - ref[j]; +#endif // CONFIG_VP9_HIGHBITDEPTH + const uint32_t error = diff * diff; + EXPECT_EQ(0u, error) << "Error: 32x32 IDCT Comparison has error " + << error << " at index " << j; + } + } + } +}; + +GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(InvTrans32x32Test); + +TEST_P(InvTrans32x32Test, DISABLED_Speed) { RunInvTrans32x32SpeedTest(); } +TEST_P(InvTrans32x32Test, CompareReference) { CompareInvReference32x32(); } + using std::make_tuple; #if CONFIG_VP9_HIGHBITDEPTH @@ -334,6 +507,14 @@ INSTANTIATE_TEST_SUITE_P( VPX_BITS_8), make_tuple(&vpx_fdct32x32_rd_c, &vpx_idct32x32_1024_add_c, 1, VPX_BITS_8))); + +INSTANTIATE_TEST_SUITE_P( + C, InvTrans32x32Test, + ::testing::Values( + (make_tuple(&vpx_idct32x32_1024_add_c, &vpx_idct32x32_1024_add_c, 0, + VPX_BITS_8, 32, 6225)), + make_tuple(&vpx_idct32x32_135_add_c, &vpx_idct32x32_135_add_c, 0, + VPX_BITS_8, 16, 6255))); #endif // CONFIG_VP9_HIGHBITDEPTH #if HAVE_NEON && !CONFIG_EMULATE_HARDWARE @@ -352,6 +533,14 @@ INSTANTIATE_TEST_SUITE_P( &vpx_idct32x32_1024_add_sse2, 0, VPX_BITS_8), make_tuple(&vpx_fdct32x32_rd_sse2, &vpx_idct32x32_1024_add_sse2, 1, VPX_BITS_8))); + +INSTANTIATE_TEST_SUITE_P( + SSE2, InvTrans32x32Test, + ::testing::Values( + (make_tuple(&vpx_idct32x32_1024_add_c, &vpx_idct32x32_1024_add_sse2, 0, + VPX_BITS_8, 32, 6225)), + make_tuple(&vpx_idct32x32_135_add_c, &vpx_idct32x32_135_add_sse2, 0, + VPX_BITS_8, 16, 6225))); #endif // HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE #if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE @@ -377,6 +566,14 @@ INSTANTIATE_TEST_SUITE_P( &vpx_idct32x32_1024_add_sse2, 0, VPX_BITS_8), make_tuple(&vpx_fdct32x32_rd_avx2, &vpx_idct32x32_1024_add_sse2, 1, VPX_BITS_8))); + +INSTANTIATE_TEST_SUITE_P( + AVX2, InvTrans32x32Test, + ::testing::Values( + (make_tuple(&vpx_idct32x32_1024_add_c, &vpx_idct32x32_1024_add_avx2, 0, + VPX_BITS_8, 32, 6225)), + make_tuple(&vpx_idct32x32_135_add_c, &vpx_idct32x32_135_add_avx2, 0, + VPX_BITS_8, 16, 6225))); #endif // HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE #if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE |