diff options
-rw-r--r-- | test/vp9_quantize_test.cc | 287 | ||||
-rw-r--r-- | vp9/encoder/vp9_block.h | 1 | ||||
-rw-r--r-- | vp9/encoder/vp9_encodemb.c | 6 | ||||
-rw-r--r-- | vp9/encoder/vp9_rdopt.c | 29 | ||||
-rw-r--r-- | vpx_dsp/arm/quantize_neon.c | 17 | ||||
-rw-r--r-- | vpx_dsp/quantize.c | 17 | ||||
-rw-r--r-- | vpx_dsp/vpx_dsp_rtcd_defs.pl | 5 | ||||
-rw-r--r-- | vpx_dsp/x86/quantize_avx.c | 30 | ||||
-rw-r--r-- | vpx_dsp/x86/quantize_avx2.c | 15 | ||||
-rw-r--r-- | vpx_dsp/x86/quantize_sse2.h | 28 | ||||
-rw-r--r-- | vpx_dsp/x86/quantize_ssse3.c | 35 | ||||
-rw-r--r-- | vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c | 96 |
12 files changed, 352 insertions, 214 deletions
diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc index 587cec692..6a8f1dafb 100644 --- a/test/vp9_quantize_test.cc +++ b/test/vp9_quantize_test.cc @@ -26,6 +26,7 @@ #include "test/util.h" #include "vp9/common/vp9_entropy.h" #include "vp9/common/vp9_scan.h" +#include "vp9/encoder/vp9_block.h" #include "vpx/vpx_codec.h" #include "vpx/vpx_integer.h" #include "vpx_ports/msvc.h" @@ -38,8 +39,7 @@ namespace { const int number_of_iterations = 100; typedef void (*QuantizeFunc)(const tran_low_t *coeff, intptr_t count, - const int16_t *zbin, const int16_t *round, - const int16_t *quant, const int16_t *quant_shift, + const macroblock_plane *const mb_plane, tran_low_t *qcoeff, tran_low_t *dqcoeff, const int16_t *dequant, uint16_t *eob, const int16_t *scan, const int16_t *iscan); @@ -47,6 +47,41 @@ typedef std::tuple<QuantizeFunc, QuantizeFunc, vpx_bit_depth_t, int /*max_size*/, bool /*is_fp*/> QuantizeParam; +// Wrapper which takes a macroblock_plane. +typedef void (*QuantizeBaseFunc)(const tran_low_t *coeff, intptr_t count, + const int16_t *zbin, const int16_t *round, + const int16_t *quant, + const int16_t *quant_shift, tran_low_t *qcoeff, + tran_low_t *dqcoeff, const int16_t *dequant, + uint16_t *eob, const int16_t *scan, + const int16_t *iscan); + +template <QuantizeBaseFunc fn> +void QuantWrapper(const tran_low_t *coeff, intptr_t count, + const macroblock_plane *const mb_plane, tran_low_t *qcoeff, + tran_low_t *dqcoeff, const int16_t *dequant, uint16_t *eob, + const int16_t *scan, const int16_t *iscan) { + fn(coeff, count, mb_plane->zbin, mb_plane->round, mb_plane->quant, + mb_plane->quant_shift, qcoeff, dqcoeff, dequant, eob, scan, iscan); +} + +// Wrapper for 32x32 version which does not use count +typedef void (*Quantize32x32Func)(const tran_low_t *coeff, + const macroblock_plane *const mb_plane, + tran_low_t *qcoeff, tran_low_t *dqcoeff, + const int16_t *dequant, uint16_t *eob, + const int16_t *scan, const int16_t *iscan); + +template <Quantize32x32Func fn> +void Quant32x32Wrapper(const tran_low_t *coeff, intptr_t count, + const macroblock_plane *const mb_plane, + tran_low_t *qcoeff, tran_low_t *dqcoeff, + const int16_t *dequant, uint16_t *eob, + const int16_t *scan, const int16_t *iscan) { + (void)count; + fn(coeff, mb_plane, qcoeff, dqcoeff, dequant, eob, scan, iscan); +} + // Wrapper for FP version which does not use zbin or quant_shift. typedef void (*QuantizeFPFunc)(const tran_low_t *coeff, intptr_t count, const int16_t *round, const int16_t *quant, @@ -56,15 +91,11 @@ typedef void (*QuantizeFPFunc)(const tran_low_t *coeff, intptr_t count, template <QuantizeFPFunc fn> void QuantFPWrapper(const tran_low_t *coeff, intptr_t count, - const int16_t *zbin, const int16_t *round, - const int16_t *quant, const int16_t *quant_shift, - tran_low_t *qcoeff, tran_low_t *dqcoeff, - const int16_t *dequant, uint16_t *eob, const int16_t *scan, - const int16_t *iscan) { - (void)zbin; - (void)quant_shift; - - fn(coeff, count, round, quant, qcoeff, dqcoeff, dequant, eob, scan, iscan); + const macroblock_plane *const mb_plane, tran_low_t *qcoeff, + tran_low_t *dqcoeff, const int16_t *dequant, uint16_t *eob, + const int16_t *scan, const int16_t *iscan) { + fn(coeff, count, mb_plane->round_fp, mb_plane->quant_fp, qcoeff, dqcoeff, + dequant, eob, scan, iscan); } void GenerateHelperArrays(ACMRandom *rnd, int16_t *zbin, int16_t *round, @@ -119,17 +150,21 @@ class VP9QuantizeBase : public AbstractBench { #else max_value_ = (1 << bit_depth_) - 1; #endif - zbin_ptr_ = + + mb_plane_ = reinterpret_cast<macroblock_plane *>( + vpx_memalign(16, sizeof(macroblock_plane))); + + zbin_ptr_ = mb_plane_->zbin = reinterpret_cast<int16_t *>(vpx_memalign(16, 8 * sizeof(*zbin_ptr_))); - round_fp_ptr_ = reinterpret_cast<int16_t *>( + round_fp_ptr_ = mb_plane_->round_fp = reinterpret_cast<int16_t *>( vpx_memalign(16, 8 * sizeof(*round_fp_ptr_))); - quant_fp_ptr_ = reinterpret_cast<int16_t *>( + quant_fp_ptr_ = mb_plane_->quant_fp = reinterpret_cast<int16_t *>( vpx_memalign(16, 8 * sizeof(*quant_fp_ptr_))); - round_ptr_ = + round_ptr_ = mb_plane_->round = reinterpret_cast<int16_t *>(vpx_memalign(16, 8 * sizeof(*round_ptr_))); - quant_ptr_ = + quant_ptr_ = mb_plane_->quant = reinterpret_cast<int16_t *>(vpx_memalign(16, 8 * sizeof(*quant_ptr_))); - quant_shift_ptr_ = reinterpret_cast<int16_t *>( + quant_shift_ptr_ = mb_plane_->quant_shift = reinterpret_cast<int16_t *>( vpx_memalign(16, 8 * sizeof(*quant_shift_ptr_))); dequant_ptr_ = reinterpret_cast<int16_t *>( vpx_memalign(16, 8 * sizeof(*dequant_ptr_))); @@ -139,6 +174,7 @@ class VP9QuantizeBase : public AbstractBench { } ~VP9QuantizeBase() { + vpx_free(mb_plane_); vpx_free(zbin_ptr_); vpx_free(round_fp_ptr_); vpx_free(quant_fp_ptr_); @@ -146,6 +182,7 @@ class VP9QuantizeBase : public AbstractBench { vpx_free(quant_ptr_); vpx_free(quant_shift_ptr_); vpx_free(dequant_ptr_); + mb_plane_ = nullptr; zbin_ptr_ = nullptr; round_fp_ptr_ = nullptr; quant_fp_ptr_ = nullptr; @@ -157,9 +194,10 @@ class VP9QuantizeBase : public AbstractBench { } protected: + macroblock_plane *mb_plane_; int16_t *zbin_ptr_; - int16_t *round_fp_ptr_; int16_t *quant_fp_ptr_; + int16_t *round_fp_ptr_; int16_t *round_ptr_; int16_t *quant_ptr_; int16_t *quant_shift_ptr_; @@ -193,8 +231,7 @@ class VP9QuantizeTest : public VP9QuantizeBase, }; void VP9QuantizeTest::Run() { - quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_, - quant_shift_ptr_, qcoeff_.TopLeftPixel(), + quantize_op_(coeff_.TopLeftPixel(), count_, mb_plane_, qcoeff_.TopLeftPixel(), dqcoeff_.TopLeftPixel(), dequant_ptr_, &eob_, scan_->scan, scan_->iscan); } @@ -266,8 +303,8 @@ void VP9QuantizeTest::Speed(bool is_median) { vpx_usec_timer_start(&timer); for (int n = 0; n < kNumTests; ++n) { - ref_quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, - q_ptr_, quant_shift_ptr_, ref_qcoeff.TopLeftPixel(), + ref_quantize_op_(coeff_.TopLeftPixel(), count_, mb_plane_, + ref_qcoeff.TopLeftPixel(), ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob, scan_->scan, scan_->iscan); } @@ -275,10 +312,9 @@ void VP9QuantizeTest::Speed(bool is_median) { vpx_usec_timer_start(&simd_timer); for (int n = 0; n < kNumTests; ++n) { - quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_, - quant_shift_ptr_, qcoeff_.TopLeftPixel(), - dqcoeff_.TopLeftPixel(), dequant_ptr_, &eob_, - scan_->scan, scan_->iscan); + quantize_op_(coeff_.TopLeftPixel(), count_, mb_plane_, + qcoeff_.TopLeftPixel(), dqcoeff_.TopLeftPixel(), + dequant_ptr_, &eob_, scan_->scan, scan_->iscan); } vpx_usec_timer_mark(&simd_timer); @@ -417,15 +453,14 @@ TEST_P(VP9QuantizeTest, OperationCheck) { GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_, quant_shift_ptr_, dequant_ptr_, round_fp_ptr_, quant_fp_ptr_); - ref_quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_, - quant_shift_ptr_, ref_qcoeff.TopLeftPixel(), - ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob, - scan_->scan, scan_->iscan); + ref_quantize_op_(coeff_.TopLeftPixel(), count_, mb_plane_, + ref_qcoeff.TopLeftPixel(), ref_dqcoeff.TopLeftPixel(), + dequant_ptr_, &ref_eob, scan_->scan, scan_->iscan); - ASM_REGISTER_STATE_CHECK(quantize_op_( - coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_, - quant_shift_ptr_, qcoeff_.TopLeftPixel(), dqcoeff_.TopLeftPixel(), - dequant_ptr_, &eob_, scan_->scan, scan_->iscan)); + ASM_REGISTER_STATE_CHECK(quantize_op_(coeff_.TopLeftPixel(), count_, + mb_plane_, qcoeff_.TopLeftPixel(), + dqcoeff_.TopLeftPixel(), dequant_ptr_, + &eob_, scan_->scan, scan_->iscan)); EXPECT_TRUE(qcoeff_.CheckValues(ref_qcoeff)); EXPECT_TRUE(dqcoeff_.CheckValues(ref_dqcoeff)); @@ -475,15 +510,14 @@ TEST_P(VP9QuantizeTest, EOBCheck) { GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_, quant_shift_ptr_, dequant_ptr_, round_fp_ptr_, quant_fp_ptr_); - ref_quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_, - quant_shift_ptr_, ref_qcoeff.TopLeftPixel(), - ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob, - scan_->scan, scan_->iscan); + ref_quantize_op_(coeff_.TopLeftPixel(), count_, mb_plane_, + ref_qcoeff.TopLeftPixel(), ref_dqcoeff.TopLeftPixel(), + dequant_ptr_, &ref_eob, scan_->scan, scan_->iscan); - ASM_REGISTER_STATE_CHECK(quantize_op_( - coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_, - quant_shift_ptr_, qcoeff_.TopLeftPixel(), dqcoeff_.TopLeftPixel(), - dequant_ptr_, &eob_, scan_->scan, scan_->iscan)); + ASM_REGISTER_STATE_CHECK(quantize_op_(coeff_.TopLeftPixel(), count_, + mb_plane_, qcoeff_.TopLeftPixel(), + dqcoeff_.TopLeftPixel(), dequant_ptr_, + &eob_, scan_->scan, scan_->iscan)); EXPECT_TRUE(qcoeff_.CheckValues(ref_qcoeff)); EXPECT_TRUE(dqcoeff_.CheckValues(ref_dqcoeff)); @@ -510,28 +544,35 @@ using std::make_tuple; INSTANTIATE_TEST_SUITE_P( SSE2, VP9QuantizeTest, ::testing::Values( - make_tuple(&vpx_quantize_b_sse2, &vpx_quantize_b_c, VPX_BITS_8, 16, - false), + make_tuple(&QuantWrapper<vpx_quantize_b_sse2>, + &QuantWrapper<vpx_quantize_b_c>, VPX_BITS_8, 16, false), make_tuple(&QuantFPWrapper<vp9_quantize_fp_sse2>, &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8, 16, true), - make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c, - VPX_BITS_8, 16, false), - make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c, - VPX_BITS_10, 16, false), - make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c, - VPX_BITS_12, 16, false), - make_tuple(&vpx_highbd_quantize_b_32x32_sse2, - &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8, 32, false), - make_tuple(&vpx_highbd_quantize_b_32x32_sse2, - &vpx_highbd_quantize_b_32x32_c, VPX_BITS_10, 32, false), - make_tuple(&vpx_highbd_quantize_b_32x32_sse2, - &vpx_highbd_quantize_b_32x32_c, VPX_BITS_12, 32, false))); + make_tuple(&QuantWrapper<vpx_highbd_quantize_b_sse2>, + &QuantWrapper<vpx_highbd_quantize_b_c>, VPX_BITS_8, 16, + false), + make_tuple(&QuantWrapper<vpx_highbd_quantize_b_sse2>, + &QuantWrapper<vpx_highbd_quantize_b_c>, VPX_BITS_10, 16, + false), + make_tuple(&QuantWrapper<vpx_highbd_quantize_b_sse2>, + &QuantWrapper<vpx_highbd_quantize_b_c>, VPX_BITS_12, 16, + false), + make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_sse2>, + &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_8, 32, + false), + make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_sse2>, + &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_10, + 32, false), + make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_sse2>, + &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_12, + 32, false))); #else INSTANTIATE_TEST_SUITE_P( SSE2, VP9QuantizeTest, - ::testing::Values(make_tuple(&vpx_quantize_b_sse2, &vpx_quantize_b_c, - VPX_BITS_8, 16, false), + ::testing::Values(make_tuple(&QuantWrapper<vpx_quantize_b_sse2>, + &QuantWrapper<vpx_quantize_b_c>, VPX_BITS_8, + 16, false), make_tuple(&QuantFPWrapper<vp9_quantize_fp_sse2>, &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8, 16, true))); @@ -541,11 +582,12 @@ INSTANTIATE_TEST_SUITE_P( #if HAVE_SSSE3 INSTANTIATE_TEST_SUITE_P( SSSE3, VP9QuantizeTest, - ::testing::Values(make_tuple(&vpx_quantize_b_ssse3, &vpx_quantize_b_c, - VPX_BITS_8, 16, false), - make_tuple(&vpx_quantize_b_32x32_ssse3, - &vpx_quantize_b_32x32_c, VPX_BITS_8, 32, - false), + ::testing::Values(make_tuple(&QuantWrapper<vpx_quantize_b_ssse3>, + &QuantWrapper<vpx_quantize_b_c>, VPX_BITS_8, + 16, false), + make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_ssse3>, + &Quant32x32Wrapper<vpx_quantize_b_32x32_c>, + VPX_BITS_8, 32, false), make_tuple(&QuantFPWrapper<vp9_quantize_fp_ssse3>, &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8, 16, true), @@ -555,13 +597,14 @@ INSTANTIATE_TEST_SUITE_P( #endif // HAVE_SSSE3 #if HAVE_AVX -INSTANTIATE_TEST_SUITE_P(AVX, VP9QuantizeTest, - ::testing::Values(make_tuple(&vpx_quantize_b_avx, - &vpx_quantize_b_c, - VPX_BITS_8, 16, false), - make_tuple(&vpx_quantize_b_32x32_avx, - &vpx_quantize_b_32x32_c, - VPX_BITS_8, 32, false))); +INSTANTIATE_TEST_SUITE_P( + AVX, VP9QuantizeTest, + ::testing::Values(make_tuple(&QuantWrapper<vpx_quantize_b_avx>, + &QuantWrapper<vpx_quantize_b_c>, VPX_BITS_8, + 16, false), + make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_avx>, + &Quant32x32Wrapper<vpx_quantize_b_32x32_c>, + VPX_BITS_8, 32, false))); #endif // HAVE_AVX #if VPX_ARCH_X86_64 && HAVE_AVX2 @@ -577,22 +620,29 @@ INSTANTIATE_TEST_SUITE_P( make_tuple(&QuantFPWrapper<vp9_highbd_quantize_fp_32x32_avx2>, &QuantFPWrapper<vp9_highbd_quantize_fp_32x32_c>, VPX_BITS_12, 32, true), - make_tuple(&vpx_quantize_b_avx2, &vpx_quantize_b_c, VPX_BITS_8, 16, + make_tuple(&QuantWrapper<vpx_quantize_b_avx2>, + &QuantWrapper<vpx_quantize_b_c>, VPX_BITS_8, 16, false), + make_tuple(&QuantWrapper<vpx_highbd_quantize_b_avx2>, + &QuantWrapper<vpx_highbd_quantize_b_c>, VPX_BITS_8, 16, + false), + make_tuple(&QuantWrapper<vpx_highbd_quantize_b_avx2>, + &QuantWrapper<vpx_highbd_quantize_b_c>, VPX_BITS_10, 16, + false), + make_tuple(&QuantWrapper<vpx_highbd_quantize_b_avx2>, + &QuantWrapper<vpx_highbd_quantize_b_c>, VPX_BITS_12, 16, + false), + make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_avx2>, + &Quant32x32Wrapper<vpx_quantize_b_32x32_c>, VPX_BITS_8, 32, false), - make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c, - VPX_BITS_8, 16, false), - make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c, - VPX_BITS_10, 16, false), - make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c, - VPX_BITS_12, 16, false), - make_tuple(&vpx_quantize_b_32x32_avx2, &vpx_quantize_b_32x32_c, - VPX_BITS_8, 32, false), - make_tuple(&vpx_highbd_quantize_b_32x32_avx2, - &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8, 32, false), - make_tuple(&vpx_highbd_quantize_b_32x32_avx2, - &vpx_highbd_quantize_b_32x32_c, VPX_BITS_10, 32, false), - make_tuple(&vpx_highbd_quantize_b_32x32_avx2, - &vpx_highbd_quantize_b_32x32_c, VPX_BITS_12, 32, false))); + make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_avx2>, + &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_8, 32, + false), + make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_avx2>, + &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_10, + 32, false), + make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_avx2>, + &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_12, + 32, false))); #else INSTANTIATE_TEST_SUITE_P( AVX2, VP9QuantizeTest, @@ -602,11 +652,12 @@ INSTANTIATE_TEST_SUITE_P( make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_avx2>, &QuantFPWrapper<quantize_fp_32x32_nz_c>, VPX_BITS_8, 32, true), - make_tuple(&vpx_quantize_b_avx2, &vpx_quantize_b_c, - VPX_BITS_8, 16, false), - make_tuple(&vpx_quantize_b_32x32_avx2, - &vpx_quantize_b_32x32_c, VPX_BITS_8, 32, - false))); + make_tuple(&QuantWrapper<vpx_quantize_b_avx2>, + &QuantWrapper<vpx_quantize_b_c>, VPX_BITS_8, + 16, false), + make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_avx2>, + &Quant32x32Wrapper<vpx_quantize_b_32x32_c>, + VPX_BITS_8, 32, false))); #endif // CONFIG_VP9_HIGHBITDEPTH #endif // HAVE_AVX2 @@ -615,22 +666,29 @@ INSTANTIATE_TEST_SUITE_P( INSTANTIATE_TEST_SUITE_P( NEON, VP9QuantizeTest, ::testing::Values( - make_tuple(&vpx_quantize_b_neon, &vpx_quantize_b_c, VPX_BITS_8, 16, + make_tuple(&QuantWrapper<vpx_quantize_b_neon>, + &QuantWrapper<vpx_quantize_b_c>, VPX_BITS_8, 16, false), + make_tuple(&QuantWrapper<vpx_highbd_quantize_b_neon>, + &QuantWrapper<vpx_highbd_quantize_b_c>, VPX_BITS_8, 16, + false), + make_tuple(&QuantWrapper<vpx_highbd_quantize_b_neon>, + &QuantWrapper<vpx_highbd_quantize_b_c>, VPX_BITS_10, 16, false), - make_tuple(&vpx_highbd_quantize_b_neon, &vpx_highbd_quantize_b_c, - VPX_BITS_8, 16, false), - make_tuple(&vpx_highbd_quantize_b_neon, &vpx_highbd_quantize_b_c, - VPX_BITS_10, 16, false), - make_tuple(&vpx_highbd_quantize_b_neon, &vpx_highbd_quantize_b_c, - VPX_BITS_12, 16, false), - make_tuple(&vpx_quantize_b_32x32_neon, &vpx_quantize_b_32x32_c, - VPX_BITS_8, 32, false), - make_tuple(&vpx_highbd_quantize_b_32x32_neon, - &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8, 32, false), - make_tuple(&vpx_highbd_quantize_b_32x32_neon, - &vpx_highbd_quantize_b_32x32_c, VPX_BITS_10, 32, false), - make_tuple(&vpx_highbd_quantize_b_32x32_neon, - &vpx_highbd_quantize_b_32x32_c, VPX_BITS_12, 32, false), + make_tuple(&QuantWrapper<vpx_highbd_quantize_b_neon>, + &QuantWrapper<vpx_highbd_quantize_b_c>, VPX_BITS_12, 16, + false), + make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_neon>, + &Quant32x32Wrapper<vpx_quantize_b_32x32_c>, VPX_BITS_8, 32, + false), + make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_neon>, + &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_8, 32, + false), + make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_neon>, + &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_10, + 32, false), + make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_neon>, + &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_12, + 32, false), make_tuple(&QuantFPWrapper<vp9_quantize_fp_neon>, &QuantFPWrapper<vp9_quantize_fp_c>, VPX_BITS_8, 16, true), make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_neon>, @@ -639,11 +697,12 @@ INSTANTIATE_TEST_SUITE_P( #else INSTANTIATE_TEST_SUITE_P( NEON, VP9QuantizeTest, - ::testing::Values(make_tuple(&vpx_quantize_b_neon, &vpx_quantize_b_c, - VPX_BITS_8, 16, false), - make_tuple(&vpx_quantize_b_32x32_neon, - &vpx_quantize_b_32x32_c, VPX_BITS_8, 32, - false), + ::testing::Values(make_tuple(&QuantWrapper<vpx_quantize_b_neon>, + &QuantWrapper<vpx_quantize_b_c>, VPX_BITS_8, + 16, false), + make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_neon>, + &Quant32x32Wrapper<vpx_quantize_b_32x32_c>, + VPX_BITS_8, 32, false), make_tuple(&QuantFPWrapper<vp9_quantize_fp_neon>, &QuantFPWrapper<vp9_quantize_fp_c>, VPX_BITS_8, 16, true), @@ -683,9 +742,11 @@ INSTANTIATE_TEST_SUITE_P(LSX, VP9QuantizeTest, INSTANTIATE_TEST_SUITE_P( DISABLED_C, VP9QuantizeTest, ::testing::Values( - make_tuple(&vpx_quantize_b_c, &vpx_quantize_b_c, VPX_BITS_8, 16, false), - make_tuple(&vpx_quantize_b_32x32_c, &vpx_quantize_b_32x32_c, VPX_BITS_8, - 32, false), + make_tuple(&QuantWrapper<vpx_quantize_b_c>, + &QuantWrapper<vpx_quantize_b_c>, VPX_BITS_8, 16, false), + make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_c>, + &Quant32x32Wrapper<vpx_quantize_b_32x32_c>, VPX_BITS_8, 32, + false), make_tuple(&QuantFPWrapper<vp9_quantize_fp_c>, &QuantFPWrapper<vp9_quantize_fp_c>, VPX_BITS_8, 16, true), make_tuple(&QuantFPWrapper<quantize_fp_nz_c>, diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h index 3e2c9a3c3..da01c346d 100644 --- a/vp9/encoder/vp9_block.h +++ b/vp9/encoder/vp9_block.h @@ -13,6 +13,7 @@ #include "vpx_util/vpx_thread.h" +#include "vp9/common/vp9_blockd.h" #include "vp9/common/vp9_entropymv.h" #include "vp9/common/vp9_entropy.h" diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c index fa222f9dc..4910dc20f 100644 --- a/vp9/encoder/vp9_encodemb.c +++ b/vp9/encoder/vp9_encodemb.c @@ -542,8 +542,7 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col, switch (tx_size) { case TX_32X32: fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); - vpx_quantize_b_32x32(coeff, 1024, p->zbin, p->round, p->quant, - p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, + vpx_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); break; case TX_16X16: @@ -948,8 +947,7 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, vpx_subtract_block(32, 32, src_diff, diff_stride, src, src_stride, dst, dst_stride); fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); - vpx_quantize_b_32x32(coeff, 1024, p->zbin, p->round, p->quant, - p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, + vpx_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); } if (args->enable_coeff_opt && !x->skip_recode) { diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index f87ab3e0b..bcadd5777 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -160,12 +160,13 @@ static void swap_block_ptr(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx, int m, int n, } #if !CONFIG_REALTIME_ONLY -static int model_rd_for_sb_earlyterm(VP9_COMP *cpi, int mi_row, int mi_col, - BLOCK_SIZE bsize, MACROBLOCK *x, - MACROBLOCKD *xd, int *out_rate_sum, - int64_t *out_dist_sum, int *skip_txfm_sb, - int64_t *skip_sse_sb, int do_earlyterm, - int64_t best_rd) { +// Planewise build inter prediction and compute rdcost with early termination +// option +static int build_inter_pred_model_rd_earlyterm( + VP9_COMP *cpi, int mi_row, int mi_col, BLOCK_SIZE bsize, MACROBLOCK *x, + MACROBLOCKD *xd, int *out_rate_sum, int64_t *out_dist_sum, + int *skip_txfm_sb, int64_t *skip_sse_sb, int do_earlyterm, + int64_t best_rd) { // Note our transform coeffs are 8 times an orthogonal transform. // Hence quantizer step is also 8 times. To get effective quantizer // we need to divide by 8 before sending to modeling function. @@ -2999,13 +3000,13 @@ static int64_t handle_inter_mode( xd->plane[j].dst.stride = 64; } } - // Compute RD cost with early termination option + filt_best_rd = cm->interp_filter == SWITCHABLE ? (best_rd - rs_rd) : best_rd; - if (model_rd_for_sb_earlyterm(cpi, mi_row, mi_col, bsize, x, xd, - &rate_sum, &dist_sum, &tmp_skip_sb, - &tmp_skip_sse, enable_earlyterm, - filt_best_rd)) { + if (build_inter_pred_model_rd_earlyterm( + cpi, mi_row, mi_col, bsize, x, xd, &rate_sum, &dist_sum, + &tmp_skip_sb, &tmp_skip_sse, enable_earlyterm, + filt_best_rd)) { filter_cache[i] = INT64_MAX; continue; } @@ -3076,9 +3077,9 @@ static int64_t handle_inter_mode( // Handles the special case when a filter that is not in the // switchable list (ex. bilinear) is indicated at the frame level, or // skip condition holds. - model_rd_for_sb_earlyterm(cpi, mi_row, mi_col, bsize, x, xd, &tmp_rate, - &tmp_dist, &skip_txfm_sb, &skip_sse_sb, - 0 /*do_earlyterm*/, INT64_MAX); + build_inter_pred_model_rd_earlyterm( + cpi, mi_row, mi_col, bsize, x, xd, &tmp_rate, &tmp_dist, &skip_txfm_sb, + &skip_sse_sb, 0 /*do_earlyterm*/, INT64_MAX); rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate, tmp_dist); memcpy(skip_txfm, x->skip_txfm, sizeof(skip_txfm)); memcpy(bsse, x->bsse, sizeof(bsse)); diff --git a/vpx_dsp/arm/quantize_neon.c b/vpx_dsp/arm/quantize_neon.c index 9c227d560..e81738a7b 100644 --- a/vpx_dsp/arm/quantize_neon.c +++ b/vpx_dsp/arm/quantize_neon.c @@ -14,6 +14,7 @@ #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/arm/mem_neon.h" +#include "vp9/encoder/vp9_block.h" static INLINE void calculate_dqcoeff_and_store(const int16x8_t qcoeff, const int16x8_t dequant, @@ -213,11 +214,8 @@ quantize_b_32x32_neon(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, // Main difference is that zbin values are halved before comparison and dqcoeff // values are divided by 2. zbin is rounded but dqcoeff is not. -void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *zbin_ptr, - const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, +void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, + const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { @@ -226,10 +224,10 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int i; // Only the first element of each vector is DC. - int16x8_t zbin = vrshrq_n_s16(vld1q_s16(zbin_ptr), 1); - int16x8_t round = vrshrq_n_s16(vld1q_s16(round_ptr), 1); - int16x8_t quant = vld1q_s16(quant_ptr); - int16x8_t quant_shift = vld1q_s16(quant_shift_ptr); + int16x8_t zbin = vrshrq_n_s16(vld1q_s16(mb_plane->zbin), 1); + int16x8_t round = vrshrq_n_s16(vld1q_s16(mb_plane->round), 1); + int16x8_t quant = vld1q_s16(mb_plane->quant); + int16x8_t quant_shift = vld1q_s16(mb_plane->quant_shift); int16x8_t dequant = vld1q_s16(dequant_ptr); // Process first 8 values which include a dc component. @@ -289,6 +287,5 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, #endif // __aarch64__ // Need these here, else the compiler complains about mixing declarations and // code in C90 - (void)n_coeffs; (void)scan; } diff --git a/vpx_dsp/quantize.c b/vpx_dsp/quantize.c index 5d6ba64a8..212db45c8 100644 --- a/vpx_dsp/quantize.c +++ b/vpx_dsp/quantize.c @@ -14,6 +14,7 @@ #include "vpx_dsp/quantize.h" #include "vpx_dsp/vpx_dsp_common.h" #include "vpx_mem/vpx_mem.h" +#include "vp9/encoder/vp9_block.h" void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, const int16_t *round_ptr, const int16_t quant, @@ -208,19 +209,21 @@ void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, } #endif -void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *zbin_ptr, const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, +void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, + const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { - const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 1), - ROUND_POWER_OF_TWO(zbin_ptr[1], 1) }; + const int n_coeffs = 32 * 32; + const int zbins[2] = { ROUND_POWER_OF_TWO(mb_plane->zbin[0], 1), + ROUND_POWER_OF_TWO(mb_plane->zbin[1], 1) }; const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; + const int16_t *round_ptr = mb_plane->round; + const int16_t *quant_ptr = mb_plane->quant; + const int16_t *quant_shift_ptr = mb_plane->quant_shift; int idx = 0; - int idx_arr[1024]; + int idx_arr[32 * 32 /* n_coeffs */]; int i, eob = -1; (void)iscan; diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index ad8ff6e18..3baf16cc8 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -17,6 +17,9 @@ print <<EOF #include "vpx/vpx_integer.h" #include "vpx_dsp/vpx_dsp_common.h" #include "vpx_dsp/vpx_filter.h" +#if CONFIG_VP9_ENCODER + struct macroblock_plane; +#endif EOF } @@ -721,7 +724,7 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") { add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/vpx_quantize_b neon sse2 ssse3 avx avx2 vsx lsx/; - add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/vpx_quantize_b_32x32 neon ssse3 avx avx2 vsx lsx/; if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { diff --git a/vpx_dsp/x86/quantize_avx.c b/vpx_dsp/x86/quantize_avx.c index 7d8352721..d52f6c664 100644 --- a/vpx_dsp/x86/quantize_avx.c +++ b/vpx_dsp/x86/quantize_avx.c @@ -140,15 +140,12 @@ void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, *eob_ptr = accumulate_eob(eob); } -void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *zbin_ptr, const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, +void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, + const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { const __m128i zero = _mm_setzero_si128(); - const __m128i one = _mm_set1_epi16(1); const __m256i big_zero = _mm256_setzero_si256(); int index; @@ -160,26 +157,9 @@ void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, __m128i eob = zero, eob0; (void)scan; - (void)n_coeffs; - - // Setup global values. - // The 32x32 halves zbin and round. - zbin = _mm_load_si128((const __m128i *)zbin_ptr); - // Shift with rounding. - zbin = _mm_add_epi16(zbin, one); - zbin = _mm_srli_epi16(zbin, 1); - // x86 has no "greater *or equal*" comparison. Subtract 1 from zbin so - // it is a strict "greater" comparison. - zbin = _mm_sub_epi16(zbin, one); - - round = _mm_load_si128((const __m128i *)round_ptr); - round = _mm_add_epi16(round, one); - round = _mm_srli_epi16(round, 1); - - quant = _mm_load_si128((const __m128i *)quant_ptr); - dequant = _mm_load_si128((const __m128i *)dequant_ptr); - shift = _mm_load_si128((const __m128i *)quant_shift_ptr); - shift = _mm_slli_epi16(shift, 1); + + load_b_values32x32(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant, + &shift); // Do DC and first 15 AC. coeff0 = load_tran_low(coeff_ptr); diff --git a/vpx_dsp/x86/quantize_avx2.c b/vpx_dsp/x86/quantize_avx2.c index 28f7c9c7d..a8412c5b8 100644 --- a/vpx_dsp/x86/quantize_avx2.c +++ b/vpx_dsp/x86/quantize_avx2.c @@ -13,6 +13,7 @@ #include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" +#include "vp9/encoder/vp9_block.h" static VPX_FORCE_INLINE void load_b_values_avx2( const int16_t *zbin_ptr, __m256i *zbin, const int16_t *round_ptr, @@ -250,23 +251,19 @@ static VPX_FORCE_INLINE __m256i quantize_b_32x32_16( } } -void vpx_quantize_b_32x32_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *zbin_ptr, - const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, +void vpx_quantize_b_32x32_avx2(const tran_low_t *coeff_ptr, + const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { __m256i v_zbin, v_round, v_quant, v_dequant, v_quant_shift; __m256i v_eobmax = _mm256_setzero_si256(); intptr_t count; - (void)n_coeffs; (void)scan; - load_b_values_avx2(zbin_ptr, &v_zbin, round_ptr, &v_round, quant_ptr, - &v_quant, dequant_ptr, &v_dequant, quant_shift_ptr, - &v_quant_shift, 1); + load_b_values_avx2(mb_plane->zbin, &v_zbin, mb_plane->round, &v_round, + mb_plane->quant, &v_quant, dequant_ptr, &v_dequant, + mb_plane->quant_shift, &v_quant_shift, 1); // Do DC and first 15 AC. v_eobmax = quantize_b_32x32_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, iscan, diff --git a/vpx_dsp/x86/quantize_sse2.h b/vpx_dsp/x86/quantize_sse2.h index 27bfb4e41..fe42fee01 100644 --- a/vpx_dsp/x86/quantize_sse2.h +++ b/vpx_dsp/x86/quantize_sse2.h @@ -15,6 +15,7 @@ #include "./vpx_config.h" #include "vpx/vpx_integer.h" +#include "vp9/encoder/vp9_block.h" static INLINE void load_b_values(const int16_t *zbin_ptr, __m128i *zbin, const int16_t *round_ptr, __m128i *round, @@ -29,6 +30,33 @@ static INLINE void load_b_values(const int16_t *zbin_ptr, __m128i *zbin, *shift = _mm_load_si128((const __m128i *)shift_ptr); } +static INLINE void load_b_values32x32( + const struct macroblock_plane *const mb_plane, __m128i *zbin, + __m128i *round, __m128i *quant, const int16_t *dequant_ptr, + __m128i *dequant, __m128i *shift) { + const __m128i one = _mm_set1_epi16(1); + // The 32x32 halves zbin and round. + *zbin = _mm_load_si128((const __m128i *)mb_plane->zbin); + // Shift with rounding. + *zbin = _mm_add_epi16(*zbin, one); + *zbin = _mm_srli_epi16(*zbin, 1); + // x86 has no "greater *or equal*" comparison. Subtract 1 from zbin so + // it is a strict "greater" comparison. + *zbin = _mm_sub_epi16(*zbin, one); + + *round = _mm_load_si128((const __m128i *)mb_plane->round); + *round = _mm_add_epi16(*round, one); + *round = _mm_srli_epi16(*round, 1); + + *quant = _mm_load_si128((const __m128i *)mb_plane->quant); + *dequant = _mm_load_si128((const __m128i *)dequant_ptr); + *shift = _mm_load_si128((const __m128i *)mb_plane->quant_shift); + // I suspect this is not technically OK because quant_shift can be up + // to 1 << 16 and shifting up again will outrange that, but the test is not + // comprehensive enough to catch that and "it's been that way forever" + *shift = _mm_slli_epi16(*shift, 1); +} + static INLINE void load_fp_values(const int16_t *round_ptr, __m128i *round, const int16_t *quant_ptr, __m128i *quant, const int16_t *dequant_ptr, diff --git a/vpx_dsp/x86/quantize_ssse3.c b/vpx_dsp/x86/quantize_ssse3.c index 476230286..6fe54d7d9 100644 --- a/vpx_dsp/x86/quantize_ssse3.c +++ b/vpx_dsp/x86/quantize_ssse3.c @@ -16,6 +16,7 @@ #include "vpx_dsp/x86/bitdepth_conversion_sse2.h" #include "vpx_dsp/x86/quantize_sse2.h" #include "vpx_dsp/x86/quantize_ssse3.h" +#include "vp9/encoder/vp9_block.h" void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, @@ -107,16 +108,12 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, *eob_ptr = accumulate_eob(eob); } -void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *zbin_ptr, - const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, +void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, + const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { const __m128i zero = _mm_setzero_si128(); - const __m128i one = _mm_set1_epi16(1); int index; __m128i zbin, round, quant, dequant, shift; @@ -127,29 +124,9 @@ void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, __m128i eob = zero, eob0; (void)scan; - (void)n_coeffs; - - // Setup global values. - // The 32x32 halves zbin and round. - zbin = _mm_load_si128((const __m128i *)zbin_ptr); - // Shift with rounding. - zbin = _mm_add_epi16(zbin, one); - zbin = _mm_srli_epi16(zbin, 1); - // x86 has no "greater *or equal*" comparison. Subtract 1 from zbin so - // it is a strict "greater" comparison. - zbin = _mm_sub_epi16(zbin, one); - - round = _mm_load_si128((const __m128i *)round_ptr); - round = _mm_add_epi16(round, one); - round = _mm_srli_epi16(round, 1); - - quant = _mm_load_si128((const __m128i *)quant_ptr); - dequant = _mm_load_si128((const __m128i *)dequant_ptr); - shift = _mm_load_si128((const __m128i *)quant_shift_ptr); - // I suspect this is not technically OK because quant_shift can be up - // to 1 << 16 and shifting up again will outrange that, but the test is not - // comprehensive enough to catch that and "it's been that way forever" - shift = _mm_slli_epi16(shift, 1); + + load_b_values32x32(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant, + &shift); // Do DC and first 15 AC. coeff0 = load_tran_low(coeff_ptr); diff --git a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c index 26e82f9b7..141614e7a 100644 --- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c +++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c @@ -942,19 +942,111 @@ static void vpx_filter_block1d4_v4_avx2(const uint8_t *src_ptr, } } +static void vpx_filter_block1d8_v8_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, + ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { + __m256i f[4], ss[4]; + __m256i r[8]; + __m128i s[9]; + + unsigned int y = output_height; + // Multiply the size of the source stride by two + const ptrdiff_t src_stride = src_pitch << 1; + + // The output_height is always a multiple of two. + assert(!(output_height & 1)); + + shuffle_filter_avx2(filter, f); + s[0] = _mm_loadl_epi64((const __m128i *)(src_ptr + 0 * src_pitch)); + s[1] = _mm_loadl_epi64((const __m128i *)(src_ptr + 1 * src_pitch)); + s[2] = _mm_loadl_epi64((const __m128i *)(src_ptr + 2 * src_pitch)); + s[3] = _mm_loadl_epi64((const __m128i *)(src_ptr + 3 * src_pitch)); + s[4] = _mm_loadl_epi64((const __m128i *)(src_ptr + 4 * src_pitch)); + s[5] = _mm_loadl_epi64((const __m128i *)(src_ptr + 5 * src_pitch)); + s[6] = _mm_loadl_epi64((const __m128i *)(src_ptr + 6 * src_pitch)); + + // merge the result together + // r[0]: 0 0 0 0 0 0 0 0 r17 r16 r15 r14 r13 r12 r11 r10 | 0 0 0 0 0 0 0 0 + // r07 r06 r05 r04 r03 r02 r01 r00 + r[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[0]), s[1], 1); + + // r[1]: 0 0 0 0 0 0 0 0 r27 r26 r25 r24 r23 r22 r21 r20 | 0 0 0 0 0 0 0 0 + // r17 r16 r15 r14 r13 r12 r11 r10 + r[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[1]), s[2], 1); + + // r[2]: 0 0 0 0 0 0 0 0 r37 r36 r35 r34 r33 r32 r31 r30 | 0 0 0 0 0 0 0 0 + // r27 r26 r25 r24 r23 r22 r21 r20 + r[2] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[2]), s[3], 1); + + // r[3]: 0 0 0 0 0 0 0 0 r47 r46 r45 r44 r43 r42 r41 r40 | 0 0 0 0 0 0 0 0 + // r37 r36 r35 r34 r33 r32 r31 r30 + r[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[3]), s[4], 1); + + // r[4]: 0 0 0 0 0 0 0 0 r57 r56 r55 r54 r53 r52 r51 r50 | 0 0 0 0 0 0 0 0 + // r47 r46 r45 r44 r43 r42 r41 r40 + r[4] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[4]), s[5], 1); + + // r[5]: 0 0 0 0 0 0 0 0 r67 r66 r65 r64 r63 r62 r61 r60 | 0 0 0 0 0 0 0 0 + // r57 r56 r55 r54 r53 r52 r51 r50 + r[5] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[5]), s[6], 1); + + // Merge together + // ss[0]: |r27 r17|.......|r21 r11|r20 r10 || r17 r07|.....|r12 r02|r11 + // r01|r10 r00| + ss[0] = _mm256_unpacklo_epi8(r[0], r[1]); + + // ss[0]: |r47 r37|.......|r41 r31|r40 r30 || r37 r27|.....|r32 r22|r31 + // r21|r30 r20| + ss[1] = _mm256_unpacklo_epi8(r[2], r[3]); + + // ss[2]: |r67 r57|.......|r61 r51|r60 r50 || r57 r47|.....|r52 r42|r51 + // r41|r50 r40| + ss[2] = _mm256_unpacklo_epi8(r[4], r[5]); + + // Process 2 rows at a time + do { + s[7] = _mm_loadl_epi64((const __m128i *)(src_ptr + 7 * src_pitch)); + s[8] = _mm_loadl_epi64((const __m128i *)(src_ptr + 8 * src_pitch)); + + // r[6]: 0 0 0 0 0 0 0 0 r77 r76 r75 r74 r73 r72 r71 r70 | 0 0 0 0 0 0 0 + // 0 r67 r66 r65 r64 r63 r62 r61 r60 + r[6] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[6]), s[7], 1); + // r[7]: 0 0 0 0 0 0 0 0 r87 r86 r85 r84 r83 r82 r81 r80 | 0 0 0 0 0 0 0 + // 0 r77 r76 r75 r74 r73 r72 r71 r70 + r[7] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[7]), s[8], 1); + + // ss[3] : | r87 r77 | .......| r81 r71 | r80 r70 || r77 r67 | .....| r72 + // r62 | r71 r61|r70 r60| + ss[3] = _mm256_unpacklo_epi8(r[6], r[7]); + ss[0] = convolve8_16_avx2(ss, f); + ss[0] = _mm256_packus_epi16(ss[0], ss[0]); + src_ptr += src_stride; + + /* shift down two rows */ + s[6] = s[8]; + _mm_storel_epi64((__m128i *)&output_ptr[0], _mm256_castsi256_si128(ss[0])); + output_ptr += out_pitch; + _mm_storel_epi64((__m128i *)&output_ptr[0], + _mm256_extractf128_si256(ss[0], 1)); + output_ptr += out_pitch; + ss[0] = ss[1]; + ss[1] = ss[2]; + ss[2] = ss[3]; + y -= 2; + } while (y > 1); +} + #if HAVE_AVX2 && HAVE_SSSE3 filter8_1dfunction vpx_filter_block1d4_v8_ssse3; #if VPX_ARCH_X86_64 filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3; filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3; filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3; -#define vpx_filter_block1d8_v8_avx2 vpx_filter_block1d8_v8_intrin_ssse3 #define vpx_filter_block1d4_h8_avx2 vpx_filter_block1d4_h8_intrin_ssse3 #else // VPX_ARCH_X86 filter8_1dfunction vpx_filter_block1d8_v8_ssse3; filter8_1dfunction vpx_filter_block1d8_h8_ssse3; filter8_1dfunction vpx_filter_block1d4_h8_ssse3; -#define vpx_filter_block1d8_v8_avx2 vpx_filter_block1d8_v8_ssse3 #define vpx_filter_block1d4_h8_avx2 vpx_filter_block1d4_h8_ssse3 #endif // VPX_ARCH_X86_64 filter8_1dfunction vpx_filter_block1d8_v8_avg_ssse3; |