summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--test/vp9_quantize_test.cc287
-rw-r--r--vp9/encoder/vp9_block.h1
-rw-r--r--vp9/encoder/vp9_encodemb.c6
-rw-r--r--vp9/encoder/vp9_rdopt.c29
-rw-r--r--vpx_dsp/arm/quantize_neon.c17
-rw-r--r--vpx_dsp/quantize.c17
-rw-r--r--vpx_dsp/vpx_dsp_rtcd_defs.pl5
-rw-r--r--vpx_dsp/x86/quantize_avx.c30
-rw-r--r--vpx_dsp/x86/quantize_avx2.c15
-rw-r--r--vpx_dsp/x86/quantize_sse2.h28
-rw-r--r--vpx_dsp/x86/quantize_ssse3.c35
-rw-r--r--vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c96
12 files changed, 352 insertions, 214 deletions
diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc
index 587cec692..6a8f1dafb 100644
--- a/test/vp9_quantize_test.cc
+++ b/test/vp9_quantize_test.cc
@@ -26,6 +26,7 @@
#include "test/util.h"
#include "vp9/common/vp9_entropy.h"
#include "vp9/common/vp9_scan.h"
+#include "vp9/encoder/vp9_block.h"
#include "vpx/vpx_codec.h"
#include "vpx/vpx_integer.h"
#include "vpx_ports/msvc.h"
@@ -38,8 +39,7 @@ namespace {
const int number_of_iterations = 100;
typedef void (*QuantizeFunc)(const tran_low_t *coeff, intptr_t count,
- const int16_t *zbin, const int16_t *round,
- const int16_t *quant, const int16_t *quant_shift,
+ const macroblock_plane *const mb_plane,
tran_low_t *qcoeff, tran_low_t *dqcoeff,
const int16_t *dequant, uint16_t *eob,
const int16_t *scan, const int16_t *iscan);
@@ -47,6 +47,41 @@ typedef std::tuple<QuantizeFunc, QuantizeFunc, vpx_bit_depth_t,
int /*max_size*/, bool /*is_fp*/>
QuantizeParam;
+// Wrapper which takes a macroblock_plane.
+typedef void (*QuantizeBaseFunc)(const tran_low_t *coeff, intptr_t count,
+ const int16_t *zbin, const int16_t *round,
+ const int16_t *quant,
+ const int16_t *quant_shift, tran_low_t *qcoeff,
+ tran_low_t *dqcoeff, const int16_t *dequant,
+ uint16_t *eob, const int16_t *scan,
+ const int16_t *iscan);
+
+template <QuantizeBaseFunc fn>
+void QuantWrapper(const tran_low_t *coeff, intptr_t count,
+ const macroblock_plane *const mb_plane, tran_low_t *qcoeff,
+ tran_low_t *dqcoeff, const int16_t *dequant, uint16_t *eob,
+ const int16_t *scan, const int16_t *iscan) {
+ fn(coeff, count, mb_plane->zbin, mb_plane->round, mb_plane->quant,
+ mb_plane->quant_shift, qcoeff, dqcoeff, dequant, eob, scan, iscan);
+}
+
+// Wrapper for 32x32 version which does not use count
+typedef void (*Quantize32x32Func)(const tran_low_t *coeff,
+ const macroblock_plane *const mb_plane,
+ tran_low_t *qcoeff, tran_low_t *dqcoeff,
+ const int16_t *dequant, uint16_t *eob,
+ const int16_t *scan, const int16_t *iscan);
+
+template <Quantize32x32Func fn>
+void Quant32x32Wrapper(const tran_low_t *coeff, intptr_t count,
+ const macroblock_plane *const mb_plane,
+ tran_low_t *qcoeff, tran_low_t *dqcoeff,
+ const int16_t *dequant, uint16_t *eob,
+ const int16_t *scan, const int16_t *iscan) {
+ (void)count;
+ fn(coeff, mb_plane, qcoeff, dqcoeff, dequant, eob, scan, iscan);
+}
+
// Wrapper for FP version which does not use zbin or quant_shift.
typedef void (*QuantizeFPFunc)(const tran_low_t *coeff, intptr_t count,
const int16_t *round, const int16_t *quant,
@@ -56,15 +91,11 @@ typedef void (*QuantizeFPFunc)(const tran_low_t *coeff, intptr_t count,
template <QuantizeFPFunc fn>
void QuantFPWrapper(const tran_low_t *coeff, intptr_t count,
- const int16_t *zbin, const int16_t *round,
- const int16_t *quant, const int16_t *quant_shift,
- tran_low_t *qcoeff, tran_low_t *dqcoeff,
- const int16_t *dequant, uint16_t *eob, const int16_t *scan,
- const int16_t *iscan) {
- (void)zbin;
- (void)quant_shift;
-
- fn(coeff, count, round, quant, qcoeff, dqcoeff, dequant, eob, scan, iscan);
+ const macroblock_plane *const mb_plane, tran_low_t *qcoeff,
+ tran_low_t *dqcoeff, const int16_t *dequant, uint16_t *eob,
+ const int16_t *scan, const int16_t *iscan) {
+ fn(coeff, count, mb_plane->round_fp, mb_plane->quant_fp, qcoeff, dqcoeff,
+ dequant, eob, scan, iscan);
}
void GenerateHelperArrays(ACMRandom *rnd, int16_t *zbin, int16_t *round,
@@ -119,17 +150,21 @@ class VP9QuantizeBase : public AbstractBench {
#else
max_value_ = (1 << bit_depth_) - 1;
#endif
- zbin_ptr_ =
+
+ mb_plane_ = reinterpret_cast<macroblock_plane *>(
+ vpx_memalign(16, sizeof(macroblock_plane)));
+
+ zbin_ptr_ = mb_plane_->zbin =
reinterpret_cast<int16_t *>(vpx_memalign(16, 8 * sizeof(*zbin_ptr_)));
- round_fp_ptr_ = reinterpret_cast<int16_t *>(
+ round_fp_ptr_ = mb_plane_->round_fp = reinterpret_cast<int16_t *>(
vpx_memalign(16, 8 * sizeof(*round_fp_ptr_)));
- quant_fp_ptr_ = reinterpret_cast<int16_t *>(
+ quant_fp_ptr_ = mb_plane_->quant_fp = reinterpret_cast<int16_t *>(
vpx_memalign(16, 8 * sizeof(*quant_fp_ptr_)));
- round_ptr_ =
+ round_ptr_ = mb_plane_->round =
reinterpret_cast<int16_t *>(vpx_memalign(16, 8 * sizeof(*round_ptr_)));
- quant_ptr_ =
+ quant_ptr_ = mb_plane_->quant =
reinterpret_cast<int16_t *>(vpx_memalign(16, 8 * sizeof(*quant_ptr_)));
- quant_shift_ptr_ = reinterpret_cast<int16_t *>(
+ quant_shift_ptr_ = mb_plane_->quant_shift = reinterpret_cast<int16_t *>(
vpx_memalign(16, 8 * sizeof(*quant_shift_ptr_)));
dequant_ptr_ = reinterpret_cast<int16_t *>(
vpx_memalign(16, 8 * sizeof(*dequant_ptr_)));
@@ -139,6 +174,7 @@ class VP9QuantizeBase : public AbstractBench {
}
~VP9QuantizeBase() {
+ vpx_free(mb_plane_);
vpx_free(zbin_ptr_);
vpx_free(round_fp_ptr_);
vpx_free(quant_fp_ptr_);
@@ -146,6 +182,7 @@ class VP9QuantizeBase : public AbstractBench {
vpx_free(quant_ptr_);
vpx_free(quant_shift_ptr_);
vpx_free(dequant_ptr_);
+ mb_plane_ = nullptr;
zbin_ptr_ = nullptr;
round_fp_ptr_ = nullptr;
quant_fp_ptr_ = nullptr;
@@ -157,9 +194,10 @@ class VP9QuantizeBase : public AbstractBench {
}
protected:
+ macroblock_plane *mb_plane_;
int16_t *zbin_ptr_;
- int16_t *round_fp_ptr_;
int16_t *quant_fp_ptr_;
+ int16_t *round_fp_ptr_;
int16_t *round_ptr_;
int16_t *quant_ptr_;
int16_t *quant_shift_ptr_;
@@ -193,8 +231,7 @@ class VP9QuantizeTest : public VP9QuantizeBase,
};
void VP9QuantizeTest::Run() {
- quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_,
- quant_shift_ptr_, qcoeff_.TopLeftPixel(),
+ quantize_op_(coeff_.TopLeftPixel(), count_, mb_plane_, qcoeff_.TopLeftPixel(),
dqcoeff_.TopLeftPixel(), dequant_ptr_, &eob_, scan_->scan,
scan_->iscan);
}
@@ -266,8 +303,8 @@ void VP9QuantizeTest::Speed(bool is_median) {
vpx_usec_timer_start(&timer);
for (int n = 0; n < kNumTests; ++n) {
- ref_quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_,
- q_ptr_, quant_shift_ptr_, ref_qcoeff.TopLeftPixel(),
+ ref_quantize_op_(coeff_.TopLeftPixel(), count_, mb_plane_,
+ ref_qcoeff.TopLeftPixel(),
ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob,
scan_->scan, scan_->iscan);
}
@@ -275,10 +312,9 @@ void VP9QuantizeTest::Speed(bool is_median) {
vpx_usec_timer_start(&simd_timer);
for (int n = 0; n < kNumTests; ++n) {
- quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_,
- quant_shift_ptr_, qcoeff_.TopLeftPixel(),
- dqcoeff_.TopLeftPixel(), dequant_ptr_, &eob_,
- scan_->scan, scan_->iscan);
+ quantize_op_(coeff_.TopLeftPixel(), count_, mb_plane_,
+ qcoeff_.TopLeftPixel(), dqcoeff_.TopLeftPixel(),
+ dequant_ptr_, &eob_, scan_->scan, scan_->iscan);
}
vpx_usec_timer_mark(&simd_timer);
@@ -417,15 +453,14 @@ TEST_P(VP9QuantizeTest, OperationCheck) {
GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_,
quant_shift_ptr_, dequant_ptr_, round_fp_ptr_,
quant_fp_ptr_);
- ref_quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_,
- quant_shift_ptr_, ref_qcoeff.TopLeftPixel(),
- ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob,
- scan_->scan, scan_->iscan);
+ ref_quantize_op_(coeff_.TopLeftPixel(), count_, mb_plane_,
+ ref_qcoeff.TopLeftPixel(), ref_dqcoeff.TopLeftPixel(),
+ dequant_ptr_, &ref_eob, scan_->scan, scan_->iscan);
- ASM_REGISTER_STATE_CHECK(quantize_op_(
- coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_,
- quant_shift_ptr_, qcoeff_.TopLeftPixel(), dqcoeff_.TopLeftPixel(),
- dequant_ptr_, &eob_, scan_->scan, scan_->iscan));
+ ASM_REGISTER_STATE_CHECK(quantize_op_(coeff_.TopLeftPixel(), count_,
+ mb_plane_, qcoeff_.TopLeftPixel(),
+ dqcoeff_.TopLeftPixel(), dequant_ptr_,
+ &eob_, scan_->scan, scan_->iscan));
EXPECT_TRUE(qcoeff_.CheckValues(ref_qcoeff));
EXPECT_TRUE(dqcoeff_.CheckValues(ref_dqcoeff));
@@ -475,15 +510,14 @@ TEST_P(VP9QuantizeTest, EOBCheck) {
GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_,
quant_shift_ptr_, dequant_ptr_, round_fp_ptr_,
quant_fp_ptr_);
- ref_quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_,
- quant_shift_ptr_, ref_qcoeff.TopLeftPixel(),
- ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob,
- scan_->scan, scan_->iscan);
+ ref_quantize_op_(coeff_.TopLeftPixel(), count_, mb_plane_,
+ ref_qcoeff.TopLeftPixel(), ref_dqcoeff.TopLeftPixel(),
+ dequant_ptr_, &ref_eob, scan_->scan, scan_->iscan);
- ASM_REGISTER_STATE_CHECK(quantize_op_(
- coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_,
- quant_shift_ptr_, qcoeff_.TopLeftPixel(), dqcoeff_.TopLeftPixel(),
- dequant_ptr_, &eob_, scan_->scan, scan_->iscan));
+ ASM_REGISTER_STATE_CHECK(quantize_op_(coeff_.TopLeftPixel(), count_,
+ mb_plane_, qcoeff_.TopLeftPixel(),
+ dqcoeff_.TopLeftPixel(), dequant_ptr_,
+ &eob_, scan_->scan, scan_->iscan));
EXPECT_TRUE(qcoeff_.CheckValues(ref_qcoeff));
EXPECT_TRUE(dqcoeff_.CheckValues(ref_dqcoeff));
@@ -510,28 +544,35 @@ using std::make_tuple;
INSTANTIATE_TEST_SUITE_P(
SSE2, VP9QuantizeTest,
::testing::Values(
- make_tuple(&vpx_quantize_b_sse2, &vpx_quantize_b_c, VPX_BITS_8, 16,
- false),
+ make_tuple(&QuantWrapper<vpx_quantize_b_sse2>,
+ &QuantWrapper<vpx_quantize_b_c>, VPX_BITS_8, 16, false),
make_tuple(&QuantFPWrapper<vp9_quantize_fp_sse2>,
&QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8, 16, true),
- make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c,
- VPX_BITS_8, 16, false),
- make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c,
- VPX_BITS_10, 16, false),
- make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c,
- VPX_BITS_12, 16, false),
- make_tuple(&vpx_highbd_quantize_b_32x32_sse2,
- &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8, 32, false),
- make_tuple(&vpx_highbd_quantize_b_32x32_sse2,
- &vpx_highbd_quantize_b_32x32_c, VPX_BITS_10, 32, false),
- make_tuple(&vpx_highbd_quantize_b_32x32_sse2,
- &vpx_highbd_quantize_b_32x32_c, VPX_BITS_12, 32, false)));
+ make_tuple(&QuantWrapper<vpx_highbd_quantize_b_sse2>,
+ &QuantWrapper<vpx_highbd_quantize_b_c>, VPX_BITS_8, 16,
+ false),
+ make_tuple(&QuantWrapper<vpx_highbd_quantize_b_sse2>,
+ &QuantWrapper<vpx_highbd_quantize_b_c>, VPX_BITS_10, 16,
+ false),
+ make_tuple(&QuantWrapper<vpx_highbd_quantize_b_sse2>,
+ &QuantWrapper<vpx_highbd_quantize_b_c>, VPX_BITS_12, 16,
+ false),
+ make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_sse2>,
+ &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_8, 32,
+ false),
+ make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_sse2>,
+ &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_10,
+ 32, false),
+ make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_sse2>,
+ &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_12,
+ 32, false)));
#else
INSTANTIATE_TEST_SUITE_P(
SSE2, VP9QuantizeTest,
- ::testing::Values(make_tuple(&vpx_quantize_b_sse2, &vpx_quantize_b_c,
- VPX_BITS_8, 16, false),
+ ::testing::Values(make_tuple(&QuantWrapper<vpx_quantize_b_sse2>,
+ &QuantWrapper<vpx_quantize_b_c>, VPX_BITS_8,
+ 16, false),
make_tuple(&QuantFPWrapper<vp9_quantize_fp_sse2>,
&QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8,
16, true)));
@@ -541,11 +582,12 @@ INSTANTIATE_TEST_SUITE_P(
#if HAVE_SSSE3
INSTANTIATE_TEST_SUITE_P(
SSSE3, VP9QuantizeTest,
- ::testing::Values(make_tuple(&vpx_quantize_b_ssse3, &vpx_quantize_b_c,
- VPX_BITS_8, 16, false),
- make_tuple(&vpx_quantize_b_32x32_ssse3,
- &vpx_quantize_b_32x32_c, VPX_BITS_8, 32,
- false),
+ ::testing::Values(make_tuple(&QuantWrapper<vpx_quantize_b_ssse3>,
+ &QuantWrapper<vpx_quantize_b_c>, VPX_BITS_8,
+ 16, false),
+ make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_ssse3>,
+ &Quant32x32Wrapper<vpx_quantize_b_32x32_c>,
+ VPX_BITS_8, 32, false),
make_tuple(&QuantFPWrapper<vp9_quantize_fp_ssse3>,
&QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8,
16, true),
@@ -555,13 +597,14 @@ INSTANTIATE_TEST_SUITE_P(
#endif // HAVE_SSSE3
#if HAVE_AVX
-INSTANTIATE_TEST_SUITE_P(AVX, VP9QuantizeTest,
- ::testing::Values(make_tuple(&vpx_quantize_b_avx,
- &vpx_quantize_b_c,
- VPX_BITS_8, 16, false),
- make_tuple(&vpx_quantize_b_32x32_avx,
- &vpx_quantize_b_32x32_c,
- VPX_BITS_8, 32, false)));
+INSTANTIATE_TEST_SUITE_P(
+ AVX, VP9QuantizeTest,
+ ::testing::Values(make_tuple(&QuantWrapper<vpx_quantize_b_avx>,
+ &QuantWrapper<vpx_quantize_b_c>, VPX_BITS_8,
+ 16, false),
+ make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_avx>,
+ &Quant32x32Wrapper<vpx_quantize_b_32x32_c>,
+ VPX_BITS_8, 32, false)));
#endif // HAVE_AVX
#if VPX_ARCH_X86_64 && HAVE_AVX2
@@ -577,22 +620,29 @@ INSTANTIATE_TEST_SUITE_P(
make_tuple(&QuantFPWrapper<vp9_highbd_quantize_fp_32x32_avx2>,
&QuantFPWrapper<vp9_highbd_quantize_fp_32x32_c>, VPX_BITS_12,
32, true),
- make_tuple(&vpx_quantize_b_avx2, &vpx_quantize_b_c, VPX_BITS_8, 16,
+ make_tuple(&QuantWrapper<vpx_quantize_b_avx2>,
+ &QuantWrapper<vpx_quantize_b_c>, VPX_BITS_8, 16, false),
+ make_tuple(&QuantWrapper<vpx_highbd_quantize_b_avx2>,
+ &QuantWrapper<vpx_highbd_quantize_b_c>, VPX_BITS_8, 16,
+ false),
+ make_tuple(&QuantWrapper<vpx_highbd_quantize_b_avx2>,
+ &QuantWrapper<vpx_highbd_quantize_b_c>, VPX_BITS_10, 16,
+ false),
+ make_tuple(&QuantWrapper<vpx_highbd_quantize_b_avx2>,
+ &QuantWrapper<vpx_highbd_quantize_b_c>, VPX_BITS_12, 16,
+ false),
+ make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_avx2>,
+ &Quant32x32Wrapper<vpx_quantize_b_32x32_c>, VPX_BITS_8, 32,
false),
- make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c,
- VPX_BITS_8, 16, false),
- make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c,
- VPX_BITS_10, 16, false),
- make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c,
- VPX_BITS_12, 16, false),
- make_tuple(&vpx_quantize_b_32x32_avx2, &vpx_quantize_b_32x32_c,
- VPX_BITS_8, 32, false),
- make_tuple(&vpx_highbd_quantize_b_32x32_avx2,
- &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8, 32, false),
- make_tuple(&vpx_highbd_quantize_b_32x32_avx2,
- &vpx_highbd_quantize_b_32x32_c, VPX_BITS_10, 32, false),
- make_tuple(&vpx_highbd_quantize_b_32x32_avx2,
- &vpx_highbd_quantize_b_32x32_c, VPX_BITS_12, 32, false)));
+ make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_avx2>,
+ &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_8, 32,
+ false),
+ make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_avx2>,
+ &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_10,
+ 32, false),
+ make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_avx2>,
+ &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_12,
+ 32, false)));
#else
INSTANTIATE_TEST_SUITE_P(
AVX2, VP9QuantizeTest,
@@ -602,11 +652,12 @@ INSTANTIATE_TEST_SUITE_P(
make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_avx2>,
&QuantFPWrapper<quantize_fp_32x32_nz_c>,
VPX_BITS_8, 32, true),
- make_tuple(&vpx_quantize_b_avx2, &vpx_quantize_b_c,
- VPX_BITS_8, 16, false),
- make_tuple(&vpx_quantize_b_32x32_avx2,
- &vpx_quantize_b_32x32_c, VPX_BITS_8, 32,
- false)));
+ make_tuple(&QuantWrapper<vpx_quantize_b_avx2>,
+ &QuantWrapper<vpx_quantize_b_c>, VPX_BITS_8,
+ 16, false),
+ make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_avx2>,
+ &Quant32x32Wrapper<vpx_quantize_b_32x32_c>,
+ VPX_BITS_8, 32, false)));
#endif // CONFIG_VP9_HIGHBITDEPTH
#endif // HAVE_AVX2
@@ -615,22 +666,29 @@ INSTANTIATE_TEST_SUITE_P(
INSTANTIATE_TEST_SUITE_P(
NEON, VP9QuantizeTest,
::testing::Values(
- make_tuple(&vpx_quantize_b_neon, &vpx_quantize_b_c, VPX_BITS_8, 16,
+ make_tuple(&QuantWrapper<vpx_quantize_b_neon>,
+ &QuantWrapper<vpx_quantize_b_c>, VPX_BITS_8, 16, false),
+ make_tuple(&QuantWrapper<vpx_highbd_quantize_b_neon>,
+ &QuantWrapper<vpx_highbd_quantize_b_c>, VPX_BITS_8, 16,
+ false),
+ make_tuple(&QuantWrapper<vpx_highbd_quantize_b_neon>,
+ &QuantWrapper<vpx_highbd_quantize_b_c>, VPX_BITS_10, 16,
false),
- make_tuple(&vpx_highbd_quantize_b_neon, &vpx_highbd_quantize_b_c,
- VPX_BITS_8, 16, false),
- make_tuple(&vpx_highbd_quantize_b_neon, &vpx_highbd_quantize_b_c,
- VPX_BITS_10, 16, false),
- make_tuple(&vpx_highbd_quantize_b_neon, &vpx_highbd_quantize_b_c,
- VPX_BITS_12, 16, false),
- make_tuple(&vpx_quantize_b_32x32_neon, &vpx_quantize_b_32x32_c,
- VPX_BITS_8, 32, false),
- make_tuple(&vpx_highbd_quantize_b_32x32_neon,
- &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8, 32, false),
- make_tuple(&vpx_highbd_quantize_b_32x32_neon,
- &vpx_highbd_quantize_b_32x32_c, VPX_BITS_10, 32, false),
- make_tuple(&vpx_highbd_quantize_b_32x32_neon,
- &vpx_highbd_quantize_b_32x32_c, VPX_BITS_12, 32, false),
+ make_tuple(&QuantWrapper<vpx_highbd_quantize_b_neon>,
+ &QuantWrapper<vpx_highbd_quantize_b_c>, VPX_BITS_12, 16,
+ false),
+ make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_neon>,
+ &Quant32x32Wrapper<vpx_quantize_b_32x32_c>, VPX_BITS_8, 32,
+ false),
+ make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_neon>,
+ &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_8, 32,
+ false),
+ make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_neon>,
+ &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_10,
+ 32, false),
+ make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_neon>,
+ &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_12,
+ 32, false),
make_tuple(&QuantFPWrapper<vp9_quantize_fp_neon>,
&QuantFPWrapper<vp9_quantize_fp_c>, VPX_BITS_8, 16, true),
make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_neon>,
@@ -639,11 +697,12 @@ INSTANTIATE_TEST_SUITE_P(
#else
INSTANTIATE_TEST_SUITE_P(
NEON, VP9QuantizeTest,
- ::testing::Values(make_tuple(&vpx_quantize_b_neon, &vpx_quantize_b_c,
- VPX_BITS_8, 16, false),
- make_tuple(&vpx_quantize_b_32x32_neon,
- &vpx_quantize_b_32x32_c, VPX_BITS_8, 32,
- false),
+ ::testing::Values(make_tuple(&QuantWrapper<vpx_quantize_b_neon>,
+ &QuantWrapper<vpx_quantize_b_c>, VPX_BITS_8,
+ 16, false),
+ make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_neon>,
+ &Quant32x32Wrapper<vpx_quantize_b_32x32_c>,
+ VPX_BITS_8, 32, false),
make_tuple(&QuantFPWrapper<vp9_quantize_fp_neon>,
&QuantFPWrapper<vp9_quantize_fp_c>, VPX_BITS_8,
16, true),
@@ -683,9 +742,11 @@ INSTANTIATE_TEST_SUITE_P(LSX, VP9QuantizeTest,
INSTANTIATE_TEST_SUITE_P(
DISABLED_C, VP9QuantizeTest,
::testing::Values(
- make_tuple(&vpx_quantize_b_c, &vpx_quantize_b_c, VPX_BITS_8, 16, false),
- make_tuple(&vpx_quantize_b_32x32_c, &vpx_quantize_b_32x32_c, VPX_BITS_8,
- 32, false),
+ make_tuple(&QuantWrapper<vpx_quantize_b_c>,
+ &QuantWrapper<vpx_quantize_b_c>, VPX_BITS_8, 16, false),
+ make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_c>,
+ &Quant32x32Wrapper<vpx_quantize_b_32x32_c>, VPX_BITS_8, 32,
+ false),
make_tuple(&QuantFPWrapper<vp9_quantize_fp_c>,
&QuantFPWrapper<vp9_quantize_fp_c>, VPX_BITS_8, 16, true),
make_tuple(&QuantFPWrapper<quantize_fp_nz_c>,
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index 3e2c9a3c3..da01c346d 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -13,6 +13,7 @@
#include "vpx_util/vpx_thread.h"
+#include "vp9/common/vp9_blockd.h"
#include "vp9/common/vp9_entropymv.h"
#include "vp9/common/vp9_entropy.h"
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index fa222f9dc..4910dc20f 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -542,8 +542,7 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col,
switch (tx_size) {
case TX_32X32:
fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
- vpx_quantize_b_32x32(coeff, 1024, p->zbin, p->round, p->quant,
- p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
+ vpx_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, eob,
scan_order->scan, scan_order->iscan);
break;
case TX_16X16:
@@ -948,8 +947,7 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
vpx_subtract_block(32, 32, src_diff, diff_stride, src, src_stride, dst,
dst_stride);
fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
- vpx_quantize_b_32x32(coeff, 1024, p->zbin, p->round, p->quant,
- p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
+ vpx_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, eob,
scan_order->scan, scan_order->iscan);
}
if (args->enable_coeff_opt && !x->skip_recode) {
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index f87ab3e0b..bcadd5777 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -160,12 +160,13 @@ static void swap_block_ptr(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx, int m, int n,
}
#if !CONFIG_REALTIME_ONLY
-static int model_rd_for_sb_earlyterm(VP9_COMP *cpi, int mi_row, int mi_col,
- BLOCK_SIZE bsize, MACROBLOCK *x,
- MACROBLOCKD *xd, int *out_rate_sum,
- int64_t *out_dist_sum, int *skip_txfm_sb,
- int64_t *skip_sse_sb, int do_earlyterm,
- int64_t best_rd) {
+// Planewise build inter prediction and compute rdcost with early termination
+// option
+static int build_inter_pred_model_rd_earlyterm(
+ VP9_COMP *cpi, int mi_row, int mi_col, BLOCK_SIZE bsize, MACROBLOCK *x,
+ MACROBLOCKD *xd, int *out_rate_sum, int64_t *out_dist_sum,
+ int *skip_txfm_sb, int64_t *skip_sse_sb, int do_earlyterm,
+ int64_t best_rd) {
// Note our transform coeffs are 8 times an orthogonal transform.
// Hence quantizer step is also 8 times. To get effective quantizer
// we need to divide by 8 before sending to modeling function.
@@ -2999,13 +3000,13 @@ static int64_t handle_inter_mode(
xd->plane[j].dst.stride = 64;
}
}
- // Compute RD cost with early termination option
+
filt_best_rd =
cm->interp_filter == SWITCHABLE ? (best_rd - rs_rd) : best_rd;
- if (model_rd_for_sb_earlyterm(cpi, mi_row, mi_col, bsize, x, xd,
- &rate_sum, &dist_sum, &tmp_skip_sb,
- &tmp_skip_sse, enable_earlyterm,
- filt_best_rd)) {
+ if (build_inter_pred_model_rd_earlyterm(
+ cpi, mi_row, mi_col, bsize, x, xd, &rate_sum, &dist_sum,
+ &tmp_skip_sb, &tmp_skip_sse, enable_earlyterm,
+ filt_best_rd)) {
filter_cache[i] = INT64_MAX;
continue;
}
@@ -3076,9 +3077,9 @@ static int64_t handle_inter_mode(
// Handles the special case when a filter that is not in the
// switchable list (ex. bilinear) is indicated at the frame level, or
// skip condition holds.
- model_rd_for_sb_earlyterm(cpi, mi_row, mi_col, bsize, x, xd, &tmp_rate,
- &tmp_dist, &skip_txfm_sb, &skip_sse_sb,
- 0 /*do_earlyterm*/, INT64_MAX);
+ build_inter_pred_model_rd_earlyterm(
+ cpi, mi_row, mi_col, bsize, x, xd, &tmp_rate, &tmp_dist, &skip_txfm_sb,
+ &skip_sse_sb, 0 /*do_earlyterm*/, INT64_MAX);
rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate, tmp_dist);
memcpy(skip_txfm, x->skip_txfm, sizeof(skip_txfm));
memcpy(bsse, x->bsse, sizeof(bsse));
diff --git a/vpx_dsp/arm/quantize_neon.c b/vpx_dsp/arm/quantize_neon.c
index 9c227d560..e81738a7b 100644
--- a/vpx_dsp/arm/quantize_neon.c
+++ b/vpx_dsp/arm/quantize_neon.c
@@ -14,6 +14,7 @@
#include "./vpx_config.h"
#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/arm/mem_neon.h"
+#include "vp9/encoder/vp9_block.h"
static INLINE void calculate_dqcoeff_and_store(const int16x8_t qcoeff,
const int16x8_t dequant,
@@ -213,11 +214,8 @@ quantize_b_32x32_neon(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
// Main difference is that zbin values are halved before comparison and dqcoeff
// values are divided by 2. zbin is rounded but dqcoeff is not.
-void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
- const int16_t *zbin_ptr,
- const int16_t *round_ptr,
- const int16_t *quant_ptr,
- const int16_t *quant_shift_ptr,
+void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr,
+ const struct macroblock_plane *const mb_plane,
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t *dequant_ptr, uint16_t *eob_ptr,
const int16_t *scan, const int16_t *iscan) {
@@ -226,10 +224,10 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
int i;
// Only the first element of each vector is DC.
- int16x8_t zbin = vrshrq_n_s16(vld1q_s16(zbin_ptr), 1);
- int16x8_t round = vrshrq_n_s16(vld1q_s16(round_ptr), 1);
- int16x8_t quant = vld1q_s16(quant_ptr);
- int16x8_t quant_shift = vld1q_s16(quant_shift_ptr);
+ int16x8_t zbin = vrshrq_n_s16(vld1q_s16(mb_plane->zbin), 1);
+ int16x8_t round = vrshrq_n_s16(vld1q_s16(mb_plane->round), 1);
+ int16x8_t quant = vld1q_s16(mb_plane->quant);
+ int16x8_t quant_shift = vld1q_s16(mb_plane->quant_shift);
int16x8_t dequant = vld1q_s16(dequant_ptr);
// Process first 8 values which include a dc component.
@@ -289,6 +287,5 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
#endif // __aarch64__
// Need these here, else the compiler complains about mixing declarations and
// code in C90
- (void)n_coeffs;
(void)scan;
}
diff --git a/vpx_dsp/quantize.c b/vpx_dsp/quantize.c
index 5d6ba64a8..212db45c8 100644
--- a/vpx_dsp/quantize.c
+++ b/vpx_dsp/quantize.c
@@ -14,6 +14,7 @@
#include "vpx_dsp/quantize.h"
#include "vpx_dsp/vpx_dsp_common.h"
#include "vpx_mem/vpx_mem.h"
+#include "vp9/encoder/vp9_block.h"
void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
const int16_t *round_ptr, const int16_t quant,
@@ -208,19 +209,21 @@ void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
}
#endif
-void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
- const int16_t *zbin_ptr, const int16_t *round_ptr,
- const int16_t *quant_ptr,
- const int16_t *quant_shift_ptr,
+void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr,
+ const struct macroblock_plane *const mb_plane,
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t *dequant_ptr, uint16_t *eob_ptr,
const int16_t *scan, const int16_t *iscan) {
- const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 1),
- ROUND_POWER_OF_TWO(zbin_ptr[1], 1) };
+ const int n_coeffs = 32 * 32;
+ const int zbins[2] = { ROUND_POWER_OF_TWO(mb_plane->zbin[0], 1),
+ ROUND_POWER_OF_TWO(mb_plane->zbin[1], 1) };
const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
+ const int16_t *round_ptr = mb_plane->round;
+ const int16_t *quant_ptr = mb_plane->quant;
+ const int16_t *quant_shift_ptr = mb_plane->quant_shift;
int idx = 0;
- int idx_arr[1024];
+ int idx_arr[32 * 32 /* n_coeffs */];
int i, eob = -1;
(void)iscan;
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index ad8ff6e18..3baf16cc8 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -17,6 +17,9 @@ print <<EOF
#include "vpx/vpx_integer.h"
#include "vpx_dsp/vpx_dsp_common.h"
#include "vpx_dsp/vpx_filter.h"
+#if CONFIG_VP9_ENCODER
+ struct macroblock_plane;
+#endif
EOF
}
@@ -721,7 +724,7 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/vpx_quantize_b neon sse2 ssse3 avx avx2 vsx lsx/;
- add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/vpx_quantize_b_32x32 neon ssse3 avx avx2 vsx lsx/;
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
diff --git a/vpx_dsp/x86/quantize_avx.c b/vpx_dsp/x86/quantize_avx.c
index 7d8352721..d52f6c664 100644
--- a/vpx_dsp/x86/quantize_avx.c
+++ b/vpx_dsp/x86/quantize_avx.c
@@ -140,15 +140,12 @@ void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
*eob_ptr = accumulate_eob(eob);
}
-void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
- const int16_t *zbin_ptr, const int16_t *round_ptr,
- const int16_t *quant_ptr,
- const int16_t *quant_shift_ptr,
+void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr,
+ const struct macroblock_plane *const mb_plane,
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t *dequant_ptr, uint16_t *eob_ptr,
const int16_t *scan, const int16_t *iscan) {
const __m128i zero = _mm_setzero_si128();
- const __m128i one = _mm_set1_epi16(1);
const __m256i big_zero = _mm256_setzero_si256();
int index;
@@ -160,26 +157,9 @@ void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
__m128i eob = zero, eob0;
(void)scan;
- (void)n_coeffs;
-
- // Setup global values.
- // The 32x32 halves zbin and round.
- zbin = _mm_load_si128((const __m128i *)zbin_ptr);
- // Shift with rounding.
- zbin = _mm_add_epi16(zbin, one);
- zbin = _mm_srli_epi16(zbin, 1);
- // x86 has no "greater *or equal*" comparison. Subtract 1 from zbin so
- // it is a strict "greater" comparison.
- zbin = _mm_sub_epi16(zbin, one);
-
- round = _mm_load_si128((const __m128i *)round_ptr);
- round = _mm_add_epi16(round, one);
- round = _mm_srli_epi16(round, 1);
-
- quant = _mm_load_si128((const __m128i *)quant_ptr);
- dequant = _mm_load_si128((const __m128i *)dequant_ptr);
- shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
- shift = _mm_slli_epi16(shift, 1);
+
+ load_b_values32x32(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant,
+ &shift);
// Do DC and first 15 AC.
coeff0 = load_tran_low(coeff_ptr);
diff --git a/vpx_dsp/x86/quantize_avx2.c b/vpx_dsp/x86/quantize_avx2.c
index 28f7c9c7d..a8412c5b8 100644
--- a/vpx_dsp/x86/quantize_avx2.c
+++ b/vpx_dsp/x86/quantize_avx2.c
@@ -13,6 +13,7 @@
#include "./vpx_dsp_rtcd.h"
#include "vpx/vpx_integer.h"
+#include "vp9/encoder/vp9_block.h"
static VPX_FORCE_INLINE void load_b_values_avx2(
const int16_t *zbin_ptr, __m256i *zbin, const int16_t *round_ptr,
@@ -250,23 +251,19 @@ static VPX_FORCE_INLINE __m256i quantize_b_32x32_16(
}
}
-void vpx_quantize_b_32x32_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
- const int16_t *zbin_ptr,
- const int16_t *round_ptr,
- const int16_t *quant_ptr,
- const int16_t *quant_shift_ptr,
+void vpx_quantize_b_32x32_avx2(const tran_low_t *coeff_ptr,
+ const struct macroblock_plane *const mb_plane,
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t *dequant_ptr, uint16_t *eob_ptr,
const int16_t *scan, const int16_t *iscan) {
__m256i v_zbin, v_round, v_quant, v_dequant, v_quant_shift;
__m256i v_eobmax = _mm256_setzero_si256();
intptr_t count;
- (void)n_coeffs;
(void)scan;
- load_b_values_avx2(zbin_ptr, &v_zbin, round_ptr, &v_round, quant_ptr,
- &v_quant, dequant_ptr, &v_dequant, quant_shift_ptr,
- &v_quant_shift, 1);
+ load_b_values_avx2(mb_plane->zbin, &v_zbin, mb_plane->round, &v_round,
+ mb_plane->quant, &v_quant, dequant_ptr, &v_dequant,
+ mb_plane->quant_shift, &v_quant_shift, 1);
// Do DC and first 15 AC.
v_eobmax = quantize_b_32x32_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, iscan,
diff --git a/vpx_dsp/x86/quantize_sse2.h b/vpx_dsp/x86/quantize_sse2.h
index 27bfb4e41..fe42fee01 100644
--- a/vpx_dsp/x86/quantize_sse2.h
+++ b/vpx_dsp/x86/quantize_sse2.h
@@ -15,6 +15,7 @@
#include "./vpx_config.h"
#include "vpx/vpx_integer.h"
+#include "vp9/encoder/vp9_block.h"
static INLINE void load_b_values(const int16_t *zbin_ptr, __m128i *zbin,
const int16_t *round_ptr, __m128i *round,
@@ -29,6 +30,33 @@ static INLINE void load_b_values(const int16_t *zbin_ptr, __m128i *zbin,
*shift = _mm_load_si128((const __m128i *)shift_ptr);
}
+static INLINE void load_b_values32x32(
+ const struct macroblock_plane *const mb_plane, __m128i *zbin,
+ __m128i *round, __m128i *quant, const int16_t *dequant_ptr,
+ __m128i *dequant, __m128i *shift) {
+ const __m128i one = _mm_set1_epi16(1);
+ // The 32x32 halves zbin and round.
+ *zbin = _mm_load_si128((const __m128i *)mb_plane->zbin);
+ // Shift with rounding.
+ *zbin = _mm_add_epi16(*zbin, one);
+ *zbin = _mm_srli_epi16(*zbin, 1);
+ // x86 has no "greater *or equal*" comparison. Subtract 1 from zbin so
+ // it is a strict "greater" comparison.
+ *zbin = _mm_sub_epi16(*zbin, one);
+
+ *round = _mm_load_si128((const __m128i *)mb_plane->round);
+ *round = _mm_add_epi16(*round, one);
+ *round = _mm_srli_epi16(*round, 1);
+
+ *quant = _mm_load_si128((const __m128i *)mb_plane->quant);
+ *dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+ *shift = _mm_load_si128((const __m128i *)mb_plane->quant_shift);
+ // I suspect this is not technically OK because quant_shift can be up
+ // to 1 << 16 and shifting up again will outrange that, but the test is not
+ // comprehensive enough to catch that and "it's been that way forever"
+ *shift = _mm_slli_epi16(*shift, 1);
+}
+
static INLINE void load_fp_values(const int16_t *round_ptr, __m128i *round,
const int16_t *quant_ptr, __m128i *quant,
const int16_t *dequant_ptr,
diff --git a/vpx_dsp/x86/quantize_ssse3.c b/vpx_dsp/x86/quantize_ssse3.c
index 476230286..6fe54d7d9 100644
--- a/vpx_dsp/x86/quantize_ssse3.c
+++ b/vpx_dsp/x86/quantize_ssse3.c
@@ -16,6 +16,7 @@
#include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
#include "vpx_dsp/x86/quantize_sse2.h"
#include "vpx_dsp/x86/quantize_ssse3.h"
+#include "vp9/encoder/vp9_block.h"
void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
const int16_t *zbin_ptr, const int16_t *round_ptr,
@@ -107,16 +108,12 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
*eob_ptr = accumulate_eob(eob);
}
-void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
- const int16_t *zbin_ptr,
- const int16_t *round_ptr,
- const int16_t *quant_ptr,
- const int16_t *quant_shift_ptr,
+void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr,
+ const struct macroblock_plane *const mb_plane,
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t *dequant_ptr, uint16_t *eob_ptr,
const int16_t *scan, const int16_t *iscan) {
const __m128i zero = _mm_setzero_si128();
- const __m128i one = _mm_set1_epi16(1);
int index;
__m128i zbin, round, quant, dequant, shift;
@@ -127,29 +124,9 @@ void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
__m128i eob = zero, eob0;
(void)scan;
- (void)n_coeffs;
-
- // Setup global values.
- // The 32x32 halves zbin and round.
- zbin = _mm_load_si128((const __m128i *)zbin_ptr);
- // Shift with rounding.
- zbin = _mm_add_epi16(zbin, one);
- zbin = _mm_srli_epi16(zbin, 1);
- // x86 has no "greater *or equal*" comparison. Subtract 1 from zbin so
- // it is a strict "greater" comparison.
- zbin = _mm_sub_epi16(zbin, one);
-
- round = _mm_load_si128((const __m128i *)round_ptr);
- round = _mm_add_epi16(round, one);
- round = _mm_srli_epi16(round, 1);
-
- quant = _mm_load_si128((const __m128i *)quant_ptr);
- dequant = _mm_load_si128((const __m128i *)dequant_ptr);
- shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
- // I suspect this is not technically OK because quant_shift can be up
- // to 1 << 16 and shifting up again will outrange that, but the test is not
- // comprehensive enough to catch that and "it's been that way forever"
- shift = _mm_slli_epi16(shift, 1);
+
+ load_b_values32x32(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant,
+ &shift);
// Do DC and first 15 AC.
coeff0 = load_tran_low(coeff_ptr);
diff --git a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
index 26e82f9b7..141614e7a 100644
--- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
+++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
@@ -942,19 +942,111 @@ static void vpx_filter_block1d4_v4_avx2(const uint8_t *src_ptr,
}
}
+static void vpx_filter_block1d8_v8_avx2(
+ const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+ ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
+ __m256i f[4], ss[4];
+ __m256i r[8];
+ __m128i s[9];
+
+ unsigned int y = output_height;
+ // Multiply the size of the source stride by two
+ const ptrdiff_t src_stride = src_pitch << 1;
+
+ // The output_height is always a multiple of two.
+ assert(!(output_height & 1));
+
+ shuffle_filter_avx2(filter, f);
+ s[0] = _mm_loadl_epi64((const __m128i *)(src_ptr + 0 * src_pitch));
+ s[1] = _mm_loadl_epi64((const __m128i *)(src_ptr + 1 * src_pitch));
+ s[2] = _mm_loadl_epi64((const __m128i *)(src_ptr + 2 * src_pitch));
+ s[3] = _mm_loadl_epi64((const __m128i *)(src_ptr + 3 * src_pitch));
+ s[4] = _mm_loadl_epi64((const __m128i *)(src_ptr + 4 * src_pitch));
+ s[5] = _mm_loadl_epi64((const __m128i *)(src_ptr + 5 * src_pitch));
+ s[6] = _mm_loadl_epi64((const __m128i *)(src_ptr + 6 * src_pitch));
+
+ // merge the result together
+ // r[0]: 0 0 0 0 0 0 0 0 r17 r16 r15 r14 r13 r12 r11 r10 | 0 0 0 0 0 0 0 0
+ // r07 r06 r05 r04 r03 r02 r01 r00
+ r[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[0]), s[1], 1);
+
+ // r[1]: 0 0 0 0 0 0 0 0 r27 r26 r25 r24 r23 r22 r21 r20 | 0 0 0 0 0 0 0 0
+ // r17 r16 r15 r14 r13 r12 r11 r10
+ r[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[1]), s[2], 1);
+
+ // r[2]: 0 0 0 0 0 0 0 0 r37 r36 r35 r34 r33 r32 r31 r30 | 0 0 0 0 0 0 0 0
+ // r27 r26 r25 r24 r23 r22 r21 r20
+ r[2] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[2]), s[3], 1);
+
+ // r[3]: 0 0 0 0 0 0 0 0 r47 r46 r45 r44 r43 r42 r41 r40 | 0 0 0 0 0 0 0 0
+ // r37 r36 r35 r34 r33 r32 r31 r30
+ r[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[3]), s[4], 1);
+
+ // r[4]: 0 0 0 0 0 0 0 0 r57 r56 r55 r54 r53 r52 r51 r50 | 0 0 0 0 0 0 0 0
+ // r47 r46 r45 r44 r43 r42 r41 r40
+ r[4] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[4]), s[5], 1);
+
+ // r[5]: 0 0 0 0 0 0 0 0 r67 r66 r65 r64 r63 r62 r61 r60 | 0 0 0 0 0 0 0 0
+ // r57 r56 r55 r54 r53 r52 r51 r50
+ r[5] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[5]), s[6], 1);
+
+ // Merge together
+ // ss[0]: |r27 r17|.......|r21 r11|r20 r10 || r17 r07|.....|r12 r02|r11
+ // r01|r10 r00|
+ ss[0] = _mm256_unpacklo_epi8(r[0], r[1]);
+
+ // ss[0]: |r47 r37|.......|r41 r31|r40 r30 || r37 r27|.....|r32 r22|r31
+ // r21|r30 r20|
+ ss[1] = _mm256_unpacklo_epi8(r[2], r[3]);
+
+ // ss[2]: |r67 r57|.......|r61 r51|r60 r50 || r57 r47|.....|r52 r42|r51
+ // r41|r50 r40|
+ ss[2] = _mm256_unpacklo_epi8(r[4], r[5]);
+
+ // Process 2 rows at a time
+ do {
+ s[7] = _mm_loadl_epi64((const __m128i *)(src_ptr + 7 * src_pitch));
+ s[8] = _mm_loadl_epi64((const __m128i *)(src_ptr + 8 * src_pitch));
+
+ // r[6]: 0 0 0 0 0 0 0 0 r77 r76 r75 r74 r73 r72 r71 r70 | 0 0 0 0 0 0 0
+ // 0 r67 r66 r65 r64 r63 r62 r61 r60
+ r[6] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[6]), s[7], 1);
+ // r[7]: 0 0 0 0 0 0 0 0 r87 r86 r85 r84 r83 r82 r81 r80 | 0 0 0 0 0 0 0
+ // 0 r77 r76 r75 r74 r73 r72 r71 r70
+ r[7] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[7]), s[8], 1);
+
+ // ss[3] : | r87 r77 | .......| r81 r71 | r80 r70 || r77 r67 | .....| r72
+ // r62 | r71 r61|r70 r60|
+ ss[3] = _mm256_unpacklo_epi8(r[6], r[7]);
+ ss[0] = convolve8_16_avx2(ss, f);
+ ss[0] = _mm256_packus_epi16(ss[0], ss[0]);
+ src_ptr += src_stride;
+
+ /* shift down two rows */
+ s[6] = s[8];
+ _mm_storel_epi64((__m128i *)&output_ptr[0], _mm256_castsi256_si128(ss[0]));
+ output_ptr += out_pitch;
+ _mm_storel_epi64((__m128i *)&output_ptr[0],
+ _mm256_extractf128_si256(ss[0], 1));
+ output_ptr += out_pitch;
+ ss[0] = ss[1];
+ ss[1] = ss[2];
+ ss[2] = ss[3];
+ y -= 2;
+ } while (y > 1);
+}
+
#if HAVE_AVX2 && HAVE_SSSE3
filter8_1dfunction vpx_filter_block1d4_v8_ssse3;
#if VPX_ARCH_X86_64
filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3;
filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3;
filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3;
-#define vpx_filter_block1d8_v8_avx2 vpx_filter_block1d8_v8_intrin_ssse3
#define vpx_filter_block1d4_h8_avx2 vpx_filter_block1d4_h8_intrin_ssse3
#else // VPX_ARCH_X86
filter8_1dfunction vpx_filter_block1d8_v8_ssse3;
filter8_1dfunction vpx_filter_block1d8_h8_ssse3;
filter8_1dfunction vpx_filter_block1d4_h8_ssse3;
-#define vpx_filter_block1d8_v8_avx2 vpx_filter_block1d8_v8_ssse3
#define vpx_filter_block1d4_h8_avx2 vpx_filter_block1d4_h8_ssse3
#endif // VPX_ARCH_X86_64
filter8_1dfunction vpx_filter_block1d8_v8_avg_ssse3;