12 files changed, 352 insertions, 214 deletions
diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc
index 587cec692..6a8f1dafb 100644
--- a/test/vp9_quantize_test.cc
+++ b/test/vp9_quantize_test.cc
@@ -26,6 +26,7 @@
 #include "test/util.h"
 #include "vp9/common/vp9_entropy.h"
 #include "vp9/common/vp9_scan.h"
+#include "vp9/encoder/vp9_block.h"
 #include "vpx/vpx_codec.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_ports/msvc.h"
@@ -38,8 +39,7 @@ namespace {
 const int number_of_iterations = 100;
 
 typedef void (*QuantizeFunc)(const tran_low_t *coeff, intptr_t count,
-                             const int16_t *zbin, const int16_t *round,
-                             const int16_t *quant, const int16_t *quant_shift,
+                             const macroblock_plane *const mb_plane,
                              tran_low_t *qcoeff, tran_low_t *dqcoeff,
                              const int16_t *dequant, uint16_t *eob,
                              const int16_t *scan, const int16_t *iscan);
@@ -47,6 +47,41 @@ typedef std::tuple<QuantizeFunc, QuantizeFunc, vpx_bit_depth_t,
                    int /*max_size*/, bool /*is_fp*/>
     QuantizeParam;
 
+// Wrapper which takes a macroblock_plane.
+typedef void (*QuantizeBaseFunc)(const tran_low_t *coeff, intptr_t count,
+                                 const int16_t *zbin, const int16_t *round,
+                                 const int16_t *quant,
+                                 const int16_t *quant_shift, tran_low_t *qcoeff,
+                                 tran_low_t *dqcoeff, const int16_t *dequant,
+                                 uint16_t *eob, const int16_t *scan,
+                                 const int16_t *iscan);
+
+template <QuantizeBaseFunc fn>
+void QuantWrapper(const tran_low_t *coeff, intptr_t count,
+                  const macroblock_plane *const mb_plane, tran_low_t *qcoeff,
+                  tran_low_t *dqcoeff, const int16_t *dequant, uint16_t *eob,
+                  const int16_t *scan, const int16_t *iscan) {
+  fn(coeff, count, mb_plane->zbin, mb_plane->round, mb_plane->quant,
+     mb_plane->quant_shift, qcoeff, dqcoeff, dequant, eob, scan, iscan);
+}
+
+// Wrapper for 32x32 version which does not use count
+typedef void (*Quantize32x32Func)(const tran_low_t *coeff,
+                                  const macroblock_plane *const mb_plane,
+                                  tran_low_t *qcoeff, tran_low_t *dqcoeff,
+                                  const int16_t *dequant, uint16_t *eob,
+                                  const int16_t *scan, const int16_t *iscan);
+
+template <Quantize32x32Func fn>
+void Quant32x32Wrapper(const tran_low_t *coeff, intptr_t count,
+                       const macroblock_plane *const mb_plane,
+                       tran_low_t *qcoeff, tran_low_t *dqcoeff,
+                       const int16_t *dequant, uint16_t *eob,
+                       const int16_t *scan, const int16_t *iscan) {
+  (void)count;
+  fn(coeff, mb_plane, qcoeff, dqcoeff, dequant, eob, scan, iscan);
+}
+
 // Wrapper for FP version which does not use zbin or quant_shift.
 typedef void (*QuantizeFPFunc)(const tran_low_t *coeff, intptr_t count,
                                const int16_t *round, const int16_t *quant,
@@ -56,15 +91,11 @@ typedef void (*QuantizeFPFunc)(const tran_low_t *coeff, intptr_t count,
 
 template <QuantizeFPFunc fn>
 void QuantFPWrapper(const tran_low_t *coeff, intptr_t count,
-                    const int16_t *zbin, const int16_t *round,
-                    const int16_t *quant, const int16_t *quant_shift,
-                    tran_low_t *qcoeff, tran_low_t *dqcoeff,
-                    const int16_t *dequant, uint16_t *eob, const int16_t *scan,
-                    const int16_t *iscan) {
-  (void)zbin;
-  (void)quant_shift;
-
-  fn(coeff, count, round, quant, qcoeff, dqcoeff, dequant, eob, scan, iscan);
+                    const macroblock_plane *const mb_plane, tran_low_t *qcoeff,
+                    tran_low_t *dqcoeff, const int16_t *dequant, uint16_t *eob,
+                    const int16_t *scan, const int16_t *iscan) {
+  fn(coeff, count, mb_plane->round_fp, mb_plane->quant_fp, qcoeff, dqcoeff,
+     dequant, eob, scan, iscan);
 }
 
 void GenerateHelperArrays(ACMRandom *rnd, int16_t *zbin, int16_t *round,
@@ -119,17 +150,21 @@ class VP9QuantizeBase : public AbstractBench {
 #else
     max_value_ = (1 << bit_depth_) - 1;
 #endif
-    zbin_ptr_ =
+
+    mb_plane_ = reinterpret_cast<macroblock_plane *>(
+        vpx_memalign(16, sizeof(macroblock_plane)));
+
+    zbin_ptr_ = mb_plane_->zbin =
         reinterpret_cast<int16_t *>(vpx_memalign(16, 8 * sizeof(*zbin_ptr_)));
-    round_fp_ptr_ = reinterpret_cast<int16_t *>(
+    round_fp_ptr_ = mb_plane_->round_fp = reinterpret_cast<int16_t *>(
         vpx_memalign(16, 8 * sizeof(*round_fp_ptr_)));
-    quant_fp_ptr_ = reinterpret_cast<int16_t *>(
+    quant_fp_ptr_ = mb_plane_->quant_fp = reinterpret_cast<int16_t *>(
         vpx_memalign(16, 8 * sizeof(*quant_fp_ptr_)));
-    round_ptr_ =
+    round_ptr_ = mb_plane_->round =
         reinterpret_cast<int16_t *>(vpx_memalign(16, 8 * sizeof(*round_ptr_)));
-    quant_ptr_ =
+    quant_ptr_ = mb_plane_->quant =
         reinterpret_cast<int16_t *>(vpx_memalign(16, 8 * sizeof(*quant_ptr_)));
-    quant_shift_ptr_ = reinterpret_cast<int16_t *>(
+    quant_shift_ptr_ = mb_plane_->quant_shift = reinterpret_cast<int16_t *>(
         vpx_memalign(16, 8 * sizeof(*quant_shift_ptr_)));
     dequant_ptr_ = reinterpret_cast<int16_t *>(
         vpx_memalign(16, 8 * sizeof(*dequant_ptr_)));
@@ -139,6 +174,7 @@ class VP9QuantizeBase : public AbstractBench {
   }
 
   ~VP9QuantizeBase() {
+    vpx_free(mb_plane_);
     vpx_free(zbin_ptr_);
     vpx_free(round_fp_ptr_);
     vpx_free(quant_fp_ptr_);
@@ -146,6 +182,7 @@ class VP9QuantizeBase : public AbstractBench {
     vpx_free(quant_ptr_);
     vpx_free(quant_shift_ptr_);
     vpx_free(dequant_ptr_);
+    mb_plane_ = nullptr;
     zbin_ptr_ = nullptr;
     round_fp_ptr_ = nullptr;
     quant_fp_ptr_ = nullptr;
@@ -157,9 +194,10 @@ class VP9QuantizeBase : public AbstractBench {
   }
 
  protected:
+  macroblock_plane *mb_plane_;
   int16_t *zbin_ptr_;
-  int16_t *round_fp_ptr_;
   int16_t *quant_fp_ptr_;
+  int16_t *round_fp_ptr_;
   int16_t *round_ptr_;
   int16_t *quant_ptr_;
   int16_t *quant_shift_ptr_;
@@ -193,8 +231,7 @@ class VP9QuantizeTest : public VP9QuantizeBase,
 };
 
 void VP9QuantizeTest::Run() {
-  quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_,
-               quant_shift_ptr_, qcoeff_.TopLeftPixel(),
+  quantize_op_(coeff_.TopLeftPixel(), count_, mb_plane_, qcoeff_.TopLeftPixel(),
                dqcoeff_.TopLeftPixel(), dequant_ptr_, &eob_, scan_->scan,
                scan_->iscan);
 }
@@ -266,8 +303,8 @@ void VP9QuantizeTest::Speed(bool is_median) {
 
         vpx_usec_timer_start(&timer);
         for (int n = 0; n < kNumTests; ++n) {
-          ref_quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_,
-                           q_ptr_, quant_shift_ptr_, ref_qcoeff.TopLeftPixel(),
+          ref_quantize_op_(coeff_.TopLeftPixel(), count_, mb_plane_,
+                           ref_qcoeff.TopLeftPixel(),
                            ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob,
                            scan_->scan, scan_->iscan);
         }
@@ -275,10 +312,9 @@ void VP9QuantizeTest::Speed(bool is_median) {
 
         vpx_usec_timer_start(&simd_timer);
         for (int n = 0; n < kNumTests; ++n) {
-          quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_,
-                       quant_shift_ptr_, qcoeff_.TopLeftPixel(),
-                       dqcoeff_.TopLeftPixel(), dequant_ptr_, &eob_,
-                       scan_->scan, scan_->iscan);
+          quantize_op_(coeff_.TopLeftPixel(), count_, mb_plane_,
+                       qcoeff_.TopLeftPixel(), dqcoeff_.TopLeftPixel(),
+                       dequant_ptr_, &eob_, scan_->scan, scan_->iscan);
         }
         vpx_usec_timer_mark(&simd_timer);
 
@@ -417,15 +453,14 @@ TEST_P(VP9QuantizeTest, OperationCheck) {
     GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_,
                          quant_shift_ptr_, dequant_ptr_, round_fp_ptr_,
                          quant_fp_ptr_);
-    ref_quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_,
-                     quant_shift_ptr_, ref_qcoeff.TopLeftPixel(),
-                     ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob,
-                     scan_->scan, scan_->iscan);
+    ref_quantize_op_(coeff_.TopLeftPixel(), count_, mb_plane_,
+                     ref_qcoeff.TopLeftPixel(), ref_dqcoeff.TopLeftPixel(),
+                     dequant_ptr_, &ref_eob, scan_->scan, scan_->iscan);
 
-    ASM_REGISTER_STATE_CHECK(quantize_op_(
-        coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_,
-        quant_shift_ptr_, qcoeff_.TopLeftPixel(), dqcoeff_.TopLeftPixel(),
-        dequant_ptr_, &eob_, scan_->scan, scan_->iscan));
+    ASM_REGISTER_STATE_CHECK(quantize_op_(coeff_.TopLeftPixel(), count_,
+                                          mb_plane_, qcoeff_.TopLeftPixel(),
+                                          dqcoeff_.TopLeftPixel(), dequant_ptr_,
+                                          &eob_, scan_->scan, scan_->iscan));
 
     EXPECT_TRUE(qcoeff_.CheckValues(ref_qcoeff));
     EXPECT_TRUE(dqcoeff_.CheckValues(ref_dqcoeff));
@@ -475,15 +510,14 @@ TEST_P(VP9QuantizeTest, EOBCheck) {
     GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_,
                          quant_shift_ptr_, dequant_ptr_, round_fp_ptr_,
                          quant_fp_ptr_);
-    ref_quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_,
-                     quant_shift_ptr_, ref_qcoeff.TopLeftPixel(),
-                     ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob,
-                     scan_->scan, scan_->iscan);
+    ref_quantize_op_(coeff_.TopLeftPixel(), count_, mb_plane_,
+                     ref_qcoeff.TopLeftPixel(), ref_dqcoeff.TopLeftPixel(),
+                     dequant_ptr_, &ref_eob, scan_->scan, scan_->iscan);
 
-    ASM_REGISTER_STATE_CHECK(quantize_op_(
-        coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_,
-        quant_shift_ptr_, qcoeff_.TopLeftPixel(), dqcoeff_.TopLeftPixel(),
-        dequant_ptr_, &eob_, scan_->scan, scan_->iscan));
+    ASM_REGISTER_STATE_CHECK(quantize_op_(coeff_.TopLeftPixel(), count_,
+                                          mb_plane_, qcoeff_.TopLeftPixel(),
+                                          dqcoeff_.TopLeftPixel(), dequant_ptr_,
+                                          &eob_, scan_->scan, scan_->iscan));
 
     EXPECT_TRUE(qcoeff_.CheckValues(ref_qcoeff));
     EXPECT_TRUE(dqcoeff_.CheckValues(ref_dqcoeff));
@@ -510,28 +544,35 @@ using std::make_tuple;
 INSTANTIATE_TEST_SUITE_P(
     SSE2, VP9QuantizeTest,
     ::testing::Values(
-        make_tuple(&vpx_quantize_b_sse2, &vpx_quantize_b_c, VPX_BITS_8, 16,
-                   false),
+        make_tuple(&QuantWrapper<vpx_quantize_b_sse2>,
+                   &QuantWrapper<vpx_quantize_b_c>, VPX_BITS_8, 16, false),
         make_tuple(&QuantFPWrapper<vp9_quantize_fp_sse2>,
                    &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8, 16, true),
-        make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c,
-                   VPX_BITS_8, 16, false),
-        make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c,
-                   VPX_BITS_10, 16, false),
-        make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c,
-                   VPX_BITS_12, 16, false),
-        make_tuple(&vpx_highbd_quantize_b_32x32_sse2,
-                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8, 32, false),
-        make_tuple(&vpx_highbd_quantize_b_32x32_sse2,
-                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_10, 32, false),
-        make_tuple(&vpx_highbd_quantize_b_32x32_sse2,
-                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_12, 32, false)));
+        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_sse2>,
+                   &QuantWrapper<vpx_highbd_quantize_b_c>, VPX_BITS_8, 16,
+                   false),
+        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_sse2>,
+                   &QuantWrapper<vpx_highbd_quantize_b_c>, VPX_BITS_10, 16,
+                   false),
+        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_sse2>,
+                   &QuantWrapper<vpx_highbd_quantize_b_c>, VPX_BITS_12, 16,
+                   false),
+        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_sse2>,
+                   &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_8, 32,
+                   false),
+        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_sse2>,
+                   &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_10,
+                   32, false),
+        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_sse2>,
+                   &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_12,
+                   32, false)));
 
 #else
 INSTANTIATE_TEST_SUITE_P(
     SSE2, VP9QuantizeTest,
-    ::testing::Values(make_tuple(&vpx_quantize_b_sse2, &vpx_quantize_b_c,
-                                 VPX_BITS_8, 16, false),
+    ::testing::Values(make_tuple(&QuantWrapper<vpx_quantize_b_sse2>,
+                                 &QuantWrapper<vpx_quantize_b_c>, VPX_BITS_8,
+                                 16, false),
                       make_tuple(&QuantFPWrapper<vp9_quantize_fp_sse2>,
                                  &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8,
                                  16, true)));
@@ -541,11 +582,12 @@ INSTANTIATE_TEST_SUITE_P(
 #if HAVE_SSSE3
 INSTANTIATE_TEST_SUITE_P(
     SSSE3, VP9QuantizeTest,
-    ::testing::Values(make_tuple(&vpx_quantize_b_ssse3, &vpx_quantize_b_c,
-                                 VPX_BITS_8, 16, false),
-                      make_tuple(&vpx_quantize_b_32x32_ssse3,
-                                 &vpx_quantize_b_32x32_c, VPX_BITS_8, 32,
-                                 false),
+    ::testing::Values(make_tuple(&QuantWrapper<vpx_quantize_b_ssse3>,
+                                 &QuantWrapper<vpx_quantize_b_c>, VPX_BITS_8,
+                                 16, false),
+                      make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_ssse3>,
+                                 &Quant32x32Wrapper<vpx_quantize_b_32x32_c>,
+                                 VPX_BITS_8, 32, false),
                       make_tuple(&QuantFPWrapper<vp9_quantize_fp_ssse3>,
                                  &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8,
                                  16, true),
@@ -555,13 +597,14 @@ INSTANTIATE_TEST_SUITE_P(
 #endif  // HAVE_SSSE3
 
 #if HAVE_AVX
-INSTANTIATE_TEST_SUITE_P(AVX, VP9QuantizeTest,
-                         ::testing::Values(make_tuple(&vpx_quantize_b_avx,
-                                                      &vpx_quantize_b_c,
-                                                      VPX_BITS_8, 16, false),
-                                           make_tuple(&vpx_quantize_b_32x32_avx,
-                                                      &vpx_quantize_b_32x32_c,
-                                                      VPX_BITS_8, 32, false)));
+INSTANTIATE_TEST_SUITE_P(
+    AVX, VP9QuantizeTest,
+    ::testing::Values(make_tuple(&QuantWrapper<vpx_quantize_b_avx>,
+                                 &QuantWrapper<vpx_quantize_b_c>, VPX_BITS_8,
+                                 16, false),
+                      make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_avx>,
+                                 &Quant32x32Wrapper<vpx_quantize_b_32x32_c>,
+                                 VPX_BITS_8, 32, false)));
 #endif  // HAVE_AVX
 
 #if VPX_ARCH_X86_64 && HAVE_AVX2
@@ -577,22 +620,29 @@ INSTANTIATE_TEST_SUITE_P(
         make_tuple(&QuantFPWrapper<vp9_highbd_quantize_fp_32x32_avx2>,
                    &QuantFPWrapper<vp9_highbd_quantize_fp_32x32_c>, VPX_BITS_12,
                    32, true),
-        make_tuple(&vpx_quantize_b_avx2, &vpx_quantize_b_c, VPX_BITS_8, 16,
+        make_tuple(&QuantWrapper<vpx_quantize_b_avx2>,
+                   &QuantWrapper<vpx_quantize_b_c>, VPX_BITS_8, 16, false),
+        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_avx2>,
+                   &QuantWrapper<vpx_highbd_quantize_b_c>, VPX_BITS_8, 16,
+                   false),
+        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_avx2>,
+                   &QuantWrapper<vpx_highbd_quantize_b_c>, VPX_BITS_10, 16,
+                   false),
+        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_avx2>,
+                   &QuantWrapper<vpx_highbd_quantize_b_c>, VPX_BITS_12, 16,
+                   false),
+        make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_avx2>,
+                   &Quant32x32Wrapper<vpx_quantize_b_32x32_c>, VPX_BITS_8, 32,
                    false),
-        make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c,
-                   VPX_BITS_8, 16, false),
-        make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c,
-                   VPX_BITS_10, 16, false),
-        make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c,
-                   VPX_BITS_12, 16, false),
-        make_tuple(&vpx_quantize_b_32x32_avx2, &vpx_quantize_b_32x32_c,
-                   VPX_BITS_8, 32, false),
-        make_tuple(&vpx_highbd_quantize_b_32x32_avx2,
-                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8, 32, false),
-        make_tuple(&vpx_highbd_quantize_b_32x32_avx2,
-                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_10, 32, false),
-        make_tuple(&vpx_highbd_quantize_b_32x32_avx2,
-                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_12, 32, false)));
+        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_avx2>,
+                   &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_8, 32,
+                   false),
+        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_avx2>,
+                   &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_10,
+                   32, false),
+        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_avx2>,
+                   &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_12,
+                   32, false)));
 #else
 INSTANTIATE_TEST_SUITE_P(
     AVX2, VP9QuantizeTest,
@@ -602,11 +652,12 @@ INSTANTIATE_TEST_SUITE_P(
                       make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_avx2>,
                                  &QuantFPWrapper<quantize_fp_32x32_nz_c>,
                                  VPX_BITS_8, 32, true),
-                      make_tuple(&vpx_quantize_b_avx2, &vpx_quantize_b_c,
-                                 VPX_BITS_8, 16, false),
-                      make_tuple(&vpx_quantize_b_32x32_avx2,
-                                 &vpx_quantize_b_32x32_c, VPX_BITS_8, 32,
-                                 false)));
+                      make_tuple(&QuantWrapper<vpx_quantize_b_avx2>,
+                                 &QuantWrapper<vpx_quantize_b_c>, VPX_BITS_8,
+                                 16, false),
+                      make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_avx2>,
+                                 &Quant32x32Wrapper<vpx_quantize_b_32x32_c>,
+                                 VPX_BITS_8, 32, false)));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // HAVE_AVX2
 
@@ -615,22 +666,29 @@ INSTANTIATE_TEST_SUITE_P(
 INSTANTIATE_TEST_SUITE_P(
     NEON, VP9QuantizeTest,
     ::testing::Values(
-        make_tuple(&vpx_quantize_b_neon, &vpx_quantize_b_c, VPX_BITS_8, 16,
+        make_tuple(&QuantWrapper<vpx_quantize_b_neon>,
+                   &QuantWrapper<vpx_quantize_b_c>, VPX_BITS_8, 16, false),
+        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_neon>,
+                   &QuantWrapper<vpx_highbd_quantize_b_c>, VPX_BITS_8, 16,
+                   false),
+        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_neon>,
+                   &QuantWrapper<vpx_highbd_quantize_b_c>, VPX_BITS_10, 16,
                    false),
-        make_tuple(&vpx_highbd_quantize_b_neon, &vpx_highbd_quantize_b_c,
-                   VPX_BITS_8, 16, false),
-        make_tuple(&vpx_highbd_quantize_b_neon, &vpx_highbd_quantize_b_c,
-                   VPX_BITS_10, 16, false),
-        make_tuple(&vpx_highbd_quantize_b_neon, &vpx_highbd_quantize_b_c,
-                   VPX_BITS_12, 16, false),
-        make_tuple(&vpx_quantize_b_32x32_neon, &vpx_quantize_b_32x32_c,
-                   VPX_BITS_8, 32, false),
-        make_tuple(&vpx_highbd_quantize_b_32x32_neon,
-                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8, 32, false),
-        make_tuple(&vpx_highbd_quantize_b_32x32_neon,
-                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_10, 32, false),
-        make_tuple(&vpx_highbd_quantize_b_32x32_neon,
-                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_12, 32, false),
+        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_neon>,
+                   &QuantWrapper<vpx_highbd_quantize_b_c>, VPX_BITS_12, 16,
+                   false),
+        make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_neon>,
+                   &Quant32x32Wrapper<vpx_quantize_b_32x32_c>, VPX_BITS_8, 32,
+                   false),
+        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_neon>,
+                   &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_8, 32,
+                   false),
+        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_neon>,
+                   &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_10,
+                   32, false),
+        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_neon>,
+                   &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_12,
+                   32, false),
         make_tuple(&QuantFPWrapper<vp9_quantize_fp_neon>,
                    &QuantFPWrapper<vp9_quantize_fp_c>, VPX_BITS_8, 16, true),
         make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_neon>,
@@ -639,11 +697,12 @@ INSTANTIATE_TEST_SUITE_P(
 #else
 INSTANTIATE_TEST_SUITE_P(
     NEON, VP9QuantizeTest,
-    ::testing::Values(make_tuple(&vpx_quantize_b_neon, &vpx_quantize_b_c,
-                                 VPX_BITS_8, 16, false),
-                      make_tuple(&vpx_quantize_b_32x32_neon,
-                                 &vpx_quantize_b_32x32_c, VPX_BITS_8, 32,
-                                 false),
+    ::testing::Values(make_tuple(&QuantWrapper<vpx_quantize_b_neon>,
+                                 &QuantWrapper<vpx_quantize_b_c>, VPX_BITS_8,
+                                 16, false),
+                      make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_neon>,
+                                 &Quant32x32Wrapper<vpx_quantize_b_32x32_c>,
+                                 VPX_BITS_8, 32, false),
                       make_tuple(&QuantFPWrapper<vp9_quantize_fp_neon>,
                                  &QuantFPWrapper<vp9_quantize_fp_c>, VPX_BITS_8,
                                  16, true),
@@ -683,9 +742,11 @@ INSTANTIATE_TEST_SUITE_P(LSX, VP9QuantizeTest,
 INSTANTIATE_TEST_SUITE_P(
     DISABLED_C, VP9QuantizeTest,
     ::testing::Values(
-        make_tuple(&vpx_quantize_b_c, &vpx_quantize_b_c, VPX_BITS_8, 16, false),
-        make_tuple(&vpx_quantize_b_32x32_c, &vpx_quantize_b_32x32_c, VPX_BITS_8,
-                   32, false),
+        make_tuple(&QuantWrapper<vpx_quantize_b_c>,
+                   &QuantWrapper<vpx_quantize_b_c>, VPX_BITS_8, 16, false),
+        make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_c>,
+                   &Quant32x32Wrapper<vpx_quantize_b_32x32_c>, VPX_BITS_8, 32,
+                   false),
         make_tuple(&QuantFPWrapper<vp9_quantize_fp_c>,
                    &QuantFPWrapper<vp9_quantize_fp_c>, VPX_BITS_8, 16, true),
         make_tuple(&QuantFPWrapper<quantize_fp_nz_c>,
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index 3e2c9a3c3..da01c346d 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -13,6 +13,7 @@
 
 #include "vpx_util/vpx_thread.h"
 
+#include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_entropymv.h"
 #include "vp9/common/vp9_entropy.h"
 
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index fa222f9dc..4910dc20f 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -542,8 +542,7 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col,
   switch (tx_size) {
     case TX_32X32:
       fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
-      vpx_quantize_b_32x32(coeff, 1024, p->zbin, p->round, p->quant,
-                           p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
+      vpx_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, eob,
                            scan_order->scan, scan_order->iscan);
       break;
     case TX_16X16:
@@ -948,8 +947,7 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
         vpx_subtract_block(32, 32, src_diff, diff_stride, src, src_stride, dst,
                            dst_stride);
         fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
-        vpx_quantize_b_32x32(coeff, 1024, p->zbin, p->round, p->quant,
-                             p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
+        vpx_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, eob,
                              scan_order->scan, scan_order->iscan);
       }
       if (args->enable_coeff_opt && !x->skip_recode) {
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index f87ab3e0b..bcadd5777 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -160,12 +160,13 @@ static void swap_block_ptr(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx, int m, int n,
 }
 
 #if !CONFIG_REALTIME_ONLY
-static int model_rd_for_sb_earlyterm(VP9_COMP *cpi, int mi_row, int mi_col,
-                                     BLOCK_SIZE bsize, MACROBLOCK *x,
-                                     MACROBLOCKD *xd, int *out_rate_sum,
-                                     int64_t *out_dist_sum, int *skip_txfm_sb,
-                                     int64_t *skip_sse_sb, int do_earlyterm,
-                                     int64_t best_rd) {
+// Planewise build inter prediction and compute rdcost with early termination
+// option
+static int build_inter_pred_model_rd_earlyterm(
+    VP9_COMP *cpi, int mi_row, int mi_col, BLOCK_SIZE bsize, MACROBLOCK *x,
+    MACROBLOCKD *xd, int *out_rate_sum, int64_t *out_dist_sum,
+    int *skip_txfm_sb, int64_t *skip_sse_sb, int do_earlyterm,
+    int64_t best_rd) {
   // Note our transform coeffs are 8 times an orthogonal transform.
   // Hence quantizer step is also 8 times. To get effective quantizer
   // we need to divide by 8 before sending to modeling function.
@@ -2999,13 +3000,13 @@ static int64_t handle_inter_mode(
               xd->plane[j].dst.stride = 64;
             }
           }
-          // Compute RD cost with early termination option
+
           filt_best_rd =
               cm->interp_filter == SWITCHABLE ? (best_rd - rs_rd) : best_rd;
-          if (model_rd_for_sb_earlyterm(cpi, mi_row, mi_col, bsize, x, xd,
-                                        &rate_sum, &dist_sum, &tmp_skip_sb,
-                                        &tmp_skip_sse, enable_earlyterm,
-                                        filt_best_rd)) {
+          if (build_inter_pred_model_rd_earlyterm(
+                  cpi, mi_row, mi_col, bsize, x, xd, &rate_sum, &dist_sum,
+                  &tmp_skip_sb, &tmp_skip_sse, enable_earlyterm,
+                  filt_best_rd)) {
             filter_cache[i] = INT64_MAX;
             continue;
           }
@@ -3076,9 +3077,9 @@ static int64_t handle_inter_mode(
     // Handles the special case when a filter that is not in the
     // switchable list (ex. bilinear) is indicated at the frame level, or
     // skip condition holds.
-    model_rd_for_sb_earlyterm(cpi, mi_row, mi_col, bsize, x, xd, &tmp_rate,
-                              &tmp_dist, &skip_txfm_sb, &skip_sse_sb,
-                              0 /*do_earlyterm*/, INT64_MAX);
+    build_inter_pred_model_rd_earlyterm(
+        cpi, mi_row, mi_col, bsize, x, xd, &tmp_rate, &tmp_dist, &skip_txfm_sb,
+        &skip_sse_sb, 0 /*do_earlyterm*/, INT64_MAX);
     rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate, tmp_dist);
     memcpy(skip_txfm, x->skip_txfm, sizeof(skip_txfm));
     memcpy(bsse, x->bsse, sizeof(bsse));
diff --git a/vpx_dsp/arm/quantize_neon.c b/vpx_dsp/arm/quantize_neon.c
index 9c227d560..e81738a7b 100644
--- a/vpx_dsp/arm/quantize_neon.c
+++ b/vpx_dsp/arm/quantize_neon.c
@@ -14,6 +14,7 @@
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/arm/mem_neon.h"
+#include "vp9/encoder/vp9_block.h"
 
 static INLINE void calculate_dqcoeff_and_store(const int16x8_t qcoeff,
                                                const int16x8_t dequant,
@@ -213,11 +214,8 @@ quantize_b_32x32_neon(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
 
 // Main difference is that zbin values are halved before comparison and dqcoeff
 // values are divided by 2. zbin is rounded but dqcoeff is not.
-void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                               const int16_t *zbin_ptr,
-                               const int16_t *round_ptr,
-                               const int16_t *quant_ptr,
-                               const int16_t *quant_shift_ptr,
+void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr,
+                               const struct macroblock_plane *const mb_plane,
                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
                                const int16_t *scan, const int16_t *iscan) {
@@ -226,10 +224,10 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   int i;
 
   // Only the first element of each vector is DC.
-  int16x8_t zbin = vrshrq_n_s16(vld1q_s16(zbin_ptr), 1);
-  int16x8_t round = vrshrq_n_s16(vld1q_s16(round_ptr), 1);
-  int16x8_t quant = vld1q_s16(quant_ptr);
-  int16x8_t quant_shift = vld1q_s16(quant_shift_ptr);
+  int16x8_t zbin = vrshrq_n_s16(vld1q_s16(mb_plane->zbin), 1);
+  int16x8_t round = vrshrq_n_s16(vld1q_s16(mb_plane->round), 1);
+  int16x8_t quant = vld1q_s16(mb_plane->quant);
+  int16x8_t quant_shift = vld1q_s16(mb_plane->quant_shift);
   int16x8_t dequant = vld1q_s16(dequant_ptr);
 
   // Process first 8 values which include a dc component.
@@ -289,6 +287,5 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
 #endif  // __aarch64__
   // Need these here, else the compiler complains about mixing declarations and
   // code in C90
-  (void)n_coeffs;
   (void)scan;
 }
diff --git a/vpx_dsp/quantize.c b/vpx_dsp/quantize.c
index 5d6ba64a8..212db45c8 100644
--- a/vpx_dsp/quantize.c
+++ b/vpx_dsp/quantize.c
@@ -14,6 +14,7 @@
 #include "vpx_dsp/quantize.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_mem/vpx_mem.h"
+#include "vp9/encoder/vp9_block.h"
 
 void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
                      const int16_t *round_ptr, const int16_t quant,
@@ -208,19 +209,21 @@ void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
 }
 #endif
 
-void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                            const int16_t *zbin_ptr, const int16_t *round_ptr,
-                            const int16_t *quant_ptr,
-                            const int16_t *quant_shift_ptr,
+void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr,
+                            const struct macroblock_plane *const mb_plane,
                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                             const int16_t *dequant_ptr, uint16_t *eob_ptr,
                             const int16_t *scan, const int16_t *iscan) {
-  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 1),
-                         ROUND_POWER_OF_TWO(zbin_ptr[1], 1) };
+  const int n_coeffs = 32 * 32;
+  const int zbins[2] = { ROUND_POWER_OF_TWO(mb_plane->zbin[0], 1),
+                         ROUND_POWER_OF_TWO(mb_plane->zbin[1], 1) };
   const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
+  const int16_t *round_ptr = mb_plane->round;
+  const int16_t *quant_ptr = mb_plane->quant;
+  const int16_t *quant_shift_ptr = mb_plane->quant_shift;
 
   int idx = 0;
-  int idx_arr[1024];
+  int idx_arr[32 * 32 /* n_coeffs */];
   int i, eob = -1;
   (void)iscan;
 
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index ad8ff6e18..3baf16cc8 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -17,6 +17,9 @@ print <<EOF
 #include "vpx/vpx_integer.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_dsp/vpx_filter.h"
+#if CONFIG_VP9_ENCODER
+ struct macroblock_plane;
+#endif
 
 EOF
 }
@@ -721,7 +724,7 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
   add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/vpx_quantize_b neon sse2 ssse3 avx avx2 vsx lsx/;
 
-  add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/vpx_quantize_b_32x32 neon ssse3 avx avx2 vsx lsx/;
 
   if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
diff --git a/vpx_dsp/x86/quantize_avx.c b/vpx_dsp/x86/quantize_avx.c
index 7d8352721..d52f6c664 100644
--- a/vpx_dsp/x86/quantize_avx.c
+++ b/vpx_dsp/x86/quantize_avx.c
@@ -140,15 +140,12 @@ void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   *eob_ptr = accumulate_eob(eob);
 }
 
-void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                              const int16_t *zbin_ptr, const int16_t *round_ptr,
-                              const int16_t *quant_ptr,
-                              const int16_t *quant_shift_ptr,
+void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr,
+                              const struct macroblock_plane *const mb_plane,
                               tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                               const int16_t *dequant_ptr, uint16_t *eob_ptr,
                               const int16_t *scan, const int16_t *iscan) {
   const __m128i zero = _mm_setzero_si128();
-  const __m128i one = _mm_set1_epi16(1);
   const __m256i big_zero = _mm256_setzero_si256();
   int index;
 
@@ -160,26 +157,9 @@ void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   __m128i eob = zero, eob0;
 
   (void)scan;
-  (void)n_coeffs;
-
-  // Setup global values.
-  // The 32x32 halves zbin and round.
-  zbin = _mm_load_si128((const __m128i *)zbin_ptr);
-  // Shift with rounding.
-  zbin = _mm_add_epi16(zbin, one);
-  zbin = _mm_srli_epi16(zbin, 1);
-  // x86 has no "greater *or equal*" comparison. Subtract 1 from zbin so
-  // it is a strict "greater" comparison.
-  zbin = _mm_sub_epi16(zbin, one);
-
-  round = _mm_load_si128((const __m128i *)round_ptr);
-  round = _mm_add_epi16(round, one);
-  round = _mm_srli_epi16(round, 1);
-
-  quant = _mm_load_si128((const __m128i *)quant_ptr);
-  dequant = _mm_load_si128((const __m128i *)dequant_ptr);
-  shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
-  shift = _mm_slli_epi16(shift, 1);
+
+  load_b_values32x32(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant,
+                     &shift);
 
   // Do DC and first 15 AC.
   coeff0 = load_tran_low(coeff_ptr);
diff --git a/vpx_dsp/x86/quantize_avx2.c b/vpx_dsp/x86/quantize_avx2.c
index 28f7c9c7d..a8412c5b8 100644
--- a/vpx_dsp/x86/quantize_avx2.c
+++ b/vpx_dsp/x86/quantize_avx2.c
@@ -13,6 +13,7 @@
 
 #include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
+#include "vp9/encoder/vp9_block.h"
 
 static VPX_FORCE_INLINE void load_b_values_avx2(
     const int16_t *zbin_ptr, __m256i *zbin, const int16_t *round_ptr,
@@ -250,23 +251,19 @@ static VPX_FORCE_INLINE __m256i quantize_b_32x32_16(
   }
 }
 
-void vpx_quantize_b_32x32_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                               const int16_t *zbin_ptr,
-                               const int16_t *round_ptr,
-                               const int16_t *quant_ptr,
-                               const int16_t *quant_shift_ptr,
+void vpx_quantize_b_32x32_avx2(const tran_low_t *coeff_ptr,
+                               const struct macroblock_plane *const mb_plane,
                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
                                const int16_t *scan, const int16_t *iscan) {
   __m256i v_zbin, v_round, v_quant, v_dequant, v_quant_shift;
   __m256i v_eobmax = _mm256_setzero_si256();
   intptr_t count;
-  (void)n_coeffs;
   (void)scan;
 
-  load_b_values_avx2(zbin_ptr, &v_zbin, round_ptr, &v_round, quant_ptr,
-                     &v_quant, dequant_ptr, &v_dequant, quant_shift_ptr,
-                     &v_quant_shift, 1);
+  load_b_values_avx2(mb_plane->zbin, &v_zbin, mb_plane->round, &v_round,
+                     mb_plane->quant, &v_quant, dequant_ptr, &v_dequant,
+                     mb_plane->quant_shift, &v_quant_shift, 1);
 
   // Do DC and first 15 AC.
   v_eobmax = quantize_b_32x32_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, iscan,
diff --git a/vpx_dsp/x86/quantize_sse2.h b/vpx_dsp/x86/quantize_sse2.h
index 27bfb4e41..fe42fee01 100644
--- a/vpx_dsp/x86/quantize_sse2.h
+++ b/vpx_dsp/x86/quantize_sse2.h
@@ -15,6 +15,7 @@
 
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
+#include "vp9/encoder/vp9_block.h"
 
 static INLINE void load_b_values(const int16_t *zbin_ptr, __m128i *zbin,
                                  const int16_t *round_ptr, __m128i *round,
@@ -29,6 +30,33 @@ static INLINE void load_b_values(const int16_t *zbin_ptr, __m128i *zbin,
   *shift = _mm_load_si128((const __m128i *)shift_ptr);
 }
 
+static INLINE void load_b_values32x32(
+    const struct macroblock_plane *const mb_plane, __m128i *zbin,
+    __m128i *round, __m128i *quant, const int16_t *dequant_ptr,
+    __m128i *dequant, __m128i *shift) {
+  const __m128i one = _mm_set1_epi16(1);
+  // The 32x32 halves zbin and round.
+  *zbin = _mm_load_si128((const __m128i *)mb_plane->zbin);
+  // Shift with rounding.
+  *zbin = _mm_add_epi16(*zbin, one);
+  *zbin = _mm_srli_epi16(*zbin, 1);
+  // x86 has no "greater *or equal*" comparison. Subtract 1 from zbin so
+  // it is a strict "greater" comparison.
+  *zbin = _mm_sub_epi16(*zbin, one);
+
+  *round = _mm_load_si128((const __m128i *)mb_plane->round);
+  *round = _mm_add_epi16(*round, one);
+  *round = _mm_srli_epi16(*round, 1);
+
+  *quant = _mm_load_si128((const __m128i *)mb_plane->quant);
+  *dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+  *shift = _mm_load_si128((const __m128i *)mb_plane->quant_shift);
+  // I suspect this is not technically OK because quant_shift can be up
+  // to 1 << 16 and shifting up again will outrange that, but the test is not
+  // comprehensive enough to catch that and "it's been that way forever"
+  *shift = _mm_slli_epi16(*shift, 1);
+}
+
 static INLINE void load_fp_values(const int16_t *round_ptr, __m128i *round,
                                   const int16_t *quant_ptr, __m128i *quant,
                                   const int16_t *dequant_ptr,
diff --git a/vpx_dsp/x86/quantize_ssse3.c b/vpx_dsp/x86/quantize_ssse3.c
index 476230286..6fe54d7d9 100644
--- a/vpx_dsp/x86/quantize_ssse3.c
+++ b/vpx_dsp/x86/quantize_ssse3.c
@@ -16,6 +16,7 @@
 #include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
 #include "vpx_dsp/x86/quantize_sse2.h"
 #include "vpx_dsp/x86/quantize_ssse3.h"
+#include "vp9/encoder/vp9_block.h"
 
 void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                           const int16_t *zbin_ptr, const int16_t *round_ptr,
@@ -107,16 +108,12 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   *eob_ptr = accumulate_eob(eob);
 }
 
-void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                                const int16_t *zbin_ptr,
-                                const int16_t *round_ptr,
-                                const int16_t *quant_ptr,
-                                const int16_t *quant_shift_ptr,
+void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr,
+                                const struct macroblock_plane *const mb_plane,
                                 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                                 const int16_t *dequant_ptr, uint16_t *eob_ptr,
                                 const int16_t *scan, const int16_t *iscan) {
   const __m128i zero = _mm_setzero_si128();
-  const __m128i one = _mm_set1_epi16(1);
   int index;
 
   __m128i zbin, round, quant, dequant, shift;
@@ -127,29 +124,9 @@ void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   __m128i eob = zero, eob0;
 
   (void)scan;
-  (void)n_coeffs;
-
-  // Setup global values.
-  // The 32x32 halves zbin and round.
-  zbin = _mm_load_si128((const __m128i *)zbin_ptr);
-  // Shift with rounding.
-  zbin = _mm_add_epi16(zbin, one);
-  zbin = _mm_srli_epi16(zbin, 1);
-  // x86 has no "greater *or equal*" comparison. Subtract 1 from zbin so
-  // it is a strict "greater" comparison.
-  zbin = _mm_sub_epi16(zbin, one);
-
-  round = _mm_load_si128((const __m128i *)round_ptr);
-  round = _mm_add_epi16(round, one);
-  round = _mm_srli_epi16(round, 1);
-
-  quant = _mm_load_si128((const __m128i *)quant_ptr);
-  dequant = _mm_load_si128((const __m128i *)dequant_ptr);
-  shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
-  // I suspect this is not technically OK because quant_shift can be up
-  // to 1 << 16 and shifting up again will outrange that, but the test is not
-  // comprehensive enough to catch that and "it's been that way forever"
-  shift = _mm_slli_epi16(shift, 1);
+
+  load_b_values32x32(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant,
+                     &shift);
 
   // Do DC and first 15 AC.
   coeff0 = load_tran_low(coeff_ptr);
diff --git a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
index 26e82f9b7..141614e7a 100644
--- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
+++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
@@ -942,19 +942,111 @@ static void vpx_filter_block1d4_v4_avx2(const uint8_t *src_ptr,
   }
 }
 
+static void vpx_filter_block1d8_v8_avx2(
+    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
+  __m256i f[4], ss[4];
+  __m256i r[8];
+  __m128i s[9];
+
+  unsigned int y = output_height;
+  // Multiply the size of the source stride by two
+  const ptrdiff_t src_stride = src_pitch << 1;
+
+  // The output_height is always a multiple of two.
+  assert(!(output_height & 1));
+
+  shuffle_filter_avx2(filter, f);
+  s[0] = _mm_loadl_epi64((const __m128i *)(src_ptr + 0 * src_pitch));
+  s[1] = _mm_loadl_epi64((const __m128i *)(src_ptr + 1 * src_pitch));
+  s[2] = _mm_loadl_epi64((const __m128i *)(src_ptr + 2 * src_pitch));
+  s[3] = _mm_loadl_epi64((const __m128i *)(src_ptr + 3 * src_pitch));
+  s[4] = _mm_loadl_epi64((const __m128i *)(src_ptr + 4 * src_pitch));
+  s[5] = _mm_loadl_epi64((const __m128i *)(src_ptr + 5 * src_pitch));
+  s[6] = _mm_loadl_epi64((const __m128i *)(src_ptr + 6 * src_pitch));
+
+  // merge the result together
+  // r[0]:    0 0 0 0 0 0 0 0 r17 r16 r15 r14 r13 r12 r11 r10 | 0 0 0 0 0 0 0 0
+  // r07 r06 r05 r04 r03 r02 r01 r00
+  r[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[0]), s[1], 1);
+
+  // r[1]:    0 0 0 0 0 0 0 0 r27 r26 r25 r24 r23 r22 r21 r20 | 0 0 0 0 0 0 0 0
+  // r17 r16 r15 r14 r13 r12 r11 r10
+  r[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[1]), s[2], 1);
+
+  // r[2]:    0 0 0 0 0 0 0 0 r37 r36 r35 r34 r33 r32 r31 r30 | 0 0 0 0 0 0 0 0
+  // r27 r26 r25 r24 r23 r22 r21 r20
+  r[2] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[2]), s[3], 1);
+
+  // r[3]:    0 0 0 0 0 0 0 0 r47 r46 r45 r44 r43 r42 r41 r40 | 0 0 0 0 0 0 0 0
+  // r37 r36 r35 r34 r33 r32 r31 r30
+  r[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[3]), s[4], 1);
+
+  // r[4]:    0 0 0 0 0 0 0 0 r57 r56 r55 r54 r53 r52 r51 r50 | 0 0 0 0 0 0 0 0
+  // r47 r46 r45 r44 r43 r42 r41 r40
+  r[4] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[4]), s[5], 1);
+
+  // r[5]:    0 0 0 0 0 0 0 0 r67 r66 r65 r64 r63 r62 r61 r60 | 0 0 0 0 0 0 0 0
+  // r57 r56 r55 r54 r53 r52 r51 r50
+  r[5] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[5]), s[6], 1);
+
+  // Merge together
+  // ss[0]: |r27 r17|.......|r21 r11|r20 r10 || r17 r07|.....|r12 r02|r11
+  // r01|r10 r00|
+  ss[0] = _mm256_unpacklo_epi8(r[0], r[1]);
+
+  // ss[0]: |r47 r37|.......|r41 r31|r40 r30 || r37 r27|.....|r32 r22|r31
+  // r21|r30 r20|
+  ss[1] = _mm256_unpacklo_epi8(r[2], r[3]);
+
+  // ss[2]: |r67 r57|.......|r61 r51|r60 r50 || r57 r47|.....|r52 r42|r51
+  // r41|r50 r40|
+  ss[2] = _mm256_unpacklo_epi8(r[4], r[5]);
+
+  // Process 2 rows at a time
+  do {
+    s[7] = _mm_loadl_epi64((const __m128i *)(src_ptr + 7 * src_pitch));
+    s[8] = _mm_loadl_epi64((const __m128i *)(src_ptr + 8 * src_pitch));
+
+    // r[6]:    0 0 0 0 0 0 0 0 r77 r76 r75 r74 r73 r72 r71 r70 | 0 0 0 0 0 0 0
+    // 0 r67 r66 r65 r64 r63 r62 r61 r60
+    r[6] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[6]), s[7], 1);
+    // r[7]:    0 0 0 0 0 0 0 0 r87 r86 r85 r84 r83 r82 r81 r80 | 0 0 0 0 0 0 0
+    // 0 r77 r76 r75 r74 r73 r72 r71 r70
+    r[7] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[7]), s[8], 1);
+
+    // ss[3] : | r87 r77 | .......| r81 r71 | r80 r70 || r77 r67 | .....| r72
+    // r62 | r71 r61|r70 r60|
+    ss[3] = _mm256_unpacklo_epi8(r[6], r[7]);
+    ss[0] = convolve8_16_avx2(ss, f);
+    ss[0] = _mm256_packus_epi16(ss[0], ss[0]);
+    src_ptr += src_stride;
+
+    /* shift down two rows */
+    s[6] = s[8];
+    _mm_storel_epi64((__m128i *)&output_ptr[0], _mm256_castsi256_si128(ss[0]));
+    output_ptr += out_pitch;
+    _mm_storel_epi64((__m128i *)&output_ptr[0],
+                     _mm256_extractf128_si256(ss[0], 1));
+    output_ptr += out_pitch;
+    ss[0] = ss[1];
+    ss[1] = ss[2];
+    ss[2] = ss[3];
+    y -= 2;
+  } while (y > 1);
+}
+
 #if HAVE_AVX2 && HAVE_SSSE3
 filter8_1dfunction vpx_filter_block1d4_v8_ssse3;
 #if VPX_ARCH_X86_64
 filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3;
 filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3;
 filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3;
-#define vpx_filter_block1d8_v8_avx2 vpx_filter_block1d8_v8_intrin_ssse3
 #define vpx_filter_block1d4_h8_avx2 vpx_filter_block1d4_h8_intrin_ssse3
 #else  // VPX_ARCH_X86
 filter8_1dfunction vpx_filter_block1d8_v8_ssse3;
 filter8_1dfunction vpx_filter_block1d8_h8_ssse3;
 filter8_1dfunction vpx_filter_block1d4_h8_ssse3;
-#define vpx_filter_block1d8_v8_avx2 vpx_filter_block1d8_v8_ssse3
 #define vpx_filter_block1d4_h8_avx2 vpx_filter_block1d4_h8_ssse3
 #endif  // VPX_ARCH_X86_64
 filter8_1dfunction vpx_filter_block1d8_v8_avg_ssse3;