9 files changed, 171 insertions, 674 deletions
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index 83c110264..e6c24f05a 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -164,12 +164,12 @@ struct macroblock {
   void (*fwd_txm16x16)(int16_t *input, int16_t *output, int pitch);
   void (*quantize_b_4x4)(MACROBLOCK *x, int b_idx, TX_TYPE tx_type,
                          int y_blocks);
+#if !CONFIG_SB8X8
   void (*quantize_b_4x4_pair)(MACROBLOCK *x, int b_idx1, int b_idx2,
                               int y_blocks);
-  void (*quantize_b_16x16)(MACROBLOCK *x, int b_idx, TX_TYPE tx_type,
-                           int y_blocks);
   void (*quantize_b_8x8)(MACROBLOCK *x, int b_idx, TX_TYPE tx_type,
                          int y_blocks);
+#endif
 };
 
 #endif  // VP9_ENCODER_VP9_BLOCK_H_
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 9c07b9cd6..4117852c1 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -2435,13 +2435,7 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t,
 
     vp9_encode_intra4x4mby(x, bsize);
     vp9_build_intra_predictors_sbuv_s(&x->e_mbd, bsize);
-    vp9_subtract_sbuv(x, bsize);
-    vp9_transform_sbuv_4x4(x, bsize);
-    vp9_quantize_sbuv_4x4(x, bsize);
-    if (x->optimize)
-      vp9_optimize_sbuv(cm, x, bsize);
-    vp9_inverse_transform_sbuv_4x4(xd, bsize);
-    vp9_recon_sbuv(xd, bsize);
+    vp9_encode_sbuv(cm, x, bsize);
 
     if (output_enabled)
       sum_intra_stats(cpi, x);
diff --git a/vp9/encoder/vp9_encodeintra.c b/vp9/encoder/vp9_encodeintra.c
index c5f29fe7e..d5574db16 100644
--- a/vp9/encoder/vp9_encodeintra.c
+++ b/vp9/encoder/vp9_encodeintra.c
@@ -104,63 +104,16 @@ void vp9_encode_intra4x4mby(MACROBLOCK *mb, BLOCK_SIZE_TYPE bsize) {
 
 void vp9_encode_intra16x16mby(VP9_COMMON *const cm, MACROBLOCK *x) {
   MACROBLOCKD *xd = &x->e_mbd;
-  TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;
 
   vp9_build_intra_predictors_sby_s(xd, BLOCK_SIZE_MB16X16);
-  vp9_subtract_sby(x, BLOCK_SIZE_MB16X16);
-
-  switch (tx_size) {
-    case TX_16X16:
-      vp9_transform_sby_16x16(x, BLOCK_SIZE_MB16X16);
-      vp9_quantize_sby_16x16(x, BLOCK_SIZE_MB16X16);
-      if (x->optimize)
-        vp9_optimize_sby(cm, x, BLOCK_SIZE_MB16X16);
-      vp9_inverse_transform_sby_16x16(xd, BLOCK_SIZE_MB16X16);
-      break;
-    case TX_8X8:
-      vp9_transform_sby_8x8(x, BLOCK_SIZE_MB16X16);
-      vp9_quantize_sby_8x8(x, BLOCK_SIZE_MB16X16);
-      if (x->optimize)
-        vp9_optimize_sby(cm, x, BLOCK_SIZE_MB16X16);
-      vp9_inverse_transform_sby_8x8(xd, BLOCK_SIZE_MB16X16);
-      break;
-    default:
-      vp9_transform_sby_4x4(x, BLOCK_SIZE_MB16X16);
-      vp9_quantize_sby_4x4(x, BLOCK_SIZE_MB16X16);
-      if (x->optimize)
-        vp9_optimize_sby(cm, x, BLOCK_SIZE_MB16X16);
-      vp9_inverse_transform_sby_4x4(xd, BLOCK_SIZE_MB16X16);
-      break;
-  }
-
-  vp9_recon_sby(xd, BLOCK_SIZE_MB16X16);
+  vp9_encode_sby(cm, x, BLOCK_SIZE_MB16X16);
 }
 
 void vp9_encode_intra16x16mbuv(VP9_COMMON *const cm, MACROBLOCK *x) {
   MACROBLOCKD *xd = &x->e_mbd;
-  TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;
 
   vp9_build_intra_predictors_sbuv_s(xd, BLOCK_SIZE_MB16X16);
-  vp9_subtract_sbuv(x, BLOCK_SIZE_MB16X16);
-
-  switch (tx_size) {
-    case TX_4X4:
-      vp9_transform_sbuv_4x4(x, BLOCK_SIZE_MB16X16);
-      vp9_quantize_sbuv_4x4(x, BLOCK_SIZE_MB16X16);
-      if (x->optimize)
-        vp9_optimize_sbuv(cm, x, BLOCK_SIZE_MB16X16);
-      vp9_inverse_transform_sbuv_4x4(xd, BLOCK_SIZE_MB16X16);
-      break;
-    default:  // 16x16 or 8x8
-      vp9_transform_sbuv_8x8(x, BLOCK_SIZE_MB16X16);
-      vp9_quantize_sbuv_8x8(x, BLOCK_SIZE_MB16X16);
-      if (x->optimize)
-        vp9_optimize_sbuv(cm, x, BLOCK_SIZE_MB16X16);
-      vp9_inverse_transform_sbuv_8x8(xd, BLOCK_SIZE_MB16X16);
-      break;
-    }
-
-  vp9_recon_sbuv(xd, BLOCK_SIZE_MB16X16);
+  vp9_encode_sbuv(cm, x, BLOCK_SIZE_MB16X16);
 }
 
 #if !CONFIG_SB8X8
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 5f00b7063..6e28f90cd 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -67,143 +67,6 @@ void vp9_subtract_sb(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
 }
 
 
-void vp9_transform_sby_32x32(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize) - 3, bw = 1 << bwl;
-  const int bh = 1 << (b_height_log2(bsize) - 3);
-  const int stride = 32 << bwl;
-  int n;
-
-  for (n = 0; n < bw * bh; n++) {
-    const int x_idx = n & (bw - 1), y_idx = n >> bwl;
-
-    vp9_short_fdct32x32(x->plane[0].src_diff + y_idx * stride * 32 + x_idx * 32,
-                        x->plane[0].coeff + n * 1024, stride * 2);
-  }
-}
-
-void vp9_transform_sby_16x16(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize) - 2, bw = 1 << bwl;
-  const int bh = 1 << (b_height_log2(bsize) - 2);
-  const int stride = 16 << bwl, bstride = 4 << bwl;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  int n;
-
-  for (n = 0; n < bw * bh; n++) {
-    const int x_idx = n & (bw - 1), y_idx = n >> bwl;
-    const TX_TYPE tx_type = get_tx_type_16x16(xd,
-                                              (y_idx * bstride + x_idx) * 4);
-
-    if (tx_type != DCT_DCT) {
-      vp9_short_fht16x16(x->plane[0].src_diff +
-                             y_idx * stride * 16 + x_idx * 16,
-                         x->plane[0].coeff + n * 256, stride, tx_type);
-    } else {
-      x->fwd_txm16x16(x->plane[0].src_diff + y_idx * stride * 16 + x_idx * 16,
-                      x->plane[0].coeff + n * 256, stride * 2);
-    }
-  }
-}
-
-void vp9_transform_sby_8x8(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize) - 1, bw = 1 << bwl;
-  const int bh = 1 << (b_height_log2(bsize) - 1);
-  const int stride = 8 << bwl, bstride = 2 << bwl;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  int n;
-
-  for (n = 0; n < bw * bh; n++) {
-    const int x_idx = n & (bw - 1), y_idx = n >> bwl;
-    const TX_TYPE tx_type = get_tx_type_8x8(xd, (y_idx * bstride + x_idx) * 2);
-
-    if (tx_type != DCT_DCT) {
-      vp9_short_fht8x8(x->plane[0].src_diff + y_idx * stride * 8 + x_idx * 8,
-                       x->plane[0].coeff + n * 64, stride, tx_type);
-    } else {
-      x->fwd_txm8x8(x->plane[0].src_diff + y_idx * stride * 8 + x_idx * 8,
-                    x->plane[0].coeff + n * 64, stride * 2);
-    }
-  }
-}
-
-void vp9_transform_sby_4x4(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize), bw = 1 << bwl;
-  const int bh = 1 << b_height_log2(bsize);
-  const int stride = 4 << bwl;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  int n;
-
-  for (n = 0; n < bw * bh; n++) {
-    const int x_idx = n & (bw - 1), y_idx = n >> bwl;
-    const TX_TYPE tx_type = get_tx_type_4x4(xd, n);
-
-    if (tx_type != DCT_DCT) {
-      vp9_short_fht4x4(x->plane[0].src_diff + y_idx * stride * 4 + x_idx * 4,
-                       x->plane[0].coeff + n * 16, stride, tx_type);
-    } else {
-      x->fwd_txm4x4(x->plane[0].src_diff + y_idx * stride * 4 + x_idx * 4,
-                    x->plane[0].coeff + n * 16, stride * 2);
-    }
-  }
-}
-
-void vp9_transform_sbuv_32x32(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
-  assert(bsize == BLOCK_SIZE_SB64X64);
-  vp9_clear_system_state();
-  vp9_short_fdct32x32(x->plane[1].src_diff, x->plane[1].coeff, 64);
-  vp9_short_fdct32x32(x->plane[2].src_diff, x->plane[2].coeff, 64);
-}
-
-void vp9_transform_sbuv_16x16(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize) - 2, bhl = b_height_log2(bsize) - 2;
-  const int bw = 1 << (bwl - 1), bh = 1 << (bhl - 1);
-  const int stride = 16 << (bwl - 1);
-  int n;
-
-  vp9_clear_system_state();
-  for (n = 0; n < bw * bh; n++) {
-    const int x_idx = n & (bw - 1), y_idx = n >> (bwl - 1);
-
-    x->fwd_txm16x16(x->plane[1].src_diff + y_idx * stride * 16 + x_idx * 16,
-                    x->plane[1].coeff + n * 256, stride * 2);
-    x->fwd_txm16x16(x->plane[2].src_diff + y_idx * stride * 16 + x_idx * 16,
-                    x->plane[2].coeff + n * 256, stride * 2);
-  }
-}
-
-void vp9_transform_sbuv_8x8(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize) - 1, bhl = b_height_log2(bsize) - 1;
-  const int bw = 1 << (bwl - 1), bh = 1 << (bhl - 1);
-  const int stride = 8 << (bwl - 1);
-  int n;
-
-  vp9_clear_system_state();
-  for (n = 0; n < bw * bh; n++) {
-    const int x_idx = n & (bw - 1), y_idx = n >> (bwl - 1);
-
-    x->fwd_txm8x8(x->plane[1].src_diff + y_idx * stride * 8 + x_idx * 8,
-                  x->plane[1].coeff + n * 64, stride * 2);
-    x->fwd_txm8x8(x->plane[2].src_diff + y_idx * stride * 8 + x_idx * 8,
-                  x->plane[2].coeff + n * 64, stride * 2);
-  }
-}
-
-void vp9_transform_sbuv_4x4(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);
-  const int bw = 1 << (bwl - 1), bh = 1 << (bhl - 1);
-  const int stride = 4 << (bwl - 1);
-  int n;
-
-  vp9_clear_system_state();
-  for (n = 0; n < bw * bh; n++) {
-    const int x_idx = n & (bw - 1), y_idx = n >> (bwl - 1);
-
-    x->fwd_txm4x4(x->plane[1].src_diff + y_idx * stride * 4 + x_idx * 4,
-                  x->plane[1].coeff + n * 16, stride * 2);
-    x->fwd_txm4x4(x->plane[2].src_diff + y_idx * stride * 4 + x_idx * 4,
-                  x->plane[2].coeff + n * 16, stride * 2);
-  }
-}
-
 #define RDTRUNC(RM,DM,R,D) ( (128+(R)*(RM)) & 0xFF )
 #define RDTRUNC_8x8(RM,DM,R,D) ( (128+(R)*(RM)) & 0xFF )
 typedef struct vp9_token_state vp9_token_state;
@@ -561,7 +424,7 @@ struct encode_b_args {
   struct optimize_ctx *ctx;
 };
 
-static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
+static void xform_quant(int plane, int block, BLOCK_SIZE_TYPE bsize,
                          int ss_txfrm_size, void *arg) {
   struct encode_b_args* const args = arg;
   MACROBLOCK* const x = args->x;
@@ -572,9 +435,6 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
   int16_t* const src_diff = raster_block_offset_int16(xd, bsize, plane,
                                                       raster_block,
                                                       x->plane[plane].src_diff);
-  int16_t* const diff = raster_block_offset_int16(xd, bsize, plane,
-                                                  raster_block,
-                                                  xd->plane[plane].diff);
   TX_TYPE tx_type = DCT_DCT;
 
   switch (ss_txfrm_size / 2) {
@@ -624,6 +484,23 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
   }
 
   vp9_quantize(x, plane, block, 16 << ss_txfrm_size, tx_type);
+}
+
+static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
+                         int ss_txfrm_size, void *arg) {
+  struct encode_b_args* const args = arg;
+  MACROBLOCK* const x = args->x;
+  MACROBLOCKD* const xd = &x->e_mbd;
+  const int bw = 4 << (b_width_log2(bsize) - xd->plane[plane].subsampling_x);
+  const int raster_block = txfrm_block_to_raster_block(xd, bsize, plane,
+                                                       block, ss_txfrm_size);
+  int16_t* const diff = raster_block_offset_int16(xd, bsize, plane,
+                                                  raster_block,
+                                                  xd->plane[plane].diff);
+  TX_TYPE tx_type = DCT_DCT;
+
+  xform_quant(plane, block, bsize, ss_txfrm_size, arg);
+
   if (x->optimize)
     vp9_optimize_b(plane, block, bsize, ss_txfrm_size, args->cm, x, args->ctx);
 
@@ -633,6 +510,7 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
                           diff, bw * 2);
       break;
     case TX_16X16:
+      tx_type = plane == 0 ? get_tx_type_16x16(xd, raster_block) : DCT_DCT;
       if (tx_type == DCT_DCT) {
         vp9_short_idct16x16(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
                             diff, bw * 2);
@@ -642,6 +520,7 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
       }
       break;
     case TX_8X8:
+      tx_type = plane == 0 ? get_tx_type_8x8(xd, raster_block) : DCT_DCT;
       if (tx_type == DCT_DCT) {
         vp9_short_idct8x8(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
                           diff, bw * 2);
@@ -651,6 +530,7 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
       }
       break;
     case TX_4X4:
+      tx_type = plane == 0 ? get_tx_type_4x4(xd, raster_block) : DCT_DCT;
       if (tx_type == DCT_DCT) {
         // this is like vp9_short_idct4x4 but has a special case around eob<=1
         // which is significant (not just an optimization) for the lossless
@@ -665,6 +545,60 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
   }
 }
 
+void vp9_xform_quant_sby(VP9_COMMON *const cm, MACROBLOCK *x,
+                         BLOCK_SIZE_TYPE bsize) {
+  MACROBLOCKD* const xd = &x->e_mbd;
+  struct encode_b_args arg = {cm, x, NULL};
+
+  foreach_transformed_block_in_plane(xd, bsize, 0,
+#if !CONFIG_SB8X8
+                                     0,
+#endif
+                                     xform_quant, &arg);
+}
+
+void vp9_xform_quant_sbuv(VP9_COMMON *const cm, MACROBLOCK *x,
+                         BLOCK_SIZE_TYPE bsize) {
+  MACROBLOCKD* const xd = &x->e_mbd;
+  struct encode_b_args arg = {cm, x, NULL};
+
+  foreach_transformed_block_uv(xd, bsize, xform_quant, &arg);
+}
+
+void vp9_encode_sby(VP9_COMMON *const cm, MACROBLOCK *x,
+                    BLOCK_SIZE_TYPE bsize) {
+  MACROBLOCKD* const xd = &x->e_mbd;
+  struct optimize_ctx ctx;
+  struct encode_b_args arg = {cm, x, &ctx};
+
+  vp9_subtract_sby(x, bsize);
+  if (x->optimize)
+    vp9_optimize_init(xd, bsize, &ctx);
+
+  foreach_transformed_block_in_plane(xd, bsize, 0,
+#if !CONFIG_SB8X8
+                                     0,
+#endif
+                                     encode_block, &arg);
+
+  vp9_recon_sby(xd, bsize);
+}
+
+void vp9_encode_sbuv(VP9_COMMON *const cm, MACROBLOCK *x,
+                     BLOCK_SIZE_TYPE bsize) {
+  MACROBLOCKD* const xd = &x->e_mbd;
+  struct optimize_ctx ctx;
+  struct encode_b_args arg = {cm, x, &ctx};
+
+  vp9_subtract_sbuv(x, bsize);
+  if (x->optimize)
+    vp9_optimize_init(xd, bsize, &ctx);
+
+  foreach_transformed_block_uv(xd, bsize, encode_block, &arg);
+
+  vp9_recon_sbuv(xd, bsize);
+}
+
 void vp9_encode_sb(VP9_COMMON *const cm, MACROBLOCK *x,
                    BLOCK_SIZE_TYPE bsize) {
   MACROBLOCKD* const xd = &x->e_mbd;
diff --git a/vp9/encoder/vp9_encodemb.h b/vp9/encoder/vp9_encodemb.h
index 832247940..afbe4466b 100644
--- a/vp9/encoder/vp9_encodemb.h
+++ b/vp9/encoder/vp9_encodemb.h
@@ -22,18 +22,6 @@ typedef struct {
   MV_REFERENCE_FRAME second_ref_frame;
 } MODE_DEFINITION;
 
-
-#if !CONFIG_SB8X8
-#endif
-void vp9_transform_sby_32x32(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
-void vp9_transform_sby_16x16(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
-void vp9_transform_sby_8x8(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
-void vp9_transform_sby_4x4(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
-void vp9_transform_sbuv_32x32(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
-void vp9_transform_sbuv_16x16(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
-void vp9_transform_sbuv_8x8(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
-void vp9_transform_sbuv_4x4(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
-
 struct optimize_ctx {
   ENTROPY_CONTEXT ta[MAX_MB_PLANE][16];
   ENTROPY_CONTEXT tl[MAX_MB_PLANE][16];
@@ -49,6 +37,14 @@ void vp9_optimize_sbuv(VP9_COMMON *const cm, MACROBLOCK *x,
                        BLOCK_SIZE_TYPE bsize);
 
 void vp9_encode_sb(VP9_COMMON *const cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
+void vp9_encode_sby(VP9_COMMON *const cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
+void vp9_encode_sbuv(VP9_COMMON *const cm, MACROBLOCK *x,
+                     BLOCK_SIZE_TYPE bsize);
+
+void vp9_xform_quant_sby(VP9_COMMON *const cm, MACROBLOCK *x,
+                        BLOCK_SIZE_TYPE bsize);
+void vp9_xform_quant_sbuv(VP9_COMMON *const cm, MACROBLOCK *x,
+                          BLOCK_SIZE_TYPE bsize);
 
 void vp9_subtract_block(int rows, int cols,
                         int16_t *diff_ptr, int diff_stride,
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index 738d6e6dc..300fa32e7 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -867,9 +867,10 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
   }
 
   cpi->mb.quantize_b_4x4      = vp9_regular_quantize_b_4x4;
+#if !CONFIG_SB8X8
   cpi->mb.quantize_b_4x4_pair = vp9_regular_quantize_b_4x4_pair;
   cpi->mb.quantize_b_8x8      = vp9_regular_quantize_b_8x8;
-  cpi->mb.quantize_b_16x16    = vp9_regular_quantize_b_16x16;
+#endif
 
   vp9_init_quantizer(cpi);
 
diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index 2de01d09c..e8dd0e982 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@@ -133,6 +133,7 @@ void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type,
            pt_scan, 1);
 }
 
+#if !CONFIG_SB8X8
 void vp9_regular_quantize_b_8x8(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type,
                                 int y_blocks) {
   MACROBLOCKD *const xd = &mb->e_mbd;
@@ -154,131 +155,6 @@ void vp9_regular_quantize_b_8x8(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type,
            pt_scan, 1);
 }
 
-void vp9_regular_quantize_b_16x16(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type,
-                                  int y_blocks) {
-  MACROBLOCKD *const xd = &mb->e_mbd;
-  const struct plane_block_idx pb_idx = plane_block_idx(y_blocks, b_idx);
-  const int *pt_scan = get_scan_16x16(tx_type);
-
-  quantize(mb->plane[pb_idx.plane].zrun_zbin_boost,
-           BLOCK_OFFSET(mb->plane[pb_idx.plane].coeff, pb_idx.block, 16),
-           256, mb->skip_block,
-           mb->plane[pb_idx.plane].zbin,
-           mb->plane[pb_idx.plane].round,
-           mb->plane[pb_idx.plane].quant,
-           mb->plane[pb_idx.plane].quant_shift,
-           BLOCK_OFFSET(xd->plane[pb_idx.plane].qcoeff, pb_idx.block, 16),
-           BLOCK_OFFSET(xd->plane[pb_idx.plane].dqcoeff, pb_idx.block, 16),
-           xd->plane[pb_idx.plane].dequant,
-           mb->plane[pb_idx.plane].zbin_extra,
-           &xd->plane[pb_idx.plane].eobs[pb_idx.block],
-           pt_scan, 1);
-}
-
-void vp9_regular_quantize_b_32x32(MACROBLOCK *mb, int b_idx, int y_blocks) {
-  MACROBLOCKD *const xd = &mb->e_mbd;
-  const struct plane_block_idx pb_idx = plane_block_idx(y_blocks, b_idx);
-
-  quantize(mb->plane[pb_idx.plane].zrun_zbin_boost,
-           BLOCK_OFFSET(mb->plane[pb_idx.plane].coeff, pb_idx.block, 16),
-           1024, mb->skip_block,
-           mb->plane[pb_idx.plane].zbin,
-           mb->plane[pb_idx.plane].round,
-           mb->plane[pb_idx.plane].quant,
-           mb->plane[pb_idx.plane].quant_shift,
-           BLOCK_OFFSET(xd->plane[pb_idx.plane].qcoeff, pb_idx.block, 16),
-           BLOCK_OFFSET(xd->plane[pb_idx.plane].dqcoeff, pb_idx.block, 16),
-           xd->plane[pb_idx.plane].dequant,
-           mb->plane[pb_idx.plane].zbin_extra,
-           &xd->plane[pb_idx.plane].eobs[pb_idx.block],
-           vp9_default_zig_zag1d_32x32, 2);
-}
-
-void vp9_quantize_sby_32x32(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
-  const int bw = 1 << (b_width_log2(bsize) - 3);
-  const int bh = 1 << (b_height_log2(bsize) - 3);
-  int n;
-
-  for (n = 0; n < bw * bh; n++)
-    vp9_regular_quantize_b_32x32(x, n * 64, bw * bh * 64);
-}
-
-void vp9_quantize_sby_16x16(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize) - 2, bw = 1 << bwl;
-  const int bh = 1 << (b_height_log2(bsize) - 2);
-  const int bstride = 16 << bwl;
-  int n;
-
-  for (n = 0; n < bw * bh; n++) {
-    const int x_idx = n & (bw - 1), y_idx = n >> bwl;
-    TX_TYPE tx_type = get_tx_type_16x16(&x->e_mbd,
-                                        4 * x_idx + y_idx * bstride);
-    x->quantize_b_16x16(x, n * 16, tx_type, 16 * bw * bh);
-  }
-}
-
-void vp9_quantize_sby_8x8(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize) - 1, bw = 1 << bwl;
-  const int bh = 1 << (b_height_log2(bsize) - 1);
-  const int bstride = 4 << bwl;
-  int n;
-
-  for (n = 0; n < bw * bh; n++) {
-    const int x_idx = n & (bw - 1), y_idx = n >> bwl;
-    TX_TYPE tx_type = get_tx_type_8x8(&x->e_mbd,
-                                      2 * x_idx + y_idx * bstride);
-    x->quantize_b_8x8(x, n * 4, tx_type, 4 * bw * bh);
-  }
-}
-
-void vp9_quantize_sby_4x4(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize), bw = 1 << bwl;
-  const int bh = 1 << b_height_log2(bsize);
-  MACROBLOCKD *const xd = &x->e_mbd;
-  int n;
-
-  for (n = 0; n < bw * bh; n++) {
-    const TX_TYPE tx_type = get_tx_type_4x4(xd, n);
-    x->quantize_b_4x4(x, n, tx_type, bw * bh);
-  }
-}
-
-void vp9_quantize_sbuv_32x32(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
-  assert(bsize == BLOCK_SIZE_SB64X64);
-  vp9_regular_quantize_b_32x32(x, 256, 256);
-  vp9_regular_quantize_b_32x32(x, 320, 256);
-}
-
-void vp9_quantize_sbuv_16x16(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize) - 2;
-  const int bhl = b_height_log2(bsize) - 2;
-  const int uoff = 16 << (bhl + bwl);
-  int i;
-
-  for (i = uoff; i < ((uoff * 3) >> 1); i += 16)
-    x->quantize_b_16x16(x, i, DCT_DCT, uoff);
-}
-
-void vp9_quantize_sbuv_8x8(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize) - 1;
-  const int bhl = b_height_log2(bsize) - 1;
-  const int uoff = 4 << (bhl + bwl);
-  int i;
-
-  for (i = uoff; i < ((uoff * 3) >> 1); i += 4)
-    x->quantize_b_8x8(x, i, DCT_DCT, uoff);
-}
-
-void vp9_quantize_sbuv_4x4(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize);
-  const int bhl = b_height_log2(bsize);
-  const int uoff = 1 << (bhl + bwl);
-  int i;
-
-  for (i = uoff; i < ((uoff * 3) >> 1); i++)
-    x->quantize_b_4x4(x, i, DCT_DCT, uoff);
-}
-
 /* quantize_b_pair function pointer in MACROBLOCK structure is set to one of
  * these two C functions if corresponding optimized routine is not available.
  * NEON optimized version implements currently the fast quantization for pair
@@ -288,6 +164,7 @@ void vp9_regular_quantize_b_4x4_pair(MACROBLOCK *x, int b_idx1, int b_idx2,
   vp9_regular_quantize_b_4x4(x, b_idx1, DCT_DCT, y_blocks);
   vp9_regular_quantize_b_4x4(x, b_idx2, DCT_DCT, y_blocks);
 }
+#endif
 
 static void invert_quant(int16_t *quant, uint8_t *shift, int d) {
   unsigned t;
diff --git a/vp9/encoder/vp9_quantize.h b/vp9/encoder/vp9_quantize.h
index 718a1272d..2b1eeabbe 100644
--- a/vp9/encoder/vp9_quantize.h
+++ b/vp9/encoder/vp9_quantize.h
@@ -31,20 +31,6 @@ void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type,
                                 int y_blocks);
 void vp9_regular_quantize_b_8x8(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type,
                                 int y_blocks);
-void vp9_regular_quantize_b_16x16(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type,
-                                  int y_blocks);
-void vp9_regular_quantize_b_32x32(MACROBLOCK *mb, int b_idx,
-                                  int y_blocks);
-
-void vp9_quantize_sby_32x32(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
-void vp9_quantize_sby_16x16(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
-void vp9_quantize_sby_8x8(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
-void vp9_quantize_sby_4x4(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
-void vp9_quantize_sbuv_32x32(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
-void vp9_quantize_sbuv_16x16(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
-void vp9_quantize_sbuv_8x8(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
-void vp9_quantize_sbuv_4x4(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
-
 struct VP9_COMP;
 
 extern void vp9_set_quantizer(struct VP9_COMP *cpi, int Q);
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index cf4b1e8e8..eb0ff9edf 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -293,7 +293,7 @@ int vp9_block_error_c(int16_t *coeff, int16_t *dqcoeff, int block_size) {
 }
 
 static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb,
-                              int ib, PLANE_TYPE type,
+                              int plane, int block, PLANE_TYPE type,
                               ENTROPY_CONTEXT *A,
                               ENTROPY_CONTEXT *L,
                               TX_SIZE tx_size,
@@ -304,10 +304,9 @@ static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb,
   int c = 0;
   int cost = 0, pad;
   const int *scan, *nb;
-  const struct plane_block_idx pb_idx = plane_block_idx(y_blocks, ib);
-  const int eob = xd->plane[pb_idx.plane].eobs[pb_idx.block];
-  const int16_t *qcoeff_ptr = BLOCK_OFFSET(xd->plane[pb_idx.plane].qcoeff,
-                                           pb_idx.block, 16);
+  const int eob = xd->plane[plane].eobs[block];
+  const int16_t *qcoeff_ptr = BLOCK_OFFSET(xd->plane[plane].qcoeff,
+                                           block, 16);
   const int ref = mbmi->ref_frame != INTRA_FRAME;
   unsigned int (*token_costs)[PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS] =
       mb->token_costs[tx_size][type][ref];
@@ -334,7 +333,7 @@ static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb,
 #endif
 
   // Check for consistency of tx_size with mode info
-  assert((!type && !pb_idx.plane) || (type && pb_idx.plane));
+  assert((!type && !plane) || (type && plane));
   if (type == PLANE_TYPE_Y_WITH_DC) {
     assert(xd->mode_info_context->mbmi.txfm_size == tx_size);
   } else {
@@ -345,7 +344,7 @@ static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb,
   switch (tx_size) {
     case TX_4X4: {
       tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
-          get_tx_type_4x4(xd, ib) : DCT_DCT;
+          get_tx_type_4x4(xd, block) : DCT_DCT;
       above_ec = A[0] != 0;
       left_ec = L[0] != 0;
       coef_probs = cm->fc.coef_probs_4x4;
@@ -359,7 +358,7 @@ static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb,
     case TX_8X8: {
       const BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type;
       const int sz = 1 + b_width_log2(sb_type);
-      const int x = ib & ((1 << sz) - 1), y = ib - x;
+      const int x = block & ((1 << sz) - 1), y = block - x;
       TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
           get_tx_type_8x8(xd, y + (x >> 1)) : DCT_DCT;
       above_ec = (A[0] + A[1]) != 0;
@@ -375,7 +374,7 @@ static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb,
     case TX_16X16: {
       const BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type;
       const int sz = 2 + b_width_log2(sb_type);
-      const int x = ib & ((1 << sz) - 1), y = ib - x;
+      const int x = block & ((1 << sz) - 1), y = block - x;
       TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
           get_tx_type_16x16(xd, y + (x >> 2)) : DCT_DCT;
       scan = get_scan_16x16(tx_type);
@@ -615,9 +614,10 @@ static int block_error(int16_t *coeff, int16_t *dqcoeff,
   return error > INT_MAX ? INT_MAX : (int)error;
 }
 
-static int block_error_sby(MACROBLOCK *x, int block_size, int shift) {
+static int block_error_sby(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize, int shift) {
+  const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);
   return block_error(x->plane[0].coeff, x->e_mbd.plane[0].dqcoeff,
-                     block_size, shift);
+                     16 << (bwl + bhl), shift);
 }
 
 static int block_error_sbuv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize, int shift) {
@@ -635,155 +635,54 @@ static int block_error_sbuv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize, int shift) {
   return sum > INT_MAX ? INT_MAX : (int)sum;
 }
 
-static int rdcost_sby_4x4(VP9_COMMON *const cm, MACROBLOCK *x,
-                          BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize), bw = 1 << bwl;
-  const int bh = 1 << b_height_log2(bsize);
-  int cost = 0, b;
+static int rdcost_plane(VP9_COMMON *const cm, MACROBLOCK *x,
+                        int plane, BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) {
   MACROBLOCKD *const xd = &x->e_mbd;
+  const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x;
+  const int bhl = b_height_log2(bsize) - xd->plane[plane].subsampling_y;
+  const int bw = 1 << bwl, bh = 1 << bhl;
   ENTROPY_CONTEXT t_above[16], t_left[16];
+  int block, cost;
 
-  vpx_memcpy(&t_above, xd->plane[0].above_context,
+  vpx_memcpy(&t_above, xd->plane[plane].above_context,
              sizeof(ENTROPY_CONTEXT) * bw);
-  vpx_memcpy(&t_left,  xd->plane[0].left_context,
+  vpx_memcpy(&t_left,  xd->plane[plane].left_context,
              sizeof(ENTROPY_CONTEXT) * bh);
 
-  for (b = 0; b < bw * bh; b++) {
-    const int x_idx = b & (bw - 1), y_idx = b >> bwl;
-    cost += cost_coeffs(cm, x, b, PLANE_TYPE_Y_WITH_DC,
-                        t_above + x_idx, t_left + y_idx,
-                        TX_4X4, bw * bh);
-  }
-
-  return cost;
-}
-
-static void super_block_yrd_4x4(VP9_COMMON *const cm, MACROBLOCK *x,
-                                int *rate, int *distortion, int *skippable,
-                                BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);
-  MACROBLOCKD *const xd = &x->e_mbd;
-
-  xd->mode_info_context->mbmi.txfm_size = TX_4X4;
-  vp9_transform_sby_4x4(x, bsize);
-  vp9_quantize_sby_4x4(x, bsize);
-
-  *distortion = block_error_sby(x, 16 << (bwl + bhl), 2);
-  *rate       = rdcost_sby_4x4(cm, x, bsize);
-  *skippable  = vp9_sby_is_skippable(xd, bsize);
-}
-
-static int rdcost_sby_8x8(VP9_COMMON *const cm, MACROBLOCK *x,
-                          BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize) - 1, bw = 1 << bwl;
-  const int bh = 1 << (b_height_log2(bsize) - 1);
-  int cost = 0, b;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  ENTROPY_CONTEXT t_above[16], t_left[16];
-
-  vpx_memcpy(&t_above, xd->plane[0].above_context,
-             sizeof(ENTROPY_CONTEXT) * 2 * bw);
-  vpx_memcpy(&t_left,  xd->plane[0].left_context,
-             sizeof(ENTROPY_CONTEXT) * 2 * bh);
-
-  for (b = 0; b < bw * bh; b++) {
-    const int x_idx = b & (bw - 1), y_idx = b >> bwl;
-    cost += cost_coeffs(cm, x, b * 4, PLANE_TYPE_Y_WITH_DC,
-                        t_above + x_idx * 2, t_left + y_idx * 2,
-                        TX_8X8, 4 * bw * bh);
-  }
-
-  return cost;
-}
-
-static void super_block_yrd_8x8(VP9_COMMON *const cm, MACROBLOCK *x,
-                                int *rate, int *distortion, int *skippable,
-                                BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);
-  MACROBLOCKD *const xd = &x->e_mbd;
-
-  xd->mode_info_context->mbmi.txfm_size = TX_8X8;
-  vp9_transform_sby_8x8(x, bsize);
-  vp9_quantize_sby_8x8(x, bsize);
-
-  *distortion = block_error_sby(x, 16 << (bhl + bwl), 2);
-  *rate       = rdcost_sby_8x8(cm, x, bsize);
-  *skippable  = vp9_sby_is_skippable(xd, bsize);
-}
+  cost = 0;
+  for (block = 0; block < bw * bh; block += 1 << (tx_size * 2)) {
+    int x_idx, y_idx;
 
-static int rdcost_sby_16x16(VP9_COMMON *const cm, MACROBLOCK *x,
-                            BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize) - 2, bw = 1 << bwl;
-  const int bh = 1 << (b_height_log2(bsize) - 2);
-  int cost = 0, b;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  ENTROPY_CONTEXT t_above[16], t_left[16];
-
-  vpx_memcpy(&t_above, xd->plane[0].above_context,
-             sizeof(ENTROPY_CONTEXT) * 4 * bw);
-  vpx_memcpy(&t_left,  xd->plane[0].left_context,
-             sizeof(ENTROPY_CONTEXT) * 4 * bh);
+    txfrm_block_to_raster_xy(xd, bsize, plane, block, tx_size * 2,
+                             &x_idx, &y_idx);
 
-  for (b = 0; b < bw * bh; b++) {
-    const int x_idx = b & (bw - 1), y_idx = b >> bwl;
-    cost += cost_coeffs(cm, x, b * 16, PLANE_TYPE_Y_WITH_DC,
-                        t_above + x_idx * 4, t_left + y_idx * 4,
-                        TX_16X16, bw * bh * 16);
+    cost += cost_coeffs(cm, x, plane, block, xd->plane[plane].plane_type,
+                        t_above + x_idx, t_left + y_idx,
+                        tx_size, bw * bh);
   }
 
   return cost;
 }
 
-static void super_block_yrd_16x16(VP9_COMMON *const cm, MACROBLOCK *x,
-                                  int *rate, int *distortion, int *skippable,
-                                  BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);
-  MACROBLOCKD *const xd = &x->e_mbd;
-
-  xd->mode_info_context->mbmi.txfm_size = TX_16X16;
-  vp9_transform_sby_16x16(x, bsize);
-  vp9_quantize_sby_16x16(x, bsize);
-
-  *distortion = block_error_sby(x, 16 << (bwl + bhl), 2);
-  *rate       = rdcost_sby_16x16(cm, x, bsize);
-  *skippable  = vp9_sby_is_skippable(xd, bsize);
-}
-
-static int rdcost_sby_32x32(VP9_COMMON *const cm, MACROBLOCK *x,
-                            BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize) - 3, bw = 1 << bwl;
-  const int bh = 1 << (b_height_log2(bsize) - 3);
-  int cost = 0, b;
-  MACROBLOCKD * const xd = &x->e_mbd;
-  ENTROPY_CONTEXT t_above[16], t_left[16];
-
-  vpx_memcpy(&t_above, xd->plane[0].above_context,
-             sizeof(ENTROPY_CONTEXT) * 8 * bw);
-  vpx_memcpy(&t_left,  xd->plane[0].left_context,
-             sizeof(ENTROPY_CONTEXT) * 8 * bh);
+static int rdcost_uv(VP9_COMMON *const cm, MACROBLOCK *x,
+                     BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) {
+  int cost = 0, plane;
 
-  for (b = 0; b < bw * bh; b++) {
-    const int x_idx = b & (bw - 1), y_idx = b >> bwl;
-    cost += cost_coeffs(cm, x, b * 64, PLANE_TYPE_Y_WITH_DC,
-                        t_above + x_idx * 8, t_left + y_idx * 8,
-                        TX_32X32, bw * bh * 64);
+  for (plane = 1; plane < MAX_MB_PLANE; plane++) {
+    cost += rdcost_plane(cm, x, plane, bsize, tx_size);
   }
-
   return cost;
 }
 
-static void super_block_yrd_32x32(VP9_COMMON *const cm, MACROBLOCK *x,
-                                  int *rate, int *distortion, int *skippable,
-                                  BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);
+static void super_block_yrd_for_txfm(VP9_COMMON *const cm, MACROBLOCK *x,
+                                     int *rate, int *distortion, int *skippable,
+                                     BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) {
   MACROBLOCKD *const xd = &x->e_mbd;
+  xd->mode_info_context->mbmi.txfm_size = tx_size;
+  vp9_xform_quant_sby(cm, x, bsize);
 
-  xd->mode_info_context->mbmi.txfm_size = TX_32X32;
-  vp9_transform_sby_32x32(x, bsize);
-  vp9_quantize_sby_32x32(x, bsize);
-
-  *distortion = block_error_sby(x, 16 << (bwl + bhl), 0);
-  *rate       = rdcost_sby_32x32(cm, x, bsize);
+  *distortion = block_error_sby(x, bsize, tx_size == TX_32X32 ? 0 : 2);
+  *rate       = rdcost_plane(cm, x, 0, bsize, tx_size);
   *skippable  = vp9_sby_is_skippable(xd, bsize);
 }
 
@@ -797,13 +696,15 @@ static void super_block_yrd(VP9_COMP *cpi,
   vp9_subtract_sby(x, bs);
 
   if (bs >= BLOCK_SIZE_SB32X32)
-    super_block_yrd_32x32(cm, x, &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32],
-                          bs);
+    super_block_yrd_for_txfm(cm, x, &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32],
+                             bs, TX_32X32);
   if (bs >= BLOCK_SIZE_MB16X16)
-    super_block_yrd_16x16(cm, x, &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16],
-                          bs);
-  super_block_yrd_8x8(cm, x,   &r[TX_8X8][0],   &d[TX_8X8],   &s[TX_8X8],   bs);
-  super_block_yrd_4x4(cm, x,   &r[TX_4X4][0],   &d[TX_4X4],   &s[TX_4X4],   bs);
+    super_block_yrd_for_txfm(cm, x, &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16],
+                             bs, TX_16X16);
+  super_block_yrd_for_txfm(cm, x, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8], bs,
+                           TX_8X8);
+  super_block_yrd_for_txfm(cm, x, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4], bs,
+                           TX_4X4);
 
   choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s, skip, txfm_cache,
                            TX_32X32 - (bs < BLOCK_SIZE_SB32X32)
@@ -920,7 +821,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
     tempa = ta;
     templ = tl;
 
-    ratey = cost_coeffs(cm, x, ib,
+    ratey = cost_coeffs(cm, x, 0, ib,
                         PLANE_TYPE_Y_WITH_DC, &tempa, &templ, TX_4X4, 16);
     rate += ratey;
     distortion = vp9_block_error(coeff,
@@ -1147,7 +1048,7 @@ static int64_t rd_pick_intra8x8block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
       distortion = vp9_block_error_c(coeff,
           BLOCK_OFFSET(xd->plane[0].dqcoeff, idx, 16), 64);
 
-      rate_t = cost_coeffs(cm, x, idx, PLANE_TYPE_Y_WITH_DC,
+      rate_t = cost_coeffs(cm, x, 0, idx, PLANE_TYPE_Y_WITH_DC,
                            ta_temp, tl_temp, TX_8X8, 16);
 
       rate += rate_t;
@@ -1182,12 +1083,12 @@ static int64_t rd_pick_intra8x8block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
         distortion += vp9_block_error_c(coeff,
             BLOCK_OFFSET(xd->plane[0].dqcoeff, ib + iblock[i], 16),
             16 << do_two);
-        rate_t += cost_coeffs(cm, x, ib + iblock[i], PLANE_TYPE_Y_WITH_DC,
+        rate_t += cost_coeffs(cm, x, 0, ib + iblock[i], PLANE_TYPE_Y_WITH_DC,
                               &ta_temp[i & 1], &tl_temp[i >> 1],
                               TX_4X4, 16);
         if (do_two) {
           i++;
-          rate_t += cost_coeffs(cm, x, ib + iblock[i], PLANE_TYPE_Y_WITH_DC,
+          rate_t += cost_coeffs(cm, x, 0, ib + iblock[i], PLANE_TYPE_Y_WITH_DC,
                                 &ta_temp[i & 1], &tl_temp[i >> 1],
                                 TX_4X4, 16);
         }
@@ -1327,165 +1228,16 @@ static int64_t rd_pick_intra8x8mby_modes_and_txsz(VP9_COMP *cpi, MACROBLOCK *x,
 }
 #endif  // !CONFIG_SB8X8
 
-static int rd_cost_sbuv_4x4(VP9_COMMON *const cm, MACROBLOCK *x,
-                            BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize) - 1, bw = 1 << bwl;
-  const int bh = 1 << (b_height_log2(bsize) - 1);
-  int yoff = 4 * bw * bh;
-  int p, b, cost = 0;
+static void super_block_uvrd_for_txfm(VP9_COMMON *const cm, MACROBLOCK *x,
+                                      int *rate, int *distortion,
+                                      int *skippable, BLOCK_SIZE_TYPE bsize,
+                                      TX_SIZE uv_tx_size) {
   MACROBLOCKD *const xd = &x->e_mbd;
+  vp9_xform_quant_sbuv(cm, x, bsize);
 
-  for (p = 1; p < MAX_MB_PLANE; p++) {
-    ENTROPY_CONTEXT t_above[8], t_left[8];
-
-    vpx_memcpy(t_above, xd->plane[p].above_context,
-               sizeof(ENTROPY_CONTEXT) * 2 * bw >> xd->plane[p].subsampling_x);
-    vpx_memcpy(t_left, xd->plane[p].left_context,
-               sizeof(ENTROPY_CONTEXT) * 2 * bh >> xd->plane[p].subsampling_y);
-    for (b = 0; b < bw * bh; b++) {
-      const int x_idx = b & (bw - 1), y_idx = b >> bwl;
-      cost += cost_coeffs(cm, x, yoff + b, PLANE_TYPE_UV,
-                          t_above + x_idx, t_left + y_idx,
-                          TX_4X4, bw * bh * 4);
-    }
-    yoff = (yoff * 5) >> 2;  // u -> v
-  }
-
-  return cost;
-}
-
-static void super_block_uvrd_4x4(VP9_COMMON *const cm, MACROBLOCK *x,
-                                 int *rate, int *distortion, int *skip,
-                                 BLOCK_SIZE_TYPE bsize) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-
-  vp9_transform_sbuv_4x4(x, bsize);
-  vp9_quantize_sbuv_4x4(x, bsize);
-
-  *rate       = rd_cost_sbuv_4x4(cm, x, bsize);
-  *distortion = block_error_sbuv(x, bsize, 2);
-  *skip       = vp9_sbuv_is_skippable(xd, bsize);
-}
-
-static int rd_cost_sbuv_8x8(VP9_COMMON *const cm, MACROBLOCK *x,
-                            BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize) - 2, bw = 1 << bwl;
-  const int bh = 1 << (b_height_log2(bsize) - 2);
-  int yoff = 16 * bw * bh;
-  int p, b, cost = 0;
-  MACROBLOCKD *const xd = &x->e_mbd;
-
-  for (p = 1; p < MAX_MB_PLANE; p++) {
-    ENTROPY_CONTEXT t_above[8], t_left[8];
-
-    vpx_memcpy(t_above, xd->plane[p].above_context,
-               sizeof(ENTROPY_CONTEXT) * 4 * bw >> xd->plane[p].subsampling_x);
-    vpx_memcpy(t_left, xd->plane[p].left_context,
-               sizeof(ENTROPY_CONTEXT) * 4 * bh >> xd->plane[p].subsampling_y);
-    for (b = 0; b < bw * bh; b++) {
-      const int x_idx = b & (bw - 1), y_idx = b >> bwl;
-      cost += cost_coeffs(cm, x, yoff + b * 4, PLANE_TYPE_UV,
-                          t_above + x_idx * 2, t_left + y_idx * 2,
-                          TX_8X8, bw * bh * 16);
-    }
-    yoff = (yoff * 5) >> 2;  // u -> v
-  }
-
-  return cost;
-}
-
-static void super_block_uvrd_8x8(VP9_COMMON *const cm, MACROBLOCK *x,
-                                 int *rate, int *distortion, int *skip,
-                                 BLOCK_SIZE_TYPE bsize) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-
-  vp9_transform_sbuv_8x8(x, bsize);
-  vp9_quantize_sbuv_8x8(x, bsize);
-
-  *rate       = rd_cost_sbuv_8x8(cm, x, bsize);
-  *distortion = block_error_sbuv(x, bsize, 2);
-  *skip       = vp9_sbuv_is_skippable(xd, bsize);
-}
-
-static int rd_cost_sbuv_16x16(VP9_COMMON *const cm, MACROBLOCK *x,
-                              BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize) - 3, bw = 1 << bwl;
-  const int bh = 1 << (b_height_log2(bsize) - 3);
-  int yoff = 64 * bw * bh;
-  int p, b, cost = 0;
-  MACROBLOCKD *const xd = &x->e_mbd;
-
-  for (p = 1; p < MAX_MB_PLANE; p++) {
-    ENTROPY_CONTEXT t_above[8], t_left[8];
-
-    vpx_memcpy(t_above, xd->plane[p].above_context,
-               sizeof(ENTROPY_CONTEXT) * 8 * bw >> xd->plane[p].subsampling_x);
-    vpx_memcpy(t_left, xd->plane[p].left_context,
-               sizeof(ENTROPY_CONTEXT) * 8 * bh >> xd->plane[p].subsampling_y);
-    for (b = 0; b < bw * bh; b++) {
-      const int x_idx = b & (bw - 1), y_idx = b >> bwl;
-      cost += cost_coeffs(cm, x, yoff + b * 16, PLANE_TYPE_UV,
-                          t_above + x_idx * 4, t_left + y_idx * 4,
-                          TX_16X16, bw * bh * 64);
-    }
-    yoff = (yoff * 5) >> 2;  // u -> v
-  }
-
-  return cost;
-}
-
-static void super_block_uvrd_16x16(VP9_COMMON *const cm, MACROBLOCK *x,
-                                   int *rate, int *distortion, int *skip,
-                                   BLOCK_SIZE_TYPE bsize) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-
-  vp9_transform_sbuv_16x16(x, bsize);
-  vp9_quantize_sbuv_16x16(x, bsize);
-
-  *rate       = rd_cost_sbuv_16x16(cm, x, bsize);
-  *distortion = block_error_sbuv(x, bsize, 2);
-  *skip       = vp9_sbuv_is_skippable(xd, bsize);
-}
-
-static int rd_cost_sbuv_32x32(VP9_COMMON *const cm, MACROBLOCK *x,
-                              BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize) - 4, bw = 1 << bwl;
-  const int bh = 1 << (b_height_log2(bsize) - 4);
-  int yoff = 256 * bh * bw;
-  int p, b, cost = 0;
-  MACROBLOCKD *const xd = &x->e_mbd;
-
-  for (p = 1; p < MAX_MB_PLANE; p++) {
-    ENTROPY_CONTEXT t_above[8], t_left[8];
-
-    vpx_memcpy(t_above, xd->plane[p].above_context,
-               sizeof(ENTROPY_CONTEXT) * 16 * bw >> xd->plane[p].subsampling_x);
-    vpx_memcpy(t_left, xd->plane[p].left_context,
-               sizeof(ENTROPY_CONTEXT) * 16 * bh >> xd->plane[p].subsampling_y);
-    for (b = 0; b < bw * bh; b++) {
-      const int x_idx = b * (bw - 1), y_idx = b >> bwl;
-      cost += cost_coeffs(cm, x, yoff + b * 64, PLANE_TYPE_UV,
-                          t_above + x_idx * 8, t_left + y_idx * 8,
-                          TX_32X32, 256 * bh * bw);
-    }
-    yoff = (yoff * 5) >> 2;  // u -> v
-  }
-
-  return cost;
-}
-#undef UVCTX
-
-static void super_block_uvrd_32x32(VP9_COMMON *const cm, MACROBLOCK *x,
-                                   int *rate, int *distortion, int *skip,
-                                   BLOCK_SIZE_TYPE bsize) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-
-  vp9_transform_sbuv_32x32(x, bsize);
-  vp9_quantize_sbuv_32x32(x, bsize);
-
-  *rate       = rd_cost_sbuv_32x32(cm, x, bsize);
-  *distortion = block_error_sbuv(x, bsize, 0);
-  *skip       = vp9_sbuv_is_skippable(xd, bsize);
+  *distortion = block_error_sbuv(x, bsize, uv_tx_size == TX_32X32 ? 0 : 2);
+  *rate       = rdcost_uv(cm, x, bsize, uv_tx_size);
+  *skippable  = vp9_sbuv_is_skippable(xd, bsize);
 }
 
 static void super_block_uvrd(VP9_COMMON *const cm, MACROBLOCK *x,
@@ -1497,13 +1249,17 @@ static void super_block_uvrd(VP9_COMMON *const cm, MACROBLOCK *x,
   vp9_subtract_sbuv(x, bsize);
 
   if (mbmi->txfm_size >= TX_32X32 && bsize >= BLOCK_SIZE_SB64X64) {
-    super_block_uvrd_32x32(cm, x, rate, distortion, skippable, bsize);
+    super_block_uvrd_for_txfm(cm, x, rate, distortion, skippable, bsize,
+                              TX_32X32);
   } else if (mbmi->txfm_size >= TX_16X16 && bsize >= BLOCK_SIZE_SB32X32) {
-    super_block_uvrd_16x16(cm, x, rate, distortion, skippable, bsize);
+    super_block_uvrd_for_txfm(cm, x, rate, distortion, skippable, bsize,
+                              TX_16X16);
   } else if (mbmi->txfm_size >= TX_8X8 && bsize >= BLOCK_SIZE_MB16X16) {
-    super_block_uvrd_8x8(cm, x, rate, distortion, skippable, bsize);
+    super_block_uvrd_for_txfm(cm, x, rate, distortion, skippable, bsize,
+                              TX_8X8);
   } else {
-    super_block_uvrd_4x4(cm, x, rate, distortion, skippable, bsize);
+    super_block_uvrd_for_txfm(cm, x, rate, distortion, skippable, bsize,
+                              TX_4X4);
   }
 }
 
@@ -1740,7 +1496,7 @@ static int64_t encode_inter_mb_segment(VP9_COMMON *const cm,
                                        BLOCK_OFFSET(xd->plane[0].dqcoeff,
                                                     i, 16), 16);
       *distortion += thisdistortion;
-      *labelyrate += cost_coeffs(cm, x, i, PLANE_TYPE_Y_WITH_DC,
+      *labelyrate += cost_coeffs(cm, x, 0, i, PLANE_TYPE_Y_WITH_DC,
                                  ta + (i & 1),
                                  tl + (i >> 1), TX_4X4, 16);
     }
@@ -2250,7 +2006,7 @@ static int64_t encode_inter_mb_segment(VP9_COMMON *const cm,
       thisdistortion = vp9_block_error(coeff,
           BLOCK_OFFSET(xd->plane[0].dqcoeff, i, 16), 16);
       *distortion += thisdistortion;
-      *labelyrate += cost_coeffs(cm, x, i, PLANE_TYPE_Y_WITH_DC,
+      *labelyrate += cost_coeffs(cm, x, 0, i, PLANE_TYPE_Y_WITH_DC,
                                  ta + (i & 3),
                                  tl + (i >> 2), TX_4X4, 16);
     }
@@ -2333,7 +2089,7 @@ static int64_t encode_inter_mb_segment_8x8(VP9_COMMON *const cm,
               BLOCK_OFFSET(xd->plane[0].dqcoeff, idx, 16), 64);
           otherdist += thisdistortion;
           xd->mode_info_context->mbmi.txfm_size = TX_8X8;
-          othercost += cost_coeffs(cm, x, idx, PLANE_TYPE_Y_WITH_DC,
+          othercost += cost_coeffs(cm, x, 0, idx, PLANE_TYPE_Y_WITH_DC,
                                    tac + (i & 1) * 2,
                                    tlc + (i & 2),
                                    TX_8X8, 16);
@@ -2352,12 +2108,12 @@ static int64_t encode_inter_mb_segment_8x8(VP9_COMMON *const cm,
               BLOCK_OFFSET(xd->plane[0].dqcoeff, ib + iblock[j], 16), 32);
           *distortion += thisdistortion;
           *labelyrate +=
-              cost_coeffs(cm, x, ib + iblock[j], PLANE_TYPE_Y_WITH_DC,
+              cost_coeffs(cm, x, 0, ib + iblock[j], PLANE_TYPE_Y_WITH_DC,
                           ta + (i & 1) * 2,
                           tl + (i & 2) + ((j & 2) >> 1),
                           TX_4X4, 16);
           *labelyrate +=
-              cost_coeffs(cm, x, ib + iblock[j] + 1,
+              cost_coeffs(cm, x, 0, ib + iblock[j] + 1,
                           PLANE_TYPE_Y_WITH_DC,
                           ta + (i & 1) * 2 + 1,
                           tl + (i & 2) + ((j & 2) >> 1),
@@ -2379,12 +2135,12 @@ static int64_t encode_inter_mb_segment_8x8(VP9_COMMON *const cm,
             otherdist += thisdistortion;
             xd->mode_info_context->mbmi.txfm_size = TX_4X4;
             othercost +=
-                cost_coeffs(cm, x, ib + iblock[j], PLANE_TYPE_Y_WITH_DC,
+                cost_coeffs(cm, x, 0, ib + iblock[j], PLANE_TYPE_Y_WITH_DC,
                             tac + (i & 1) * 2,
                             tlc + (i & 2) + ((j & 2) >> 1),
                             TX_4X4, 16);
             othercost +=
-                cost_coeffs(cm, x, ib + iblock[j] + 1,
+                cost_coeffs(cm, x, 0, ib + iblock[j] + 1,
                             PLANE_TYPE_Y_WITH_DC,
                             tac + (i & 1) * 2 + 1,
                             tlc + (i & 2) + ((j & 2) >> 1),
@@ -2397,7 +2153,7 @@ static int64_t encode_inter_mb_segment_8x8(VP9_COMMON *const cm,
         thisdistortion = vp9_block_error_c(coeff,
             BLOCK_OFFSET(xd->plane[0].dqcoeff, idx, 16), 64);
         *distortion += thisdistortion;
-        *labelyrate += cost_coeffs(cm, x, idx, PLANE_TYPE_Y_WITH_DC,
+        *labelyrate += cost_coeffs(cm, x, 0, idx, PLANE_TYPE_Y_WITH_DC,
                                    ta + (i & 1) * 2,
                                    tl + (i & 2),
                                    TX_8X8, 16);
@@ -4220,8 +3976,8 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
 
         vp9_subtract_sbuv(x, BLOCK_SIZE_MB16X16);
 
-        super_block_uvrd_4x4(cm, x, &rate_uv, &distortion_uv,
-                             &uv_skippable, BLOCK_SIZE_MB16X16);
+        super_block_uvrd_for_txfm(cm, x, &rate_uv, &distortion_uv,
+                                  &uv_skippable, BLOCK_SIZE_MB16X16, TX_4X4);
         rate2 += rate_uv;
         distortion2 += distortion_uv;
         skippable = skippable && uv_skippable;
@@ -5240,8 +4996,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       vp9_build_inter_predictors_sbuv(&x->e_mbd, mi_row, mi_col,
                                       bsize);
       vp9_subtract_sbuv(x, bsize);
-      super_block_uvrd_4x4(cm, x, &rate_uv, &distortion_uv,
-                           &uv_skippable, bsize);
+      super_block_uvrd_for_txfm(cm, x, &rate_uv, &distortion_uv,
+                                &uv_skippable, bsize, TX_4X4);
       rate2 += rate_uv;
       distortion2 += distortion_uv;
       skippable = skippable && uv_skippable;