9 files changed, 156 insertions, 203 deletions
diff --git a/vp9/encoder/vp9_asm_enc_offsets.c b/vp9/encoder/vp9_asm_enc_offsets.c
index e174a894a..1a770dcf7 100644
--- a/vp9/encoder/vp9_asm_enc_offsets.c
+++ b/vp9/encoder/vp9_asm_enc_offsets.c
@@ -29,9 +29,7 @@ DEFINE(vp9_block_zbin_extra,                    offsetof(BLOCK, zbin_extra));
 DEFINE(vp9_block_zrun_zbin_boost,               offsetof(BLOCK, zrun_zbin_boost));
 DEFINE(vp9_block_quant_shift,                   offsetof(BLOCK, quant_shift));
 
-DEFINE(vp9_blockd_qcoeff,                       offsetof(BLOCKD, qcoeff));
 DEFINE(vp9_blockd_dequant,                      offsetof(BLOCKD, dequant));
-DEFINE(vp9_blockd_dqcoeff,                      offsetof(BLOCKD, dqcoeff));
 
 END
 
diff --git a/vp9/encoder/vp9_encodeintra.c b/vp9/encoder/vp9_encodeintra.c
index eddacb872..883038b7e 100644
--- a/vp9/encoder/vp9_encodeintra.c
+++ b/vp9/encoder/vp9_encodeintra.c
@@ -16,6 +16,8 @@
 #include "vp9/common/vp9_invtrans.h"
 #include "vp9/encoder/vp9_encodeintra.h"
 
+static void encode_intra4x4block(MACROBLOCK *x, int ib);
+
 int vp9_encode_intra(VP9_COMP *cpi, MACROBLOCK *x, int use_16x16_pred) {
   MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
   (void) cpi;
@@ -31,18 +33,21 @@ int vp9_encode_intra(VP9_COMP *cpi, MACROBLOCK *x, int use_16x16_pred) {
 
     for (i = 0; i < 16; i++) {
       x->e_mbd.block[i].bmi.as_mode.first = B_DC_PRED;
-      vp9_encode_intra4x4block(x, i);
+      encode_intra4x4block(x, i);
     }
   }
 
   return vp9_get_mb_ss(x->src_diff);
 }
 
-void vp9_encode_intra4x4block(MACROBLOCK *x, int ib) {
+static void encode_intra4x4block(MACROBLOCK *x, int ib) {
   BLOCKD *b = &x->e_mbd.block[ib];
   BLOCK *be = &x->block[ib];
+  MACROBLOCKD * const xd = &x->e_mbd;
   TX_TYPE tx_type;
 
+  assert(ib < 16);
+
 #if CONFIG_NEWBINTRAMODES
   b->bmi.as_mode.context = vp9_find_bpred_context(&x->e_mbd, b);
 #endif
@@ -54,12 +59,14 @@ void vp9_encode_intra4x4block(MACROBLOCK *x, int ib) {
   if (tx_type != DCT_DCT) {
     vp9_short_fht4x4(be->src_diff, be->coeff, 16, tx_type);
     vp9_ht_quantize_b_4x4(x, ib, tx_type);
-    vp9_short_iht4x4(b->dqcoeff, b->diff, 16, tx_type);
+    vp9_short_iht4x4(BLOCK_OFFSET(xd->plane[0].dqcoeff, ib, 16),
+                     b->diff, 16, tx_type);
   } else {
     x->fwd_txm4x4(be->src_diff, be->coeff, 32);
     x->quantize_b_4x4(x, ib);
     vp9_inverse_transform_b_4x4(&x->e_mbd, x->e_mbd.eobs[ib],
-                                b->dqcoeff, b->diff, 32);
+                                BLOCK_OFFSET(xd->plane[0].dqcoeff, ib, 16),
+                                b->diff, 32);
   }
 
   vp9_recon_b(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
@@ -69,7 +76,7 @@ void vp9_encode_intra4x4mby(MACROBLOCK *mb) {
   int i;
 
   for (i = 0; i < 16; i++)
-    vp9_encode_intra4x4block(mb, i);
+    encode_intra4x4block(mb, i);
 }
 
 void vp9_encode_intra16x16mby(VP9_COMMON *const cm, MACROBLOCK *x) {
@@ -151,41 +158,47 @@ void vp9_encode_intra8x8(MACROBLOCK *x, int ib) {
 
   if (xd->mode_info_context->mbmi.txfm_size == TX_8X8) {
     int idx = (ib & 0x02) ? (ib + 2) : ib;
+    int16_t * const dqcoeff = BLOCK_OFFSET(xd->plane[0].dqcoeff, idx, 16);
 
+    assert(idx < 16);
     tx_type = get_tx_type_8x8(xd, ib);
     if (tx_type != DCT_DCT) {
       vp9_short_fht8x8(be->src_diff, (x->block + idx)->coeff, 16, tx_type);
       x->quantize_b_8x8(x, idx, tx_type);
-      vp9_short_iht8x8(xd->block[idx].dqcoeff, xd->block[ib].diff,
+      vp9_short_iht8x8(dqcoeff, xd->block[ib].diff,
                             16, tx_type);
     } else {
       x->fwd_txm8x8(be->src_diff, (x->block + idx)->coeff, 32);
       x->quantize_b_8x8(x, idx, DCT_DCT);
-      vp9_short_idct8x8(xd->block[idx].dqcoeff, xd->block[ib].diff, 32);
+      vp9_short_idct8x8(dqcoeff, xd->block[ib].diff, 32);
     }
   } else {
     for (i = 0; i < 4; i++) {
+      int idx = ib + iblock[i];
+      int16_t * const dqcoeff = BLOCK_OFFSET(xd->plane[0].dqcoeff, idx, 16);
+
+      assert(idx < 16);
       b = &xd->block[ib + iblock[i]];
       be = &x->block[ib + iblock[i]];
       tx_type = get_tx_type_4x4(xd, ib + iblock[i]);
       if (tx_type != DCT_DCT) {
         vp9_short_fht4x4(be->src_diff, be->coeff, 16, tx_type);
         vp9_ht_quantize_b_4x4(x, ib + iblock[i], tx_type);
-        vp9_short_iht4x4(b->dqcoeff, b->diff, 16, tx_type);
+        vp9_short_iht4x4(dqcoeff, b->diff, 16, tx_type);
       } else if (!(i & 1) &&
                  get_tx_type_4x4(xd, ib + iblock[i] + 1) == DCT_DCT) {
         x->fwd_txm8x4(be->src_diff, be->coeff, 32);
         x->quantize_b_4x4_pair(x, ib + iblock[i], ib + iblock[i] + 1);
         vp9_inverse_transform_b_4x4(xd, xd->eobs[ib + iblock[i]],
-                                    b->dqcoeff, b->diff, 32);
+                                    dqcoeff, b->diff, 32);
         vp9_inverse_transform_b_4x4(xd, xd->eobs[ib + iblock[i] + 1],
-                                    (b + 1)->dqcoeff, (b + 1)->diff, 32);
+                                    dqcoeff + 16, (b + 1)->diff, 32);
         i++;
       } else {
         x->fwd_txm4x4(be->src_diff, be->coeff, 32);
         x->quantize_b_4x4(x, ib + iblock[i]);
         vp9_inverse_transform_b_4x4(xd, xd->eobs[ib + iblock[i]],
-                                    b->dqcoeff, b->diff, 32);
+                                    dqcoeff, b->diff, 32);
       }
     }
   }
@@ -206,9 +219,12 @@ void vp9_encode_intra8x8mby(MACROBLOCK *x) {
 }
 
 static void encode_intra_uv4x4(MACROBLOCK *x, int ib, int mode) {
+  MACROBLOCKD * const xd = &x->e_mbd;
   BLOCKD *b = &x->e_mbd.block[ib];
   BLOCK *be = &x->block[ib];
+  int16_t * const dqcoeff = MB_SUBBLOCK_FIELD(xd, dqcoeff, ib);
 
+  assert(ib >= 16 && ib < 24);
   vp9_intra_uv4x4_predict(&x->e_mbd, b, mode, b->predictor);
 
   vp9_subtract_b(be, b, 8);
@@ -216,7 +232,7 @@ static void encode_intra_uv4x4(MACROBLOCK *x, int ib, int mode) {
   x->fwd_txm4x4(be->src_diff, be->coeff, 16);
   x->quantize_b_4x4(x, ib);
   vp9_inverse_transform_b_4x4(&x->e_mbd, x->e_mbd.eobs[ib],
-                              b->dqcoeff, b->diff, 16);
+                              dqcoeff, b->diff, 16);
 
   vp9_recon_uv_b_c(b->predictor, b->diff, *(b->base_dst) + b->dst,
                    b->dst_stride);
diff --git a/vp9/encoder/vp9_encodeintra.h b/vp9/encoder/vp9_encodeintra.h
index 0b19b5652..6576c94d2 100644
--- a/vp9/encoder/vp9_encodeintra.h
+++ b/vp9/encoder/vp9_encodeintra.h
@@ -17,7 +17,6 @@ int vp9_encode_intra(VP9_COMP *cpi, MACROBLOCK *x, int use_16x16_pred);
 void vp9_encode_intra16x16mby(VP9_COMMON *const cm, MACROBLOCK *x);
 void vp9_encode_intra16x16mbuv(VP9_COMMON *const cm, MACROBLOCK *x);
 void vp9_encode_intra4x4mby(MACROBLOCK *mb);
-void vp9_encode_intra4x4block(MACROBLOCK *x, int ib);
 void vp9_encode_intra8x8mby(MACROBLOCK *x);
 void vp9_encode_intra8x8mbuv(MACROBLOCK *x);
 void vp9_encode_intra8x8(MACROBLOCK *x, int ib);
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 27015773f..a30268886 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -549,9 +549,10 @@ static void optimize_b(VP9_COMMON *const cm,
   MACROBLOCKD *const xd = &mb->e_mbd;
   vp9_token_state tokens[1025][2];
   unsigned best_index[1025][2];
+  const struct plane_block_idx pb_idx = plane_block_idx(xd, ib);
   const int16_t *coeff_ptr = mb->coeff + ib * 16;
-  int16_t *qcoeff_ptr = xd->qcoeff + ib * 16;
-  int16_t *dqcoeff_ptr = xd->dqcoeff + ib * 16;
+  int16_t *qcoeff_ptr;
+  int16_t *dqcoeff_ptr;
   int eob = xd->eobs[ib], final_eob, sz = 0;
   const int i0 = 0;
   int rc, x, next, i;
@@ -582,6 +583,8 @@ static void optimize_b(VP9_COMMON *const cm,
   nzc0 = nzc1 = nzc;
 #endif
 
+  dqcoeff_ptr = BLOCK_OFFSET(xd->plane[pb_idx.plane].dqcoeff, pb_idx.block, 16);
+  qcoeff_ptr = BLOCK_OFFSET(xd->plane[pb_idx.plane].qcoeff, pb_idx.block, 16);
   switch (tx_size) {
     default:
     case TX_4X4: {
diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index 881fce50f..826bee4c3 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@@ -39,8 +39,9 @@ void vp9_ht_quantize_b_4x4(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type) {
   int zbin;
   int x, y, z, sz;
   int16_t *coeff_ptr       = mb->coeff + b_idx * 16;
-  int16_t *qcoeff_ptr      = xd->qcoeff + b_idx * 16;
-  int16_t *dqcoeff_ptr     = xd->dqcoeff + b_idx * 16;
+  // ht is luma-only
+  int16_t *qcoeff_ptr      = BLOCK_OFFSET(xd->plane[0].qcoeff, b_idx, 16);
+  int16_t *dqcoeff_ptr     = BLOCK_OFFSET(xd->plane[0].dqcoeff, b_idx, 16);
   int16_t *zbin_boost_ptr  = b->zrun_zbin_boost;
   int16_t *zbin_ptr        = b->zbin;
   int16_t *round_ptr       = b->round;
@@ -110,14 +111,17 @@ void vp9_ht_quantize_b_4x4(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type) {
 void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx) {
   MACROBLOCKD *const xd = &mb->e_mbd;
   const int c_idx = plane_idx(xd, b_idx);
+  const struct plane_block_idx pb_idx = plane_block_idx(xd, b_idx);
   BLOCK *const b = &mb->block[c_idx];
   BLOCKD *const d = &xd->block[c_idx];
   int i, rc, eob;
   int zbin;
   int x, y, z, sz;
   int16_t *coeff_ptr       = mb->coeff + b_idx * 16;
-  int16_t *qcoeff_ptr      = xd->qcoeff + b_idx * 16;
-  int16_t *dqcoeff_ptr     = xd->dqcoeff + b_idx * 16;
+  int16_t *qcoeff_ptr      = BLOCK_OFFSET(xd->plane[pb_idx.plane].qcoeff,
+                                          pb_idx.block, 16);
+  int16_t *dqcoeff_ptr     = BLOCK_OFFSET(xd->plane[pb_idx.plane].dqcoeff,
+                                          pb_idx.block, 16);
   int16_t *zbin_boost_ptr  = b->zrun_zbin_boost;
   int16_t *zbin_ptr        = b->zbin;
   int16_t *round_ptr       = b->round;
@@ -186,9 +190,13 @@ void vp9_quantize_mby_4x4(MACROBLOCK *x) {
 
 void vp9_quantize_mbuv_4x4(MACROBLOCK *x) {
   int i;
+  const MACROBLOCKD * const xd = &x->e_mbd;
+  const BLOCK_SIZE_TYPE real_sb_type = xd->mode_info_context->mbmi.sb_type;
+  xd->mode_info_context->mbmi.sb_type = BLOCK_SIZE_MB16X16;
 
   for (i = 16; i < 24; i++)
     x->quantize_b_4x4(x, i);
+  xd->mode_info_context->mbmi.sb_type = real_sb_type;
 }
 
 void vp9_quantize_mb_4x4(MACROBLOCK *x) {
@@ -198,9 +206,12 @@ void vp9_quantize_mb_4x4(MACROBLOCK *x) {
 
 void vp9_regular_quantize_b_8x8(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type) {
   MACROBLOCKD *const xd = &mb->e_mbd;
-  int16_t *qcoeff_ptr = xd->qcoeff + 16 * b_idx;
-  int16_t *dqcoeff_ptr = xd->dqcoeff + 16 * b_idx;
+  const struct plane_block_idx pb_idx = plane_block_idx(xd, b_idx);
   const int c_idx = plane_idx(xd, b_idx);
+  int16_t *qcoeff_ptr = BLOCK_OFFSET(xd->plane[pb_idx.plane].qcoeff,
+                                     pb_idx.block, 16);
+  int16_t *dqcoeff_ptr = BLOCK_OFFSET(xd->plane[pb_idx.plane].dqcoeff,
+                                      pb_idx.block, 16);
   BLOCK *const b = &mb->block[c_idx];
   BLOCKD *const d = &xd->block[c_idx];
   const int *pt_scan;
@@ -323,6 +334,9 @@ void vp9_quantize_mby_8x8(MACROBLOCK *x) {
 
 void vp9_quantize_mbuv_8x8(MACROBLOCK *x) {
   int i;
+  const MACROBLOCKD * const xd = &x->e_mbd;
+  const BLOCK_SIZE_TYPE real_sb_type = xd->mode_info_context->mbmi.sb_type;
+  xd->mode_info_context->mbmi.sb_type = BLOCK_SIZE_MB16X16;
 
 #if CONFIG_CODE_NONZEROCOUNT
   for (i = 16; i < 24; i ++) {
@@ -331,6 +345,7 @@ void vp9_quantize_mbuv_8x8(MACROBLOCK *x) {
 #endif
   for (i = 16; i < 24; i += 4)
     x->quantize_b_8x8(x, i, DCT_DCT);
+  xd->mode_info_context->mbmi.sb_type = real_sb_type;
 }
 
 void vp9_quantize_mb_8x8(MACROBLOCK *x) {
@@ -418,6 +433,7 @@ static void quantize(int16_t *zbin_boost_orig_ptr,
 void vp9_regular_quantize_b_16x16(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type) {
   MACROBLOCKD *const xd = &mb->e_mbd;
   const int c_idx = plane_idx(xd, b_idx);
+  const struct plane_block_idx pb_idx = plane_block_idx(xd, b_idx);
   BLOCK *const b = &mb->block[c_idx];
   BLOCKD *const d = &xd->block[c_idx];
   const int *pt_scan;
@@ -438,8 +454,8 @@ void vp9_regular_quantize_b_16x16(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type) {
            mb->coeff + 16 * b_idx,
            256, b->skip_block,
            b->zbin, b->round, b->quant, b->quant_shift,
-           xd->qcoeff + 16 * b_idx,
-           xd->dqcoeff + 16 * b_idx,
+           BLOCK_OFFSET(xd->plane[pb_idx.plane].qcoeff, pb_idx.block, 16),
+           BLOCK_OFFSET(xd->plane[pb_idx.plane].dqcoeff, pb_idx.block, 16),
            d->dequant,
            b->zbin_extra,
            &xd->eobs[b_idx],
@@ -452,6 +468,7 @@ void vp9_regular_quantize_b_16x16(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type) {
 void vp9_regular_quantize_b_32x32(MACROBLOCK *mb, int b_idx) {
   MACROBLOCKD *const xd = &mb->e_mbd;
   const int c_idx = plane_idx(xd, b_idx);
+  const struct plane_block_idx pb_idx = plane_block_idx(xd, b_idx);
   BLOCK *const b = &mb->block[c_idx];
   BLOCKD *const d = &xd->block[c_idx];
 
@@ -460,8 +477,8 @@ void vp9_regular_quantize_b_32x32(MACROBLOCK *mb, int b_idx) {
            1024, b->skip_block,
            b->zbin,
            b->round, b->quant, b->quant_shift,
-           xd->qcoeff + b_idx * 16,
-           xd->dqcoeff + b_idx * 16,
+           BLOCK_OFFSET(xd->plane[pb_idx.plane].qcoeff, pb_idx.block, 16),
+           BLOCK_OFFSET(xd->plane[pb_idx.plane].dqcoeff, pb_idx.block, 16),
            d->dequant,
            b->zbin_extra,
            &xd->eobs[b_idx],
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 34adc9915..82c5b5bcd 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -348,35 +348,36 @@ int vp9_block_error_c(int16_t *coeff, int16_t *dqcoeff, int block_size) {
 }
 
 int vp9_mbblock_error_c(MACROBLOCK *mb) {
+  MACROBLOCKD * const xd = &mb->e_mbd;
   BLOCK  *be;
-  BLOCKD *bd;
-  int i, j;
-  int berror, error = 0;
+  int i;
+  int error = 0;
 
   for (i = 0; i < 16; i++) {
     be = &mb->block[i];
-    bd = &mb->e_mbd.block[i];
-    berror = 0;
-    for (j = 0; j < 16; j++) {
-      int this_diff = be->coeff[j] - bd->dqcoeff[j];
-      berror += this_diff * this_diff;
-    }
-    error += berror;
+    error += vp9_block_error(be->coeff,
+                             BLOCK_OFFSET(xd->plane[0].dqcoeff, i, 16), 16);
   }
   return error;
 }
 
 int vp9_mbuverror_c(MACROBLOCK *mb) {
+  MACROBLOCKD * const xd = &mb->e_mbd;
   BLOCK  *be;
-  BLOCKD *bd;
 
   int i, error = 0;
 
-  for (i = 16; i < 24; i++) {
+  for (i = 16; i < 20; i++) {
     be = &mb->block[i];
-    bd = &mb->e_mbd.block[i];
-
-    error += vp9_block_error_c(be->coeff, bd->dqcoeff, 16);
+    error += vp9_block_error(be->coeff,
+                             BLOCK_OFFSET(xd->plane[1].dqcoeff, i - 16, 16),
+                             16);
+  }
+  for (i = 20; i < 24; i++) {
+    be = &mb->block[i];
+    error += vp9_block_error(be->coeff,
+                             BLOCK_OFFSET(xd->plane[2].dqcoeff, i - 20, 16),
+                             16);
   }
 
   return error;
@@ -438,7 +439,9 @@ static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb,
   int c = 0;
   int cost = 0, pad;
   const int *scan, *nb;
-  const int16_t *qcoeff_ptr = xd->qcoeff + ib * 16;
+  const struct plane_block_idx pb_idx = plane_block_idx(xd, ib);
+  const int16_t *qcoeff_ptr = BLOCK_OFFSET(xd->plane[pb_idx.plane].qcoeff,
+                                           pb_idx.block, 16);
   const int ref = mbmi->ref_frame != INTRA_FRAME;
   unsigned int (*token_costs)[PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS] =
       mb->token_costs[tx_size][type][ref];
@@ -858,6 +861,26 @@ static int vp9_sb_block_error_c(int16_t *coeff, int16_t *dqcoeff,
   return error > INT_MAX ? INT_MAX : (int)error;
 }
 
+static int vp9_sb_uv_block_error_c(int16_t *coeff,
+                                   int16_t *dqcoeff0, int16_t *dqcoeff1,
+                                   int block_size, int shift) {
+  int i;
+  int64_t error = 0;
+
+  for (i = 0; i < block_size / 2; i++) {
+    unsigned int this_diff = coeff[i] - dqcoeff0[i];
+    error += this_diff * this_diff;
+  }
+  coeff += block_size / 2;
+  for (i = 0; i < block_size / 2; i++) {
+    unsigned int this_diff = coeff[i] - dqcoeff1[i];
+    error += this_diff * this_diff;
+  }
+  error >>= shift;
+
+  return error > INT_MAX ? INT_MAX : (int)error;
+}
+
 static int rdcost_sby_4x4(VP9_COMMON *const cm, MACROBLOCK *x) {
   int cost = 0, b;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -884,7 +907,7 @@ static void super_block_yrd_4x4(VP9_COMMON *const cm, MACROBLOCK *x,
   vp9_transform_sby_4x4(x);
   vp9_quantize_sby_4x4(x);
 
-  *distortion = vp9_sb_block_error_c(x->coeff, xd->dqcoeff, 1024, 2);
+  *distortion = vp9_sb_block_error_c(x->coeff, xd->plane[0].dqcoeff, 1024, 2);
   *rate       = rdcost_sby_4x4(cm, x);
   *skippable  = vp9_sby_is_skippable_4x4(xd);
 }
@@ -915,7 +938,7 @@ static void super_block_yrd_8x8(VP9_COMMON *const cm, MACROBLOCK *x,
   vp9_transform_sby_8x8(x);
   vp9_quantize_sby_8x8(x);
 
-  *distortion = vp9_sb_block_error_c(x->coeff, xd->dqcoeff, 1024, 2);
+  *distortion = vp9_sb_block_error_c(x->coeff, xd->plane[0].dqcoeff, 1024, 2);
   *rate       = rdcost_sby_8x8(cm, x);
   *skippable  = vp9_sby_is_skippable_8x8(xd);
 }
@@ -946,7 +969,7 @@ static void super_block_yrd_16x16(VP9_COMMON *const cm, MACROBLOCK *x,
   vp9_transform_sby_16x16(x);
   vp9_quantize_sby_16x16(x);
 
-  *distortion = vp9_sb_block_error_c(x->coeff, xd->dqcoeff, 1024, 2);
+  *distortion = vp9_sb_block_error_c(x->coeff, xd->plane[0].dqcoeff, 1024, 2);
   *rate       = rdcost_sby_16x16(cm, x);
   *skippable  = vp9_sby_is_skippable_16x16(xd);
 }
@@ -971,7 +994,7 @@ static void super_block_yrd_32x32(VP9_COMMON *const cm, MACROBLOCK *x,
   vp9_transform_sby_32x32(x);
   vp9_quantize_sby_32x32(x);
 
-  *distortion = vp9_sb_block_error_c(x->coeff, xd->dqcoeff, 1024, 0);
+  *distortion = vp9_sb_block_error_c(x->coeff, xd->plane[0].dqcoeff, 1024, 0);
   *rate       = rdcost_sby_32x32(cm, x);
   *skippable  = vp9_sby_is_skippable_32x32(xd);
 }
@@ -1022,7 +1045,7 @@ static void super_block64_yrd_4x4(VP9_COMMON *const cm, MACROBLOCK *x,
   vp9_transform_sb64y_4x4(x);
   vp9_quantize_sb64y_4x4(x);
 
-  *distortion = vp9_sb_block_error_c(x->coeff, xd->dqcoeff, 4096, 2);
+  *distortion = vp9_sb_block_error_c(x->coeff, xd->plane[0].dqcoeff, 4096, 2);
   *rate       = rdcost_sb64y_4x4(cm, x);
   *skippable  = vp9_sb64y_is_skippable_4x4(xd);
 }
@@ -1053,7 +1076,7 @@ static void super_block64_yrd_8x8(VP9_COMMON *const cm, MACROBLOCK *x,
   vp9_transform_sb64y_8x8(x);
   vp9_quantize_sb64y_8x8(x);
 
-  *distortion = vp9_sb_block_error_c(x->coeff, xd->dqcoeff, 4096, 2);
+  *distortion = vp9_sb_block_error_c(x->coeff, xd->plane[0].dqcoeff, 4096, 2);
   *rate       = rdcost_sb64y_8x8(cm, x);
   *skippable  = vp9_sb64y_is_skippable_8x8(xd);
 }
@@ -1085,7 +1108,7 @@ static void super_block64_yrd_16x16(VP9_COMMON *const cm, MACROBLOCK *x,
   vp9_transform_sb64y_16x16(x);
   vp9_quantize_sb64y_16x16(x);
 
-  *distortion = vp9_sb_block_error_c(x->coeff, xd->dqcoeff, 4096, 2);
+  *distortion = vp9_sb_block_error_c(x->coeff, xd->plane[0].dqcoeff, 4096, 2);
   *rate       = rdcost_sb64y_16x16(cm, x);
   *skippable  = vp9_sb64y_is_skippable_16x16(xd);
 }
@@ -1117,7 +1140,7 @@ static void super_block64_yrd_32x32(VP9_COMMON *const cm, MACROBLOCK *x,
   vp9_transform_sb64y_32x32(x);
   vp9_quantize_sb64y_32x32(x);
 
-  *distortion = vp9_sb_block_error_c(x->coeff, xd->dqcoeff, 4096, 0);
+  *distortion = vp9_sb_block_error_c(x->coeff, xd->plane[0].dqcoeff, 4096, 0);
   *rate       = rdcost_sb64y_32x32(cm, x);
   *skippable  = vp9_sb64y_is_skippable_32x32(xd);
 }
@@ -1163,8 +1186,8 @@ static void copy_predictor_8x8(uint8_t *dst, const uint8_t *predictor) {
   d[29] = p[29];
 }
 
-static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, BLOCK *be,
-                                     BLOCKD *b, B_PREDICTION_MODE *best_mode,
+static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
+                                     B_PREDICTION_MODE *best_mode,
                                      int *bmode_costs,
                                      ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
                                      int *bestrate, int *bestratey,
@@ -1175,6 +1198,8 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, BLOCK *be,
   int rate = 0;
   int distortion;
   VP9_COMMON *const cm = &cpi->common;
+  BLOCK *be = x->block + ib;
+  BLOCKD *b = xd->block + ib;
 
   ENTROPY_CONTEXT ta = *a, tempa = *a;
   ENTROPY_CONTEXT tl = *l, templ = *l;
@@ -1188,6 +1213,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, BLOCK *be,
   DECLARE_ALIGNED_ARRAY(16, uint8_t, best_predictor, 16 * 4);
   DECLARE_ALIGNED_ARRAY(16, int16_t, best_dqcoeff, 16);
 
+  assert(ib < 16);
 #if CONFIG_NEWBINTRAMODES
   b->bmi.as_mode.context = vp9_find_bpred_context(xd, b);
 #endif
@@ -1233,7 +1259,9 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, BLOCK *be,
     ratey = cost_coeffs(cm, x, b - xd->block,
                         PLANE_TYPE_Y_WITH_DC, &tempa, &templ, TX_4X4);
     rate += ratey;
-    distortion = vp9_block_error(be->coeff, b->dqcoeff, 16) >> 2;
+    distortion = vp9_block_error(be->coeff,
+                                 BLOCK_OFFSET(xd->plane[0].dqcoeff, ib, 16),
+                                 16) >> 2;
 
     this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
 
@@ -1247,7 +1275,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, BLOCK *be,
       *a = tempa;
       *l = templ;
       copy_predictor(best_predictor, b->predictor);
-      vpx_memcpy(best_dqcoeff, b->dqcoeff, 32);
+      vpx_memcpy(best_dqcoeff, BLOCK_OFFSET(xd->plane[0].dqcoeff, ib, 16), 32);
     }
   }
   b->bmi.as_mode.first = (B_PREDICTION_MODE)(*best_mode);
@@ -1304,7 +1332,7 @@ static int64_t rd_pick_intra4x4mby_modes(VP9_COMP *cpi, MACROBLOCK *mb,
 #endif
 
     total_rd += rd_pick_intra4x4block(
-                  cpi, mb, mb->block + i, xd->block + i, &best_mode,
+                  cpi, mb, i, &best_mode,
                   bmode_costs, ta + vp9_block2above[TX_4X4][i],
                   tl + vp9_block2left[TX_4X4][i], &r, &ry, &d);
 
@@ -1504,6 +1532,7 @@ static int64_t rd_pick_intra8x8block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
   // note the input and output index mapping
   int idx = (ib & 0x02) ? (ib + 2) : ib;
 
+  assert(ib < 16);
   for (mode = DC_PRED; mode <= TM_PRED; mode++) {
     int64_t this_rd;
     int rate_t = 0;
@@ -1526,7 +1555,7 @@ static int64_t rd_pick_intra8x8block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
 
       // compute quantization mse of 8x8 block
       distortion = vp9_block_error_c((x->block + idx)->coeff,
-                                     (xd->block + idx)->dqcoeff, 64);
+          BLOCK_OFFSET(xd->plane[0].dqcoeff, idx, 16), 64);
 
       vpx_memcpy(&ta, a, sizeof(ENTROPY_CONTEXT_PLANES));
       vpx_memcpy(&tl, l, sizeof(ENTROPY_CONTEXT_PLANES));
@@ -1569,7 +1598,9 @@ static int64_t rd_pick_intra8x8block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
           x->fwd_txm4x4(be->src_diff, be->coeff, 32);
           x->quantize_b_4x4(x, ib + iblock[i]);
         }
-        distortion += vp9_block_error_c(be->coeff, b->dqcoeff, 16 << do_two);
+        distortion += vp9_block_error_c(be->coeff,
+            BLOCK_OFFSET(xd->plane[0].dqcoeff, ib + iblock[i], 16),
+            16 << do_two);
         rate_t += cost_coeffs(cm, x, ib + iblock[i], PLANE_TYPE_Y_WITH_DC,
                               i&1 ? ta1 : ta0, i&2 ? tl1 : tl0,
                               TX_4X4);
@@ -1598,8 +1629,10 @@ static int64_t rd_pick_intra8x8block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
       best_rd = this_rd;
       *best_mode = mode;
       copy_predictor_8x8(best_predictor, b->predictor);
-      vpx_memcpy(best_dqcoeff, b->dqcoeff, 64);
-      vpx_memcpy(best_dqcoeff + 32, b->dqcoeff + 64, 64);
+      vpx_memcpy(best_dqcoeff,
+                 BLOCK_OFFSET(xd->plane[0].dqcoeff, ib, 16), 64);
+      vpx_memcpy(best_dqcoeff + 32,
+                 BLOCK_OFFSET(xd->plane[0].dqcoeff, ib, 16) + 64, 64);
     }
   }
   b->bmi.as_mode.first = (*best_mode);
@@ -1742,6 +1775,8 @@ static int rd_cost_mbuv_4x4(VP9_COMMON *const cm, MACROBLOCK *mb, int backup) {
   MACROBLOCKD *xd = &mb->e_mbd;
   ENTROPY_CONTEXT_PLANES t_above, t_left;
   ENTROPY_CONTEXT *ta, *tl;
+  const BLOCK_SIZE_TYPE real_sb_type = xd->mode_info_context->mbmi.sb_type;
+  xd->mode_info_context->mbmi.sb_type = BLOCK_SIZE_MB16X16;
 
   if (backup) {
     vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));
@@ -1760,6 +1795,7 @@ static int rd_cost_mbuv_4x4(VP9_COMMON *const cm, MACROBLOCK *mb, int backup) {
                         tl + vp9_block2left[TX_4X4][b],
                         TX_4X4);
 
+  xd->mode_info_context->mbmi.sb_type = real_sb_type;
   return cost;
 }
 
@@ -1783,6 +1819,8 @@ static int rd_cost_mbuv_8x8(VP9_COMMON *const cm, MACROBLOCK *mb, int backup) {
   MACROBLOCKD *xd = &mb->e_mbd;
   ENTROPY_CONTEXT_PLANES t_above, t_left;
   ENTROPY_CONTEXT *ta, *tl;
+  const BLOCK_SIZE_TYPE real_sb_type = xd->mode_info_context->mbmi.sb_type;
+  xd->mode_info_context->mbmi.sb_type = BLOCK_SIZE_MB16X16;
 
   if (backup) {
     vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));
@@ -1800,6 +1838,7 @@ static int rd_cost_mbuv_8x8(VP9_COMMON *const cm, MACROBLOCK *mb, int backup) {
                         ta + vp9_block2above[TX_8X8][b],
                         tl + vp9_block2left[TX_8X8][b], TX_8X8);
 
+  xd->mode_info_context->mbmi.sb_type = real_sb_type;
   return cost;
 }
 
@@ -1851,8 +1890,9 @@ static void rd_inter32x32_uv_16x16(VP9_COMMON *const cm, MACROBLOCK *x,
   vp9_quantize_sbuv_16x16(x);
 
   *rate       = rd_cost_sbuv_16x16(cm, x, backup);
-  *distortion = vp9_sb_block_error_c(x->coeff + 1024,
-                                     xd->dqcoeff + 1024, 512, 2);
+  *distortion = vp9_sb_uv_block_error_c(x->coeff + 1024,
+                                        xd->plane[1].dqcoeff,
+                                        xd->plane[2].dqcoeff, 512, 2);
   *skip       = vp9_sbuv_is_skippable_16x16(xd);
 }
 
@@ -2127,8 +2167,9 @@ static void rd_inter64x64_uv_32x32(VP9_COMMON *const cm, MACROBLOCK *x,
   vp9_quantize_sb64uv_32x32(x);
 
   *rate       = rd_cost_sb64uv_32x32(cm, x, backup);
-  *distortion = vp9_sb_block_error_c(x->coeff + 4096,
-                                     xd->dqcoeff + 4096, 2048, 0);
+  *distortion = vp9_sb_uv_block_error_c(x->coeff + 4096,
+                                        xd->plane[1].dqcoeff,
+                                        xd->plane[2].dqcoeff, 2048, 0);
   *skip       = vp9_sb64uv_is_skippable_32x32(xd);
 }
 
@@ -2466,7 +2507,8 @@ static int64_t encode_inter_mb_segment(VP9_COMMON *const cm,
       vp9_subtract_b(be, bd, 16);
       x->fwd_txm4x4(be->src_diff, be->coeff, 32);
       x->quantize_b_4x4(x, i);
-      thisdistortion = vp9_block_error(be->coeff, bd->dqcoeff, 16);
+      thisdistortion = vp9_block_error(be->coeff,
+          BLOCK_OFFSET(xd->plane[0].dqcoeff, i, 16), 16);
       *distortion += thisdistortion;
       *labelyrate += cost_coeffs(cm, x, i, PLANE_TYPE_Y_WITH_DC,
                                  ta + vp9_block2above[TX_4X4][i],
@@ -2508,11 +2550,12 @@ static int64_t encode_inter_mb_segment_8x8(VP9_COMMON *const cm,
       const int use_second_ref =
           xd->mode_info_context->mbmi.second_ref_frame > 0;
       int which_mv;
-      int idx = (ib & 8) + ((ib & 2) << 1);
-      BLOCKD *bd = &xd->block[ib], *bd2 = &xd->block[idx];
+      const int idx = (ib & 8) + ((ib & 2) << 1);
+      BLOCKD *bd = &xd->block[ib];
       BLOCK *be = &x->block[ib], *be2 = &x->block[idx];
       int thisdistortion;
 
+      assert(idx < 16);
       for (which_mv = 0; which_mv < 1 + use_second_ref; ++which_mv) {
         uint8_t **base_pre = which_mv ? bd->base_second_pre : bd->base_pre;
 
@@ -2532,7 +2575,8 @@ static int64_t encode_inter_mb_segment_8x8(VP9_COMMON *const cm,
         if (otherrd) {
           x->fwd_txm8x8(be->src_diff, be2->coeff, 32);
           x->quantize_b_8x8(x, idx, DCT_DCT);
-          thisdistortion = vp9_block_error_c(be2->coeff, bd2->dqcoeff, 64);
+          thisdistortion = vp9_block_error_c(be2->coeff,
+              BLOCK_OFFSET(xd->plane[0].dqcoeff, idx, 16), 64);
           otherdist += thisdistortion;
           xd->mode_info_context->mbmi.txfm_size = TX_8X8;
           othercost += cost_coeffs(cm, x, idx, PLANE_TYPE_Y_WITH_DC,
@@ -2546,7 +2590,8 @@ static int64_t encode_inter_mb_segment_8x8(VP9_COMMON *const cm,
           be = &x->block[ib + iblock[j]];
           x->fwd_txm8x4(be->src_diff, be->coeff, 32);
           x->quantize_b_4x4_pair(x, ib + iblock[j], ib + iblock[j] + 1);
-          thisdistortion = vp9_block_error_c(be->coeff, bd->dqcoeff, 32);
+          thisdistortion = vp9_block_error_c(be->coeff,
+              BLOCK_OFFSET(xd->plane[0].dqcoeff, ib + iblock[j], 16), 32);
           *distortion += thisdistortion;
           *labelyrate +=
               cost_coeffs(cm, x, ib + iblock[j], PLANE_TYPE_Y_WITH_DC,
@@ -2563,11 +2608,11 @@ static int64_t encode_inter_mb_segment_8x8(VP9_COMMON *const cm,
       } else /* 8x8 */ {
         if (otherrd) {
           for (j = 0; j < 4; j += 2) {
-            BLOCKD *bd = &xd->block[ib + iblock[j]];
             BLOCK *be = &x->block[ib + iblock[j]];
             x->fwd_txm8x4(be->src_diff, be->coeff, 32);
             x->quantize_b_4x4_pair(x, ib + iblock[j], ib + iblock[j] + 1);
-            thisdistortion = vp9_block_error_c(be->coeff, bd->dqcoeff, 32);
+            thisdistortion = vp9_block_error_c(be->coeff,
+                BLOCK_OFFSET(xd->plane[0].dqcoeff, ib + iblock[j], 16), 32);
             otherdist += thisdistortion;
             xd->mode_info_context->mbmi.txfm_size = TX_4X4;
             othercost +=
@@ -2586,7 +2631,8 @@ static int64_t encode_inter_mb_segment_8x8(VP9_COMMON *const cm,
         }
         x->fwd_txm8x8(be->src_diff, be2->coeff, 32);
         x->quantize_b_8x8(x, idx, DCT_DCT);
-        thisdistortion = vp9_block_error_c(be2->coeff, bd2->dqcoeff, 64);
+        thisdistortion = vp9_block_error_c(be2->coeff,
+            BLOCK_OFFSET(xd->plane[0].dqcoeff, idx, 16), 64);
         *distortion += thisdistortion;
         *labelyrate += cost_coeffs(cm, x, idx, PLANE_TYPE_Y_WITH_DC,
                                    ta + vp9_block2above[TX_8X8][idx],
diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c
index 8f9e9da69..ab286fd8a 100644
--- a/vp9/encoder/vp9_tokenize.c
+++ b/vp9/encoder/vp9_tokenize.c
@@ -123,7 +123,9 @@ static void tokenize_b(VP9_COMP *cpi,
   int c = 0;
   const int eob = xd->eobs[ib];     /* one beyond last nonzero coeff */
   TOKENEXTRA *t = *tp;        /* store tokens starting here */
-  int16_t *qcoeff_ptr = xd->qcoeff + 16 * ib;
+  const struct plane_block_idx pb_idx = plane_block_idx(xd, ib);
+  const int16_t *qcoeff_ptr = BLOCK_OFFSET(xd->plane[pb_idx.plane].qcoeff,
+                                           pb_idx.block, 16);
   int seg_eob, default_eob, pad;
   const int segment_id = mbmi->segment_id;
   const BLOCK_SIZE_TYPE sb_type = mbmi->sb_type;
diff --git a/vp9/encoder/x86/vp9_encodeopt.asm b/vp9/encoder/x86/vp9_encodeopt.asm
index 90c793d4f..51314a7a8 100644
--- a/vp9/encoder/x86/vp9_encodeopt.asm
+++ b/vp9/encoder/x86/vp9_encodeopt.asm
@@ -260,117 +260,3 @@ sym(vp9_mbblock_error_xmm_impl):
     UNSHADOW_ARGS
     pop         rbp
     ret
-
-
-;int vp9_mbuverror_mmx_impl(short *s_ptr, short *d_ptr);
-global sym(vp9_mbuverror_mmx_impl) PRIVATE
-sym(vp9_mbuverror_mmx_impl):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 2
-    push rsi
-    push rdi
-    ; end prolog
-
-
-        mov             rsi,        arg(0) ;s_ptr
-        mov             rdi,        arg(1) ;d_ptr
-
-        mov             rcx,        16
-        pxor            mm7,        mm7
-
-.mbuverror_loop_mmx:
-
-        movq            mm1,        [rsi]
-        movq            mm2,        [rdi]
-
-        psubw           mm1,        mm2
-        pmaddwd         mm1,        mm1
-
-
-        movq            mm3,        [rsi+8]
-        movq            mm4,        [rdi+8]
-
-        psubw           mm3,        mm4
-        pmaddwd         mm3,        mm3
-
-
-        paddd           mm7,        mm1
-        paddd           mm7,        mm3
-
-
-        add             rsi,        16
-        add             rdi,        16
-
-        dec             rcx
-        jnz             .mbuverror_loop_mmx
-
-        movq            mm0,        mm7
-        psrlq           mm7,        32
-
-        paddd           mm0,        mm7
-        movq            rax,        mm0
-
-    pop rdi
-    pop rsi
-    ; begin epilog
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;int vp9_mbuverror_xmm_impl(short *s_ptr, short *d_ptr);
-global sym(vp9_mbuverror_xmm_impl) PRIVATE
-sym(vp9_mbuverror_xmm_impl):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 2
-    push rsi
-    push rdi
-    ; end prolog
-
-
-        mov             rsi,        arg(0) ;s_ptr
-        mov             rdi,        arg(1) ;d_ptr
-
-        mov             rcx,        16
-        pxor            xmm3,       xmm3
-
-.mbuverror_loop:
-
-        movdqa          xmm1,       [rsi]
-        movdqa          xmm2,       [rdi]
-
-        psubw           xmm1,       xmm2
-        pmaddwd         xmm1,       xmm1
-
-        paddd           xmm3,       xmm1
-
-        add             rsi,        16
-        add             rdi,        16
-
-        dec             rcx
-        jnz             .mbuverror_loop
-
-        pxor        xmm0,           xmm0
-        movdqa      xmm1,           xmm3
-
-        movdqa      xmm2,           xmm1
-        punpckldq   xmm1,           xmm0
-
-        punpckhdq   xmm2,           xmm0
-        paddd       xmm1,           xmm2
-
-        movdqa      xmm2,           xmm1
-
-        psrldq      xmm1,           8
-        paddd       xmm1,           xmm2
-
-        movq            rax,            xmm1
-
-    pop rdi
-    pop rsi
-    ; begin epilog
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
diff --git a/vp9/encoder/x86/vp9_x86_csystemdependent.c b/vp9/encoder/x86/vp9_x86_csystemdependent.c
index 2bf32c569..9557af119 100644
--- a/vp9/encoder/x86/vp9_x86_csystemdependent.c
+++ b/vp9/encoder/x86/vp9_x86_csystemdependent.c
@@ -26,17 +26,10 @@ void vp9_short_fdct8x4_mmx(short *input, short *output, int pitch) {
 int vp9_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr);
 int vp9_mbblock_error_mmx(MACROBLOCK *mb) {
   short *coeff_ptr =  mb->block[0].coeff;
-  short *dcoef_ptr =  mb->e_mbd.block[0].dqcoeff;
+  short *dcoef_ptr =  mb->e_mbd.plane[0].dqcoeff;
   return vp9_mbblock_error_mmx_impl(coeff_ptr, dcoef_ptr);
 }
 
-int vp9_mbuverror_mmx_impl(short *s_ptr, short *d_ptr);
-int vp9_mbuverror_mmx(MACROBLOCK *mb) {
-  short *s_ptr = &mb->coeff[256];
-  short *d_ptr = &mb->e_mbd.dqcoeff[256];
-  return vp9_mbuverror_mmx_impl(s_ptr, d_ptr);
-}
-
 void vp9_subtract_b_mmx_impl(unsigned char *z,  int src_stride,
                              short *diff, unsigned char *predictor,
                              int pitch);
@@ -54,17 +47,10 @@ void vp9_subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch) {
 int vp9_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr);
 int vp9_mbblock_error_xmm(MACROBLOCK *mb) {
   short *coeff_ptr =  mb->block[0].coeff;
-  short *dcoef_ptr =  mb->e_mbd.block[0].dqcoeff;
+  short *dcoef_ptr =  mb->e_mbd.plane[0].dqcoeff;
   return vp9_mbblock_error_xmm_impl(coeff_ptr, dcoef_ptr);
 }
 
-int vp9_mbuverror_xmm_impl(short *s_ptr, short *d_ptr);
-int vp9_mbuverror_xmm(MACROBLOCK *mb) {
-  short *s_ptr = &mb->coeff[256];
-  short *d_ptr = &mb->e_mbd.dqcoeff[256];
-  return vp9_mbuverror_xmm_impl(s_ptr, d_ptr);
-}
-
 void vp9_subtract_b_sse2_impl(unsigned char *z,  int src_stride,
                               short *diff, unsigned char *predictor,
                               int pitch);