4 files changed, 193 insertions, 291 deletions
diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c
index 0db4d7046..1e95e2207 100644
--- a/vp9/decoder/vp9_decodframe.c
+++ b/vp9/decoder/vp9_decodframe.c
@@ -227,10 +227,10 @@ static void decode_16x16(VP9D_COMP *pbi, MACROBLOCKD *xd,
                          BOOL_DECODER* const bc) {
   const TX_TYPE tx_type = get_tx_type_16x16(xd, 0);
   if (tx_type != DCT_DCT) {
-    vp9_ht_dequant_idct_add_16x16_c(tx_type, xd->plane[0].qcoeff,
-                                    xd->block[0].dequant, xd->predictor,
-                                    xd->dst.y_buffer, 16, xd->dst.y_stride,
-                                    xd->plane[0].eobs[0]);
+    vp9_dequant_iht_add_16x16_c(tx_type, xd->plane[0].qcoeff,
+                                xd->block[0].dequant, xd->predictor,
+                                xd->dst.y_buffer, 16, xd->dst.y_stride,
+                                xd->plane[0].eobs[0]);
   } else {
     vp9_dequant_idct_add_16x16(xd->plane[0].qcoeff, xd->block[0].dequant,
                                xd->predictor, xd->dst.y_buffer,
@@ -269,8 +269,8 @@ static void decode_8x8(VP9D_COMP *pbi, MACROBLOCKD *xd,
       }
       tx_type = get_tx_type_8x8(xd, ib);
       if (tx_type != DCT_DCT) {
-        vp9_ht_dequant_idct_add_8x8_c(tx_type, q, dq, pre, dst, 16, stride,
-                                      xd->plane[0].eobs[idx]);
+        vp9_dequant_iht_add_8x8_c(tx_type, q, dq, pre, dst, 16, stride,
+                                  xd->plane[0].eobs[idx]);
       } else {
         vp9_dequant_idct_add_8x8_c(q, dq, pre, dst, 16, stride,
                                    xd->plane[0].eobs[idx]);
@@ -342,7 +342,7 @@ static void decode_4x4(VP9D_COMP *pbi, MACROBLOCKD *xd,
         b = &xd->block[ib + iblock[j]];
         tx_type = get_tx_type_4x4(xd, ib + iblock[j]);
         if (tx_type != DCT_DCT) {
-          vp9_ht_dequant_idct_add_c(tx_type,
+          vp9_dequant_iht_add_c(tx_type,
               BLOCK_OFFSET(xd->plane[0].qcoeff, ib + iblock[j], 16),
                                     b->dequant, b->predictor,
                                     *(b->base_dst) + b->dst, 16,
@@ -381,11 +381,11 @@ static void decode_4x4(VP9D_COMP *pbi, MACROBLOCKD *xd,
       vp9_intra4x4_predict(xd, b, b_mode, b->predictor);
       tx_type = get_tx_type_4x4(xd, i);
       if (tx_type != DCT_DCT) {
-        vp9_ht_dequant_idct_add_c(tx_type,
-                                  BLOCK_OFFSET(xd->plane[0].qcoeff, i, 16),
-                                  b->dequant, b->predictor,
-                                  *(b->base_dst) + b->dst, 16, b->dst_stride,
-                                  xd->plane[0].eobs[i]);
+        vp9_dequant_iht_add_c(tx_type,
+                              BLOCK_OFFSET(xd->plane[0].qcoeff, i, 16),
+                              b->dequant, b->predictor,
+                              *(b->base_dst) + b->dst, 16, b->dst_stride,
+                               xd->plane[0].eobs[i]);
       } else {
         xd->itxm_add(BLOCK_OFFSET(xd->plane[0].qcoeff, i, 16),
                      b->dequant, b->predictor,
@@ -422,11 +422,11 @@ static void decode_4x4(VP9D_COMP *pbi, MACROBLOCKD *xd,
       BLOCKD *b = &xd->block[i];
       tx_type = get_tx_type_4x4(xd, i);
       if (tx_type != DCT_DCT) {
-        vp9_ht_dequant_idct_add_c(tx_type,
-                                  BLOCK_OFFSET(xd->plane[0].qcoeff, i, 16),
-                                  b->dequant, b->predictor,
-                                  *(b->base_dst) + b->dst, 16,
-                                  b->dst_stride, xd->plane[0].eobs[i]);
+        vp9_dequant_iht_add_c(tx_type,
+                              BLOCK_OFFSET(xd->plane[0].qcoeff, i, 16),
+                              b->dequant, b->predictor,
+                              *(b->base_dst) + b->dst, 16,
+                              b->dst_stride, xd->plane[0].eobs[i]);
       } else {
         xd->itxm_add(BLOCK_OFFSET(xd->plane[0].qcoeff, i, 16),
                      b->dequant, b->predictor,
@@ -463,13 +463,13 @@ static void decode_sb_16x16(MACROBLOCKD *mb, int y_size) {
                                  mb->dst.y_stride, mb->dst.y_stride,
                                  mb->plane[0].eobs[n * 16]);
     } else {
-      vp9_ht_dequant_idct_add_16x16_c(tx_type,
-                                      BLOCK_OFFSET(mb->plane[0].qcoeff, n, 256),
-                                      mb->block[0].dequant,
-                                      mb->dst.y_buffer + y_offset,
-                                      mb->dst.y_buffer + y_offset,
-                                      mb->dst.y_stride, mb->dst.y_stride,
-                                      mb->plane[0].eobs[n * 16]);
+      vp9_dequant_iht_add_16x16_c(tx_type,
+                                  BLOCK_OFFSET(mb->plane[0].qcoeff, n, 256),
+                                  mb->block[0].dequant,
+                                  mb->dst.y_buffer + y_offset,
+                                  mb->dst.y_buffer + y_offset,
+                                  mb->dst.y_stride, mb->dst.y_stride,
+                                  mb->plane[0].eobs[n * 16]);
     }
   }
 
@@ -513,13 +513,13 @@ static INLINE void decode_sb_8x8(MACROBLOCKD *xd, int y_size) {
                                  xd->dst.y_stride, xd->dst.y_stride,
                                  xd->plane[0].eobs[n * 4]);
     } else {
-      vp9_ht_dequant_idct_add_8x8_c(tx_type,
-                                    BLOCK_OFFSET(xd->plane[0].qcoeff, n, 64),
-                                    xd->block[0].dequant,
-                                    xd->dst.y_buffer + y_offset,
-                                    xd->dst.y_buffer + y_offset,
-                                    xd->dst.y_stride, xd->dst.y_stride,
-                                    xd->plane[0].eobs[n * 4]);
+      vp9_dequant_iht_add_8x8_c(tx_type,
+                                BLOCK_OFFSET(xd->plane[0].qcoeff, n, 64),
+                                xd->block[0].dequant,
+                                xd->dst.y_buffer + y_offset,
+                                xd->dst.y_buffer + y_offset,
+                                xd->dst.y_stride, xd->dst.y_stride,
+                                xd->plane[0].eobs[n * 4]);
     }
   }
 
@@ -563,14 +563,14 @@ static void decode_sb_4x4(MACROBLOCKD *xd, int y_size) {
                    xd->dst.y_stride, xd->dst.y_stride,
                    xd->plane[0].eobs[n]);
     } else {
-      vp9_ht_dequant_idct_add_c(tx_type,
-                                BLOCK_OFFSET(xd->plane[0].qcoeff, n, 16),
-                                xd->block[0].dequant,
-                                xd->dst.y_buffer + y_offset,
-                                xd->dst.y_buffer + y_offset,
-                                xd->dst.y_stride,
-                                xd->dst.y_stride,
-                                xd->plane[0].eobs[n]);
+      vp9_dequant_iht_add_c(tx_type,
+                            BLOCK_OFFSET(xd->plane[0].qcoeff, n, 16),
+                            xd->block[0].dequant,
+                            xd->dst.y_buffer + y_offset,
+                            xd->dst.y_buffer + y_offset,
+                            xd->dst.y_stride,
+                            xd->dst.y_stride,
+                            xd->plane[0].eobs[n]);
     }
   }
 
@@ -608,7 +608,7 @@ static void decode_sb64(VP9D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int mb_col,
     mb_init_dequantizer(pbi, xd);
 
   if (mi->mbmi.mb_skip_coeff) {
-    vp9_reset_sb64_tokens_context(xd);
+    vp9_reset_sb_tokens_context(xd, BLOCK_SIZE_SB64X64);
 
     // Special case:  Force the loopfilter to skip when eobtotal and
     // mb_skip_coeff are zero.
@@ -686,7 +686,7 @@ static void decode_sb32(VP9D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int mb_col,
     mb_init_dequantizer(pbi, xd);
 
   if (mi->mbmi.mb_skip_coeff) {
-    vp9_reset_sb_tokens_context(xd);
+    vp9_reset_sb_tokens_context(xd, BLOCK_SIZE_SB32X32);
 
     // Special case:  Force the loopfilter to skip when eobtotal and
     // mb_skip_coeff are zero.
@@ -877,31 +877,26 @@ static void set_offsets(VP9D_COMP *pbi, int block_size,
                         int mb_row, int mb_col) {
   VP9_COMMON *const cm = &pbi->common;
   MACROBLOCKD *const xd = &pbi->mb;
-  const int mis = cm->mode_info_stride;
-  const int idx = mis * mb_row + mb_col;
-  const int dst_fb_idx = cm->new_fb_idx;
-  const int recon_y_stride = cm->yv12_fb[dst_fb_idx].y_stride;
-  const int recon_uv_stride = cm->yv12_fb[dst_fb_idx].uv_stride;
-  const int recon_yoffset = mb_row * 16 * recon_y_stride + 16 * mb_col;
-  const int recon_uvoffset = mb_row * 8 * recon_uv_stride + 8 * mb_col;
-
-  xd->mode_info_context = cm->mi + idx;
-  xd->mode_info_context->mbmi.sb_type = block_size >> 5;
-  xd->prev_mode_info_context = cm->prev_mi + idx;
-  xd->above_context = cm->above_context + mb_col;
-  xd->left_context = cm->left_context + (mb_row & 3);
 
-  // Distance of Mb to the various image edges.
-  // These are specified to 8th pel as they are always compared to
-  // values that are in 1/8th pel units
-  block_size >>= 4;  // in mb units
+  const int mb_idx = mb_row * cm->mode_info_stride + mb_col;
+  const YV12_BUFFER_CONFIG *dst_fb = &cm->yv12_fb[cm->new_fb_idx];
+  const int recon_yoffset = (16 * mb_row) * dst_fb->y_stride + (16 * mb_col);
+  const int recon_uvoffset = (8 * mb_row) * dst_fb->uv_stride + (8 * mb_col);
+
+  xd->mode_info_context = cm->mi + mb_idx;
+  xd->mode_info_context->mbmi.sb_type = (BLOCK_SIZE_TYPE)(block_size / 32);
+  xd->prev_mode_info_context = cm->prev_mi + mb_idx;
+  xd->above_context = cm->above_context + mb_col;
+  xd->left_context = cm->left_context + mb_row % 4;
 
-  set_mb_row(cm, xd, mb_row, block_size);
-  set_mb_col(cm, xd, mb_col, block_size);
+  // Distance of Mb to the various image edges. These are specified to 8th pel
+  // as they are always compared to values that are in 1/8th pel units
+  set_mb_row(cm, xd, mb_row, block_size / 16);
+  set_mb_col(cm, xd, mb_col, block_size / 16);
 
-  xd->dst.y_buffer = cm->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset;
-  xd->dst.u_buffer = cm->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset;
-  xd->dst.v_buffer = cm->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;
+  xd->dst.y_buffer = dst_fb->y_buffer + recon_yoffset;
+  xd->dst.u_buffer = dst_fb->u_buffer + recon_uvoffset;
+  xd->dst.v_buffer = dst_fb->v_buffer + recon_uvoffset;
 }
 
 static void set_refs(VP9D_COMP *pbi, int block_size, int mb_row, int mb_col) {
@@ -923,6 +918,8 @@ static void set_refs(VP9D_COMP *pbi, int block_size, int mb_row, int mb_col) {
       // Select the appropriate reference frame for this MB
       const int second_fb_idx = cm->active_ref_idx[mbmi->second_ref_frame - 1];
       const YV12_BUFFER_CONFIG *second_cfg = &cm->yv12_fb[second_fb_idx];
+      xd->scale_factor[1]    = cm->active_ref_scale[mbmi->second_ref_frame - 1];
+      xd->scale_factor_uv[1] = cm->active_ref_scale[mbmi->second_ref_frame - 1];
       setup_pred_block(&xd->second_pre, second_cfg, mb_row, mb_col,
                        &xd->scale_factor[1], &xd->scale_factor_uv[1]);
       xd->corrupted |= second_cfg->corrupted;
@@ -1637,15 +1634,14 @@ int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) {
   // For all non key frames the GF and ARF refresh flags and sign bias
   // flags must be set explicitly.
   if (pc->frame_type == KEY_FRAME) {
-    pc->active_ref_idx[0] = pc->new_fb_idx;
-    pc->active_ref_idx[1] = pc->new_fb_idx;
-    pc->active_ref_idx[2] = pc->new_fb_idx;
+    for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i)
+      pc->active_ref_idx[i] = pc->new_fb_idx;
   } else {
     // Should the GF or ARF be updated from the current frame
     pbi->refresh_frame_flags = vp9_read_literal(&header_bc, NUM_REF_FRAMES);
 
     // Select active reference frames
-    for (i = 0; i < 3; i++) {
+    for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i) {
       int ref_frame_num = vp9_read_literal(&header_bc, NUM_REF_FRAMES_LG2);
       pc->active_ref_idx[i] = pc->ref_frame_map[ref_frame_num];
     }
@@ -1665,19 +1661,18 @@ int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) {
     pc->use_interintra = vp9_read_bit(&header_bc);
 #endif
 
-    /* Calculate scaling factors for each of the 3 available references */
-    for (i = 0; i < 3; ++i) {
-      if (pc->active_ref_idx[i] >= NUM_YV12_BUFFERS) {
-        memset(&pc->active_ref_scale[i], 0, sizeof(pc->active_ref_scale[i]));
-        continue;
-      }
-
-      vp9_setup_scale_factors_for_frame(&pc->active_ref_scale[i],
-                                        &pc->yv12_fb[pc->active_ref_idx[i]],
-                                        pc->width, pc->height);
+    // Calculate scaling factors for each of the 3 available references
+    for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i) {
+      const int idx = pc->active_ref_idx[i];
+      struct scale_factors *sf = &pc->active_ref_scale[i];
+      if (idx >= NUM_YV12_BUFFERS)
+        memset(sf, 0, sizeof(*sf));
+      else
+        vp9_setup_scale_factors_for_frame(sf, &pc->yv12_fb[idx],
+                                          pc->width, pc->height);
     }
 
-    // To enable choice of different interploation filters
+    // To enable choice of different interpolation filters
     vp9_setup_interp_filters(xd, pc->mcomp_filter_type, pc);
   }
 
diff --git a/vp9/decoder/vp9_dequantize.c b/vp9/decoder/vp9_dequantize.c
index c0d1e2adb..1539ee7d1 100644
--- a/vp9/decoder/vp9_dequantize.c
+++ b/vp9/decoder/vp9_dequantize.c
@@ -79,10 +79,10 @@ void vp9_add_constant_residual_32x32_c(const int16_t diff, const uint8_t *pred,
   add_constant_residual(diff, pred, pitch, dest, stride, 32, 32);
 }
 
-void vp9_ht_dequant_idct_add_c(TX_TYPE tx_type, int16_t *input,
-                               const int16_t *dq,
-                               uint8_t *pred, uint8_t *dest,
-                               int pitch, int stride, int eob) {
+void vp9_dequant_iht_add_c(TX_TYPE tx_type, int16_t *input,
+                           const int16_t *dq,
+                           uint8_t *pred, uint8_t *dest,
+                           int pitch, int stride, int eob) {
   int i;
   DECLARE_ALIGNED_ARRAY(16, int16_t, output, 16);
 
@@ -94,10 +94,10 @@ void vp9_ht_dequant_idct_add_c(TX_TYPE tx_type, int16_t *input,
   vp9_add_residual_4x4(output, pred, pitch, dest, stride);
 }
 
-void vp9_ht_dequant_idct_add_8x8_c(TX_TYPE tx_type, int16_t *input,
-                                   const int16_t *dq,
-                                   uint8_t *pred, uint8_t *dest,
-                                   int pitch, int stride, int eob) {
+void vp9_dequant_iht_add_8x8_c(TX_TYPE tx_type, int16_t *input,
+                               const int16_t *dq,
+                               uint8_t *pred, uint8_t *dest,
+                               int pitch, int stride, int eob) {
   DECLARE_ALIGNED_ARRAY(16, int16_t, output, 64);
 
   if (eob == 0) {
@@ -253,10 +253,10 @@ void vp9_dequant_idct_add_8x8_c(int16_t *input, const int16_t *dq,
   }
 }
 
-void vp9_ht_dequant_idct_add_16x16_c(TX_TYPE tx_type, int16_t *input,
-                                     const int16_t *dq, uint8_t *pred,
-                                     uint8_t *dest, int pitch, int stride,
-                                     int eob) {
+void vp9_dequant_iht_add_16x16_c(TX_TYPE tx_type, int16_t *input,
+                                 const int16_t *dq, uint8_t *pred,
+                                 uint8_t *dest, int pitch, int stride,
+                                 int eob) {
   DECLARE_ALIGNED_ARRAY(16, int16_t, output, 256);
 
   if (eob == 0) {
diff --git a/vp9/decoder/vp9_dequantize.h b/vp9/decoder/vp9_dequantize.h
index bb72bb294..da9e2b72f 100644
--- a/vp9/decoder/vp9_dequantize.h
+++ b/vp9/decoder/vp9_dequantize.h
@@ -44,18 +44,18 @@ void vp9_dequant_idct_add_uv_block_lossless_c(int16_t *q, const int16_t *dq,
                                               int stride,
                                               uint16_t *eobs);
 
-void vp9_ht_dequant_idct_add_c(TX_TYPE tx_type, int16_t *input, const int16_t *dq,
-                                    unsigned char *pred, unsigned char *dest,
-                                    int pitch, int stride, int eob);
-
-void vp9_ht_dequant_idct_add_8x8_c(TX_TYPE tx_type, int16_t *input,
-                                   const int16_t *dq, unsigned char *pred,
-                                   unsigned char *dest, int pitch, int stride,
-                                   int eob);
-
-void vp9_ht_dequant_idct_add_16x16_c(TX_TYPE tx_type, int16_t *input,
-                                     const int16_t *dq, unsigned char *pred,
-                                     unsigned char *dest,
-                                     int pitch, int stride, int eob);
+void vp9_dequant_iht_add_c(TX_TYPE tx_type, int16_t *input, const int16_t *dq,
+                           unsigned char *pred, unsigned char *dest,
+                           int pitch, int stride, int eob);
+
+void vp9_dequant_iht_add_8x8_c(TX_TYPE tx_type, int16_t *input,
+                               const int16_t *dq, unsigned char *pred,
+                               unsigned char *dest, int pitch, int stride,
+                               int eob);
+
+void vp9_dequant_iht_add_16x16_c(TX_TYPE tx_type, int16_t *input,
+                                 const int16_t *dq, unsigned char *pred,
+                                 unsigned char *dest,
+                                 int pitch, int stride, int eob);
 
 #endif  // VP9_DECODER_VP9_DEQUANTIZE_H_
diff --git a/vp9/decoder/vp9_detokenize.c b/vp9/decoder/vp9_detokenize.c
index a4ada2b7e..9077fcde1 100644
--- a/vp9/decoder/vp9_detokenize.c
+++ b/vp9/decoder/vp9_detokenize.c
@@ -380,182 +380,122 @@ static int get_eob(MACROBLOCKD* const xd, int segment_id, int eob_max) {
   return vp9_get_segdata(xd, segment_id, SEG_LVL_SKIP) ? 0 : eob_max;
 }
 
-static INLINE int decode_sb(VP9D_COMP* const pbi,
-                            MACROBLOCKD* const xd,
-                            BOOL_DECODER* const bc,
-                            int offset, int count, int inc,
-                            int eob_max, TX_SIZE tx_size) {
-  const int segment_id = xd->mode_info_context->mbmi.segment_id;
-  const int seg_eob = get_eob(xd, segment_id, eob_max);
+/* TODO(jkoleszar): Probably best to remove instances that require this,
+ * as the data likely becomes per-plane and stored in the per-plane structures.
+ * This is a stub to work with the existing code.
+ */
+static INLINE int block_idx_4x4(MACROBLOCKD* const xd, int block_size_b,
+                                int plane, int i) {
+  const int luma_blocks = 1 << block_size_b;
+  assert(xd->plane[0].subsampling_x == 0);
+  assert(xd->plane[0].subsampling_y == 0);
+  assert(xd->plane[1].subsampling_x == 1);
+  assert(xd->plane[1].subsampling_y == 1);
+  assert(xd->plane[2].subsampling_x == 1);
+  assert(xd->plane[2].subsampling_y == 1);
+  return plane == 0 ? i :
+         plane == 1 ? luma_blocks + i :
+                      luma_blocks * 5 / 4 + i;
+}
+
+static INLINE int decode_block_plane(VP9D_COMP* const pbi,
+                                     MACROBLOCKD* const xd,
+                                     BOOL_DECODER* const bc,
+                                     BLOCK_SIZE_LG2 block_size,
+                                     int segment_id,
+                                     int plane,
+                                     int is_split) {
+  // block and transform sizes, in number of 4x4 blocks log 2 ("*_b")
+  // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8
+  const TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;
+  const BLOCK_SIZE_LG2 block_size_b = block_size;
+  const BLOCK_SIZE_LG2 txfrm_size_b = tx_size * 2;
+
+  // subsampled size of the block
+  const int ss_sum = xd->plane[plane].subsampling_x +
+                     xd->plane[plane].subsampling_y;
+  const BLOCK_SIZE_LG2 ss_block_size = block_size_b - ss_sum;
+
+  // size of the transform to use. scale the transform down if it's larger
+  // than the size of the subsampled data, or forced externally by the mb mode.
+  const int ss_max = MAX(xd->plane[plane].subsampling_x,
+                         xd->plane[plane].subsampling_y);
+  const BLOCK_SIZE_LG2 ss_txfrm_size = txfrm_size_b > ss_block_size || is_split
+                                       ? txfrm_size_b - ss_max * 2
+                                       : txfrm_size_b;
+  const TX_SIZE ss_tx_size = ss_txfrm_size / 2;
+
+  // TODO(jkoleszar): 1 may not be correct here with larger chroma planes.
+  const int inc = is_split ? 1 : (1 << ss_txfrm_size);
+
+  // find the maximum eob for this transform size, adjusted by segment
+  const int seg_eob = get_eob(xd, segment_id, 16 << ss_txfrm_size);
+
   int i, eobtotal = 0;
 
-  assert(count == offset * 3 / 2);
+  assert(txfrm_size_b <= block_size_b);
+  assert(ss_txfrm_size <= ss_block_size);
 
-  // luma blocks
-  for (i = 0; i < offset; i += inc) {
-    const int c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_Y_WITH_DC, seg_eob,
-                               BLOCK_OFFSET(xd->plane[0].qcoeff, i, 16),
-                               tx_size);
-    xd->plane[0].eobs[i] = c;
-    eobtotal += c;
-  }
+  // step through the block by the size of the transform in use.
+  for (i = 0; i < (1 << ss_block_size); i += inc) {
+    const int block_idx = block_idx_4x4(xd, block_size_b, plane, i);
 
-  // chroma blocks
-  for (i = offset; i < offset * 5 / 4; i += inc) {
-    const int b = i - offset;
-    const int c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_UV, seg_eob,
-                               BLOCK_OFFSET(xd->plane[1].qcoeff, b, 16),
-                               tx_size);
-    xd->plane[1].eobs[b] = c;
+    const int c = decode_coefs(pbi, xd, bc, block_idx,
+                               xd->plane[plane].plane_type, seg_eob,
+                               BLOCK_OFFSET(xd->plane[plane].qcoeff, i, 16),
+                               ss_tx_size);
+    xd->plane[plane].eobs[i] = c;
     eobtotal += c;
   }
-  for (i = offset * 5 / 4; i < count; i += inc) {
-    const int b = i - offset * 5 / 4;
-    const int c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_UV, seg_eob,
-                               BLOCK_OFFSET(xd->plane[2].qcoeff, b, 16),
-                               tx_size);
-    xd->plane[2].eobs[b] = c;
-    eobtotal += c;
-  }
-
   return eobtotal;
 }
 
-int vp9_decode_sb_tokens(VP9D_COMP* const pbi,
-                         MACROBLOCKD* const xd,
-                         BOOL_DECODER* const bc) {
-  switch (xd->mode_info_context->mbmi.txfm_size) {
-    case TX_32X32: {
-      // 32x32 luma block
-      const int segment_id = xd->mode_info_context->mbmi.segment_id;
-      int eobtotal = 0, seg_eob;
-      int c = decode_coefs(pbi, xd, bc, 0, PLANE_TYPE_Y_WITH_DC,
-                           get_eob(xd, segment_id, 1024),
-                           xd->plane[0].qcoeff, TX_32X32);
-      xd->plane[0].eobs[0] = c;
-      eobtotal += c;
-
-      // 16x16 chroma blocks
-      seg_eob = get_eob(xd, segment_id, 256);
-
-      c = decode_coefs(pbi, xd, bc, 64, PLANE_TYPE_UV, seg_eob,
-                       xd->plane[1].qcoeff, TX_16X16);
-      xd->plane[1].eobs[0] = c;
-      eobtotal += c;
-      c = decode_coefs(pbi, xd, bc, 80, PLANE_TYPE_UV, seg_eob,
-                       xd->plane[2].qcoeff, TX_16X16);
-      xd->plane[2].eobs[0] = c;
-      eobtotal += c;
-      return eobtotal;
-    }
-    case TX_16X16:
-      return decode_sb(pbi, xd, bc, 64, 96, 16, 16 * 16, TX_16X16);
-    case TX_8X8:
-      return decode_sb(pbi, xd, bc, 64, 96, 4, 8 * 8, TX_8X8);
-    case TX_4X4:
-      return decode_sb(pbi, xd, bc, 64, 96, 1, 4 * 4, TX_4X4);
-    default:
-      assert(0);
-      return 0;
+static INLINE int decode_blocks_helper(VP9D_COMP* const pbi,
+                                       MACROBLOCKD* const xd,
+                                       BOOL_DECODER* const bc,
+                                       int block_size,
+                                       int is_split_chroma) {
+  const int segment_id = xd->mode_info_context->mbmi.segment_id;
+  int plane, eobtotal = 0;
+
+  for (plane = 0; plane < MAX_MB_PLANE; plane++) {
+    const int is_split = is_split_chroma &&
+                         xd->plane[plane].plane_type == PLANE_TYPE_UV;
+    eobtotal += decode_block_plane(pbi, xd, bc, block_size, segment_id,
+                                   plane, is_split);
   }
+  return eobtotal;
+}
+
+static INLINE int decode_blocks(VP9D_COMP* const pbi,
+                                MACROBLOCKD* const xd,
+                                BOOL_DECODER* const bc,
+                                int block_size) {
+  const MB_PREDICTION_MODE mode = xd->mode_info_context->mbmi.mode;
+  const TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;
+  return decode_blocks_helper(pbi, xd, bc, block_size,
+      tx_size == TX_8X8 && (mode == I8X8_PRED || mode == SPLITMV));
 }
 
 int vp9_decode_sb64_tokens(VP9D_COMP* const pbi,
                            MACROBLOCKD* const xd,
                            BOOL_DECODER* const bc) {
-  switch (xd->mode_info_context->mbmi.txfm_size) {
-    case TX_32X32:
-      return decode_sb(pbi, xd, bc, 256, 384, 64, 32 * 32, TX_32X32);
-    case TX_16X16:
-      return decode_sb(pbi, xd, bc, 256, 384, 16, 16 * 16, TX_16X16);
-    case TX_8X8:
-      return decode_sb(pbi, xd, bc, 256, 384, 4, 8 * 8, TX_8X8);
-    case TX_4X4:
-      return decode_sb(pbi, xd, bc, 256, 384, 1, 4 * 4, TX_4X4);
-    default:
-      assert(0);
-      return 0;
-  }
+  return decode_blocks(pbi, xd, bc, BLOCK_64X64_LG2);
 }
 
-static int vp9_decode_mb_tokens_16x16(VP9D_COMP* const pbi,
-                                      MACROBLOCKD* const xd,
-                                      BOOL_DECODER* const bc) {
-  const int segment_id = xd->mode_info_context->mbmi.segment_id;
-  int eobtotal = 0, seg_eob;
-
-  // Luma block
-  int c = decode_coefs(pbi, xd, bc, 0, PLANE_TYPE_Y_WITH_DC,
-                       get_eob(xd, segment_id, 256),
-                       xd->plane[0].qcoeff, TX_16X16);
-  xd->plane[0].eobs[0] = c;
-  eobtotal += c;
-
-  // 8x8 chroma blocks
-  seg_eob = get_eob(xd, segment_id, 64);
-
-  c = decode_coefs(pbi, xd, bc, 16, PLANE_TYPE_UV,
-                   seg_eob, xd->plane[1].qcoeff, TX_8X8);
-  xd->plane[1].eobs[0] = c;
-  eobtotal += c;
-  c = decode_coefs(pbi, xd, bc, 20, PLANE_TYPE_UV,
-                   seg_eob, xd->plane[2].qcoeff, TX_8X8);
-  xd->plane[2].eobs[0] = c;
-  eobtotal += c;
-  return eobtotal;
+int vp9_decode_sb_tokens(VP9D_COMP* const pbi,
+                         MACROBLOCKD* const xd,
+                         BOOL_DECODER* const bc) {
+  return decode_blocks(pbi, xd, bc, BLOCK_32X32_LG2);
 }
 
-static int vp9_decode_mb_tokens_8x8(VP9D_COMP* const pbi,
-                                    MACROBLOCKD* const xd,
-                                    BOOL_DECODER* const bc) {
-  int i, eobtotal = 0;
-  const int segment_id = xd->mode_info_context->mbmi.segment_id;
-
-  // luma blocks
-  int seg_eob = get_eob(xd, segment_id, 64);
-  for (i = 0; i < 16; i += 4) {
-    const int c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_Y_WITH_DC, seg_eob,
-                               BLOCK_OFFSET(xd->plane[0].qcoeff, i, 16),
-                               TX_8X8);
-    xd->plane[0].eobs[i] = c;
-    eobtotal += c;
-  }
-
-  // chroma blocks
-  if (xd->mode_info_context->mbmi.mode == I8X8_PRED ||
-      xd->mode_info_context->mbmi.mode == SPLITMV) {
-    // use 4x4 transform for U, V components in I8X8/splitmv prediction mode
-    seg_eob = get_eob(xd, segment_id, 16);
-    for (i = 16; i < 20; i++) {
-      const int c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_UV, seg_eob,
-                                 BLOCK_OFFSET(xd->plane[1].qcoeff, i - 16, 16),
-                                 TX_4X4);
-      xd->plane[1].eobs[i - 16] = c;
-      eobtotal += c;
-    }
-    for (i = 20; i < 24; i++) {
-      const int c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_UV, seg_eob,
-                                 BLOCK_OFFSET(xd->plane[2].qcoeff, i - 20, 16),
-                                 TX_4X4);
-      xd->plane[2].eobs[i - 20] = c;
-      eobtotal += c;
-    }
-  } else {
-    int c;
-
-    c = decode_coefs(pbi, xd, bc, 16, PLANE_TYPE_UV, seg_eob,
-                     xd->plane[1].qcoeff, TX_8X8);
-    xd->plane[1].eobs[0] = c;
-    eobtotal += c;
-    c = decode_coefs(pbi, xd, bc, 20, PLANE_TYPE_UV, seg_eob,
-                     xd->plane[2].qcoeff, TX_8X8);
-    xd->plane[2].eobs[0] = c;
-    eobtotal += c;
-  }
-
-  return eobtotal;
+int vp9_decode_mb_tokens(VP9D_COMP* const pbi,
+                         MACROBLOCKD* const xd,
+                         BOOL_DECODER* const bc) {
+  return decode_blocks(pbi, xd, bc, BLOCK_16X16_LG2);
 }
 
+#if CONFIG_NEWBINTRAMODES
 static int decode_coefs_4x4(VP9D_COMP *dx, MACROBLOCKD *xd,
                             BOOL_DECODER* const bc,
                             PLANE_TYPE type, int i, int seg_eob) {
@@ -588,39 +528,6 @@ int vp9_decode_mb_tokens_4x4_uv(VP9D_COMP* const dx,
   return decode_mb_tokens_4x4_uv(dx, xd, bc, seg_eob);
 }
 
-static int vp9_decode_mb_tokens_4x4(VP9D_COMP* const dx,
-                                    MACROBLOCKD* const xd,
-                                    BOOL_DECODER* const bc) {
-  int i, eobtotal = 0;
-  const int segment_id = xd->mode_info_context->mbmi.segment_id;
-  const int seg_eob = get_eob(xd, segment_id, 16);
-
-  // luma blocks
-  for (i = 0; i < 16; ++i)
-    eobtotal += decode_coefs_4x4(dx, xd, bc, PLANE_TYPE_Y_WITH_DC, i, seg_eob);
-
-  // chroma blocks
-  eobtotal += decode_mb_tokens_4x4_uv(dx, xd, bc, seg_eob);
-
-  return eobtotal;
-}
-
-int vp9_decode_mb_tokens(VP9D_COMP* const dx,
-                         MACROBLOCKD* const xd,
-                         BOOL_DECODER* const bc) {
-  const TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;
-  switch (tx_size) {
-    case TX_16X16:
-      return vp9_decode_mb_tokens_16x16(dx, xd, bc);
-    case TX_8X8:
-      return vp9_decode_mb_tokens_8x8(dx, xd, bc);
-    default:
-      assert(tx_size == TX_4X4);
-      return vp9_decode_mb_tokens_4x4(dx, xd, bc);
-  }
-}
-
-#if CONFIG_NEWBINTRAMODES
 int vp9_decode_coefs_4x4(VP9D_COMP *dx, MACROBLOCKD *xd,
                          BOOL_DECODER* const bc,
                          PLANE_TYPE type, int i) {