26 files changed, 1537 insertions, 5141 deletions
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index 3ab67cd8c..6624f07eb 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -281,10 +281,6 @@ static void sb_kfwrite_ymode(vp9_writer *bc, int m, const vp9_prob *p) {
   write_token(bc, vp9_uv_mode_tree, p, vp9_sb_kf_ymode_encodings + m);
 }
 
-static void write_i8x8_mode(vp9_writer *bc, int m, const vp9_prob *p) {
-  write_token(bc, vp9_i8x8_mode_tree, p, vp9_i8x8_mode_encodings + m);
-}
-
 static void write_uv_mode(vp9_writer *bc, int m, const vp9_prob *p) {
   write_token(bc, vp9_uv_mode_tree, p, vp9_uv_mode_encodings + m);
 }
@@ -302,10 +298,6 @@ static void write_kf_bmode(vp9_writer *bc, int m, const vp9_prob *p) {
   write_token(bc, vp9_kf_bmode_tree, p, vp9_kf_bmode_encodings + m);
 }
 
-static void write_split(vp9_writer *bc, int x, const vp9_prob *p) {
-  write_token(bc, vp9_mbsplit_tree, p, vp9_mbsplit_encodings + x);
-}
-
 static int prob_update_savings(const unsigned int *ct,
                                const vp9_prob oldp, const vp9_prob newp,
                                const vp9_prob upd) {
@@ -508,60 +500,9 @@ static void write_nmv(VP9_COMP *cpi, vp9_writer *bc,
 // It should only be called if a segment map update is indicated.
 static void write_mb_segid(vp9_writer *bc,
                            const MB_MODE_INFO *mi, const MACROBLOCKD *xd) {
-  // Encode the MB segment id.
-  int seg_id = mi->segment_id;
-
-  if (xd->segmentation_enabled && xd->update_mb_segmentation_map) {
-    switch (seg_id) {
-      case 0:
-        vp9_write(bc, 0, xd->mb_segment_tree_probs[0]);
-        vp9_write(bc, 0, xd->mb_segment_tree_probs[1]);
-        vp9_write(bc, 0, xd->mb_segment_tree_probs[2]);
-        break;
-      case 1:
-        vp9_write(bc, 0, xd->mb_segment_tree_probs[0]);
-        vp9_write(bc, 0, xd->mb_segment_tree_probs[1]);
-        vp9_write(bc, 1, xd->mb_segment_tree_probs[2]);
-        break;
-      case 2:
-        vp9_write(bc, 0, xd->mb_segment_tree_probs[0]);
-        vp9_write(bc, 1, xd->mb_segment_tree_probs[1]);
-        vp9_write(bc, 0, xd->mb_segment_tree_probs[3]);
-        break;
-      case 3:
-        vp9_write(bc, 0, xd->mb_segment_tree_probs[0]);
-        vp9_write(bc, 1, xd->mb_segment_tree_probs[1]);
-        vp9_write(bc, 1, xd->mb_segment_tree_probs[3]);
-        break;
-      case 4:
-        vp9_write(bc, 1, xd->mb_segment_tree_probs[0]);
-        vp9_write(bc, 0, xd->mb_segment_tree_probs[4]);
-        vp9_write(bc, 0, xd->mb_segment_tree_probs[5]);
-        break;
-      case 5:
-        vp9_write(bc, 1, xd->mb_segment_tree_probs[0]);
-        vp9_write(bc, 0, xd->mb_segment_tree_probs[4]);
-        vp9_write(bc, 1, xd->mb_segment_tree_probs[5]);
-        break;
-      case 6:
-        vp9_write(bc, 1, xd->mb_segment_tree_probs[0]);
-        vp9_write(bc, 1, xd->mb_segment_tree_probs[4]);
-        vp9_write(bc, 0, xd->mb_segment_tree_probs[6]);
-        break;
-      case 7:
-        vp9_write(bc, 1, xd->mb_segment_tree_probs[0]);
-        vp9_write(bc, 1, xd->mb_segment_tree_probs[4]);
-        vp9_write(bc, 1, xd->mb_segment_tree_probs[6]);
-        break;
-
-        // TRAP.. This should not happen
-      default:
-        vp9_write(bc, 0, xd->mb_segment_tree_probs[0]);
-        vp9_write(bc, 0, xd->mb_segment_tree_probs[1]);
-        vp9_write(bc, 0, xd->mb_segment_tree_probs[2]);
-        break;
-    }
-  }
+  if (xd->segmentation_enabled && xd->update_mb_segmentation_map)
+    treed_write(bc, vp9_segment_tree, xd->mb_segment_tree_probs,
+                mi->segment_id, 3);
 }
 
 // This function encodes the reference frame
@@ -718,7 +659,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m,
     active_section = 6;
 #endif
 
-    if (m->mbmi.sb_type > BLOCK_SIZE_MB16X16)
+    if (m->mbmi.sb_type > BLOCK_SIZE_SB8X8)
       write_sb_ymode(bc, mode, pc->fc.sb_ymode_prob);
     else
       write_ymode(bc, mode, pc->fc.ymode_prob);
@@ -728,21 +669,10 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m,
       do {
         write_bmode(bc, m->bmi[j].as_mode.first,
                     pc->fc.bmode_prob);
-      } while (++j < 16);
-    }
-    if (mode == I8X8_PRED) {
-      write_i8x8_mode(bc, m->bmi[0].as_mode.first,
-                      pc->fc.i8x8_mode_prob);
-      write_i8x8_mode(bc, m->bmi[2].as_mode.first,
-                      pc->fc.i8x8_mode_prob);
-      write_i8x8_mode(bc, m->bmi[8].as_mode.first,
-                      pc->fc.i8x8_mode_prob);
-      write_i8x8_mode(bc, m->bmi[10].as_mode.first,
-                      pc->fc.i8x8_mode_prob);
-    } else {
-      write_uv_mode(bc, mi->uv_mode,
-                    pc->fc.uv_mode_prob[mode]);
+      } while (++j < 4);
     }
+    write_uv_mode(bc, mi->uv_mode,
+                  pc->fc.uv_mode_prob[mode]);
   } else {
     vp9_prob mv_ref_p[VP9_MVREFS - 1];
 
@@ -754,7 +684,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m,
 
     // If segment skip is not enabled code the mode.
     if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) {
-      if (mi->sb_type > BLOCK_SIZE_MB16X16) {
+      if (mi->sb_type > BLOCK_SIZE_SB8X8) {
         write_sb_mv_ref(bc, mode, mv_ref_p);
       } else {
         write_mv_ref(bc, mode, mv_ref_p);
@@ -824,26 +754,16 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m,
         ++count_mb_seg[mi->partitioning];
 #endif
 
-        write_split(bc, mi->partitioning, cpi->common.fc.mbsplit_prob);
-        cpi->mbsplit_count[mi->partitioning]++;
-
         do {
           B_PREDICTION_MODE blockmode;
           int_mv blockmv;
-          const int *const  L = vp9_mbsplits[mi->partitioning];
           int k = -1;  /* first block in subset j */
           int mv_contz;
           int_mv leftmv, abovemv;
 
           blockmode = cpi->mb.partition_info->bmi[j].mode;
           blockmv = cpi->mb.partition_info->bmi[j].mv;
-#if CONFIG_DEBUG
-          while (j != L[++k])
-            if (k >= 16)
-              assert(0);
-#else
-          while (j != L[++k]);
-#endif
+          k = j;
           leftmv.as_int = left_block_mv(xd, m, k);
           abovemv.as_int = above_block_mv(m, k, mis);
           mv_contz = vp9_mv_cont(&leftmv, &abovemv);
@@ -875,16 +795,15 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m,
     }
   }
 
-  if (((rf == INTRA_FRAME && mode <= I8X8_PRED) ||
-       (rf != INTRA_FRAME && !(mode == SPLITMV &&
-                               mi->partitioning == PARTITIONING_4X4))) &&
+  if (((rf == INTRA_FRAME && mode != I4X4_PRED) ||
+       (rf != INTRA_FRAME && mode != SPLITMV)) &&
       pc->txfm_mode == TX_MODE_SELECT &&
-          !(skip_coeff || vp9_segfeature_active(xd, segment_id,
-                                                SEG_LVL_SKIP))) {
+      !(skip_coeff || vp9_segfeature_active(xd, segment_id,
+                                            SEG_LVL_SKIP))) {
     TX_SIZE sz = mi->txfm_size;
     // FIXME(rbultje) code ternary symbol once all experiments are merged
     vp9_write(bc, sz != TX_4X4, pc->prob_tx[0]);
-    if (sz != TX_4X4 && mode != I8X8_PRED && mode != SPLITMV) {
+    if (mi->sb_type >= BLOCK_SIZE_MB16X16 && sz != TX_4X4) {
       vp9_write(bc, sz != TX_8X8, pc->prob_tx[1]);
       if (mi->sb_type >= BLOCK_SIZE_SB32X32 && sz != TX_8X8)
         vp9_write(bc, sz != TX_16X16, pc->prob_tx[2]);
@@ -912,7 +831,7 @@ static void write_mb_modes_kf(const VP9_COMP *cpi,
     vp9_write(bc, skip_coeff, vp9_get_pred_prob(c, xd, PRED_MBSKIP));
   }
 
-  if (m->mbmi.sb_type > BLOCK_SIZE_MB16X16)
+  if (m->mbmi.sb_type > BLOCK_SIZE_SB8X8)
     sb_kfwrite_ymode(bc, ym, c->sb_kf_ymode_prob[c->kf_ymode_probs_index]);
   else
     kfwrite_ymode(bc, ym, c->kf_ymode_prob[c->kf_ymode_probs_index]);
@@ -921,35 +840,26 @@ static void write_mb_modes_kf(const VP9_COMP *cpi,
     int i = 0;
     do {
       const B_PREDICTION_MODE a = above_block_mode(m, i, mis);
-      const B_PREDICTION_MODE l = (xd->left_available || (i & 3)) ?
+      const B_PREDICTION_MODE l = (xd->left_available ||
+                                  (i & 1)) ?
                                   left_block_mode(m, i) : B_DC_PRED;
       const int bm = m->bmi[i].as_mode.first;
 
 #ifdef ENTROPY_STATS
       ++intra_mode_stats [A] [L] [bm];
 #endif
-
       write_kf_bmode(bc, bm, c->kf_bmode_prob[a][l]);
-    } while (++i < 16);
+    } while (++i < 4);
   }
-  if (ym == I8X8_PRED) {
-    write_i8x8_mode(bc, m->bmi[0].as_mode.first, c->fc.i8x8_mode_prob);
-    // printf("    mode: %d\n", m->bmi[0].as_mode.first); fflush(stdout);
-    write_i8x8_mode(bc, m->bmi[2].as_mode.first, c->fc.i8x8_mode_prob);
-    // printf("    mode: %d\n", m->bmi[2].as_mode.first); fflush(stdout);
-    write_i8x8_mode(bc, m->bmi[8].as_mode.first, c->fc.i8x8_mode_prob);
-    // printf("    mode: %d\n", m->bmi[8].as_mode.first); fflush(stdout);
-    write_i8x8_mode(bc, m->bmi[10].as_mode.first, c->fc.i8x8_mode_prob);
-    // printf("    mode: %d\n", m->bmi[10].as_mode.first); fflush(stdout);
-  } else
-    write_uv_mode(bc, m->mbmi.uv_mode, c->kf_uv_mode_prob[ym]);
 
-  if (ym <= I8X8_PRED && c->txfm_mode == TX_MODE_SELECT &&
+  write_uv_mode(bc, m->mbmi.uv_mode, c->kf_uv_mode_prob[ym]);
+
+  if (ym != I4X4_PRED && c->txfm_mode == TX_MODE_SELECT &&
       !(skip_coeff || vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP))) {
     TX_SIZE sz = m->mbmi.txfm_size;
     // FIXME(rbultje) code ternary symbol once all experiments are merged
     vp9_write(bc, sz != TX_4X4, c->prob_tx[0]);
-    if (sz != TX_4X4 && ym <= TM_PRED) {
+    if (m->mbmi.sb_type >= BLOCK_SIZE_MB16X16 && sz != TX_4X4) {
       vp9_write(bc, sz != TX_8X8, c->prob_tx[1]);
       if (m->mbmi.sb_type >= BLOCK_SIZE_SB32X32 && sz != TX_8X8)
         vp9_write(bc, sz != TX_16X16, c->prob_tx[2]);
@@ -1162,45 +1072,34 @@ static void write_modes_sb(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc,
   else
     assert(0);
 
-  if (bsize > BLOCK_SIZE_MB16X16) {
+  if (bsize > BLOCK_SIZE_SB8X8) {
     int pl;
     xd->left_seg_context =
-        cm->left_seg_context + ((mi_row >> CONFIG_SB8X8) & 3);
-    xd->above_seg_context = cm->above_seg_context + (mi_col >> CONFIG_SB8X8);
+        cm->left_seg_context + ((mi_row >> 1) & 3);
+    xd->above_seg_context = cm->above_seg_context + (mi_col >> 1);
     pl = partition_plane_context(xd, bsize);
     // encode the partition information
     write_token(bc, vp9_partition_tree, cm->fc.partition_prob[pl],
                 vp9_partition_encodings + partition);
   }
 
+  subsize = get_subsize(bsize, partition);
+
   switch (partition) {
     case PARTITION_NONE:
-      subsize = bsize;
       write_modes_b(cpi, m, bc, tok, tok_end, mi_row, mi_col);
       break;
     case PARTITION_HORZ:
-      subsize = (bsize == BLOCK_SIZE_SB64X64) ? BLOCK_SIZE_SB64X32 :
-                                                BLOCK_SIZE_SB32X16;
       write_modes_b(cpi, m, bc, tok, tok_end, mi_row, mi_col);
       if ((mi_row + bh) < cm->mi_rows)
         write_modes_b(cpi, m + bh * mis, bc, tok, tok_end, mi_row + bh, mi_col);
       break;
     case PARTITION_VERT:
-      subsize = (bsize == BLOCK_SIZE_SB64X64) ? BLOCK_SIZE_SB32X64 :
-                                                BLOCK_SIZE_SB16X32;
       write_modes_b(cpi, m, bc, tok, tok_end, mi_row, mi_col);
       if ((mi_col + bw) < cm->mi_cols)
         write_modes_b(cpi, m + bw, bc, tok, tok_end, mi_row, mi_col + bw);
       break;
     case PARTITION_SPLIT:
-      // TODO(jingning): support recursive partitioning down to 16x16 as for
-      // now. need to merge in 16x8, 8x16, 8x8, and smaller partitions.
-      if (bsize == BLOCK_SIZE_SB64X64)
-        subsize = BLOCK_SIZE_SB32X32;
-      else if (bsize == BLOCK_SIZE_SB32X32)
-        subsize = BLOCK_SIZE_MB16X16;
-      else
-        assert(0);
       for (n = 0; n < 4; n++) {
         int j = n >> 1, i = n & 0x01;
         write_modes_sb(cpi, m + j * bs * mis + i * bs, bc, tok, tok_end,
@@ -1212,11 +1111,11 @@ static void write_modes_sb(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc,
   }
 
   // update partition context
-  if ((partition == PARTITION_SPLIT) && (bsize > BLOCK_SIZE_SB32X32))
+  if ((partition == PARTITION_SPLIT) && (bsize > BLOCK_SIZE_MB16X16))
     return;
 
-  xd->left_seg_context = cm->left_seg_context + ((mi_row >> CONFIG_SB8X8) & 3);
-  xd->above_seg_context = cm->above_seg_context + (mi_col >> CONFIG_SB8X8);
+  xd->left_seg_context = cm->left_seg_context + ((mi_row >> 1) & 3);
+  xd->above_seg_context = cm->above_seg_context + (mi_col >> 1);
   update_partition_context(xd, subsize, bsize);
 }
 
@@ -1233,12 +1132,12 @@ static void write_modes(VP9_COMP *cpi, vp9_writer* const bc,
 
   for (mi_row = c->cur_tile_mi_row_start;
        mi_row < c->cur_tile_mi_row_end;
-       mi_row += (4 << CONFIG_SB8X8), m_ptr += (4 << CONFIG_SB8X8) * mis) {
+       mi_row += 8, m_ptr += 8 * mis) {
     m = m_ptr;
     vpx_memset(c->left_seg_context, 0, sizeof(c->left_seg_context));
     for (mi_col = c->cur_tile_mi_col_start;
          mi_col < c->cur_tile_mi_col_end;
-         mi_col += (4 << CONFIG_SB8X8), m += (4 << CONFIG_SB8X8))
+         mi_col += 8, m += 8)
       write_modes_sb(cpi, m, bc, tok, tok_end, mi_row, mi_col,
                      BLOCK_SIZE_SB64X64);
   }
@@ -1719,16 +1618,147 @@ static void segment_reference_frames(VP9_COMP *cpi) {
   }
 }
 
-void vp9_pack_bitstream(VP9_COMP *cpi, unsigned char *dest,
-                        unsigned long *size) {
+static void encode_loopfilter(MACROBLOCKD *xd, vp9_writer *w) {
+  int i;
+
+  // Write out loop filter deltas applied at the MB level based on mode or
+  // ref frame (if they are enabled).
+  vp9_write_bit(w, xd->mode_ref_lf_delta_enabled);
+
+  if (xd->mode_ref_lf_delta_enabled) {
+    // Do the deltas need to be updated
+    vp9_write_bit(w, xd->mode_ref_lf_delta_update);
+    if (xd->mode_ref_lf_delta_update) {
+      // Send update
+      for (i = 0; i < MAX_REF_LF_DELTAS; i++) {
+        const int delta = xd->ref_lf_deltas[i];
+
+        // Frame level data
+        if (delta != xd->last_ref_lf_deltas[i]) {
+          xd->last_ref_lf_deltas[i] = delta;
+          vp9_write_bit(w, 1);
+
+          if (delta > 0) {
+            vp9_write_literal(w, delta & 0x3F, 6);
+            vp9_write_bit(w, 0);  // sign
+          } else {
+            assert(delta < 0);
+            vp9_write_literal(w, (-delta) & 0x3F, 6);
+            vp9_write_bit(w, 1);  // sign
+          }
+        } else {
+          vp9_write_bit(w, 0);
+        }
+      }
+
+      // Send update
+      for (i = 0; i < MAX_MODE_LF_DELTAS; i++) {
+        const int delta = xd->mode_lf_deltas[i];
+        if (delta != xd->last_mode_lf_deltas[i]) {
+          xd->last_mode_lf_deltas[i] = delta;
+          vp9_write_bit(w, 1);
+
+          if (delta > 0) {
+            vp9_write_literal(w, delta & 0x3F, 6);
+            vp9_write_bit(w, 0);  // sign
+          } else {
+            assert(delta < 0);
+            vp9_write_literal(w, (-delta) & 0x3F, 6);
+            vp9_write_bit(w, 1);  // sign
+          }
+        } else {
+          vp9_write_bit(w, 0);
+        }
+      }
+    }
+  }
+}
+
+static void encode_segmentation(VP9_COMP *cpi, vp9_writer *w) {
   int i, j;
+  VP9_COMMON *const pc = &cpi->common;
+  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+
+  vp9_write_bit(w, xd->segmentation_enabled);
+  if (!xd->segmentation_enabled)
+    return;
+
+  // Segmentation map
+  vp9_write_bit(w, xd->update_mb_segmentation_map);
+#if CONFIG_IMPLICIT_SEGMENTATION
+  vp9_write_bit(w, xd->allow_implicit_segment_update);
+#endif
+  if (xd->update_mb_segmentation_map) {
+    // Select the coding strategy (temporal or spatial)
+    vp9_choose_segmap_coding_method(cpi);
+    // Write out probabilities used to decode unpredicted  macro-block segments
+    for (i = 0; i < MB_SEG_TREE_PROBS; i++) {
+      const int prob = xd->mb_segment_tree_probs[i];
+      if (prob != MAX_PROB) {
+        vp9_write_bit(w, 1);
+        vp9_write_prob(w, prob);
+      } else {
+        vp9_write_bit(w, 0);
+      }
+    }
+
+    // Write out the chosen coding method.
+    vp9_write_bit(w, pc->temporal_update);
+    if (pc->temporal_update) {
+      for (i = 0; i < PREDICTION_PROBS; i++) {
+        const int prob = pc->segment_pred_probs[i];
+        if (prob != MAX_PROB) {
+          vp9_write_bit(w, 1);
+          vp9_write_prob(w, prob);
+        } else {
+          vp9_write_bit(w, 0);
+        }
+      }
+    }
+  }
+
+  // Segmentation data
+  vp9_write_bit(w, xd->update_mb_segmentation_data);
+  // segment_reference_frames(cpi);
+  if (xd->update_mb_segmentation_data) {
+    vp9_write_bit(w, xd->mb_segment_abs_delta);
+
+    for (i = 0; i < MAX_MB_SEGMENTS; i++) {
+      for (j = 0; j < SEG_LVL_MAX; j++) {
+        const int data = vp9_get_segdata(xd, i, j);
+        const int data_max = vp9_seg_feature_data_max(j);
+
+        if (vp9_segfeature_active(xd, i, j)) {
+          vp9_write_bit(w, 1);
+
+          if (vp9_is_segfeature_signed(j)) {
+            if (data < 0) {
+              vp9_encode_unsigned_max(w, -data, data_max);
+              vp9_write_bit(w, 1);
+            } else {
+              vp9_encode_unsigned_max(w, data, data_max);
+              vp9_write_bit(w, 0);
+            }
+          } else {
+            vp9_encode_unsigned_max(w, data, data_max);
+          }
+        } else {
+          vp9_write_bit(w, 0);
+        }
+      }
+    }
+  }
+}
+
+void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, unsigned long *size) {
+  int i;
   VP9_HEADER oh;
   VP9_COMMON *const pc = &cpi->common;
   vp9_writer header_bc, residual_bc;
   MACROBLOCKD *const xd = &cpi->mb.e_mbd;
   int extra_bytes_packed = 0;
 
-  unsigned char *cx_data = dest;
+  uint8_t *cx_data = dest;
 
   oh.show_frame = (int) pc->show_frame;
   oh.type = (int)pc->frame_type;
@@ -1798,60 +1828,7 @@ void vp9_pack_bitstream(VP9_COMP *cpi, unsigned char *dest,
   }
 #endif
 
-  // Write out loop filter deltas applied at the MB level based on mode or ref frame (if they are enabled).
-  vp9_write_bit(&header_bc, (xd->mode_ref_lf_delta_enabled) ? 1 : 0);
-
-  if (xd->mode_ref_lf_delta_enabled) {
-    // Do the deltas need to be updated
-    vp9_write_bit(&header_bc, xd->mode_ref_lf_delta_update);
-    if (xd->mode_ref_lf_delta_update) {
-      // Send update
-      for (i = 0; i < MAX_REF_LF_DELTAS; i++) {
-        const int delta = xd->ref_lf_deltas[i];
-
-        // Frame level data
-        if (delta != xd->last_ref_lf_deltas[i]) {
-          xd->last_ref_lf_deltas[i] = delta;
-          vp9_write_bit(&header_bc, 1);
-
-          if (delta > 0) {
-            vp9_write_literal(&header_bc, delta & 0x3F, 6);
-            vp9_write_bit(&header_bc, 0);  // sign
-          } else {
-            assert(delta < 0);
-            vp9_write_literal(&header_bc, (-delta) & 0x3F, 6);
-            vp9_write_bit(&header_bc, 1);  // sign
-          }
-        } else {
-          vp9_write_bit(&header_bc, 0);
-        }
-      }
-
-      // Send update
-      for (i = 0; i < MAX_MODE_LF_DELTAS; i++) {
-        const int delta = xd->mode_lf_deltas[i];
-
-        if (delta != xd->last_mode_lf_deltas[i]) {
-          xd->last_mode_lf_deltas[i] = delta;
-          vp9_write_bit(&header_bc, 1);
-
-          if (delta > 0) {
-            vp9_write_literal(&header_bc, delta & 0x3F, 6);
-            vp9_write_bit(&header_bc, 0);  // sign
-          } else {
-            assert(delta < 0);
-            vp9_write_literal(&header_bc, (-delta) & 0x3F, 6);
-            vp9_write_bit(&header_bc, 1);  // sign
-          }
-        } else {
-          vp9_write_bit(&header_bc, 0);
-        }
-      }
-    }
-  }
-
-  // TODO(jkoleszar): remove these unused bits
-  vp9_write_literal(&header_bc, 0, 2);
+  encode_loopfilter(xd, &header_bc);
 
   // Frame Q baseline quantizer index
   vp9_write_literal(&header_bc, pc->base_qindex, QINDEX_BITS);
@@ -1904,9 +1881,10 @@ void vp9_pack_bitstream(VP9_COMP *cpi, unsigned char *dest,
     vp9_write_literal(&header_bc, cpi->gld_fb_idx, NUM_REF_FRAMES_LG2);
     vp9_write_literal(&header_bc, cpi->alt_fb_idx, NUM_REF_FRAMES_LG2);
 
-    // Indicate reference frame sign bias for Golden and ARF frames (always 0 for last frame buffer)
-    vp9_write_bit(&header_bc, pc->ref_frame_sign_bias[GOLDEN_FRAME]);
-    vp9_write_bit(&header_bc, pc->ref_frame_sign_bias[ALTREF_FRAME]);
+    // Indicate the sign bias for each reference frame buffer.
+    for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i) {
+      vp9_write_bit(&header_bc, pc->ref_frame_sign_bias[LAST_FRAME + i]);
+    }
 
     // Signal whether to allow high MV precision
     vp9_write_bit(&header_bc, (xd->allow_high_precision_mv) ? 1 : 0);
@@ -1960,87 +1938,7 @@ void vp9_pack_bitstream(VP9_COMP *cpi, unsigned char *dest,
     active_section = 7;
 #endif
 
-  // Signal whether or not Segmentation is enabled
-  vp9_write_bit(&header_bc, (xd->segmentation_enabled) ? 1 : 0);
-
-  // Indicate which features are enabled
-  if (xd->segmentation_enabled) {
-    // Indicate whether or not the segmentation map is being updated.
-    vp9_write_bit(&header_bc, (xd->update_mb_segmentation_map) ? 1 : 0);
-#if CONFIG_IMPLICIT_SEGMENTATION
-    vp9_write_bit(&header_bc, (xd->allow_implicit_segment_update) ? 1 : 0);
-#endif
-
-    // If it is, then indicate the method that will be used.
-    if (xd->update_mb_segmentation_map) {
-      // Select the coding strategy (temporal or spatial)
-      vp9_choose_segmap_coding_method(cpi);
-      // Send the tree probabilities used to decode unpredicted
-      // macro-block segments
-      for (i = 0; i < MB_SEG_TREE_PROBS; i++) {
-        const int prob = xd->mb_segment_tree_probs[i];
-        if (prob != 255) {
-          vp9_write_bit(&header_bc, 1);
-          vp9_write_prob(&header_bc, prob);
-        } else {
-          vp9_write_bit(&header_bc, 0);
-        }
-      }
-
-      // Write out the chosen coding method.
-      vp9_write_bit(&header_bc, (pc->temporal_update) ? 1 : 0);
-      if (pc->temporal_update) {
-        for (i = 0; i < PREDICTION_PROBS; i++) {
-          const int prob = pc->segment_pred_probs[i];
-          if (prob != 255) {
-            vp9_write_bit(&header_bc, 1);
-            vp9_write_prob(&header_bc, prob);
-          } else {
-            vp9_write_bit(&header_bc, 0);
-          }
-        }
-      }
-    }
-
-    vp9_write_bit(&header_bc, (xd->update_mb_segmentation_data) ? 1 : 0);
-
-    // segment_reference_frames(cpi);
-
-    if (xd->update_mb_segmentation_data) {
-      vp9_write_bit(&header_bc, (xd->mb_segment_abs_delta) ? 1 : 0);
-
-      // For each segments id...
-      for (i = 0; i < MAX_MB_SEGMENTS; i++) {
-        // For each segmentation codable feature...
-        for (j = 0; j < SEG_LVL_MAX; j++) {
-          const int8_t data = vp9_get_segdata(xd, i, j);
-          const int data_max = vp9_seg_feature_data_max(j);
-
-          // If the feature is enabled...
-          if (vp9_segfeature_active(xd, i, j)) {
-            vp9_write_bit(&header_bc, 1);
-
-            // Is the segment data signed..
-            if (vp9_is_segfeature_signed(j)) {
-              // Encode the relevant feature data
-              if (data < 0) {
-                vp9_encode_unsigned_max(&header_bc, -data, data_max);
-                vp9_write_bit(&header_bc, 1);
-              } else {
-                vp9_encode_unsigned_max(&header_bc, data, data_max);
-                vp9_write_bit(&header_bc, 0);
-              }
-            } else {
-              // Unsigned data element so no sign bit needed
-              vp9_encode_unsigned_max(&header_bc, data, data_max);
-            }
-          } else {
-            vp9_write_bit(&header_bc, 0);
-          }
-        }
-      }
-    }
-  }
+  encode_segmentation(cpi, &header_bc);
 
   // Encode the common prediction model status flag probability updates for
   // the reference frame
@@ -2153,15 +2051,12 @@ void vp9_pack_bitstream(VP9_COMP *cpi, unsigned char *dest,
   vp9_copy(cpi->common.fc.pre_uv_mode_prob, cpi->common.fc.uv_mode_prob);
   vp9_copy(cpi->common.fc.pre_bmode_prob, cpi->common.fc.bmode_prob);
   vp9_copy(cpi->common.fc.pre_sub_mv_ref_prob, cpi->common.fc.sub_mv_ref_prob);
-  vp9_copy(cpi->common.fc.pre_mbsplit_prob, cpi->common.fc.mbsplit_prob);
-  vp9_copy(cpi->common.fc.pre_i8x8_mode_prob, cpi->common.fc.i8x8_mode_prob);
   vp9_copy(cpi->common.fc.pre_partition_prob, cpi->common.fc.partition_prob);
   cpi->common.fc.pre_nmvc = cpi->common.fc.nmvc;
 #if CONFIG_COMP_INTERINTRA_PRED
   cpi->common.fc.pre_interintra_prob = cpi->common.fc.interintra_prob;
 #endif
   vp9_zero(cpi->sub_mv_ref_count);
-  vp9_zero(cpi->mbsplit_count);
   vp9_zero(cpi->common.fc.mv_ref_ct);
 
   update_coef_probs(cpi, &header_bc);
@@ -2173,9 +2068,6 @@ void vp9_pack_bitstream(VP9_COMP *cpi, unsigned char *dest,
   active_section = 2;
 #endif
 
-  // TODO(jkoleszar): remove this unused bit
-  vp9_write_bit(&header_bc, 1);
-
   vp9_update_skip_probs(cpi);
   for (i = 0; i < MBSKIP_CONTEXTS; ++i) {
     vp9_write_prob(&header_bc, pc->mbskip_pred_probs[i]);
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index 2c06457e7..6bc42c7ff 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -29,7 +29,7 @@ typedef struct {
     B_PREDICTION_MODE mode;
     int_mv mv;
     int_mv second_mv;
-  } bmi[16];
+  } bmi[4];
 } PARTITION_INFO;
 
 // Structure to hold snapshot of coding context during the mode picking process
@@ -117,7 +117,6 @@ struct macroblock {
   int mbmode_cost[2][MB_MODE_COUNT];
   int intra_uv_mode_cost[2][MB_MODE_COUNT];
   int bmode_costs[VP9_KF_BINTRAMODES][VP9_KF_BINTRAMODES][VP9_KF_BINTRAMODES];
-  int i8x8_mode_costs[MB_MODE_COUNT];
   int inter_bmode_costs[B_MODE_COUNT];
   int switchable_interp_costs[VP9_SWITCHABLE_FILTERS + 1]
                              [VP9_SWITCHABLE_FILTERS];
@@ -141,6 +140,9 @@ struct macroblock {
 
   // Structure to hold context for each of the 4 MBs within a SB:
   // when encoded as 4 independent MBs:
+  PICK_MODE_CONTEXT sb8_context[4][4][4];
+  PICK_MODE_CONTEXT sb8x16_context[4][4][2];
+  PICK_MODE_CONTEXT sb16x8_context[4][4][2];
   PICK_MODE_CONTEXT mb_context[4][4];
   PICK_MODE_CONTEXT sb32x16_context[4][2];
   PICK_MODE_CONTEXT sb16x32_context[4][2];
@@ -157,12 +159,6 @@ struct macroblock {
   void (*fwd_txm16x16)(int16_t *input, int16_t *output, int pitch);
   void (*quantize_b_4x4)(MACROBLOCK *x, int b_idx, TX_TYPE tx_type,
                          int y_blocks);
-  void (*quantize_b_4x4_pair)(MACROBLOCK *x, int b_idx1, int b_idx2,
-                              int y_blocks);
-  void (*quantize_b_16x16)(MACROBLOCK *x, int b_idx, TX_TYPE tx_type,
-                           int y_blocks);
-  void (*quantize_b_8x8)(MACROBLOCK *x, int b_idx, TX_TYPE tx_type,
-                         int y_blocks);
 };
 
 #endif  // VP9_ENCODER_VP9_BLOCK_H_
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 52065df52..6366d382f 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -20,7 +20,6 @@
 #include "vp9/common/vp9_entropymode.h"
 #include "vp9/common/vp9_quant_common.h"
 #include "vp9/encoder/vp9_segmentation.h"
-#include "vp9/common/vp9_setupintrarecon.h"
 #include "vp9/encoder/vp9_encodeintra.h"
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/common/vp9_invtrans.h"
@@ -47,9 +46,6 @@ int enc_debug = 0;
 
 void vp9_select_interp_filter_type(VP9_COMP *cpi);
 
-static void encode_macroblock(VP9_COMP *cpi, TOKENEXTRA **t,
-                              int output_enabled, int mi_row, int mi_col);
-
 static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t,
                               int output_enabled, int mi_row, int mi_col,
                               BLOCK_SIZE_TYPE bsize);
@@ -380,6 +376,8 @@ static void update_state(VP9_COMP *cpi,
     }
   }
   if (bsize < BLOCK_SIZE_SB32X32) {
+    if (bsize < BLOCK_SIZE_MB16X16)
+      ctx->txfm_rd_diff[ALLOW_16X16] = ctx->txfm_rd_diff[ALLOW_8X8];
     ctx->txfm_rd_diff[ALLOW_32X32] = ctx->txfm_rd_diff[ALLOW_16X16];
   }
 
@@ -387,19 +385,10 @@ static void update_state(VP9_COMP *cpi,
     vpx_memcpy(x->partition_info, &ctx->partition_info,
                sizeof(PARTITION_INFO));
 
-    mbmi->mv[0].as_int = x->partition_info->bmi[15].mv.as_int;
-    mbmi->mv[1].as_int = x->partition_info->bmi[15].second_mv.as_int;
-#if CONFIG_SB8X8
-    vpx_memcpy(x->partition_info + mis, &ctx->partition_info,
-               sizeof(PARTITION_INFO));
-    vpx_memcpy(x->partition_info + 1, &ctx->partition_info,
-               sizeof(PARTITION_INFO));
-    vpx_memcpy(x->partition_info + mis + 1, &ctx->partition_info,
-               sizeof(PARTITION_INFO));
-    xd->mode_info_context[1].mbmi =
-    xd->mode_info_context[mis].mbmi =
-    xd->mode_info_context[1 + mis].mbmi = *mbmi;
-#endif
+    mbmi->mv[0].as_int =
+        x->partition_info->bmi[3].mv.as_int;
+    mbmi->mv[1].as_int =
+        x->partition_info->bmi[3].second_mv.as_int;
   }
 
   x->skip = ctx->skip;
@@ -453,7 +442,6 @@ static void update_state(VP9_COMP *cpi,
       THR_D27_PRED /*D27_PRED*/,
       THR_D63_PRED /*D63_PRED*/,
       THR_TM /*TM_PRED*/,
-      THR_I8X8_PRED /*I8X8_PRED*/,
       THR_B_PRED /*I4X4_PRED*/,
     };
     cpi->mode_chosen_counts[kf_mode_index[mb_mode]]++;
@@ -491,11 +479,13 @@ static void update_state(VP9_COMP *cpi,
       mbmi->best_mv.as_int = best_mv.as_int;
       mbmi->best_second_mv.as_int = best_second_mv.as_int;
       vp9_update_nmv_count(cpi, x, &best_mv, &best_second_mv);
-#if CONFIG_SB8X8
-      xd->mode_info_context[1].mbmi =
-      xd->mode_info_context[mis].mbmi =
-      xd->mode_info_context[1 + mis].mbmi = *mbmi;
-#endif
+    }
+
+    if (bsize > BLOCK_SIZE_SB8X8 && mbmi->mode == NEWMV) {
+      int i, j;
+      for (j = 0; j < bh; ++j)
+        for (i = 0; i < bw; ++i)
+          xd->mode_info_context[mis * j + i].mbmi = *mbmi;
     }
 #if CONFIG_COMP_INTERINTRA_PRED
     if (mbmi->mode >= NEARESTMV && mbmi->mode < SPLITMV &&
@@ -567,8 +557,8 @@ static INLINE void set_partition_seg_context(VP9_COMP *cpi,
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &cpi->mb.e_mbd;
 
-  xd->above_seg_context = cm->above_seg_context + (mi_col >> CONFIG_SB8X8);
-  xd->left_seg_context  = cm->left_seg_context + ((mi_row >> CONFIG_SB8X8) & 3);
+  xd->above_seg_context = cm->above_seg_context + (mi_col >> 1);
+  xd->left_seg_context  = cm->left_seg_context + ((mi_row >> 1) & 3);
 }
 
 static void set_offsets(VP9_COMP *cpi,
@@ -580,17 +570,17 @@ static void set_offsets(VP9_COMP *cpi,
   const int dst_fb_idx = cm->new_fb_idx;
   const int idx_str = xd->mode_info_stride * mi_row + mi_col;
   const int bw = 1 << mi_width_log2(bsize), bh = 1 << mi_height_log2(bsize);
-  const int mb_row = mi_row >> CONFIG_SB8X8;
-  const int mb_col = mi_col >> CONFIG_SB8X8;
+  const int mb_row = mi_row >> 1;
+  const int mb_col = mi_col >> 1;
   const int idx_map = mb_row * cm->mb_cols + mb_col;
   int i;
 
   // entropy context structures
   for (i = 0; i < MAX_MB_PLANE; i++) {
     xd->plane[i].above_context = cm->above_context[i] +
-        (mi_col * 4 >> (CONFIG_SB8X8 + xd->plane[i].subsampling_x));
+        (mi_col * 2 >>  xd->plane[i].subsampling_x);
     xd->plane[i].left_context = cm->left_context[i] +
-        (((mi_row * 4 >> CONFIG_SB8X8) & 15) >> xd->plane[i].subsampling_y);
+        (((mi_row * 2) & 15) >> xd->plane[i].subsampling_y);
   }
 
   // partition contexts
@@ -631,13 +621,11 @@ static void set_offsets(VP9_COMP *cpi,
 
   /* segment ID */
   if (xd->segmentation_enabled) {
-    if (xd->update_mb_segmentation_map) {
-      mbmi->segment_id = find_seg_id(cpi->segmentation_map, bsize,
-                                     mi_row, cm->mi_rows, mi_col, cm->mi_cols);
-    } else {
-      mbmi->segment_id = find_seg_id(cm->last_frame_seg_map, bsize,
-                                     mi_row, cm->mi_rows, mi_col, cm->mi_cols);
-    }
+    uint8_t *map = xd->update_mb_segmentation_map ? cpi->segmentation_map
+                                                  : cm->last_frame_seg_map;
+    mbmi->segment_id = find_seg_id(map, bsize, mi_row,
+                                   cm->mi_rows, mi_col, cm->mi_cols);
+
     assert(mbmi->segment_id <= (MAX_MB_SEGMENTS-1));
     vp9_mb_init_quantizer(cpi, x);
 
@@ -655,9 +643,9 @@ static void set_offsets(VP9_COMP *cpi,
       const int p16 = ((mb_row & 1) << 1) +  (mb_col & 1);
       const int p32 = ((mb_row & 2) << 2) + ((mb_col & 2) << 1);
       const int tile_progress =
-          cm->cur_tile_mi_col_start * cm->mb_rows >> CONFIG_SB8X8;
+          cm->cur_tile_mi_col_start * cm->mb_rows >> 1;
       const int mb_cols =
-          (cm->cur_tile_mi_col_end - cm->cur_tile_mi_col_start) >> CONFIG_SB8X8;
+          (cm->cur_tile_mi_col_end - cm->cur_tile_mi_col_start) >> 1;
 
       cpi->seg0_progress =
           ((y * mb_cols + x * 4 + p32 + p16 + tile_progress) << 16) / cm->MBs;
@@ -667,47 +655,6 @@ static void set_offsets(VP9_COMP *cpi,
   }
 }
 
-static int pick_mb_mode(VP9_COMP *cpi,
-                        int mi_row,
-                        int mi_col,
-                        TOKENEXTRA **tp,
-                        int *totalrate,
-                        int *totaldist) {
-  VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &cpi->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  int splitmodes_used = 0;
-  MB_MODE_INFO *mbmi;
-
-  set_offsets(cpi, mi_row, mi_col, BLOCK_SIZE_MB16X16);
-
-  if (cpi->oxcf.tuning == VP8_TUNE_SSIM)
-    vp9_activity_masking(cpi, x);
-
-  mbmi = &xd->mode_info_context->mbmi;
-  mbmi->sb_type = BLOCK_SIZE_MB16X16;
-
-  // Find best coding mode & reconstruct the MB so it is available
-  // as a predictor for MBs that follow in the SB
-  if (cm->frame_type == KEY_FRAME) {
-    vp9_rd_pick_intra_mode(cpi, x, totalrate, totaldist);
-
-    // Save the coding context
-    vpx_memcpy(&x->mb_context[xd->sb_index][xd->mb_index].mic,
-               xd->mode_info_context, sizeof(MODE_INFO));
-  } else {
-    vp9_pick_mode_inter_macroblock(cpi, x, mi_row, mi_col,
-                                   totalrate, totaldist);
-    splitmodes_used += (mbmi->mode == SPLITMV);
-
-    if (cpi->mb.e_mbd.segmentation_enabled && mbmi->segment_id == 0) {
-      cpi->seg0_idx++;
-    }
-  }
-
-  return splitmodes_used;
-}
-
 static void pick_sb_modes(VP9_COMP *cpi, int mi_row, int mi_col,
                           TOKENEXTRA **tp, int *totalrate, int *totaldist,
                           BLOCK_SIZE_TYPE bsize, PICK_MODE_CONTEXT *ctx) {
@@ -790,11 +737,10 @@ static void set_block_index(MACROBLOCKD *xd, int idx,
                             BLOCK_SIZE_TYPE bsize) {
   if (bsize >= BLOCK_SIZE_SB32X32) {
     xd->sb_index = idx;
-  } else {
-#if CONFIG_SB8X8
-    assert(bsize >= BLOCK_SIZE_MB16X16);
-#endif
+  } else if (bsize >= BLOCK_SIZE_MB16X16) {
     xd->mb_index = idx;
+  } else {
+    xd->b_index = idx;
   }
 }
 
@@ -817,6 +763,12 @@ static PICK_MODE_CONTEXT *get_block_context(MACROBLOCK *x,
       return &x->sb16x32_context[xd->sb_index][xd->mb_index];
     case BLOCK_SIZE_MB16X16:
       return &x->mb_context[xd->sb_index][xd->mb_index];
+    case BLOCK_SIZE_SB16X8:
+      return &x->sb16x8_context[xd->sb_index][xd->mb_index][xd->b_index];
+    case BLOCK_SIZE_SB8X16:
+      return &x->sb8x16_context[xd->sb_index][xd->mb_index][xd->b_index];
+    case BLOCK_SIZE_SB8X8:
+      return &x->sb8_context[xd->sb_index][xd->mb_index][xd->b_index];
     default:
       assert(0);
       return NULL;
@@ -837,14 +789,7 @@ static void encode_b(VP9_COMP *cpi, TOKENEXTRA **tp,
     set_block_index(xd, sub_index, bsize);
   set_offsets(cpi, mi_row, mi_col, bsize);
   update_state(cpi, get_block_context(x, bsize), bsize, output_enabled);
-  if (bsize == BLOCK_SIZE_MB16X16) {
-    if (cpi->oxcf.tuning == VP8_TUNE_SSIM)
-      vp9_activity_masking(cpi, x);
-
-    encode_macroblock(cpi, tp, output_enabled, mi_row, mi_col);
-  } else {
-    encode_superblock(cpi, tp, output_enabled, mi_row, mi_col, bsize);
-  }
+  encode_superblock(cpi, tp, output_enabled, mi_row, mi_col, bsize);
 
   if (output_enabled) {
     update_stats(cpi, mi_row, mi_col);
@@ -857,22 +802,26 @@ static void encode_b(VP9_COMP *cpi, TOKENEXTRA **tp,
 static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp,
                       int mi_row, int mi_col, int output_enabled,
                       BLOCK_SIZE_TYPE level,
-                      BLOCK_SIZE_TYPE c1, BLOCK_SIZE_TYPE c2[4]) {
+                      BLOCK_SIZE_TYPE c1, BLOCK_SIZE_TYPE c2[4],
+                      BLOCK_SIZE_TYPE c3[4][4]
+                      ) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   const int bsl = mi_width_log2(level), bs = 1 << (bsl - 1);
   const int bwl = mi_width_log2(c1), bhl = mi_height_log2(c1);
-  int pl;
+  int UNINITIALIZED_IS_SAFE(pl);
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
 
-  set_partition_seg_context(cpi, mi_row, mi_col);
-  pl = partition_plane_context(xd, level);
+  if (level > BLOCK_SIZE_SB8X8) {
+    set_partition_seg_context(cpi, mi_row, mi_col);
+    pl = partition_plane_context(xd, level);
+  }
 
   if (bsl == bwl && bsl == bhl) {
-    if (output_enabled && level > BLOCK_SIZE_MB16X16)
+    if (output_enabled && level > BLOCK_SIZE_SB8X8)
       cpi->partition_count[pl][PARTITION_NONE]++;
     encode_b(cpi, tp, mi_row, mi_col, output_enabled, c1, -1);
   } else if (bsl == bhl && bsl > bwl) {
@@ -892,9 +841,11 @@ static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp,
     assert(bwl < bsl && bhl < bsl);
     if (level == BLOCK_SIZE_SB64X64) {
       subsize = BLOCK_SIZE_SB32X32;
-    } else {
-      assert(level == BLOCK_SIZE_SB32X32);
+    } else if (level == BLOCK_SIZE_SB32X32) {
       subsize = BLOCK_SIZE_MB16X16;
+    } else {
+      assert(level == BLOCK_SIZE_MB16X16);
+      subsize = BLOCK_SIZE_SB8X8;
     }
 
     if (output_enabled)
@@ -906,12 +857,12 @@ static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp,
       set_block_index(xd, i, subsize);
       encode_sb(cpi, tp, mi_row + y_idx * bs, mi_col + x_idx * bs,
                 output_enabled, subsize,
-                subsize == BLOCK_SIZE_MB16X16 ? c1 : c2[i], c2);
+                c2 ? c2[i] : c1, c3 ? c3[i] : NULL, NULL);
     }
   }
 
-  if (level > BLOCK_SIZE_MB16X16 &&
-      (level == BLOCK_SIZE_SB32X32 || bsl == bwl || bsl == bhl)) {
+  if (level > BLOCK_SIZE_SB8X8 &&
+      (level == BLOCK_SIZE_MB16X16 || bsl == bwl || bsl == bhl)) {
     set_partition_seg_context(cpi, mi_row, mi_col);
     update_partition_context(xd, c1, level);
   }
@@ -932,9 +883,11 @@ static void encode_sb_row(VP9_COMP *cpi,
 
   // Code each SB in the row
   for (mi_col = cm->cur_tile_mi_col_start;
-       mi_col < cm->cur_tile_mi_col_end; mi_col += (4 << CONFIG_SB8X8)) {
+       mi_col < cm->cur_tile_mi_col_end; mi_col += 8) {
     int i, p;
+    BLOCK_SIZE_TYPE mb_partitioning[4][4];
     BLOCK_SIZE_TYPE sb_partitioning[4];
+    BLOCK_SIZE_TYPE sb64_partitioning = BLOCK_SIZE_SB32X32;
     int sb64_rate = 0, sb64_dist = 0;
     int sb64_skip = 0;
     ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
@@ -943,23 +896,27 @@ static void encode_sb_row(VP9_COMP *cpi,
 
     for (p = 0; p < MAX_MB_PLANE; p++) {
       memcpy(a + 16 * p, cm->above_context[p] +
-                 (mi_col * 4 >> (CONFIG_SB8X8 + xd->plane[p].subsampling_x)),
+                 (mi_col * 2 >> xd->plane[p].subsampling_x),
              sizeof(ENTROPY_CONTEXT) * 16 >> xd->plane[p].subsampling_x);
       memcpy(l + 16 * p, cm->left_context[p],
              sizeof(ENTROPY_CONTEXT) * 16 >> xd->plane[p].subsampling_y);
     }
-    memcpy(&seg_a, cm->above_seg_context + (mi_col >> CONFIG_SB8X8),
+    memcpy(&seg_a, cm->above_seg_context + (mi_col >> 1),
            sizeof(seg_a));
     memcpy(&seg_l, cm->left_seg_context, sizeof(seg_l));
+
+    // FIXME(rbultje): this function should probably be rewritten to be
+    // recursive at some point in the future.
     for (i = 0; i < 4; i++) {
-      const int x_idx = (i & 1) << (1 + CONFIG_SB8X8);
-      const int y_idx = (i & 2) << CONFIG_SB8X8;
+      const int x_idx = (i & 1) << 2;
+      const int y_idx = (i & 2) << 1;
       int sb32_rate = 0, sb32_dist = 0;
       int splitmodes_used = 0;
       int sb32_skip = 0;
       int j;
       ENTROPY_CONTEXT l2[8 * MAX_MB_PLANE], a2[8 * MAX_MB_PLANE];
 
+      sb_partitioning[i] = BLOCK_SIZE_MB16X16;
       if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols)
         continue;
 
@@ -969,22 +926,23 @@ static void encode_sb_row(VP9_COMP *cpi,
       for (p = 0; p < MAX_MB_PLANE; p++) {
         vpx_memcpy(l2 + 8 * p,
                    cm->left_context[p] +
-                       (y_idx * 4 >> (CONFIG_SB8X8 +
-                                      xd->plane[p].subsampling_y)),
+                       (y_idx * 2 >> xd->plane[p].subsampling_y),
                    sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_y);
         vpx_memcpy(a2 + 8 * p,
                    cm->above_context[p] +
-                       ((mi_col + x_idx) * 4 >> (CONFIG_SB8X8 +
-                                                 xd->plane[p].subsampling_x)),
+                       ((mi_col + x_idx) * 2 >> xd->plane[p].subsampling_x),
                    sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_x);
       }
 
       /* Encode MBs in raster order within the SB */
-      sb_partitioning[i] = BLOCK_SIZE_MB16X16;
       for (j = 0; j < 4; j++) {
-        const int x_idx_m = x_idx + ((j & 1) << CONFIG_SB8X8);
-        const int y_idx_m = y_idx + ((j >> 1) << CONFIG_SB8X8);
+        const int x_idx_m = x_idx + ((j & 1) << 1);
+        const int y_idx_m = y_idx + ((j >> 1) << 1);
         int r, d;
+        int r2, d2, mb16_rate = 0, mb16_dist = 0, k;
+        ENTROPY_CONTEXT l3[4 * MAX_MB_PLANE], a3[4 * MAX_MB_PLANE];
+
+        mb_partitioning[i][j] = BLOCK_SIZE_SB8X8;
 
         if (mi_row + y_idx_m >= cm->mi_rows ||
             mi_col + x_idx_m >= cm->mi_cols) {
@@ -995,30 +953,165 @@ static void encode_sb_row(VP9_COMP *cpi,
         // Index of the MB in the SB 0..3
         xd->mb_index = j;
 
-        splitmodes_used += pick_mb_mode(cpi, mi_row + y_idx_m,
-                                        mi_col + x_idx_m, tp, &r, &d);
-        sb32_rate += r;
-        sb32_dist += d;
+        for (p = 0; p < MAX_MB_PLANE; p++) {
+          vpx_memcpy(l3 + 4 * p,
+                     cm->left_context[p] +
+                         (y_idx_m * 2 >> xd->plane[p].subsampling_y),
+                     sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_y);
+          vpx_memcpy(a3 + 4 * p,
+                     cm->above_context[p] +
+                         ((mi_col + x_idx_m) * 2 >> xd->plane[p].subsampling_x),
+                     sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_x);
+        }
+
+        for (k = 0; k < 4; k++) {
+          xd->b_index = k;
+
+          // try 8x8 coding
+          pick_sb_modes(cpi, mi_row + y_idx_m + (k >> 1),
+                        mi_col + x_idx_m + (k & 1),
+                        tp, &r, &d, BLOCK_SIZE_SB8X8,
+                        &x->sb8_context[xd->sb_index][xd->mb_index]
+                                       [xd->b_index]);
+          mb16_rate += r;
+          mb16_dist += d;
+          update_state(cpi, &x->sb8_context[xd->sb_index][xd->mb_index]
+                                           [xd->b_index],
+                       BLOCK_SIZE_SB8X8, 0);
+          encode_superblock(cpi, tp,
+                            0, mi_row + y_idx_m + (k >> 1),
+                            mi_col + x_idx_m + (k & 1),
+                            BLOCK_SIZE_SB8X8);
+        }
+        set_partition_seg_context(cpi, mi_row + y_idx_m, mi_col + x_idx_m);
+        pl = partition_plane_context(xd, BLOCK_SIZE_MB16X16);
+        mb16_rate += x->partition_cost[pl][PARTITION_SPLIT];
+        for (p = 0; p < MAX_MB_PLANE; p++) {
+          vpx_memcpy(cm->left_context[p] +
+                         (y_idx_m * 2 >> xd->plane[p].subsampling_y),
+                     l3 + 4 * p,
+                     sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_y);
+          vpx_memcpy(cm->above_context[p] +
+                         ((mi_col + x_idx_m) * 2 >> xd->plane[p].subsampling_x),
+                     a3 + 4 * p,
+                     sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_x);
+        }
+
+        // try 8x16 coding
+        r2 = 0;
+        d2 = 0;
+        xd->b_index = 0;
+        pick_sb_modes(cpi, mi_row + y_idx_m, mi_col + x_idx_m,
+                      tp, &r, &d, BLOCK_SIZE_SB8X16,
+                      &x->sb8x16_context[xd->sb_index][xd->mb_index]
+                                        [xd->b_index]);
+        r2 += r;
+        d2 += d;
+        update_state(cpi, &x->sb8x16_context[xd->sb_index][xd->mb_index]
+                                            [xd->b_index],
+                     BLOCK_SIZE_SB8X16, 0);
+        encode_superblock(cpi, tp,
+                          0, mi_row + y_idx_m, mi_col + x_idx_m,
+                          BLOCK_SIZE_SB8X16);
+        xd->b_index = 1;
+        pick_sb_modes(cpi, mi_row + y_idx_m, mi_col + x_idx_m + 1,
+                      tp, &r, &d, BLOCK_SIZE_SB8X16,
+                      &x->sb8x16_context[xd->sb_index][xd->mb_index]
+                                        [xd->b_index]);
+        r2 += r;
+        d2 += d;
+        set_partition_seg_context(cpi, mi_row + y_idx_m, mi_col + x_idx_m);
+        pl = partition_plane_context(xd, BLOCK_SIZE_MB16X16);
+        r2 += x->partition_cost[pl][PARTITION_VERT];
+        if (RDCOST(x->rdmult, x->rddiv, r2, d2) <
+                RDCOST(x->rdmult, x->rddiv, mb16_rate, mb16_dist)) {
+          mb16_rate = r2;
+          mb16_dist = d2;
+          mb_partitioning[i][j] = BLOCK_SIZE_SB8X16;
+        }
+        for (p = 0; p < MAX_MB_PLANE; p++) {
+          vpx_memcpy(cm->left_context[p] +
+                         (y_idx_m * 2 >> xd->plane[p].subsampling_y),
+                     l3 + 4 * p,
+                     sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_y);
+          vpx_memcpy(cm->above_context[p] +
+                         ((mi_col + x_idx_m) * 2 >> xd->plane[p].subsampling_x),
+                     a3 + 4 * p,
+                     sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_x);
+        }
+
+        // try 16x8 coding
+        r2 = 0;
+        d2 = 0;
+        xd->b_index = 0;
+        pick_sb_modes(cpi, mi_row + y_idx_m, mi_col + x_idx_m,
+                      tp, &r, &d, BLOCK_SIZE_SB16X8,
+                      &x->sb16x8_context[xd->sb_index][xd->mb_index]
+                                        [xd->b_index]);
+        r2 += r;
+        d2 += d;
+        update_state(cpi, &x->sb16x8_context[xd->sb_index][xd->mb_index]
+                                            [xd->b_index],
+                     BLOCK_SIZE_SB16X8, 0);
+        encode_superblock(cpi, tp,
+                          0, mi_row + y_idx_m, mi_col + x_idx_m,
+                          BLOCK_SIZE_SB16X8);
+        xd->b_index = 1;
+        pick_sb_modes(cpi, mi_row + y_idx_m + 1, mi_col + x_idx_m,
+                      tp, &r, &d, BLOCK_SIZE_SB16X8,
+                      &x->sb16x8_context[xd->sb_index][xd->mb_index]
+                                        [xd->b_index]);
+        r2 += r;
+        d2 += d;
+        set_partition_seg_context(cpi, mi_row + y_idx_m, mi_col + x_idx_m);
+        pl = partition_plane_context(xd, BLOCK_SIZE_MB16X16);
+        r2 += x->partition_cost[pl][PARTITION_HORZ];
+        if (RDCOST(x->rdmult, x->rddiv, r2, d2) <
+                RDCOST(x->rdmult, x->rddiv, mb16_rate, mb16_dist)) {
+          mb16_rate = r2;
+          mb16_dist = d2;
+          mb_partitioning[i][j] = BLOCK_SIZE_SB16X8;
+        }
+        for (p = 0; p < MAX_MB_PLANE; p++) {
+          vpx_memcpy(cm->left_context[p] +
+                         (y_idx_m * 2 >> xd->plane[p].subsampling_y),
+                     l3 + 4 * p,
+                     sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_y);
+          vpx_memcpy(cm->above_context[p] +
+                         ((mi_col + x_idx_m) * 2 >> xd->plane[p].subsampling_x),
+                     a3 + 4 * p,
+                     sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_x);
+        }
+
+        // try as 16x16
+        pick_sb_modes(cpi, mi_row + y_idx_m, mi_col + x_idx_m,
+                      tp, &r, &d, BLOCK_SIZE_MB16X16,
+                      &x->mb_context[xd->sb_index][xd->mb_index]);
+        set_partition_seg_context(cpi, mi_row + y_idx_m, mi_col + x_idx_m);
+        pl = partition_plane_context(xd, BLOCK_SIZE_MB16X16);
+        r += x->partition_cost[pl][PARTITION_NONE];
+        if (RDCOST(x->rdmult, x->rddiv, r, d) <
+                RDCOST(x->rdmult, x->rddiv, mb16_rate, mb16_dist)) {
+          mb16_rate = r;
+          mb16_dist = d;
+          mb_partitioning[i][j] = BLOCK_SIZE_MB16X16;
+        }
+        sb32_rate += mb16_rate;
+        sb32_dist += mb16_dist;
 
         // Dummy encode, do not do the tokenization
-#if CONFIG_SB8X8
-        update_state(cpi, &x->mb_context[xd->sb_index][xd->mb_index],
-                     BLOCK_SIZE_MB16X16, 0);
-#endif
-        encode_macroblock(cpi, tp, 0, mi_row + y_idx_m,
-                          mi_col + x_idx_m);
+        encode_sb(cpi, tp, mi_row + y_idx_m, mi_col + x_idx_m, 0,
+                  BLOCK_SIZE_MB16X16, mb_partitioning[i][j], NULL, NULL);
       }
 
       /* Restore L & A coding context to those in place on entry */
       for (p = 0; p < MAX_MB_PLANE; p++) {
         vpx_memcpy(cm->left_context[p] +
-                       (y_idx * 4 >> (CONFIG_SB8X8 +
-                                      xd->plane[p].subsampling_y)),
+                       (y_idx * 2 >> xd->plane[p].subsampling_y),
                    l2 + 8 * p,
                    sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_y);
         vpx_memcpy(cm->above_context[p] +
-                       ((mi_col + x_idx) * 4 >> (CONFIG_SB8X8 +
-                                                 xd->plane[p].subsampling_x)),
+                       ((mi_col + x_idx) * 2 >> xd->plane[p].subsampling_x),
                    a2 + 8 * p,
                    sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_x);
       }
@@ -1033,14 +1126,14 @@ static void encode_sb_row(VP9_COMP *cpi,
       }
 
       // check 32x16
-      if (mi_col + x_idx + (2 << CONFIG_SB8X8) <= cm->mi_cols) {
+      if (mi_col + x_idx + 4 <= cm->mi_cols) {
         int r, d;
 
         xd->mb_index = 0;
         pick_sb_modes(cpi, mi_row + y_idx, mi_col + x_idx,
                       tp, &r, &d, BLOCK_SIZE_SB32X16,
                       &x->sb32x16_context[xd->sb_index][xd->mb_index]);
-        if (mi_row + y_idx + (1 << CONFIG_SB8X8) < cm->mi_rows) {
+        if (mi_row + y_idx + 2 < cm->mi_rows) {
           int r2, d2;
 
           update_state(cpi, &x->sb32x16_context[xd->sb_index][xd->mb_index],
@@ -1049,7 +1142,7 @@ static void encode_sb_row(VP9_COMP *cpi,
                             0, mi_row + y_idx, mi_col + x_idx,
                             BLOCK_SIZE_SB32X16);
           xd->mb_index = 1;
-          pick_sb_modes(cpi, mi_row + y_idx + (1 << CONFIG_SB8X8),
+          pick_sb_modes(cpi, mi_row + y_idx + 2,
                         mi_col + x_idx, tp, &r2, &d2, BLOCK_SIZE_SB32X16,
                         &x->sb32x16_context[xd->sb_index][xd->mb_index]);
           r += r2;
@@ -1070,27 +1163,25 @@ static void encode_sb_row(VP9_COMP *cpi,
 
         for (p = 0; p < MAX_MB_PLANE; p++) {
           vpx_memcpy(cm->left_context[p] +
-                         (y_idx * 4 >> (CONFIG_SB8X8 +
-                                        xd->plane[p].subsampling_y)),
+                         (y_idx * 2 >> xd->plane[p].subsampling_y),
                      l2 + 8 * p,
                      sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_y);
           vpx_memcpy(cm->above_context[p] +
-                         ((mi_col + x_idx) * 4 >> (CONFIG_SB8X8 +
-                                                   xd->plane[p].subsampling_x)),
+                         ((mi_col + x_idx) * 2 >> xd->plane[p].subsampling_x),
                      a2 + 8 * p,
                      sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_x);
         }
       }
 
       // check 16x32
-      if (mi_row + y_idx + (2 << CONFIG_SB8X8) <= cm->mi_rows) {
+      if (mi_row + y_idx + 4 <= cm->mi_rows) {
         int r, d;
 
         xd->mb_index = 0;
         pick_sb_modes(cpi, mi_row + y_idx, mi_col + x_idx,
                       tp, &r, &d, BLOCK_SIZE_SB16X32,
                       &x->sb16x32_context[xd->sb_index][xd->mb_index]);
-        if (mi_col + x_idx + (1 << CONFIG_SB8X8) < cm->mi_cols) {
+        if (mi_col + x_idx + 2 < cm->mi_cols) {
           int r2, d2;
 
           update_state(cpi, &x->sb16x32_context[xd->sb_index][xd->mb_index],
@@ -1100,7 +1191,7 @@ static void encode_sb_row(VP9_COMP *cpi,
                             BLOCK_SIZE_SB16X32);
           xd->mb_index = 1;
           pick_sb_modes(cpi, mi_row + y_idx,
-                        mi_col + x_idx + (1 << CONFIG_SB8X8),
+                        mi_col + x_idx + 2,
                         tp, &r2, &d2, BLOCK_SIZE_SB16X32,
                         &x->sb16x32_context[xd->sb_index][xd->mb_index]);
           r += r2;
@@ -1121,21 +1212,19 @@ static void encode_sb_row(VP9_COMP *cpi,
 
         for (p = 0; p < MAX_MB_PLANE; p++) {
           vpx_memcpy(cm->left_context[p] +
-                         (y_idx * 4 >> (CONFIG_SB8X8 +
-                                        xd->plane[p].subsampling_y)),
+                         (y_idx * 2 >> xd->plane[p].subsampling_y),
                      l2 + 8 * p,
                      sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_y);
           vpx_memcpy(cm->above_context[p] +
-                         ((mi_col + x_idx) * 4 >> (CONFIG_SB8X8 +
-                                                   xd->plane[p].subsampling_x)),
+                         ((mi_col + x_idx) * 2 >> xd->plane[p].subsampling_x),
                      a2 + 8 * p,
                      sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_x);
         }
       }
 
       if (!sb32_skip &&
-          mi_col + x_idx + (2 << CONFIG_SB8X8) <= cm->mi_cols &&
-          mi_row + y_idx + (2 << CONFIG_SB8X8) <= cm->mi_rows) {
+          mi_col + x_idx + 4 <= cm->mi_cols &&
+          mi_row + y_idx + 4 <= cm->mi_rows) {
         int r, d;
 
         /* Pick a mode assuming that it applies to all 4 of the MBs in the SB */
@@ -1170,18 +1259,19 @@ static void encode_sb_row(VP9_COMP *cpi,
       // instead of small->big) means we can use as threshold for small, which
       // may enable breakouts if RD is not good enough (i.e. faster)
       encode_sb(cpi, tp, mi_row + y_idx, mi_col + x_idx, 0,
-                BLOCK_SIZE_SB32X32, sb_partitioning[i], sb_partitioning);
+                BLOCK_SIZE_SB32X32, sb_partitioning[i], mb_partitioning[i],
+                NULL);
     }
 
     for (p = 0; p < MAX_MB_PLANE; p++) {
       memcpy(cm->above_context[p] +
-                 (mi_col * 4 >> (CONFIG_SB8X8 + xd->plane[p].subsampling_x)),
+                 (mi_col * 2 >> xd->plane[p].subsampling_x),
              a + 16 * p,
              sizeof(ENTROPY_CONTEXT) * 16 >> xd->plane[p].subsampling_x);
       memcpy(cm->left_context[p], l + 16 * p,
              sizeof(ENTROPY_CONTEXT) * 16 >> xd->plane[p].subsampling_y);
     }
-    memcpy(cm->above_seg_context + (mi_col >> CONFIG_SB8X8), &seg_a,
+    memcpy(cm->above_seg_context + (mi_col >> 1), &seg_a,
            sizeof(seg_a));
     memcpy(cm->left_seg_context, &seg_l, sizeof(seg_l));
 
@@ -1190,14 +1280,14 @@ static void encode_sb_row(VP9_COMP *cpi,
     sb64_rate += x->partition_cost[pl][PARTITION_SPLIT];
 
     // check 64x32
-    if (mi_col + (4 << CONFIG_SB8X8) <= cm->mi_cols && !(cm->mb_rows & 1)) {
+    if (mi_col + 8 <= cm->mi_cols && !(cm->mb_rows & 1)) {
       int r, d;
 
       xd->sb_index = 0;
       pick_sb_modes(cpi, mi_row, mi_col,
                     tp, &r, &d, BLOCK_SIZE_SB64X32,
                     &x->sb64x32_context[xd->sb_index]);
-      if (mi_row + (2 << CONFIG_SB8X8) != cm->mi_rows) {
+      if (mi_row + 4 != cm->mi_rows) {
         int r2, d2;
 
         update_state(cpi, &x->sb64x32_context[xd->sb_index],
@@ -1205,7 +1295,7 @@ static void encode_sb_row(VP9_COMP *cpi,
         encode_superblock(cpi, tp,
                           0, mi_row, mi_col, BLOCK_SIZE_SB64X32);
         xd->sb_index = 1;
-        pick_sb_modes(cpi, mi_row + (2 << CONFIG_SB8X8), mi_col,
+        pick_sb_modes(cpi, mi_row + 4, mi_col,
                       tp, &r2, &d2, BLOCK_SIZE_SB64X32,
                       &x->sb64x32_context[xd->sb_index]);
         r += r2;
@@ -1221,12 +1311,12 @@ static void encode_sb_row(VP9_COMP *cpi,
               RDCOST(x->rdmult, x->rddiv, sb64_rate, sb64_dist)) {
         sb64_rate = r;
         sb64_dist = d;
-        sb_partitioning[0] = BLOCK_SIZE_SB64X32;
+        sb64_partitioning = BLOCK_SIZE_SB64X32;
       }
 
       for (p = 0; p < MAX_MB_PLANE; p++) {
         memcpy(cm->above_context[p] +
-                   (mi_col * 4 >> (CONFIG_SB8X8 + xd->plane[p].subsampling_x)),
+                   (mi_col * 2 >> xd->plane[p].subsampling_x),
                a + 16 * p,
                sizeof(ENTROPY_CONTEXT) * 16 >> xd->plane[p].subsampling_x);
         memcpy(cm->left_context[p], l + 16 * p,
@@ -1235,14 +1325,14 @@ static void encode_sb_row(VP9_COMP *cpi,
     }
 
     // check 32x64
-    if (mi_row + (4 << CONFIG_SB8X8) <= cm->mi_rows && !(cm->mb_cols & 1)) {
+    if (mi_row + 8 <= cm->mi_rows && !(cm->mb_cols & 1)) {
       int r, d;
 
       xd->sb_index = 0;
       pick_sb_modes(cpi, mi_row, mi_col,
                     tp, &r, &d, BLOCK_SIZE_SB32X64,
                     &x->sb32x64_context[xd->sb_index]);
-      if (mi_col + (2 << CONFIG_SB8X8) != cm->mi_cols) {
+      if (mi_col + 4 != cm->mi_cols) {
         int r2, d2;
 
         update_state(cpi, &x->sb32x64_context[xd->sb_index],
@@ -1250,7 +1340,7 @@ static void encode_sb_row(VP9_COMP *cpi,
         encode_superblock(cpi, tp,
                           0, mi_row, mi_col, BLOCK_SIZE_SB32X64);
         xd->sb_index = 1;
-        pick_sb_modes(cpi, mi_row, mi_col + (2 << CONFIG_SB8X8),
+        pick_sb_modes(cpi, mi_row, mi_col + 4,
                       tp, &r2, &d2, BLOCK_SIZE_SB32X64,
                       &x->sb32x64_context[xd->sb_index]);
         r += r2;
@@ -1266,12 +1356,12 @@ static void encode_sb_row(VP9_COMP *cpi,
               RDCOST(x->rdmult, x->rddiv, sb64_rate, sb64_dist)) {
         sb64_rate = r;
         sb64_dist = d;
-        sb_partitioning[0] = BLOCK_SIZE_SB32X64;
+        sb64_partitioning = BLOCK_SIZE_SB32X64;
       }
 
       for (p = 0; p < MAX_MB_PLANE; p++) {
         memcpy(cm->above_context[p] +
-                   (mi_col * 4 >> (CONFIG_SB8X8 + xd->plane[p].subsampling_x)),
+                   (mi_col * 2 >> xd->plane[p].subsampling_x),
                a + 16 * p,
                sizeof(ENTROPY_CONTEXT) * 16 >> xd->plane[p].subsampling_x);
         memcpy(cm->left_context[p], l + 16 * p,
@@ -1280,8 +1370,8 @@ static void encode_sb_row(VP9_COMP *cpi,
     }
 
     if (!sb64_skip &&
-        mi_col + (4 << CONFIG_SB8X8) <= cm->mi_cols &&
-        mi_row + (4 << CONFIG_SB8X8) <= cm->mi_rows) {
+        mi_col + 8 <= cm->mi_cols &&
+        mi_row + 8 <= cm->mi_rows) {
       int r, d;
 
       pick_sb_modes(cpi, mi_row, mi_col, tp, &r, &d,
@@ -1295,13 +1385,13 @@ static void encode_sb_row(VP9_COMP *cpi,
               RDCOST(x->rdmult, x->rddiv, sb64_rate, sb64_dist)) {
         sb64_rate = r;
         sb64_dist = d;
-        sb_partitioning[0] = BLOCK_SIZE_SB64X64;
+        sb64_partitioning = BLOCK_SIZE_SB64X64;
       }
     }
 
     assert(tp_orig == *tp);
-    encode_sb(cpi, tp, mi_row, mi_col, 1,
-              BLOCK_SIZE_SB64X64, sb_partitioning[0], sb_partitioning);
+    encode_sb(cpi, tp, mi_row, mi_col, 1, BLOCK_SIZE_SB64X64,
+              sb64_partitioning, sb_partitioning, mb_partitioning);
     assert(tp_orig < *tp);
   }
 }
@@ -1333,9 +1423,6 @@ static void init_encode_frame_mb_context(VP9_COMP *cpi) {
                    0, 0, NULL, NULL);
   setup_dst_planes(xd, &cm->yv12_fb[cm->new_fb_idx], 0, 0);
 
-  // set up frame for intra coded blocks
-  vp9_setup_intra_recon(&cm->yv12_fb[cm->new_fb_idx]);
-
   vp9_build_block_offsets(x);
 
   vp9_setup_block_dptrs(&x->e_mbd);
@@ -1346,10 +1433,8 @@ static void init_encode_frame_mb_context(VP9_COMP *cpi) {
   vp9_zero(cpi->count_mb_ref_frame_usage)
   vp9_zero(cpi->bmode_count)
   vp9_zero(cpi->ymode_count)
-  vp9_zero(cpi->i8x8_mode_count)
   vp9_zero(cpi->y_uv_mode_count)
   vp9_zero(cpi->sub_mv_ref_count)
-  vp9_zero(cpi->mbsplit_count)
   vp9_zero(cpi->common.fc.mv_ref_ct)
   vp9_zero(cpi->sb_ymode_count)
   vp9_zero(cpi->partition_count);
@@ -1487,7 +1572,7 @@ static void encode_frame_internal(VP9_COMP *cpi) {
           vp9_get_tile_col_offsets(cm, tile_col);
           for (mi_row = cm->cur_tile_mi_row_start;
                mi_row < cm->cur_tile_mi_row_end;
-               mi_row += (4 << CONFIG_SB8X8)) {
+               mi_row += 8) {
             encode_sb_row(cpi, mi_row, &tp, &totalrate);
           }
           cpi->tok_count[tile_col] = (unsigned int)(tp - tp_old);
@@ -1616,9 +1701,11 @@ static void reset_skip_txfm_size_sb(VP9_COMP *cpi, MODE_INFO *mi,
     assert(bwl < bsl && bhl < bsl);
     if (bsize == BLOCK_SIZE_SB64X64) {
       subsize = BLOCK_SIZE_SB32X32;
-    } else {
-      assert(bsize == BLOCK_SIZE_SB32X32);
+    } else if (bsize == BLOCK_SIZE_SB32X32) {
       subsize = BLOCK_SIZE_MB16X16;
+    } else {
+      assert(bsize == BLOCK_SIZE_MB16X16);
+      subsize = BLOCK_SIZE_SB8X8;
     }
 
     for (n = 0; n < 4; n++) {
@@ -1638,10 +1725,10 @@ static void reset_skip_txfm_size(VP9_COMP *cpi, TX_SIZE txfm_max) {
   MODE_INFO *mi, *mi_ptr = cm->mi;
 
   for (mi_row = 0; mi_row < cm->mi_rows;
-       mi_row += (4 << CONFIG_SB8X8), mi_ptr += (4 << CONFIG_SB8X8) * mis) {
+       mi_row += 8, mi_ptr += 8 * mis) {
     mi = mi_ptr;
     for (mi_col = 0; mi_col < cm->mi_cols;
-         mi_col += (4 << CONFIG_SB8X8), mi += (4 << CONFIG_SB8X8)) {
+         mi_col += 8, mi += 8) {
       reset_skip_txfm_size_sb(cpi, mi, txfm_max,
                               mi_row, mi_col, BLOCK_SIZE_SB64X64);
     }
@@ -1823,30 +1910,16 @@ static void sum_intra_stats(VP9_COMP *cpi, MACROBLOCK *x) {
 
     do {
       ++ bct[xd->block[b].bmi.as_mode.first];
-    } while (++b < 16);
-  }
-
-  if (m == I8X8_PRED) {
-    i8x8_modes[xd->block[0].bmi.as_mode.first]++;
-    i8x8_modes[xd->block[2].bmi.as_mode.first]++;
-    i8x8_modes[xd->block[8].bmi.as_mode.first]++;
-    i8x8_modes[xd->block[10].bmi.as_mode.first]++;
+    } while (++b < 4);
   }
 #endif
 
-  if (xd->mode_info_context->mbmi.sb_type > BLOCK_SIZE_MB16X16) {
+  if (xd->mode_info_context->mbmi.sb_type > BLOCK_SIZE_SB8X8) {
     ++cpi->sb_ymode_count[m];
   } else {
     ++cpi->ymode_count[m];
   }
-  if (m != I8X8_PRED)
     ++cpi->y_uv_mode_count[m][uvm];
-  else {
-    cpi->i8x8_mode_count[xd->mode_info_context->bmi[0].as_mode.first]++;
-    cpi->i8x8_mode_count[xd->mode_info_context->bmi[2].as_mode.first]++;
-    cpi->i8x8_mode_count[xd->mode_info_context->bmi[8].as_mode.first]++;
-    cpi->i8x8_mode_count[xd->mode_info_context->bmi[10].as_mode.first]++;
-  }
   if (m == I4X4_PRED) {
     int b = 0;
     do {
@@ -1855,7 +1928,7 @@ static void sum_intra_stats(VP9_COMP *cpi, MACROBLOCK *x) {
       if (m == B_CONTEXT_PRED) m -= CONTEXT_PRED_REPLACEMENTS;
 #endif
       ++cpi->bmode_count[m];
-    } while (++b < 16);
+    } while (++b < 4);
   }
 }
 
@@ -1880,254 +1953,6 @@ static void adjust_act_zbin(VP9_COMP *cpi, MACROBLOCK *x) {
 #endif
 }
 
-static void encode_macroblock(VP9_COMP *cpi, TOKENEXTRA **t,
-                              int output_enabled,
-                              int mi_row, int mi_col) {
-  VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &cpi->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MODE_INFO *mi = xd->mode_info_context;
-  MB_MODE_INFO *const mbmi = &mi->mbmi;
-  const int mis = cm->mode_info_stride;
-#if CONFIG_SB8X8
-  int n;
-#endif
-
-  assert(xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_MB16X16);
-
-#ifdef ENC_DEBUG
-  enc_debug = (cpi->common.current_video_frame == 11 && cm->show_frame &&
-               mb_row == 8 && mb_col == 0 && output_enabled);
-  if (enc_debug)
-    printf("Encode MB %d %d output %d\n", mb_row, mb_col, output_enabled);
-#endif
-  if (cm->frame_type == KEY_FRAME) {
-    if (cpi->oxcf.tuning == VP8_TUNE_SSIM && output_enabled) {
-      // Adjust the zbin based on this MB rate.
-      adjust_act_zbin(cpi, x);
-      vp9_update_zbin_extra(cpi, x);
-    }
-  } else {
-    vp9_setup_interp_filters(xd, mbmi->interp_filter, cm);
-
-    if (cpi->oxcf.tuning == VP8_TUNE_SSIM) {
-      // Adjust the zbin based on this MB rate.
-      adjust_act_zbin(cpi, x);
-    }
-
-    // Experimental code. Special case for gf and arf zeromv modes.
-    // Increase zbin size to suppress noise
-    cpi->zbin_mode_boost = 0;
-    if (cpi->zbin_mode_boost_enabled) {
-      if (mbmi->ref_frame != INTRA_FRAME) {
-        if (mbmi->mode == ZEROMV) {
-          if (mbmi->ref_frame != LAST_FRAME)
-            cpi->zbin_mode_boost = GF_ZEROMV_ZBIN_BOOST;
-          else
-            cpi->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST;
-        } else if (mbmi->mode == SPLITMV)
-          cpi->zbin_mode_boost = SPLIT_MV_ZBIN_BOOST;
-        else
-          cpi->zbin_mode_boost = MV_ZBIN_BOOST;
-      } else {
-        cpi->zbin_mode_boost = INTRA_ZBIN_BOOST;
-      }
-    }
-
-    vp9_update_zbin_extra(cpi, x);
-  }
-
-  if (mbmi->ref_frame == INTRA_FRAME) {
-#if 0  // def ENC_DEBUG
-    if (enc_debug) {
-      printf("Mode %d skip %d tx_size %d\n", mbmi->mode, x->skip,
-             mbmi->txfm_size);
-    }
-#endif
-    if (mbmi->mode == I4X4_PRED) {
-      vp9_encode_intra16x16mbuv(cm, x);
-      vp9_encode_intra4x4mby(x, BLOCK_SIZE_MB16X16);
-    } else if (mbmi->mode == I8X8_PRED) {
-      vp9_encode_intra8x8mby(x);
-      vp9_encode_intra8x8mbuv(x);
-    } else {
-      vp9_encode_intra16x16mbuv(cm, x);
-      vp9_encode_intra16x16mby(cm, x);
-    }
-
-    if (output_enabled)
-      sum_intra_stats(cpi, x);
-  } else {
-    int ref_fb_idx, second_ref_fb_idx;
-#ifdef ENC_DEBUG
-    if (enc_debug)
-      printf("Mode %d skip %d tx_size %d ref %d ref2 %d mv %d %d interp %d\n",
-             mbmi->mode, x->skip, mbmi->txfm_size,
-             mbmi->ref_frame, mbmi->second_ref_frame,
-             mbmi->mv[0].as_mv.row, mbmi->mv[0].as_mv.col,
-             mbmi->interp_filter);
-#endif
-
-    assert(cm->frame_type != KEY_FRAME);
-
-    if (mbmi->ref_frame == LAST_FRAME)
-      ref_fb_idx = cpi->common.ref_frame_map[cpi->lst_fb_idx];
-    else if (mbmi->ref_frame == GOLDEN_FRAME)
-      ref_fb_idx = cpi->common.ref_frame_map[cpi->gld_fb_idx];
-    else
-      ref_fb_idx = cpi->common.ref_frame_map[cpi->alt_fb_idx];
-
-    if (mbmi->second_ref_frame > 0) {
-      if (mbmi->second_ref_frame == LAST_FRAME)
-        second_ref_fb_idx = cpi->common.ref_frame_map[cpi->lst_fb_idx];
-      else if (mbmi->second_ref_frame == GOLDEN_FRAME)
-        second_ref_fb_idx = cpi->common.ref_frame_map[cpi->gld_fb_idx];
-      else
-        second_ref_fb_idx = cpi->common.ref_frame_map[cpi->alt_fb_idx];
-    }
-
-    setup_pre_planes(xd,
-        &cpi->common.yv12_fb[ref_fb_idx],
-        mbmi->second_ref_frame > 0 ? &cpi->common.yv12_fb[second_ref_fb_idx]
-                                   : NULL,
-        mi_row, mi_col, xd->scale_factor, xd->scale_factor_uv);
-
-    if (!x->skip) {
-      vp9_encode_inter16x16(cm, x, mi_row, mi_col);
-    } else {
-      vp9_build_inter_predictors_sb(xd, mi_row, mi_col, BLOCK_SIZE_MB16X16);
-#if CONFIG_COMP_INTERINTRA_PRED
-      if (xd->mode_info_context->mbmi.second_ref_frame == INTRA_FRAME) {
-        vp9_build_interintra_predictors(xd,
-                                        xd->plane[0].dst.buf,
-                                        xd->plane[1].dst.buf,
-                                        xd->plane[2].dst.buf,
-                                        xd->plane[0].dst.stride,
-                                        xd->plane[1].dst.stride,
-                                        BLOCK_SIZE_MB16X16);
-      }
-#endif
-    }
-  }
-
-  if (!x->skip) {
-#ifdef ENC_DEBUG
-    if (enc_debug) {
-      int i, j;
-      printf("\n");
-      printf("qcoeff\n");
-      for (i = 0; i < 384; i++) {
-        printf("%3d ", xd->qcoeff[i]);
-        if (i % 16 == 15) printf("\n");
-      }
-      printf("\n");
-      printf("predictor\n");
-      for (i = 0; i < 384; i++) {
-        printf("%3d ", xd->predictor[i]);
-        if (i % 16 == 15) printf("\n");
-      }
-      printf("\n");
-      printf("src_diff\n");
-      for (i = 0; i < 384; i++) {
-        printf("%3d ", x->src_diff[i]);
-        if (i % 16 == 15) printf("\n");
-      }
-      printf("\n");
-      printf("diff\n");
-      for (i = 0; i < 384; i++) {
-        printf("%3d ", xd->block[0].diff[i]);
-        if (i % 16 == 15) printf("\n");
-      }
-      printf("\n");
-      printf("final y\n");
-      for (i = 0; i < 16; i++) {
-        for (j = 0; j < 16; j++)
-          printf("%3d ", xd->plane[0].dst.buf[i * xd->plane[0].dst.stride + j]);
-        printf("\n");
-      }
-      printf("\n");
-      printf("final u\n");
-      for (i = 0; i < 8; i++) {
-        for (j = 0; j < 8; j++)
-          printf("%3d ", xd->plane[1].dst.buf[i * xd->plane[1].dst.stride + j]);
-        printf("\n");
-      }
-      printf("\n");
-      printf("final v\n");
-      for (i = 0; i < 8; i++) {
-        for (j = 0; j < 8; j++)
-          printf("%3d ", xd->plane[2].dst.buf[i * xd->plane[1].dst.stride + j]);
-        printf("\n");
-      }
-      fflush(stdout);
-    }
-#endif
-
-    vp9_tokenize_sb(cpi, xd, t, !output_enabled, BLOCK_SIZE_MB16X16);
-  } else {
-    // FIXME(rbultje): not tile-aware (mi - 1)
-    int mb_skip_context =
-      (mi - 1)->mbmi.mb_skip_coeff + (mi - mis)->mbmi.mb_skip_coeff;
-
-    mbmi->mb_skip_coeff = 1;
-    if (output_enabled)
-      cpi->skip_true_count[mb_skip_context]++;
-    vp9_reset_sb_tokens_context(xd, BLOCK_SIZE_MB16X16);
-  }
-
-#if CONFIG_SB8X8
-  // copy skip flag on all mb_mode_info contexts in this SB
-  // if this was a skip at this txfm size
-  for (n = 1; n < 4; n++) {
-    const int x_idx = n & 1, y_idx = n >> 1;
-    if (mi_col + x_idx < cm->mi_cols && mi_row + y_idx < cm->mi_rows)
-      mi[x_idx + y_idx * mis].mbmi.mb_skip_coeff = mi->mbmi.mb_skip_coeff;
-  }
-#endif
-
-  if (output_enabled) {
-    int segment_id = mbmi->segment_id;
-    if (cpi->common.txfm_mode == TX_MODE_SELECT &&
-        !(mbmi->mb_skip_coeff ||
-          vp9_segfeature_active(&x->e_mbd, segment_id, SEG_LVL_SKIP))) {
-      assert(mbmi->txfm_size <= TX_16X16);
-      if (mbmi->mode != I4X4_PRED && mbmi->mode != I8X8_PRED &&
-          mbmi->mode != SPLITMV) {
-        cpi->txfm_count_16x16p[mbmi->txfm_size]++;
-      } else if (mbmi->mode == I8X8_PRED ||
-                 (mbmi->mode == SPLITMV &&
-                  mbmi->partitioning != PARTITIONING_4X4)) {
-        cpi->txfm_count_8x8p[mbmi->txfm_size]++;
-      }
-    } else {
-#if CONFIG_SB8X8
-      int y, x;
-#endif
-      if (mbmi->mode != I4X4_PRED && mbmi->mode != I8X8_PRED &&
-          mbmi->mode != SPLITMV && cpi->common.txfm_mode >= ALLOW_16X16) {
-        mbmi->txfm_size = TX_16X16;
-      } else if (mbmi->mode != I4X4_PRED &&
-                 !(mbmi->mode == SPLITMV &&
-                   mbmi->partitioning == PARTITIONING_4X4) &&
-                 cpi->common.txfm_mode >= ALLOW_8X8) {
-        mbmi->txfm_size = TX_8X8;
-      } else {
-        mbmi->txfm_size = TX_4X4;
-      }
-
-#if CONFIG_SB8X8
-      for (y = 0; y < 2; y++) {
-        for (x = !y; x < 2; x++) {
-          if (mi_col + x < cm->mi_cols && mi_row + y < cm->mi_rows) {
-            mi[mis * y + x].mbmi.txfm_size = mbmi->txfm_size;
-          }
-        }
-      }
-#endif
-    }
-  }
-}
-
 static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t,
                               int output_enabled, int mi_row, int mi_col,
                               BLOCK_SIZE_TYPE bsize) {
@@ -2177,7 +2002,17 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t,
     vp9_update_zbin_extra(cpi, x);
   }
 
-  if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {
+  if (xd->mode_info_context->mbmi.mode == I4X4_PRED) {
+    assert(bsize == BLOCK_SIZE_SB8X8 &&
+           xd->mode_info_context->mbmi.txfm_size == TX_4X4);
+
+    vp9_encode_intra4x4mby(x, bsize);
+    vp9_build_intra_predictors_sbuv_s(&x->e_mbd, bsize);
+    vp9_encode_sbuv(cm, x, bsize);
+
+    if (output_enabled)
+      sum_intra_stats(cpi, x);
+  } else if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {
     vp9_build_intra_predictors_sby_s(&x->e_mbd, bsize);
     vp9_build_intra_predictors_sbuv_s(&x->e_mbd, bsize);
     if (output_enabled)
@@ -2212,83 +2047,11 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t,
     vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
   }
 
-  if (!x->skip) {
-    vp9_subtract_sb(x, bsize);
-
-    switch (xd->mode_info_context->mbmi.txfm_size) {
-      case TX_32X32:
-        vp9_transform_sby_32x32(x, bsize);
-        vp9_quantize_sby_32x32(x, bsize);
-        if (bsize == BLOCK_SIZE_SB64X64) {
-          vp9_transform_sbuv_32x32(x, bsize);
-          vp9_quantize_sbuv_32x32(x, bsize);
-        } else {
-          vp9_transform_sbuv_16x16(x, bsize);
-          vp9_quantize_sbuv_16x16(x, bsize);
-        }
-        if (x->optimize) {
-          vp9_optimize_sby_32x32(cm, x, bsize);
-          if (bsize == BLOCK_SIZE_SB64X64)
-            vp9_optimize_sbuv_32x32(cm, x, bsize);
-          else
-            vp9_optimize_sbuv_16x16(cm, x, bsize);
-        }
-        vp9_inverse_transform_sby_32x32(xd, bsize);
-        if (bsize == BLOCK_SIZE_SB64X64)
-          vp9_inverse_transform_sbuv_32x32(xd, bsize);
-        else
-          vp9_inverse_transform_sbuv_16x16(xd, bsize);
-        break;
-      case TX_16X16:
-        vp9_transform_sby_16x16(x, bsize);
-        vp9_quantize_sby_16x16(x, bsize);
-        if (bsize >= BLOCK_SIZE_SB32X32) {
-          vp9_transform_sbuv_16x16(x, bsize);
-          vp9_quantize_sbuv_16x16(x, bsize);
-        } else {
-          vp9_transform_sbuv_8x8(x, bsize);
-          vp9_quantize_sbuv_8x8(x, bsize);
-        }
-        if (x->optimize) {
-          vp9_optimize_sby_16x16(cm, x, bsize);
-          if (bsize >= BLOCK_SIZE_SB32X32)
-            vp9_optimize_sbuv_16x16(cm, x, bsize);
-          else
-            vp9_optimize_sbuv_8x8(cm, x, bsize);
-        }
-        vp9_inverse_transform_sby_16x16(xd, bsize);
-        if (bsize >= BLOCK_SIZE_SB32X32)
-          vp9_inverse_transform_sbuv_16x16(xd, bsize);
-        else
-          vp9_inverse_transform_sbuv_8x8(xd, bsize);
-        break;
-      case TX_8X8:
-        vp9_transform_sby_8x8(x, bsize);
-        vp9_transform_sbuv_8x8(x, bsize);
-        vp9_quantize_sby_8x8(x, bsize);
-        vp9_quantize_sbuv_8x8(x, bsize);
-        if (x->optimize) {
-          vp9_optimize_sby_8x8(cm, x, bsize);
-          vp9_optimize_sbuv_8x8(cm, x, bsize);
-        }
-        vp9_inverse_transform_sby_8x8(xd, bsize);
-        vp9_inverse_transform_sbuv_8x8(xd, bsize);
-        break;
-      case TX_4X4:
-        vp9_transform_sby_4x4(x, bsize);
-        vp9_transform_sbuv_4x4(x, bsize);
-        vp9_quantize_sby_4x4(x, bsize);
-        vp9_quantize_sbuv_4x4(x, bsize);
-        if (x->optimize) {
-          vp9_optimize_sby_4x4(cm, x, bsize);
-          vp9_optimize_sbuv_4x4(cm, x, bsize);
-        }
-        vp9_inverse_transform_sby_4x4(xd, bsize);
-        vp9_inverse_transform_sbuv_4x4(xd, bsize);
-        break;
-      default: assert(0);
-    }
-    vp9_recon_sb_c(xd, bsize);
+  if (xd->mode_info_context->mbmi.mode == I4X4_PRED) {
+    assert(bsize == BLOCK_SIZE_SB8X8);
+    vp9_tokenize_sb(cpi, &x->e_mbd, t, !output_enabled, bsize);
+  } else if (!x->skip) {
+    vp9_encode_sb(cm, x, bsize);
     vp9_tokenize_sb(cpi, &x->e_mbd, t, !output_enabled, bsize);
   } else {
     // FIXME(rbultje): not tile-aware (mi - 1)
@@ -2315,8 +2078,10 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t,
           vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP))) {
       if (bsize >= BLOCK_SIZE_SB32X32) {
         cpi->txfm_count_32x32p[mi->mbmi.txfm_size]++;
-      } else {
+      } else if (bsize >= BLOCK_SIZE_MB16X16) {
         cpi->txfm_count_16x16p[mi->mbmi.txfm_size]++;
+      } else {
+        cpi->txfm_count_8x8p[mi->mbmi.txfm_size]++;
       }
     } else {
       int x, y;
@@ -2324,6 +2089,11 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t,
 
       if (sz == TX_32X32 && bsize < BLOCK_SIZE_SB32X32)
         sz = TX_16X16;
+      if (sz == TX_16X16 && bsize < BLOCK_SIZE_MB16X16)
+        sz = TX_8X8;
+      if (sz == TX_8X8 && (xd->mode_info_context->mbmi.mode == SPLITMV ||
+                           xd->mode_info_context->mbmi.mode == I4X4_PRED))
+        sz = TX_4X4;
 
       for (y = 0; y < bh; y++) {
         for (x = 0; x < bw; x++) {
diff --git a/vp9/encoder/vp9_encodeintra.c b/vp9/encoder/vp9_encodeintra.c
index f6ddca8f4..72a6603f8 100644
--- a/vp9/encoder/vp9_encodeintra.c
+++ b/vp9/encoder/vp9_encodeintra.c
@@ -57,36 +57,37 @@ static void encode_intra4x4block(MACROBLOCK *x, int ib,
       raster_block_offset_int16(xd, bsize, 0, ib,
                                 xd->plane[0].diff);
   int16_t* const coeff = BLOCK_OFFSET(x->plane[0].coeff, ib, 16);
+  const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);
 
-  assert(ib < 16);
+  assert(ib < (1 << (bwl + bhl)));
 
 #if CONFIG_NEWBINTRAMODES
   xd->mode_info_context->bmi[ib].as_mode.context =
     vp9_find_bpred_context(&x->e_mbd, ib, dst, xd->plane[0].dst.stride);
 #endif
 
-  vp9_intra4x4_predict(&x->e_mbd, ib,
+  vp9_intra4x4_predict(&x->e_mbd, ib, bsize,
                        xd->mode_info_context->bmi[ib].as_mode.first,
                        dst, xd->plane[0].dst.stride);
-  vp9_subtract_block(4, 4, src_diff, 16,
+  vp9_subtract_block(4, 4, src_diff, 4 << bwl,
                      src, x->plane[0].src.stride,
                      dst, xd->plane[0].dst.stride);
 
   tx_type = get_tx_type_4x4(&x->e_mbd, ib);
   if (tx_type != DCT_DCT) {
-    vp9_short_fht4x4(src_diff, coeff, 16, tx_type);
+    vp9_short_fht4x4(src_diff, coeff, 4 << bwl, tx_type);
     x->quantize_b_4x4(x, ib, tx_type, 16);
     vp9_short_iht4x4(BLOCK_OFFSET(xd->plane[0].dqcoeff, ib, 16),
-                     diff, 16, tx_type);
+                     diff, 4 << bwl, tx_type);
   } else {
-    x->fwd_txm4x4(src_diff, coeff, 32);
+    x->fwd_txm4x4(src_diff, coeff, 8 << bwl);
     x->quantize_b_4x4(x, ib, tx_type, 16);
     vp9_inverse_transform_b_4x4(&x->e_mbd, xd->plane[0].eobs[ib],
                                 BLOCK_OFFSET(xd->plane[0].dqcoeff, ib, 16),
-                                diff, 32);
+                                diff, 8 << bwl);
   }
 
-  vp9_recon_b(dst, diff, dst, xd->plane[0].dst.stride);
+  vp9_recon_b(dst, diff, 4 << bwl, dst, xd->plane[0].dst.stride);
 }
 
 void vp9_encode_intra4x4mby(MACROBLOCK *mb, BLOCK_SIZE_TYPE bsize) {
@@ -100,207 +101,14 @@ void vp9_encode_intra4x4mby(MACROBLOCK *mb, BLOCK_SIZE_TYPE bsize) {
 
 void vp9_encode_intra16x16mby(VP9_COMMON *const cm, MACROBLOCK *x) {
   MACROBLOCKD *xd = &x->e_mbd;
-  TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;
 
   vp9_build_intra_predictors_sby_s(xd, BLOCK_SIZE_MB16X16);
-  vp9_subtract_sby(x, BLOCK_SIZE_MB16X16);
-
-  switch (tx_size) {
-    case TX_16X16:
-      vp9_transform_sby_16x16(x, BLOCK_SIZE_MB16X16);
-      vp9_quantize_sby_16x16(x, BLOCK_SIZE_MB16X16);
-      if (x->optimize)
-        vp9_optimize_sby_16x16(cm, x, BLOCK_SIZE_MB16X16);
-      vp9_inverse_transform_sby_16x16(xd, BLOCK_SIZE_MB16X16);
-      break;
-    case TX_8X8:
-      vp9_transform_sby_8x8(x, BLOCK_SIZE_MB16X16);
-      vp9_quantize_sby_8x8(x, BLOCK_SIZE_MB16X16);
-      if (x->optimize)
-        vp9_optimize_sby_8x8(cm, x, BLOCK_SIZE_MB16X16);
-      vp9_inverse_transform_sby_8x8(xd, BLOCK_SIZE_MB16X16);
-      break;
-    default:
-      vp9_transform_sby_4x4(x, BLOCK_SIZE_MB16X16);
-      vp9_quantize_sby_4x4(x, BLOCK_SIZE_MB16X16);
-      if (x->optimize)
-        vp9_optimize_sby_4x4(cm, x, BLOCK_SIZE_MB16X16);
-      vp9_inverse_transform_sby_4x4(xd, BLOCK_SIZE_MB16X16);
-      break;
-  }
-
-  vp9_recon_sby(xd, BLOCK_SIZE_MB16X16);
+  vp9_encode_sby(cm, x, BLOCK_SIZE_MB16X16);
 }
 
 void vp9_encode_intra16x16mbuv(VP9_COMMON *const cm, MACROBLOCK *x) {
   MACROBLOCKD *xd = &x->e_mbd;
-  TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;
 
   vp9_build_intra_predictors_sbuv_s(xd, BLOCK_SIZE_MB16X16);
-  vp9_subtract_sbuv(x, BLOCK_SIZE_MB16X16);
-
-  switch (tx_size) {
-    case TX_4X4:
-      vp9_transform_sbuv_4x4(x, BLOCK_SIZE_MB16X16);
-      vp9_quantize_sbuv_4x4(x, BLOCK_SIZE_MB16X16);
-      if (x->optimize)
-        vp9_optimize_sbuv_4x4(cm, x, BLOCK_SIZE_MB16X16);
-      vp9_inverse_transform_sbuv_4x4(xd, BLOCK_SIZE_MB16X16);
-      break;
-    default:  // 16x16 or 8x8
-      vp9_transform_sbuv_8x8(x, BLOCK_SIZE_MB16X16);
-      vp9_quantize_sbuv_8x8(x, BLOCK_SIZE_MB16X16);
-      if (x->optimize)
-        vp9_optimize_sbuv_8x8(cm, x, BLOCK_SIZE_MB16X16);
-      vp9_inverse_transform_sbuv_8x8(xd, BLOCK_SIZE_MB16X16);
-      break;
-    }
-
-  vp9_recon_sbuv(xd, BLOCK_SIZE_MB16X16);
-}
-
-void vp9_encode_intra8x8(MACROBLOCK *x, int ib) {
-  MACROBLOCKD *xd = &x->e_mbd;
-  uint8_t* const src =
-      raster_block_offset_uint8(xd, BLOCK_SIZE_MB16X16, 0, ib,
-                                x->plane[0].src.buf, x->plane[0].src.stride);
-  int16_t* const src_diff =
-      raster_block_offset_int16(xd, BLOCK_SIZE_MB16X16, 0, ib,
-                                x->plane[0].src_diff);
-  int16_t* const diff =
-      raster_block_offset_int16(xd, BLOCK_SIZE_MB16X16, 0, ib,
-                                xd->plane[0].diff);
-  uint8_t* const dst =
-      raster_block_offset_uint8(xd, BLOCK_SIZE_MB16X16, 0, ib,
-                                xd->plane[0].dst.buf, xd->plane[0].dst.stride);
-  const int iblock[4] = {0, 1, 4, 5};
-  int i;
-  TX_TYPE tx_type;
-
-  vp9_intra8x8_predict(xd, ib, xd->mode_info_context->bmi[ib].as_mode.first,
-                       dst, xd->plane[0].dst.stride);
-  // generate residual blocks
-  vp9_subtract_block(8, 8, src_diff, 16,
-                     src, x->plane[0].src.stride,
-                     dst, xd->plane[0].dst.stride);
-
-  if (xd->mode_info_context->mbmi.txfm_size == TX_8X8) {
-    int idx = (ib & 0x02) ? (ib + 2) : ib;
-    int16_t* const dqcoeff = BLOCK_OFFSET(xd->plane[0].dqcoeff, idx, 16);
-    int16_t* const coeff = BLOCK_OFFSET(x->plane[0].coeff, idx, 16);
-
-    assert(idx < 16);
-    tx_type = get_tx_type_8x8(xd, ib);
-    if (tx_type != DCT_DCT) {
-      vp9_short_fht8x8(src_diff, coeff, 16, tx_type);
-      x->quantize_b_8x8(x, idx, tx_type, 16);
-      vp9_short_iht8x8(dqcoeff, diff, 16, tx_type);
-    } else {
-      x->fwd_txm8x8(src_diff, coeff, 32);
-      x->quantize_b_8x8(x, idx, DCT_DCT, 16);
-      vp9_short_idct8x8(dqcoeff, diff, 32);
-    }
-  } else {
-    for (i = 0; i < 4; i++) {
-      int idx = ib + iblock[i];
-      int16_t* const dqcoeff = BLOCK_OFFSET(xd->plane[0].dqcoeff, idx, 16);
-      int16_t* const coeff = BLOCK_OFFSET(x->plane[0].coeff, idx, 16);
-      int16_t* const src_diff =
-          raster_block_offset_int16(xd, BLOCK_SIZE_MB16X16, 0, idx,
-                                    x->plane[0].src_diff);
-      int16_t* const diff =
-          raster_block_offset_int16(xd, BLOCK_SIZE_MB16X16, 0, idx,
-                                    xd->plane[0].diff);
-
-      assert(idx < 16);
-      tx_type = get_tx_type_4x4(xd, ib + iblock[i]);
-      if (tx_type != DCT_DCT) {
-        vp9_short_fht4x4(src_diff, coeff, 16, tx_type);
-        x->quantize_b_4x4(x, ib + iblock[i], tx_type, 16);
-        vp9_short_iht4x4(dqcoeff, diff, 16, tx_type);
-      } else if (!(i & 1) &&
-                 get_tx_type_4x4(xd, ib + iblock[i] + 1) == DCT_DCT) {
-        x->fwd_txm8x4(src_diff, coeff, 32);
-        x->quantize_b_4x4_pair(x, ib + iblock[i], ib + iblock[i] + 1, 16);
-        vp9_inverse_transform_b_4x4(xd, xd->plane[0].eobs[ib + iblock[i]],
-                                    dqcoeff, diff, 32);
-        vp9_inverse_transform_b_4x4(xd, xd->plane[0].eobs[ib + iblock[i] + 1],
-                                    dqcoeff + 16, diff + 4, 32);
-        i++;
-      } else {
-        x->fwd_txm4x4(src_diff, coeff, 32);
-        x->quantize_b_4x4(x, ib + iblock[i], tx_type, 16);
-        vp9_inverse_transform_b_4x4(xd, xd->plane[0].eobs[ib + iblock[i]],
-                                    dqcoeff, diff, 32);
-      }
-    }
-  }
-
-  // reconstruct submacroblock
-  for (i = 0; i < 4; i++) {
-    int16_t* const diff =
-        raster_block_offset_int16(xd, BLOCK_SIZE_MB16X16, 0, ib + iblock[i],
-                                  xd->plane[0].diff);
-    uint8_t* const dst =
-        raster_block_offset_uint8(xd, BLOCK_SIZE_MB16X16, 0, ib + iblock[i],
-                                  xd->plane[0].dst.buf,
-                                  xd->plane[0].dst.stride);
-    vp9_recon_b_c(dst, diff, dst, xd->plane[0].dst.stride);
-  }
-}
-
-void vp9_encode_intra8x8mby(MACROBLOCK *x) {
-  int i;
-
-  for (i = 0; i < 4; i++)
-    vp9_encode_intra8x8(x, vp9_i8x8_block[i]);
-}
-
-static void encode_intra_uv4x4(MACROBLOCK *x, int ib, int mode) {
-  MACROBLOCKD * const xd = &x->e_mbd;
-  int16_t * const dqcoeff = MB_SUBBLOCK_FIELD(xd, dqcoeff, ib);
-  int16_t* const coeff = MB_SUBBLOCK_FIELD(x, coeff, ib);
-  const int plane = ib < 20 ? 1 : 2;
-  const int block = ib < 20 ? ib - 16 : ib - 20;
-  uint8_t* const src =
-      raster_block_offset_uint8(xd, BLOCK_SIZE_MB16X16, plane, block,
-                                x->plane[plane].src.buf,
-                                x->plane[plane].src.stride);
-  int16_t* const src_diff =
-      raster_block_offset_int16(xd, BLOCK_SIZE_MB16X16, plane, block,
-                                x->plane[plane].src_diff);
-  int16_t* const diff =
-      raster_block_offset_int16(xd, BLOCK_SIZE_MB16X16, plane, block,
-                                xd->plane[plane].diff);
-  uint8_t* const dst =
-      raster_block_offset_uint8(xd, BLOCK_SIZE_MB16X16, plane, block,
-                                xd->plane[plane].dst.buf,
-                                xd->plane[plane].dst.stride);
-
-  assert(ib >= 16 && ib < 24);
-  vp9_intra_uv4x4_predict(&x->e_mbd, ib, mode,
-                          dst, xd->plane[plane].dst.stride);
-
-  assert(xd->plane[1].subsampling_x == 1);
-  vp9_subtract_block(4, 4, src_diff, 8,
-                     src, x->plane[plane].src.stride,
-                     dst, xd->plane[plane].dst.stride);
-
-  x->fwd_txm4x4(src_diff, coeff, 16);
-  x->quantize_b_4x4(x, ib, DCT_DCT, 16);
-  vp9_inverse_transform_b_4x4(&x->e_mbd, xd->plane[plane].eobs[block],
-                              dqcoeff, diff, 16);
-
-  vp9_recon_uv_b_c(dst, diff, dst, xd->plane[plane].dst.stride);
-}
-
-void vp9_encode_intra8x8mbuv(MACROBLOCK *x) {
-  int i;
-
-  for (i = 0; i < 4; i++) {
-    int mode = x->e_mbd.mode_info_context->bmi[vp9_i8x8_block[i]].as_mode.first;
-
-    encode_intra_uv4x4(x, i + 16, mode);  // u
-    encode_intra_uv4x4(x, i + 20, mode);  // v
-  }
+  vp9_encode_sbuv(cm, x, BLOCK_SIZE_MB16X16);
 }
diff --git a/vp9/encoder/vp9_encodeintra.h b/vp9/encoder/vp9_encodeintra.h
index 7ec2f11d4..c26200494 100644
--- a/vp9/encoder/vp9_encodeintra.h
+++ b/vp9/encoder/vp9_encodeintra.h
@@ -17,8 +17,4 @@ int vp9_encode_intra(VP9_COMP *cpi, MACROBLOCK *x, int use_16x16_pred);
 void vp9_encode_intra16x16mby(VP9_COMMON *const cm, MACROBLOCK *x);
 void vp9_encode_intra16x16mbuv(VP9_COMMON *const cm, MACROBLOCK *x);
 void vp9_encode_intra4x4mby(MACROBLOCK *mb, BLOCK_SIZE_TYPE bs);
-void vp9_encode_intra8x8mby(MACROBLOCK *x);
-void vp9_encode_intra8x8mbuv(MACROBLOCK *x);
-void vp9_encode_intra8x8(MACROBLOCK *x, int ib);
-
 #endif  // VP9_ENCODER_VP9_ENCODEINTRA_H_
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 0cb1ae958..421052753 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -67,143 +67,6 @@ void vp9_subtract_sb(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
 }
 
 
-void vp9_transform_sby_32x32(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize) - 3, bw = 1 << bwl;
-  const int bh = 1 << (b_height_log2(bsize) - 3);
-  const int stride = 32 << bwl;
-  int n;
-
-  for (n = 0; n < bw * bh; n++) {
-    const int x_idx = n & (bw - 1), y_idx = n >> bwl;
-
-    vp9_short_fdct32x32(x->plane[0].src_diff + y_idx * stride * 32 + x_idx * 32,
-                        x->plane[0].coeff + n * 1024, stride * 2);
-  }
-}
-
-void vp9_transform_sby_16x16(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize) - 2, bw = 1 << bwl;
-  const int bh = 1 << (b_height_log2(bsize) - 2);
-  const int stride = 16 << bwl, bstride = 4 << bwl;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  int n;
-
-  for (n = 0; n < bw * bh; n++) {
-    const int x_idx = n & (bw - 1), y_idx = n >> bwl;
-    const TX_TYPE tx_type = get_tx_type_16x16(xd,
-                                              (y_idx * bstride + x_idx) * 4);
-
-    if (tx_type != DCT_DCT) {
-      vp9_short_fht16x16(x->plane[0].src_diff +
-                             y_idx * stride * 16 + x_idx * 16,
-                         x->plane[0].coeff + n * 256, stride, tx_type);
-    } else {
-      x->fwd_txm16x16(x->plane[0].src_diff + y_idx * stride * 16 + x_idx * 16,
-                      x->plane[0].coeff + n * 256, stride * 2);
-    }
-  }
-}
-
-void vp9_transform_sby_8x8(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize) - 1, bw = 1 << bwl;
-  const int bh = 1 << (b_height_log2(bsize) - 1);
-  const int stride = 8 << bwl, bstride = 2 << bwl;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  int n;
-
-  for (n = 0; n < bw * bh; n++) {
-    const int x_idx = n & (bw - 1), y_idx = n >> bwl;
-    const TX_TYPE tx_type = get_tx_type_8x8(xd, (y_idx * bstride + x_idx) * 2);
-
-    if (tx_type != DCT_DCT) {
-      vp9_short_fht8x8(x->plane[0].src_diff + y_idx * stride * 8 + x_idx * 8,
-                       x->plane[0].coeff + n * 64, stride, tx_type);
-    } else {
-      x->fwd_txm8x8(x->plane[0].src_diff + y_idx * stride * 8 + x_idx * 8,
-                    x->plane[0].coeff + n * 64, stride * 2);
-    }
-  }
-}
-
-void vp9_transform_sby_4x4(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize), bw = 1 << bwl;
-  const int bh = 1 << b_height_log2(bsize);
-  const int stride = 4 << bwl;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  int n;
-
-  for (n = 0; n < bw * bh; n++) {
-    const int x_idx = n & (bw - 1), y_idx = n >> bwl;
-    const TX_TYPE tx_type = get_tx_type_4x4(xd, n);
-
-    if (tx_type != DCT_DCT) {
-      vp9_short_fht4x4(x->plane[0].src_diff + y_idx * stride * 4 + x_idx * 4,
-                       x->plane[0].coeff + n * 16, stride, tx_type);
-    } else {
-      x->fwd_txm4x4(x->plane[0].src_diff + y_idx * stride * 4 + x_idx * 4,
-                    x->plane[0].coeff + n * 16, stride * 2);
-    }
-  }
-}
-
-void vp9_transform_sbuv_32x32(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
-  assert(bsize == BLOCK_SIZE_SB64X64);
-  vp9_clear_system_state();
-  vp9_short_fdct32x32(x->plane[1].src_diff, x->plane[1].coeff, 64);
-  vp9_short_fdct32x32(x->plane[2].src_diff, x->plane[2].coeff, 64);
-}
-
-void vp9_transform_sbuv_16x16(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize) - 2, bhl = b_height_log2(bsize) - 2;
-  const int bw = 1 << (bwl - 1), bh = 1 << (bhl - 1);
-  const int stride = 16 << (bwl - 1);
-  int n;
-
-  vp9_clear_system_state();
-  for (n = 0; n < bw * bh; n++) {
-    const int x_idx = n & (bw - 1), y_idx = n >> (bwl - 1);
-
-    x->fwd_txm16x16(x->plane[1].src_diff + y_idx * stride * 16 + x_idx * 16,
-                    x->plane[1].coeff + n * 256, stride * 2);
-    x->fwd_txm16x16(x->plane[2].src_diff + y_idx * stride * 16 + x_idx * 16,
-                    x->plane[2].coeff + n * 256, stride * 2);
-  }
-}
-
-void vp9_transform_sbuv_8x8(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize) - 1, bhl = b_height_log2(bsize) - 1;
-  const int bw = 1 << (bwl - 1), bh = 1 << (bhl - 1);
-  const int stride = 8 << (bwl - 1);
-  int n;
-
-  vp9_clear_system_state();
-  for (n = 0; n < bw * bh; n++) {
-    const int x_idx = n & (bw - 1), y_idx = n >> (bwl - 1);
-
-    x->fwd_txm8x8(x->plane[1].src_diff + y_idx * stride * 8 + x_idx * 8,
-                  x->plane[1].coeff + n * 64, stride * 2);
-    x->fwd_txm8x8(x->plane[2].src_diff + y_idx * stride * 8 + x_idx * 8,
-                  x->plane[2].coeff + n * 64, stride * 2);
-  }
-}
-
-void vp9_transform_sbuv_4x4(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);
-  const int bw = 1 << (bwl - 1), bh = 1 << (bhl - 1);
-  const int stride = 4 << (bwl - 1);
-  int n;
-
-  vp9_clear_system_state();
-  for (n = 0; n < bw * bh; n++) {
-    const int x_idx = n & (bw - 1), y_idx = n >> (bwl - 1);
-
-    x->fwd_txm4x4(x->plane[1].src_diff + y_idx * stride * 4 + x_idx * 4,
-                  x->plane[1].coeff + n * 16, stride * 2);
-    x->fwd_txm4x4(x->plane[2].src_diff + y_idx * stride * 4 + x_idx * 4,
-                  x->plane[2].coeff + n * 16, stride * 2);
-  }
-}
-
 #define RDTRUNC(RM,DM,R,D) ( (128+(R)*(RM)) & 0xFF )
 #define RDTRUNC_8x8(RM,DM,R,D) ( (128+(R)*(RM)) & 0xFF )
 typedef struct vp9_token_state vp9_token_state;
@@ -249,57 +112,53 @@ static int trellis_get_coeff_context(const int *scan,
   return pt;
 }
 
-static void optimize_b(VP9_COMMON *const cm,
-                       MACROBLOCK *mb, int ib, PLANE_TYPE type,
-                       const int16_t *dequant_ptr,
+static void optimize_b(VP9_COMMON *const cm, MACROBLOCK *mb,
+                       int plane, int block, BLOCK_SIZE_TYPE bsize,
                        ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
-                       int tx_size, int y_blocks) {
+                       TX_SIZE tx_size) {
   const int ref = mb->e_mbd.mode_info_context->mbmi.ref_frame != INTRA_FRAME;
   MACROBLOCKD *const xd = &mb->e_mbd;
   vp9_token_state tokens[1025][2];
   unsigned best_index[1025][2];
-  const struct plane_block_idx pb_idx = plane_block_idx(y_blocks, ib);
-  const int16_t *coeff_ptr = BLOCK_OFFSET(mb->plane[pb_idx.plane].coeff,
-                                          pb_idx.block, 16);
+  const int16_t *coeff_ptr = BLOCK_OFFSET(mb->plane[plane].coeff,
+                                          block, 16);
   int16_t *qcoeff_ptr;
   int16_t *dqcoeff_ptr;
-  int eob = xd->plane[pb_idx.plane].eobs[pb_idx.block], final_eob, sz = 0;
+  int eob = xd->plane[plane].eobs[block], final_eob, sz = 0;
   const int i0 = 0;
   int rc, x, next, i;
   int64_t rdmult, rddiv, rd_cost0, rd_cost1;
   int rate0, rate1, error0, error1, t0, t1;
   int best, band, pt;
+  PLANE_TYPE type = xd->plane[plane].plane_type;
   int err_mult = plane_rd_mult[type];
   int default_eob, pad;
   int const *scan, *nb;
   const int mul = 1 + (tx_size == TX_32X32);
   uint8_t token_cache[1024];
+  const int ib = txfrm_block_to_raster_block(xd, bsize, plane,
+                                             block, 2 * tx_size);
+  const int16_t *dequant_ptr = xd->plane[plane].dequant;
 
-  assert((!type && !pb_idx.plane) || (type && pb_idx.plane));
-  dqcoeff_ptr = BLOCK_OFFSET(xd->plane[pb_idx.plane].dqcoeff, pb_idx.block, 16);
-  qcoeff_ptr = BLOCK_OFFSET(xd->plane[pb_idx.plane].qcoeff, pb_idx.block, 16);
+  assert((!type && !plane) || (type && plane));
+  dqcoeff_ptr = BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16);
+  qcoeff_ptr = BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16);
   switch (tx_size) {
     default:
     case TX_4X4: {
-      const TX_TYPE tx_type = get_tx_type_4x4(xd, ib);
+      const TX_TYPE tx_type = plane == 0 ? get_tx_type_4x4(xd, ib) : DCT_DCT;
       default_eob = 16;
       scan = get_scan_4x4(tx_type);
       break;
     }
     case TX_8X8: {
-      const BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type;
-      const int sz = 1 + b_width_log2(sb_type);
-      const int x = ib & ((1 << sz) - 1), y = ib - x;
-      const TX_TYPE tx_type = get_tx_type_8x8(xd, y + (x >> 1));
+      const TX_TYPE tx_type = plane == 0 ? get_tx_type_8x8(xd, ib) : DCT_DCT;
       scan = get_scan_8x8(tx_type);
       default_eob = 64;
       break;
     }
     case TX_16X16: {
-      const BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type;
-      const int sz = 2 + b_width_log2(sb_type);
-      const int x = ib & ((1 << sz) - 1), y = ib - x;
-      const TX_TYPE tx_type = get_tx_type_16x16(xd, y + (x >> 2));
+      const TX_TYPE tx_type = plane == 0 ? get_tx_type_16x16(xd, ib) : DCT_DCT;
       scan = get_scan_16x16(tx_type);
       default_eob = 256;
       break;
@@ -480,272 +339,268 @@ static void optimize_b(VP9_COMMON *const cm,
   }
   final_eob++;
 
-  xd->plane[pb_idx.plane].eobs[pb_idx.block] = final_eob;
+  xd->plane[plane].eobs[block] = final_eob;
   *a = *l = (final_eob > 0);
 }
 
-void vp9_optimize_sby_32x32(VP9_COMMON *const cm, MACROBLOCK *x,
-                            BLOCK_SIZE_TYPE bsize) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  ENTROPY_CONTEXT *a = xd->plane[0].above_context;
-  ENTROPY_CONTEXT *l = xd->plane[0].left_context;
-  const int bwl = b_width_log2(bsize) - 3, bw = 1 << bwl;
-  const int bh = 1 << (b_height_log2(bsize) - 3);
-  ENTROPY_CONTEXT ta[2], tl[2];
-  int n;
-
-  for (n = 0; n < bw; n++, a += 8)
-    ta[n] = (a[0] + a[1] + a[2] + a[3] + a[4] + a[5] + a[6] + a[7]) != 0;
-  for (n = 0; n < bh; n++, l += 8)
-    tl[n] = (l[0] + l[1] + l[2] + l[3] + l[4] + l[5] + l[6] + l[7]) != 0;
-
-  for (n = 0; n < bw * bh; n++) {
-    const int x_idx = n & (bw - 1), y_idx = n >> bwl;
-
-    optimize_b(cm, x, n * 64, PLANE_TYPE_Y_WITH_DC, x->e_mbd.plane[0].dequant,
-               ta + x_idx, tl + y_idx, TX_32X32, 64 * bw * bh);
-  }
+struct optimize_block_args {
+  VP9_COMMON *cm;
+  MACROBLOCK *x;
+  struct optimize_ctx *ctx;
+};
+
+void vp9_optimize_b(int plane, int block, BLOCK_SIZE_TYPE bsize,
+                    int ss_txfrm_size, VP9_COMMON *cm, MACROBLOCK *mb,
+                    struct optimize_ctx *ctx) {
+  MACROBLOCKD* const xd = &mb->e_mbd;
+  int x, y;
+
+  // find current entropy context
+  txfrm_block_to_raster_xy(xd, bsize, plane, block, ss_txfrm_size, &x, &y);
+
+  optimize_b(cm, mb, plane, block, bsize,
+             &ctx->ta[plane][x], &ctx->tl[plane][y],
+             ss_txfrm_size / 2);
 }
 
-void vp9_optimize_sby_16x16(VP9_COMMON *const cm, MACROBLOCK *x,
-                            BLOCK_SIZE_TYPE bsize) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  ENTROPY_CONTEXT *a = xd->plane[0].above_context;
-  ENTROPY_CONTEXT *l = xd->plane[0].left_context;
-  const int bwl = b_width_log2(bsize) - 2, bw = 1 << bwl;
-  const int bh = 1 << (b_height_log2(bsize) - 2);
-  ENTROPY_CONTEXT ta[4], tl[4];
-  int n;
-
-  for (n = 0; n < bw; n++, a += 4)
-    ta[n] = (a[0] + a[1] + a[2] + a[3]) != 0;
-  for (n = 0; n < bh; n++, l += 4)
-    tl[n] = (l[0] + l[1] + l[2] + l[3]) != 0;
-
-  for (n = 0; n < bw * bh; n++) {
-    const int x_idx = n & (bw - 1), y_idx = n >> bwl;
-
-    optimize_b(cm, x, n * 16, PLANE_TYPE_Y_WITH_DC, x->e_mbd.plane[0].dequant,
-               ta + x_idx, tl + y_idx, TX_16X16, 16 * bw * bh);
-  }
+static void optimize_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
+                           int ss_txfrm_size, void *arg) {
+  const struct optimize_block_args* const args = arg;
+  vp9_optimize_b(plane, block, bsize, ss_txfrm_size, args->cm, args->x,
+                 args->ctx);
 }
 
-void vp9_optimize_sby_8x8(VP9_COMMON *const cm, MACROBLOCK *x,
-                          BLOCK_SIZE_TYPE bsize) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  ENTROPY_CONTEXT *a = xd->plane[0].above_context;
-  ENTROPY_CONTEXT *l = xd->plane[0].left_context;
-  const int bwl = b_width_log2(bsize) - 1, bw = 1 << bwl;
-  const int bh = 1 << (b_height_log2(bsize) - 1);
-  ENTROPY_CONTEXT ta[8], tl[8];
-  int n;
-
-  for (n = 0; n < bw; n++, a += 2)
-    ta[n] = (a[0] + a[1]) != 0;
-  for (n = 0; n < bh; n++, l += 2)
-    tl[n] = (l[0] + l[1]) != 0;
-
-  for (n = 0; n < bw * bh; n++) {
-    const int x_idx = n & (bw - 1), y_idx = n >> bwl;
-
-    optimize_b(cm, x, n * 4, PLANE_TYPE_Y_WITH_DC, x->e_mbd.plane[0].dequant,
-               ta + x_idx, tl + y_idx, TX_8X8, 4 * bw * bh);
+void vp9_optimize_init(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize,
+                       struct optimize_ctx *ctx) {
+  int p;
+
+  for (p = 0; p < MAX_MB_PLANE; p++) {
+    const struct macroblockd_plane* const plane = &xd->plane[p];
+    const int bwl = b_width_log2(bsize) - plane->subsampling_x;
+    const int bhl = b_height_log2(bsize) - plane->subsampling_y;
+    const TX_SIZE tx_size = tx_size_for_plane(xd, bsize, p);
+    int i, j;
+
+    for (i = 0; i < 1 << bwl; i += 1 << tx_size) {
+      int c = 0;
+      ctx->ta[p][i] = 0;
+      for (j = 0; j < 1 << tx_size && !c; j++) {
+        c = ctx->ta[p][i] |= plane->above_context[i + j];
+      }
+    }
+    for (i = 0; i < 1 << bhl; i += 1 << tx_size) {
+      int c = 0;
+      ctx->tl[p][i] = 0;
+      for (j = 0; j < 1 << tx_size && !c; j++) {
+        c = ctx->tl[p][i] |= plane->left_context[i + j];
+      }
+    }
   }
 }
 
-void vp9_optimize_sby_4x4(VP9_COMMON *const cm, MACROBLOCK *x,
-                          BLOCK_SIZE_TYPE bsize) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  int bwl = b_width_log2(bsize), bw = 1 << bwl;
-  int bh = 1 << b_height_log2(bsize);
-  ENTROPY_CONTEXT ta[16], tl[16];
-  int n;
+void vp9_optimize_sby(VP9_COMMON *const cm, MACROBLOCK *x,
+                      BLOCK_SIZE_TYPE bsize) {
+  struct optimize_ctx ctx;
+  struct optimize_block_args arg = {cm, x, &ctx};
+  vp9_optimize_init(&x->e_mbd, bsize, &ctx);
+  foreach_transformed_block_in_plane(&x->e_mbd, bsize, 0,
+                                     optimize_block, &arg);
+}
 
-  vpx_memcpy(ta, xd->plane[0].above_context, sizeof(ENTROPY_CONTEXT) * bw);
-  vpx_memcpy(tl, xd->plane[0].left_context, sizeof(ENTROPY_CONTEXT) * bh);
+void vp9_optimize_sbuv(VP9_COMMON *const cm, MACROBLOCK *x,
+                       BLOCK_SIZE_TYPE bsize) {
+  struct optimize_ctx ctx;
+  struct optimize_block_args arg = {cm, x, &ctx};
+  vp9_optimize_init(&x->e_mbd, bsize, &ctx);
+  foreach_transformed_block_uv(&x->e_mbd, bsize, optimize_block, &arg);
+}
 
-  for (n = 0; n < bw * bh; n++) {
-    const int x_idx = n & (bw - 1), y_idx = n >> bwl;
+struct encode_b_args {
+  VP9_COMMON *cm;
+  MACROBLOCK *x;
+  struct optimize_ctx *ctx;
+};
 
-    optimize_b(cm, x, n, PLANE_TYPE_Y_WITH_DC, x->e_mbd.plane[0].dequant,
-               ta + x_idx, tl + y_idx, TX_4X4, bh * bw);
+static void xform_quant(int plane, int block, BLOCK_SIZE_TYPE bsize,
+                         int ss_txfrm_size, void *arg) {
+  struct encode_b_args* const args = arg;
+  MACROBLOCK* const x = args->x;
+  MACROBLOCKD* const xd = &x->e_mbd;
+  const int bw = 4 << (b_width_log2(bsize) - xd->plane[plane].subsampling_x);
+  const int raster_block = txfrm_block_to_raster_block(xd, bsize, plane,
+                                                       block, ss_txfrm_size);
+  int16_t* const src_diff = raster_block_offset_int16(xd, bsize, plane,
+                                                      raster_block,
+                                                      x->plane[plane].src_diff);
+  TX_TYPE tx_type = DCT_DCT;
+
+  switch (ss_txfrm_size / 2) {
+    case TX_32X32:
+      vp9_short_fdct32x32(src_diff,
+                          BLOCK_OFFSET(x->plane[plane].coeff, block, 16),
+                          bw * 2);
+      break;
+    case TX_16X16:
+      tx_type = plane == 0 ? get_tx_type_16x16(xd, raster_block) : DCT_DCT;
+      if (tx_type != DCT_DCT) {
+        vp9_short_fht16x16(src_diff,
+                           BLOCK_OFFSET(x->plane[plane].coeff, block, 16),
+                           bw, tx_type);
+      } else {
+        x->fwd_txm16x16(src_diff,
+                        BLOCK_OFFSET(x->plane[plane].coeff, block, 16),
+                        bw * 2);
+      }
+      break;
+    case TX_8X8:
+      tx_type = plane == 0 ? get_tx_type_8x8(xd, raster_block) : DCT_DCT;
+      if (tx_type != DCT_DCT) {
+        vp9_short_fht8x8(src_diff,
+                           BLOCK_OFFSET(x->plane[plane].coeff, block, 16),
+                           bw, tx_type);
+      } else {
+        x->fwd_txm8x8(src_diff,
+                      BLOCK_OFFSET(x->plane[plane].coeff, block, 16),
+                      bw * 2);
+      }
+      break;
+    case TX_4X4:
+      tx_type = plane == 0 ? get_tx_type_4x4(xd, raster_block) : DCT_DCT;
+      if (tx_type != DCT_DCT) {
+        vp9_short_fht4x4(src_diff,
+                           BLOCK_OFFSET(x->plane[plane].coeff, block, 16),
+                           bw, tx_type);
+      } else {
+        x->fwd_txm4x4(src_diff,
+                      BLOCK_OFFSET(x->plane[plane].coeff, block, 16),
+                      bw * 2);
+      }
+      break;
+    default:
+      assert(0);
   }
-}
 
-void vp9_optimize_sbuv_32x32(VP9_COMMON *const cm, MACROBLOCK *x,
-                             BLOCK_SIZE_TYPE bsize) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  int b;
-
-  assert(bsize == BLOCK_SIZE_SB64X64);
-  for (b = 256; b < 384; b += 64) {
-    const int plane = 1 + (b >= 320);
-    ENTROPY_CONTEXT *a = xd->plane[plane].above_context;
-    ENTROPY_CONTEXT *l = xd->plane[plane].left_context;
-    ENTROPY_CONTEXT a_ec, l_ec;
-
-    a_ec = (a[0] + a[1] + a[2] + a[3] + a[4] + a[5] + a[6] + a[7]) != 0;
-    l_ec = (l[0] + l[1] + l[2] + l[3] + l[4] + l[5] + l[6] + l[7]) != 0;
-    optimize_b(cm, x, b, PLANE_TYPE_UV, x->e_mbd.plane[plane].dequant,
-               &a_ec, &l_ec, TX_32X32, 256);
-  }
+  vp9_quantize(x, plane, block, 16 << ss_txfrm_size, tx_type);
 }
 
-void vp9_optimize_sbuv_16x16(VP9_COMMON *const cm, MACROBLOCK *x,
-                             BLOCK_SIZE_TYPE bsize) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  const int bwl = b_width_log2(bsize) - 2, bhl = b_height_log2(bsize) - 2;
-  const int bw = 1 << (bwl - 1);
-  const int bh = 1 << (bhl - 1);
-  int uvoff = 16 << (bwl + bhl);
-  int plane, n;
-
-  for (plane = 1; plane < MAX_MB_PLANE; plane++) {
-    ENTROPY_CONTEXT ta[2], *a = xd->plane[plane].above_context;
-    ENTROPY_CONTEXT tl[2], *l = xd->plane[plane].left_context;
-
-    for (n = 0; n < bw; n++, a += 4)
-      ta[n] = (a[0] + a[1] + a[2] + a[3]) != 0;
-    for (n = 0; n < bh; n++, l += 4)
-      tl[n] = (l[0] + l[1] + l[2] + l[3]) != 0;
-
-    for (n = 0; n < bw * bh; n++) {
-      const int x_idx = n & (bw - 1), y_idx = n >> (bwl - 1);
-      optimize_b(cm, x, uvoff + n * 16, PLANE_TYPE_UV,
-                 x->e_mbd.plane[plane].dequant,
-                 &ta[x_idx], &tl[y_idx],
-                 TX_16X16, bh * bw * 64);
-    }
-    uvoff = (uvoff * 5) >> 2;  // switch u -> v
+static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
+                         int ss_txfrm_size, void *arg) {
+  struct encode_b_args* const args = arg;
+  MACROBLOCK* const x = args->x;
+  MACROBLOCKD* const xd = &x->e_mbd;
+  const int bw = 4 << (b_width_log2(bsize) - xd->plane[plane].subsampling_x);
+  const int raster_block = txfrm_block_to_raster_block(xd, bsize, plane,
+                                                       block, ss_txfrm_size);
+  int16_t* const diff = raster_block_offset_int16(xd, bsize, plane,
+                                                  raster_block,
+                                                  xd->plane[plane].diff);
+  TX_TYPE tx_type = DCT_DCT;
+
+  xform_quant(plane, block, bsize, ss_txfrm_size, arg);
+
+  if (x->optimize)
+    vp9_optimize_b(plane, block, bsize, ss_txfrm_size, args->cm, x, args->ctx);
+
+  switch (ss_txfrm_size / 2) {
+    case TX_32X32:
+      vp9_short_idct32x32(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
+                          diff, bw * 2);
+      break;
+    case TX_16X16:
+      tx_type = plane == 0 ? get_tx_type_16x16(xd, raster_block) : DCT_DCT;
+      if (tx_type == DCT_DCT) {
+        vp9_short_idct16x16(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
+                            diff, bw * 2);
+      } else {
+        vp9_short_iht16x16(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
+                           diff, bw, tx_type);
+      }
+      break;
+    case TX_8X8:
+      tx_type = plane == 0 ? get_tx_type_8x8(xd, raster_block) : DCT_DCT;
+      if (tx_type == DCT_DCT) {
+        vp9_short_idct8x8(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
+                          diff, bw * 2);
+      } else {
+        vp9_short_iht8x8(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
+                         diff, bw, tx_type);
+      }
+      break;
+    case TX_4X4:
+      tx_type = plane == 0 ? get_tx_type_4x4(xd, raster_block) : DCT_DCT;
+      if (tx_type == DCT_DCT) {
+        // this is like vp9_short_idct4x4 but has a special case around eob<=1
+        // which is significant (not just an optimization) for the lossless
+        // case.
+        vp9_inverse_transform_b_4x4(xd, xd->plane[plane].eobs[block],
+            BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16), diff, bw * 2);
+      } else {
+        vp9_short_iht4x4(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
+                         diff, bw, tx_type);
+      }
+      break;
   }
 }
 
-void vp9_optimize_sbuv_8x8(VP9_COMMON *const cm, MACROBLOCK *x,
-                           BLOCK_SIZE_TYPE bsize) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  const int bwl = b_width_log2(bsize) - 1, bhl = b_height_log2(bsize) - 1;
-  const int bw = 1 << (bwl - 1);
-  const int bh = 1 << (bhl - 1);
-  int uvoff = 4 << (bwl + bhl);
-  int plane, n;
-
-  for (plane = 1; plane < MAX_MB_PLANE; plane++) {
-    ENTROPY_CONTEXT ta[4], *a = xd->plane[plane].above_context;
-    ENTROPY_CONTEXT tl[4], *l = xd->plane[plane].left_context;
-
-    for (n = 0; n < bw; n++, a += 2)
-      ta[n] = (a[0] + a[1]) != 0;
-    for (n = 0; n < bh; n++, l += 2)
-      tl[n] = (l[0] + l[1]) != 0;
-
-    for (n = 0; n < bw * bh; n++) {
-      const int x_idx = n & (bw - 1), y_idx = n >> (bwl - 1);
-      optimize_b(cm, x, uvoff + n * 4, PLANE_TYPE_UV,
-                 x->e_mbd.plane[plane].dequant,
-                 &ta[x_idx], &tl[y_idx],
-                 TX_8X8, bh * bw * 16);
-    }
-    uvoff = (uvoff * 5) >> 2;  // switch u -> v
-  }
+void vp9_xform_quant_sby(VP9_COMMON *const cm, MACROBLOCK *x,
+                         BLOCK_SIZE_TYPE bsize) {
+  MACROBLOCKD* const xd = &x->e_mbd;
+  struct encode_b_args arg = {cm, x, NULL};
+
+  foreach_transformed_block_in_plane(xd, bsize, 0,
+                                     xform_quant, &arg);
 }
 
-void vp9_optimize_sbuv_4x4(VP9_COMMON *const cm, MACROBLOCK *x,
-                           BLOCK_SIZE_TYPE bsize) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);
-  const int bw = 1 << (bwl - 1);
-  const int bh = 1 << (bhl - 1);
-  int uvoff = 1 << (bwl + bhl);
-  int plane, n;
-
-  for (plane = 1; plane < MAX_MB_PLANE; plane++) {
-    ENTROPY_CONTEXT ta[8], tl[8];
-
-    vpx_memcpy(ta, xd->plane[plane].above_context,
-               sizeof(ENTROPY_CONTEXT) * bw);
-    vpx_memcpy(tl, xd->plane[plane].left_context,
-               sizeof(ENTROPY_CONTEXT) * bh);
-
-    for (n = 0; n < bw * bh; n++) {
-      const int x_idx = n & (bw - 1), y_idx = n >> (bwl - 1);
-      optimize_b(cm, x, uvoff + n, PLANE_TYPE_UV,
-                 x->e_mbd.plane[plane].dequant,
-                 &ta[x_idx], &tl[y_idx],
-                 TX_4X4, bh * bw * 4);
-    }
-    uvoff = (uvoff * 5) >> 2;  // switch u -> v
-  }
+void vp9_xform_quant_sbuv(VP9_COMMON *const cm, MACROBLOCK *x,
+                         BLOCK_SIZE_TYPE bsize) {
+  MACROBLOCKD* const xd = &x->e_mbd;
+  struct encode_b_args arg = {cm, x, NULL};
+
+  foreach_transformed_block_uv(xd, bsize, xform_quant, &arg);
 }
 
-void vp9_fidct_mb(VP9_COMMON *const cm, MACROBLOCK *x) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  const TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;
-
-  if (tx_size == TX_16X16) {
-    vp9_transform_sby_16x16(x, BLOCK_SIZE_MB16X16);
-    vp9_transform_sbuv_8x8(x, BLOCK_SIZE_MB16X16);
-    vp9_quantize_sby_16x16(x, BLOCK_SIZE_MB16X16);
-    vp9_quantize_sbuv_8x8(x, BLOCK_SIZE_MB16X16);
-    if (x->optimize) {
-      vp9_optimize_sby_16x16(cm, x, BLOCK_SIZE_MB16X16);
-      vp9_optimize_sbuv_8x8(cm, x, BLOCK_SIZE_MB16X16);
-    }
-    vp9_inverse_transform_sby_16x16(xd, BLOCK_SIZE_MB16X16);
-    vp9_inverse_transform_sbuv_8x8(xd, BLOCK_SIZE_MB16X16);
-  } else if (tx_size == TX_8X8) {
-    vp9_transform_sby_8x8(x, BLOCK_SIZE_MB16X16);
-    vp9_quantize_sby_8x8(x, BLOCK_SIZE_MB16X16);
-    if (x->optimize)
-      vp9_optimize_sby_8x8(cm, x, BLOCK_SIZE_MB16X16);
-    vp9_inverse_transform_sby_8x8(xd, BLOCK_SIZE_MB16X16);
-    if (xd->mode_info_context->mbmi.mode == SPLITMV) {
-      assert(xd->mode_info_context->mbmi.partitioning != PARTITIONING_4X4);
-      vp9_transform_sbuv_4x4(x, BLOCK_SIZE_MB16X16);
-      vp9_quantize_sbuv_4x4(x, BLOCK_SIZE_MB16X16);
-      if (x->optimize)
-        vp9_optimize_sbuv_4x4(cm, x, BLOCK_SIZE_MB16X16);
-      vp9_inverse_transform_sbuv_4x4(xd, BLOCK_SIZE_MB16X16);
-    } else {
-      vp9_transform_sbuv_8x8(x, BLOCK_SIZE_MB16X16);
-      vp9_quantize_sbuv_8x8(x, BLOCK_SIZE_MB16X16);
-      if (x->optimize)
-        vp9_optimize_sbuv_8x8(cm, x, BLOCK_SIZE_MB16X16);
-      vp9_inverse_transform_sbuv_8x8(xd, BLOCK_SIZE_MB16X16);
-    }
-  } else {
-    vp9_transform_sby_4x4(x, BLOCK_SIZE_MB16X16);
-    vp9_transform_sbuv_4x4(x, BLOCK_SIZE_MB16X16);
-    vp9_quantize_sby_4x4(x, BLOCK_SIZE_MB16X16);
-    vp9_quantize_sbuv_4x4(x, BLOCK_SIZE_MB16X16);
-    if (x->optimize) {
-      vp9_optimize_sby_4x4(cm, x, BLOCK_SIZE_MB16X16);
-      vp9_optimize_sbuv_4x4(cm, x, BLOCK_SIZE_MB16X16);
-    }
-    vp9_inverse_transform_sby_4x4(xd, BLOCK_SIZE_MB16X16);
-    vp9_inverse_transform_sbuv_4x4(xd, BLOCK_SIZE_MB16X16);
-  }
+void vp9_encode_sby(VP9_COMMON *const cm, MACROBLOCK *x,
+                    BLOCK_SIZE_TYPE bsize) {
+  MACROBLOCKD* const xd = &x->e_mbd;
+  struct optimize_ctx ctx;
+  struct encode_b_args arg = {cm, x, &ctx};
+
+  vp9_subtract_sby(x, bsize);
+  if (x->optimize)
+    vp9_optimize_init(xd, bsize, &ctx);
+
+  foreach_transformed_block_in_plane(xd, bsize, 0,
+                                     encode_block, &arg);
+
+  vp9_recon_sby(xd, bsize);
 }
 
-void vp9_encode_inter16x16(VP9_COMMON *const cm, MACROBLOCK *x,
-                           int mi_row, int mi_col) {
-  MACROBLOCKD *const xd = &x->e_mbd;
+void vp9_encode_sbuv(VP9_COMMON *const cm, MACROBLOCK *x,
+                     BLOCK_SIZE_TYPE bsize) {
+  MACROBLOCKD* const xd = &x->e_mbd;
+  struct optimize_ctx ctx;
+  struct encode_b_args arg = {cm, x, &ctx};
+
+  vp9_subtract_sbuv(x, bsize);
+  if (x->optimize)
+    vp9_optimize_init(xd, bsize, &ctx);
+
+  foreach_transformed_block_uv(xd, bsize, encode_block, &arg);
 
-  vp9_build_inter_predictors_sb(xd, mi_row, mi_col, BLOCK_SIZE_MB16X16);
-  vp9_subtract_sb(x, BLOCK_SIZE_MB16X16);
-  vp9_fidct_mb(cm, x);
-  vp9_recon_sb(xd, BLOCK_SIZE_MB16X16);
+  vp9_recon_sbuv(xd, bsize);
 }
 
-/* this function is used by first pass only */
-void vp9_encode_inter16x16y(MACROBLOCK *x, int mi_row, int mi_col) {
-  MACROBLOCKD *xd = &x->e_mbd;
+void vp9_encode_sb(VP9_COMMON *const cm, MACROBLOCK *x,
+                   BLOCK_SIZE_TYPE bsize) {
+  MACROBLOCKD* const xd = &x->e_mbd;
+  struct optimize_ctx ctx;
+  struct encode_b_args arg = {cm, x, &ctx};
 
-  vp9_build_inter_predictors_sby(xd, mi_row, mi_col, BLOCK_SIZE_MB16X16);
-  vp9_subtract_sby(x, BLOCK_SIZE_MB16X16);
+  vp9_subtract_sb(x, bsize);
+  if (x->optimize)
+    vp9_optimize_init(xd, bsize, &ctx);
 
-  vp9_transform_sby_4x4(x, BLOCK_SIZE_MB16X16);
-  vp9_quantize_sby_4x4(x, BLOCK_SIZE_MB16X16);
-  vp9_inverse_transform_sby_4x4(xd, BLOCK_SIZE_MB16X16);
+  foreach_transformed_block(xd, bsize, encode_block, &arg);
 
-  vp9_recon_sby(xd, BLOCK_SIZE_MB16X16);
+  vp9_recon_sb(xd, bsize);
 }
diff --git a/vp9/encoder/vp9_encodemb.h b/vp9/encoder/vp9_encodemb.h
index da134a86b..afbe4466b 100644
--- a/vp9/encoder/vp9_encodemb.h
+++ b/vp9/encoder/vp9_encodemb.h
@@ -22,39 +22,29 @@ typedef struct {
   MV_REFERENCE_FRAME second_ref_frame;
 } MODE_DEFINITION;
 
-
-struct VP9_ENCODER_RTCD;
-void vp9_encode_inter16x16(VP9_COMMON *const cm, MACROBLOCK *x,
-                           int mb_row, int mb_col);
-
-void vp9_encode_inter16x16y(MACROBLOCK *x, int mb_row, int mb_col);
-
-void vp9_transform_sby_32x32(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
-void vp9_optimize_sby_32x32(VP9_COMMON *const cm, MACROBLOCK *x,
-                            BLOCK_SIZE_TYPE bsize);
-void vp9_transform_sby_16x16(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
-void vp9_optimize_sby_16x16(VP9_COMMON *const cm, MACROBLOCK *x,
-                            BLOCK_SIZE_TYPE bsize);
-void vp9_transform_sby_8x8(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
-void vp9_optimize_sby_8x8(VP9_COMMON *const cm, MACROBLOCK *x,
+struct optimize_ctx {
+  ENTROPY_CONTEXT ta[MAX_MB_PLANE][16];
+  ENTROPY_CONTEXT tl[MAX_MB_PLANE][16];
+};
+void vp9_optimize_init(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize,
+                       struct optimize_ctx *ctx);
+void vp9_optimize_b(int plane, int block, BLOCK_SIZE_TYPE bsize,
+                    int ss_txfrm_size, VP9_COMMON *cm, MACROBLOCK *x,
+                    struct optimize_ctx *ctx);
+void vp9_optimize_sby(VP9_COMMON *const cm, MACROBLOCK *x,
+                      BLOCK_SIZE_TYPE bsize);
+void vp9_optimize_sbuv(VP9_COMMON *const cm, MACROBLOCK *x,
+                       BLOCK_SIZE_TYPE bsize);
+
+void vp9_encode_sb(VP9_COMMON *const cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
+void vp9_encode_sby(VP9_COMMON *const cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
+void vp9_encode_sbuv(VP9_COMMON *const cm, MACROBLOCK *x,
+                     BLOCK_SIZE_TYPE bsize);
+
+void vp9_xform_quant_sby(VP9_COMMON *const cm, MACROBLOCK *x,
+                        BLOCK_SIZE_TYPE bsize);
+void vp9_xform_quant_sbuv(VP9_COMMON *const cm, MACROBLOCK *x,
                           BLOCK_SIZE_TYPE bsize);
-void vp9_transform_sby_4x4(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
-void vp9_optimize_sby_4x4(VP9_COMMON *const cm, MACROBLOCK *x,
-                          BLOCK_SIZE_TYPE bsize);
-void vp9_transform_sbuv_32x32(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
-void vp9_optimize_sbuv_32x32(VP9_COMMON *const cm, MACROBLOCK *x,
-                             BLOCK_SIZE_TYPE bsize);
-void vp9_transform_sbuv_16x16(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
-void vp9_optimize_sbuv_16x16(VP9_COMMON *const cm, MACROBLOCK *x,
-                             BLOCK_SIZE_TYPE bsize);
-void vp9_transform_sbuv_8x8(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
-void vp9_optimize_sbuv_8x8(VP9_COMMON *const cm, MACROBLOCK *x,
-                           BLOCK_SIZE_TYPE bsize);
-void vp9_transform_sbuv_4x4(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
-void vp9_optimize_sbuv_4x4(VP9_COMMON *const cm, MACROBLOCK *x,
-                           BLOCK_SIZE_TYPE bsize);
-
-void vp9_fidct_mb(VP9_COMMON *const cm, MACROBLOCK *x);
 
 void vp9_subtract_block(int rows, int cols,
                         int16_t *diff_ptr, int diff_stride,
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index e4d68630d..a1898af48 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -14,7 +14,6 @@
 #include "vp9/encoder/vp9_onyx_int.h"
 #include "vp9/encoder/vp9_variance.h"
 #include "vp9/encoder/vp9_encodeintra.h"
-#include "vp9/common/vp9_setupintrarecon.h"
 #include "vp9/encoder/vp9_mcomp.h"
 #include "vp9/encoder/vp9_firstpass.h"
 #include "vpx_scale/vpx_scale.h"
@@ -247,8 +246,8 @@ static void avg_stats(FIRSTPASS_STATS *section) {
 
 // Calculate a modified Error used in distributing bits between easier and harder frames
 static double calculate_modified_err(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
-  double av_err = (cpi->twopass.total_stats->ssim_weighted_pred_err /
-                   cpi->twopass.total_stats->count);
+  double av_err = (cpi->twopass.total_stats.ssim_weighted_pred_err /
+                   cpi->twopass.total_stats.count);
   double this_err = this_frame->ssim_weighted_pred_err;
   double modified_err;
 
@@ -328,7 +327,7 @@ static int frame_max_bits(VP9_COMP *cpi) {
   // For VBR base this on the bits and frames left plus the
   // two_pass_vbrmax_section rate passed in by the user.
   max_bits = (int) (((double) cpi->twopass.bits_left
-      / (cpi->twopass.total_stats->count - (double) cpi->common
+      / (cpi->twopass.total_stats.count - (double) cpi->common
              .current_video_frame))
                     * ((double) cpi->oxcf.two_pass_vbrmax_section / 100.0));
 
@@ -340,11 +339,11 @@ static int frame_max_bits(VP9_COMP *cpi) {
 }
 
 void vp9_init_first_pass(VP9_COMP *cpi) {
-  zero_stats(cpi->twopass.total_stats);
+  zero_stats(&cpi->twopass.total_stats);
 }
 
 void vp9_end_first_pass(VP9_COMP *cpi) {
-  output_stats(cpi, cpi->output_pkt_list, cpi->twopass.total_stats);
+  output_stats(cpi, cpi->output_pkt_list, &cpi->twopass.total_stats);
 }
 
 static void zz_motion_search(VP9_COMP *cpi, MACROBLOCK *x, YV12_BUFFER_CONFIG *recon_buffer, int *best_motion_err, int recon_yoffset) {
@@ -486,8 +485,6 @@ void vp9_first_pass(VP9_COMP *cpi) {
 
   vp9_setup_block_dptrs(&x->e_mbd);
 
-  // set up frame new frame for intra coded blocks
-  vp9_setup_intra_recon(new_yv12);
   vp9_frame_init_quantizer(cpi);
 
   // Initialise the MV cost table to the defaults
@@ -521,9 +518,9 @@ void vp9_first_pass(VP9_COMP *cpi) {
       int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
 
       set_mi_row_col(cm, xd,
-                     mb_row << CONFIG_SB8X8,
+                     mb_row << 1,
                      1 << mi_height_log2(BLOCK_SIZE_MB16X16),
-                     mb_col << CONFIG_SB8X8,
+                     mb_col << 1,
                      1 << mi_height_log2(BLOCK_SIZE_MB16X16));
 
       xd->plane[0].dst.buf = new_yv12->y_buffer + recon_yoffset;
@@ -626,7 +623,10 @@ void vp9_first_pass(VP9_COMP *cpi) {
           this_error = motion_error;
           vp9_set_mbmode_and_mvs(x, NEWMV, &mv);
           xd->mode_info_context->mbmi.txfm_size = TX_4X4;
-          vp9_encode_inter16x16y(x, mb_row, mb_col);
+          vp9_build_inter_predictors_sby(xd, mb_row << 1,
+                                         mb_col << 1,
+                                         BLOCK_SIZE_MB16X16);
+          vp9_encode_sb(cm, x, BLOCK_SIZE_MB16X16);
           sum_mvr += mv.as_mv.row;
           sum_mvr_abs += abs(mv.as_mv.row);
           sum_mvc += mv.as_mv.col;
@@ -751,20 +751,20 @@ void vp9_first_pass(VP9_COMP *cpi) {
                             - cpi->source->ts_start);
 
     // don't want to do output stats with a stack variable!
-    memcpy(cpi->twopass.this_frame_stats,
+    memcpy(&cpi->twopass.this_frame_stats,
            &fps,
            sizeof(FIRSTPASS_STATS));
-    output_stats(cpi, cpi->output_pkt_list, cpi->twopass.this_frame_stats);
-    accumulate_stats(cpi->twopass.total_stats, &fps);
+    output_stats(cpi, cpi->output_pkt_list, &cpi->twopass.this_frame_stats);
+    accumulate_stats(&cpi->twopass.total_stats, &fps);
   }
 
   // Copy the previous Last Frame back into gf and and arf buffers if
   // the prediction is good enough... but also dont allow it to lag too far
   if ((cpi->twopass.sr_update_lag > 3) ||
       ((cm->current_video_frame > 0) &&
-       (cpi->twopass.this_frame_stats->pcnt_inter > 0.20) &&
-       ((cpi->twopass.this_frame_stats->intra_error /
-         DOUBLE_DIVIDE_CHECK(cpi->twopass.this_frame_stats->coded_error)) >
+       (cpi->twopass.this_frame_stats.pcnt_inter > 0.20) &&
+       ((cpi->twopass.this_frame_stats.intra_error /
+         DOUBLE_DIVIDE_CHECK(cpi->twopass.this_frame_stats.coded_error)) >
         2.0))) {
     vp8_yv12_copy_frame(lst_yv12, gld_yv12);
     cpi->twopass.sr_update_lag = 1;
@@ -995,7 +995,7 @@ static int estimate_max_q(VP9_COMP *cpi,
   // Give average a chance to settle though.
   // PGW TODO.. This code is broken for the extended Q range
   if ((cpi->ni_frames >
-       ((int)cpi->twopass.total_stats->count >> 8)) &&
+       ((int)cpi->twopass.total_stats.count >> 8)) &&
       (cpi->ni_frames > 25)) {
     adjust_maxq_qrange(cpi);
   }
@@ -1052,8 +1052,8 @@ static int estimate_cq(VP9_COMP *cpi,
   }
 
   // II ratio correction factor for clip as a whole
-  clip_iiratio = cpi->twopass.total_stats->intra_error /
-                 DOUBLE_DIVIDE_CHECK(cpi->twopass.total_stats->coded_error);
+  clip_iiratio = cpi->twopass.total_stats.intra_error /
+                 DOUBLE_DIVIDE_CHECK(cpi->twopass.total_stats.coded_error);
   clip_iifactor = 1.0 - ((clip_iiratio - 10.0) * 0.025);
   if (clip_iifactor < 0.80)
     clip_iifactor = 0.80;
@@ -1098,14 +1098,14 @@ void vp9_init_second_pass(VP9_COMP *cpi) {
   if (two_pass_min_rate < lower_bounds_min_rate)
     two_pass_min_rate = lower_bounds_min_rate;
 
-  zero_stats(cpi->twopass.total_stats);
-  zero_stats(cpi->twopass.total_left_stats);
+  zero_stats(&cpi->twopass.total_stats);
+  zero_stats(&cpi->twopass.total_left_stats);
 
   if (!cpi->twopass.stats_in_end)
     return;
 
-  *cpi->twopass.total_stats = *cpi->twopass.stats_in_end;
-  *cpi->twopass.total_left_stats = *cpi->twopass.total_stats;
+  cpi->twopass.total_stats = *cpi->twopass.stats_in_end;
+  cpi->twopass.total_left_stats = cpi->twopass.total_stats;
 
   // each frame can have a different duration, as the frame rate in the source
   // isn't guaranteed to be constant.   The frame rate prior to the first frame
@@ -1113,13 +1113,13 @@ void vp9_init_second_pass(VP9_COMP *cpi) {
   // Its calculated based on the actual durations of all frames from the first
   // pass.
   vp9_new_frame_rate(cpi,
-                     10000000.0 * cpi->twopass.total_stats->count /
-                     cpi->twopass.total_stats->duration);
+                     10000000.0 * cpi->twopass.total_stats.count /
+                     cpi->twopass.total_stats.duration);
 
   cpi->output_frame_rate = cpi->oxcf.frame_rate;
-  cpi->twopass.bits_left = (int64_t)(cpi->twopass.total_stats->duration *
+  cpi->twopass.bits_left = (int64_t)(cpi->twopass.total_stats.duration *
                                      cpi->oxcf.target_bandwidth / 10000000.0);
-  cpi->twopass.bits_left -= (int64_t)(cpi->twopass.total_stats->duration *
+  cpi->twopass.bits_left -= (int64_t)(cpi->twopass.total_stats.duration *
                                       two_pass_min_rate / 10000000.0);
 
   // Calculate a minimum intra value to be used in determining the IIratio
@@ -1145,7 +1145,8 @@ void vp9_init_second_pass(VP9_COMP *cpi) {
       sum_iiratio += IIRatio;
     }
 
-    cpi->twopass.avg_iiratio = sum_iiratio / DOUBLE_DIVIDE_CHECK((double)cpi->twopass.total_stats->count);
+    cpi->twopass.avg_iiratio = sum_iiratio /
+        DOUBLE_DIVIDE_CHECK((double)cpi->twopass.total_stats.count);
 
     // Reset file position
     reset_fpf_position(cpi, start_pos);
@@ -1828,7 +1829,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   // where cpi->twopass.kf_group_bits is tied to cpi->twopass.bits_left.
   // This is also important for short clips where there may only be one
   // key frame.
-  if (cpi->twopass.frames_to_key >= (int)(cpi->twopass.total_stats->count -
+  if (cpi->twopass.frames_to_key >= (int)(cpi->twopass.total_stats.count -
                                           cpi->common.current_video_frame)) {
     cpi->twopass.kf_group_bits =
       (cpi->twopass.bits_left > 0) ? cpi->twopass.bits_left : 0;
@@ -2096,7 +2097,7 @@ static int adjust_active_maxq(int old_maxqi, int new_maxqi) {
 
 void vp9_second_pass(VP9_COMP *cpi) {
   int tmp_q;
-  int frames_left = (int)(cpi->twopass.total_stats->count -
+  int frames_left = (int)(cpi->twopass.total_stats.count -
                           cpi->common.current_video_frame);
 
   FIRSTPASS_STATS this_frame;
@@ -2121,7 +2122,7 @@ void vp9_second_pass(VP9_COMP *cpi) {
 
       est_cq =
         estimate_cq(cpi,
-                    cpi->twopass.total_left_stats,
+                    &cpi->twopass.total_left_stats,
                     (int)(cpi->twopass.bits_left / frames_left));
 
       cpi->cq_target_quality = cpi->oxcf.cq_level;
@@ -2135,7 +2136,7 @@ void vp9_second_pass(VP9_COMP *cpi) {
 
     tmp_q = estimate_max_q(
               cpi,
-              cpi->twopass.total_left_stats,
+              &cpi->twopass.total_left_stats,
               (int)(cpi->twopass.bits_left / frames_left));
 
     cpi->active_worst_quality         = tmp_q;
@@ -2158,15 +2159,15 @@ void vp9_second_pass(VP9_COMP *cpi) {
   // radical adjustments to the allowed quantizer range just to use up a
   // few surplus bits or get beneath the target rate.
   else if ((cpi->common.current_video_frame <
-            (((unsigned int)cpi->twopass.total_stats->count * 255) >> 8)) &&
+            (((unsigned int)cpi->twopass.total_stats.count * 255) >> 8)) &&
            ((cpi->common.current_video_frame + cpi->baseline_gf_interval) <
-            (unsigned int)cpi->twopass.total_stats->count)) {
+            (unsigned int)cpi->twopass.total_stats.count)) {
     if (frames_left < 1)
       frames_left = 1;
 
     tmp_q = estimate_max_q(
               cpi,
-              cpi->twopass.total_left_stats,
+              &cpi->twopass.total_left_stats,
               (int)(cpi->twopass.bits_left / frames_left));
 
     // Make a damped adjustment to active max Q
@@ -2245,7 +2246,7 @@ void vp9_second_pass(VP9_COMP *cpi) {
   cpi->twopass.frames_to_key--;
 
   // Update the total stats remaining structure
-  subtract_stats(cpi->twopass.total_left_stats, &this_frame);
+  subtract_stats(&cpi->twopass.total_left_stats, &this_frame);
 }
 
 static int test_candidate_kf(VP9_COMP *cpi,
diff --git a/vp9/encoder/vp9_mbgraph.c b/vp9/encoder/vp9_mbgraph.c
index fe5d114ba..018c86cb9 100644
--- a/vp9/encoder/vp9_mbgraph.c
+++ b/vp9/encoder/vp9_mbgraph.c
@@ -11,7 +11,6 @@
 #include <limits.h>
 #include <vp9/encoder/vp9_encodeintra.h>
 #include <vp9/encoder/vp9_rdopt.h>
-#include <vp9/common/vp9_setupintrarecon.h>
 #include <vp9/common/vp9_blockd.h>
 #include <vp9/common/vp9_reconinter.h>
 #include <vp9/common/vp9_systemdependent.h>
@@ -386,7 +385,6 @@ static void separate_arf_mbs(VP9_COMP *cpi) {
       // goes in segment 0
       if (arf_not_zz[offset + mb_col]) {
         ncnt[0]++;
-#if CONFIG_SB8X8
         cpi->segmentation_map[offset * 4 + 2 * mb_col] = 0;
         cpi->segmentation_map[offset * 4 + 2 * mb_col + 1] = 0;
         cpi->segmentation_map[offset * 4 + 2 * mb_col + cm->mi_cols] = 0;
@@ -396,11 +394,6 @@ static void separate_arf_mbs(VP9_COMP *cpi) {
         cpi->segmentation_map[offset * 4 + 2 * mb_col + 1] = 1;
         cpi->segmentation_map[offset * 4 + 2 * mb_col + cm->mi_cols] = 1;
         cpi->segmentation_map[offset * 4 + 2 * mb_col + cm->mi_cols + 1] = 1;
-#else
-        cpi->segmentation_map[offset + mb_col] = 0;
-      } else {
-        cpi->segmentation_map[offset + mb_col] = 1;
-#endif
         ncnt[1]++;
       }
     }
@@ -419,10 +412,10 @@ static void separate_arf_mbs(VP9_COMP *cpi) {
       cpi->static_mb_pct = 0;
 
     cpi->seg0_cnt = ncnt[0];
-    vp9_enable_segmentation((VP9_PTR) cpi);
+    vp9_enable_segmentation((VP9_PTR)cpi);
   } else {
     cpi->static_mb_pct = 0;
-    vp9_disable_segmentation((VP9_PTR) cpi);
+    vp9_disable_segmentation((VP9_PTR)cpi);
   }
 
   // Free localy allocated storage
diff --git a/vp9/encoder/vp9_modecosts.c b/vp9/encoder/vp9_modecosts.c
index 7d9462f94..e26daf0c9 100644
--- a/vp9/encoder/vp9_modecosts.c
+++ b/vp9/encoder/vp9_modecosts.c
@@ -41,8 +41,6 @@ void vp9_init_mode_costs(VP9_COMP *c) {
                   x->fc.uv_mode_prob[VP9_YMODES - 1], vp9_uv_mode_tree);
   vp9_cost_tokens(c->mb.intra_uv_mode_cost[0],
                   x->kf_uv_mode_prob[VP9_YMODES - 1], vp9_uv_mode_tree);
-  vp9_cost_tokens(c->mb.i8x8_mode_costs,
-                  x->fc.i8x8_mode_prob, vp9_i8x8_mode_tree);
 
   for (i = 0; i <= VP9_SWITCHABLE_FILTERS; ++i)
     vp9_cost_tokens((int *)c->mb.switchable_interp_costs[i],
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index 0af232eed..ffee34eb7 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -280,8 +280,7 @@ static void setup_features(VP9_COMP *cpi) {
   MACROBLOCKD *xd = &cpi->mb.e_mbd;
 
   // Set up default state for MB feature flags
-
-  xd->segmentation_enabled = 0;   // Default segmentation disabled
+  xd->segmentation_enabled = 0;
 
   xd->update_mb_segmentation_map = 0;
   xd->update_mb_segmentation_data = 0;
@@ -333,15 +332,6 @@ static void dealloc_compressor_data(VP9_COMP *cpi) {
 
   vpx_free(cpi->mb.pip);
   cpi->mb.pip = 0;
-
-  vpx_free(cpi->twopass.total_stats);
-  cpi->twopass.total_stats = 0;
-
-  vpx_free(cpi->twopass.total_left_stats);
-  cpi->twopass.total_left_stats = 0;
-
-  vpx_free(cpi->twopass.this_frame_stats);
-  cpi->twopass.this_frame_stats = 0;
 }
 
 // Computes a q delta (in "q index" terms) to get from a starting q value
@@ -383,7 +373,7 @@ static void configure_static_seg_features(VP9_COMP *cpi) {
     xd->update_mb_segmentation_map = 0;
     xd->update_mb_segmentation_data = 0;
 #if CONFIG_IMPLICIT_SEGMENTATION
-  xd->allow_implicit_segment_update = 0;
+    xd->allow_implicit_segment_update = 0;
 #endif
     cpi->static_mb_pct = 0;
 
@@ -399,7 +389,7 @@ static void configure_static_seg_features(VP9_COMP *cpi) {
     xd->update_mb_segmentation_map = 0;
     xd->update_mb_segmentation_data = 0;
 #if CONFIG_IMPLICIT_SEGMENTATION
-  xd->allow_implicit_segment_update = 0;
+    xd->allow_implicit_segment_update = 0;
 #endif
     cpi->static_mb_pct = 0;
 
@@ -428,9 +418,9 @@ static void configure_static_seg_features(VP9_COMP *cpi) {
       xd->mb_segment_abs_delta = SEGMENT_DELTADATA;
 
     }
-  }
-  // All other frames if segmentation has been enabled
-  else if (xd->segmentation_enabled) {
+  } else if (xd->segmentation_enabled) {
+    // All other frames if segmentation has been enabled
+
     // First normal frame in a valid gf or alt ref group
     if (cpi->common.frames_since_golden == 0) {
       // Set up segment features for normal frames in an arf group
@@ -454,10 +444,10 @@ static void configure_static_seg_features(VP9_COMP *cpi) {
           vp9_enable_segfeature(xd, 1, SEG_LVL_REF_FRAME);
           vp9_enable_segfeature(xd, 1, SEG_LVL_SKIP);
         }
-      }
-      // Disable segmentation and clear down features if alt ref
-      // is not active for this group
-      else {
+      } else {
+        // Disable segmentation and clear down features if alt ref
+        // is not active for this group
+
         vp9_disable_segmentation((VP9_PTR)cpi);
 
         vpx_memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
@@ -467,12 +457,11 @@ static void configure_static_seg_features(VP9_COMP *cpi) {
 
         vp9_clearall_segfeatures(xd);
       }
-    }
+    } else if (cpi->is_src_frame_alt_ref) {
+      // Special case where we are coding over the top of a previous
+      // alt ref frame.
+      // Segment coding disabled for compred testing
 
-    // Special case where we are coding over the top of a previous
-    // alt ref frame.
-    // Segment coding disabled for compred testing
-    else if (cpi->is_src_frame_alt_ref) {
       // Enable ref frame features for segment 0 as well
       vp9_enable_segfeature(xd, 0, SEG_LVL_REF_FRAME);
       vp9_enable_segfeature(xd, 1, SEG_LVL_REF_FRAME);
@@ -490,9 +479,9 @@ static void configure_static_seg_features(VP9_COMP *cpi) {
       }
       // Enable data udpate
       xd->update_mb_segmentation_data = 1;
-    }
-    // All other frames.
-    else {
+    } else {
+      // All other frames.
+
       // No updates.. leave things as they are.
       xd->update_mb_segmentation_map = 0;
       xd->update_mb_segmentation_data = 0;
@@ -628,7 +617,6 @@ static void set_rd_speed_thresholds(VP9_COMP *cpi, int mode, int speed) {
   sf->thresh_mult[THR_D63_PRED ] += speed_multiplier * 1500;
 
   sf->thresh_mult[THR_B_PRED   ] += speed_multiplier * 2500;
-  sf->thresh_mult[THR_I8X8_PRED] += speed_multiplier * 2500;
 
   sf->thresh_mult[THR_NEWMV    ] += speed_multiplier * 1000;
   sf->thresh_mult[THR_NEWG     ] += speed_multiplier * 1000;
@@ -867,9 +855,6 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
   }
 
   cpi->mb.quantize_b_4x4      = vp9_regular_quantize_b_4x4;
-  cpi->mb.quantize_b_4x4_pair = vp9_regular_quantize_b_4x4_pair;
-  cpi->mb.quantize_b_8x8      = vp9_regular_quantize_b_8x8;
-  cpi->mb.quantize_b_16x16    = vp9_regular_quantize_b_16x16;
 
   vp9_init_quantizer(cpi);
 
@@ -959,23 +944,6 @@ void vp9_alloc_compressor_data(VP9_COMP *cpi) {
   CHECK_MEM_ERROR(cpi->mb_norm_activity_map,
                   vpx_calloc(sizeof(unsigned int),
                              cm->mb_rows * cm->mb_cols));
-
-  vpx_free(cpi->twopass.total_stats);
-
-  cpi->twopass.total_stats = vpx_calloc(1, sizeof(FIRSTPASS_STATS));
-
-  vpx_free(cpi->twopass.total_left_stats);
-  cpi->twopass.total_left_stats = vpx_calloc(1, sizeof(FIRSTPASS_STATS));
-
-  vpx_free(cpi->twopass.this_frame_stats);
-
-  cpi->twopass.this_frame_stats = vpx_calloc(1, sizeof(FIRSTPASS_STATS));
-
-  if (!cpi->twopass.total_stats ||
-      !cpi->twopass.total_left_stats ||
-      !cpi->twopass.this_frame_stats)
-    vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
-                       "Failed to allocate firstpass stats");
 }
 
 
@@ -1647,6 +1615,12 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
   BFP(BLOCK_8X8, vp9_sad8x8, vp9_variance8x8, vp9_sub_pixel_variance8x8,
       NULL, NULL, NULL, vp9_sad8x8x3, vp9_sad8x8x8, vp9_sad8x8x4d)
 
+  BFP(BLOCK_4X8, NULL, vp9_variance4x8, NULL,
+      NULL, NULL, NULL, NULL, NULL, NULL)
+
+  BFP(BLOCK_8X4, NULL, vp9_variance8x4, NULL,
+      NULL, NULL, NULL, NULL, NULL, NULL)
+
   BFP(BLOCK_4X4, vp9_sad4x4, vp9_variance4x4, vp9_sub_pixel_variance4x4,
       NULL, NULL, NULL, vp9_sad4x4x3, vp9_sad4x4x8, vp9_sad4x4x4d)
 
@@ -3326,9 +3300,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
     vp9_copy(cpi->common.fc.ymode_counts, cpi->ymode_count);
     vp9_copy(cpi->common.fc.uv_mode_counts, cpi->y_uv_mode_count);
     vp9_copy(cpi->common.fc.bmode_counts, cpi->bmode_count);
-    vp9_copy(cpi->common.fc.i8x8_mode_counts, cpi->i8x8_mode_count);
     vp9_copy(cpi->common.fc.sub_mv_ref_counts, cpi->sub_mv_ref_count);
-    vp9_copy(cpi->common.fc.mbsplit_counts, cpi->mbsplit_count);
     vp9_copy(cpi->common.fc.partition_counts, cpi->partition_count);
 #if CONFIG_COMP_INTERINTRA_PRED
     vp9_copy(cpi->common.fc.interintra_counts, cpi->interintra_count);
diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h
index aeaf1bda3..cc91ba5d2 100644
--- a/vp9/encoder/vp9_onyx_int.h
+++ b/vp9/encoder/vp9_onyx_int.h
@@ -48,9 +48,9 @@
 #define KEY_FRAME_CONTEXT 5
 
 #if CONFIG_COMP_INTERINTRA_PRED
-#define MAX_MODES 54
+#define MAX_MODES 53
 #else
-#define MAX_MODES 42
+#define MAX_MODES 41
 #endif
 
 #define MIN_THRESHMULT  32
@@ -72,7 +72,6 @@ typedef struct {
   // Stats
   int y_modes[VP9_YMODES];
   int uv_modes[VP9_UV_MODES];
-  int i8x8_modes[VP9_I8X8_MODES];
   int b_modes[B_MODE_COUNT];
   int inter_y_modes[MB_MODE_COUNT];
   int inter_uv_modes[VP9_UV_MODES];
@@ -100,9 +99,7 @@ typedef struct {
   vp9_prob ymode_prob[VP9_YMODES - 1]; /* interframe intra mode probs */
   vp9_prob uv_mode_prob[VP9_YMODES][VP9_UV_MODES - 1];
   vp9_prob bmode_prob[VP9_NKF_BINTRAMODES - 1];
-  vp9_prob i8x8_mode_prob[VP9_I8X8_MODES - 1];
   vp9_prob sub_mv_ref_prob[SUBMVREF_COUNT][VP9_SUBMVREFS - 1];
-  vp9_prob mbsplit_prob[VP9_NUMMBSPLITS - 1];
   vp9_prob partition_prob[NUM_PARTITION_CONTEXTS][PARTITION_TYPES - 1];
 
   vp9_prob switchable_interp_prob[VP9_SWITCHABLE_FILTERS + 1]
@@ -207,7 +204,6 @@ typedef enum {
   THR_SPLITA,
 
   THR_B_PRED,
-  THR_I8X8_PRED,
 
   THR_COMP_ZEROLG,
   THR_COMP_NEARESTLG,
@@ -273,10 +269,12 @@ typedef struct {
 } SPEED_FEATURES;
 
 enum BlockSize {
-  BLOCK_16X8 = PARTITIONING_16X8,
-  BLOCK_8X16 = PARTITIONING_8X16,
-  BLOCK_8X8 = PARTITIONING_8X8,
-  BLOCK_4X4 = PARTITIONING_4X4,
+  BLOCK_4X4,
+  BLOCK_4X8,
+  BLOCK_8X4,
+  BLOCK_8X8,
+  BLOCK_8X16,
+  BLOCK_16X8,
   BLOCK_16X16,
   BLOCK_MAX_SEGMENTS,
   BLOCK_32X32 = BLOCK_MAX_SEGMENTS,
@@ -451,9 +449,7 @@ typedef struct VP9_COMP {
   int sb_ymode_count [VP9_I32X32_MODES];
   int ymode_count[VP9_YMODES];        /* intra MB type cts this frame */
   int bmode_count[VP9_NKF_BINTRAMODES];
-  int i8x8_mode_count[VP9_I8X8_MODES];
   int sub_mv_ref_count[SUBMVREF_COUNT][VP9_SUBMVREFS];
-  int mbsplit_count[VP9_NUMMBSPLITS];
   int y_uv_mode_count[VP9_YMODES][VP9_UV_MODES];
   unsigned int partition_count[NUM_PARTITION_CONTEXTS][PARTITION_TYPES];
 #if CONFIG_COMP_INTERINTRA_PRED
@@ -557,10 +553,10 @@ typedef struct VP9_COMP {
     unsigned int section_intra_rating;
     unsigned int next_iiratio;
     unsigned int this_iiratio;
-    FIRSTPASS_STATS *total_stats;
-    FIRSTPASS_STATS *this_frame_stats;
+    FIRSTPASS_STATS total_stats;
+    FIRSTPASS_STATS this_frame_stats;
     FIRSTPASS_STATS *stats_in, *stats_in_end, *stats_in_start;
-    FIRSTPASS_STATS *total_left_stats;
+    FIRSTPASS_STATS total_left_stats;
     int first_pass_done;
     int64_t bits_left;
     int64_t clip_bits_total;
diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index 77e19721c..4ed8f6326 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@@ -75,57 +75,52 @@ static void quantize(int16_t *zbin_boost_orig_ptr,
   *eob_ptr = eob + 1;
 }
 
-void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type,
-                                int y_blocks) {
+void vp9_quantize(MACROBLOCK *mb, int plane, int block, int n_coeffs,
+                  TX_TYPE tx_type) {
   MACROBLOCKD *const xd = &mb->e_mbd;
-  const struct plane_block_idx pb_idx = plane_block_idx(y_blocks, b_idx);
-  const int *pt_scan = get_scan_4x4(tx_type);
+  const int mul = n_coeffs == 1024 ? 2 : 1;
+  const int *scan;
+
+  // These contexts may be available in the caller
+  switch (n_coeffs) {
+    case 4 * 4:
+      scan = get_scan_4x4(tx_type);
+      break;
+    case 8 * 8:
+      scan = get_scan_8x8(tx_type);
+      break;
+    case 16 * 16:
+      scan = get_scan_16x16(tx_type);
+      break;
+    default:
+      scan = vp9_default_zig_zag1d_32x32;
+      break;
+  }
 
-  quantize(mb->plane[pb_idx.plane].zrun_zbin_boost,
-           BLOCK_OFFSET(mb->plane[pb_idx.plane].coeff, pb_idx.block, 16),
-           16, mb->skip_block,
-           mb->plane[pb_idx.plane].zbin,
-           mb->plane[pb_idx.plane].round,
-           mb->plane[pb_idx.plane].quant,
-           mb->plane[pb_idx.plane].quant_shift,
-           BLOCK_OFFSET(xd->plane[pb_idx.plane].qcoeff, pb_idx.block, 16),
-           BLOCK_OFFSET(xd->plane[pb_idx.plane].dqcoeff, pb_idx.block, 16),
-           xd->plane[pb_idx.plane].dequant,
-           mb->plane[pb_idx.plane].zbin_extra,
-           &xd->plane[pb_idx.plane].eobs[pb_idx.block],
-           pt_scan, 1);
+  quantize(mb->plane[plane].zrun_zbin_boost,
+           BLOCK_OFFSET(mb->plane[plane].coeff, block, 16),
+           n_coeffs, mb->skip_block,
+           mb->plane[plane].zbin,
+           mb->plane[plane].round,
+           mb->plane[plane].quant,
+           mb->plane[plane].quant_shift,
+           BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16),
+           BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
+           xd->plane[plane].dequant,
+           mb->plane[plane].zbin_extra,
+           &xd->plane[plane].eobs[block],
+           scan, mul);
 }
 
-void vp9_regular_quantize_b_8x8(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type,
+void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type,
                                 int y_blocks) {
   MACROBLOCKD *const xd = &mb->e_mbd;
   const struct plane_block_idx pb_idx = plane_block_idx(y_blocks, b_idx);
-  const int *pt_scan = get_scan_8x8(tx_type);
-
-  quantize(mb->plane[pb_idx.plane].zrun_zbin_boost,
-           BLOCK_OFFSET(mb->plane[pb_idx.plane].coeff, pb_idx.block, 16),
-           64, mb->skip_block,
-           mb->plane[pb_idx.plane].zbin,
-           mb->plane[pb_idx.plane].round,
-           mb->plane[pb_idx.plane].quant,
-           mb->plane[pb_idx.plane].quant_shift,
-           BLOCK_OFFSET(xd->plane[pb_idx.plane].qcoeff, pb_idx.block, 16),
-           BLOCK_OFFSET(xd->plane[pb_idx.plane].dqcoeff, pb_idx.block, 16),
-           xd->plane[pb_idx.plane].dequant,
-           mb->plane[pb_idx.plane].zbin_extra,
-           &xd->plane[pb_idx.plane].eobs[pb_idx.block],
-           pt_scan, 1);
-}
-
-void vp9_regular_quantize_b_16x16(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type,
-                                  int y_blocks) {
-  MACROBLOCKD *const xd = &mb->e_mbd;
-  const struct plane_block_idx pb_idx = plane_block_idx(y_blocks, b_idx);
-  const int *pt_scan = get_scan_16x16(tx_type);
+  const int *pt_scan = get_scan_4x4(tx_type);
 
   quantize(mb->plane[pb_idx.plane].zrun_zbin_boost,
            BLOCK_OFFSET(mb->plane[pb_idx.plane].coeff, pb_idx.block, 16),
-           256, mb->skip_block,
+           16, mb->skip_block,
            mb->plane[pb_idx.plane].zbin,
            mb->plane[pb_idx.plane].round,
            mb->plane[pb_idx.plane].quant,
@@ -138,120 +133,6 @@ void vp9_regular_quantize_b_16x16(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type,
            pt_scan, 1);
 }
 
-void vp9_regular_quantize_b_32x32(MACROBLOCK *mb, int b_idx, int y_blocks) {
-  MACROBLOCKD *const xd = &mb->e_mbd;
-  const struct plane_block_idx pb_idx = plane_block_idx(y_blocks, b_idx);
-
-  quantize(mb->plane[pb_idx.plane].zrun_zbin_boost,
-           BLOCK_OFFSET(mb->plane[pb_idx.plane].coeff, pb_idx.block, 16),
-           1024, mb->skip_block,
-           mb->plane[pb_idx.plane].zbin,
-           mb->plane[pb_idx.plane].round,
-           mb->plane[pb_idx.plane].quant,
-           mb->plane[pb_idx.plane].quant_shift,
-           BLOCK_OFFSET(xd->plane[pb_idx.plane].qcoeff, pb_idx.block, 16),
-           BLOCK_OFFSET(xd->plane[pb_idx.plane].dqcoeff, pb_idx.block, 16),
-           xd->plane[pb_idx.plane].dequant,
-           mb->plane[pb_idx.plane].zbin_extra,
-           &xd->plane[pb_idx.plane].eobs[pb_idx.block],
-           vp9_default_zig_zag1d_32x32, 2);
-}
-
-void vp9_quantize_sby_32x32(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
-  const int bw = 1 << (b_width_log2(bsize) - 3);
-  const int bh = 1 << (b_height_log2(bsize) - 3);
-  int n;
-
-  for (n = 0; n < bw * bh; n++)
-    vp9_regular_quantize_b_32x32(x, n * 64, bw * bh * 64);
-}
-
-void vp9_quantize_sby_16x16(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize) - 2, bw = 1 << bwl;
-  const int bh = 1 << (b_height_log2(bsize) - 2);
-  const int bstride = 16 << bwl;
-  int n;
-
-  for (n = 0; n < bw * bh; n++) {
-    const int x_idx = n & (bw - 1), y_idx = n >> bwl;
-    TX_TYPE tx_type = get_tx_type_16x16(&x->e_mbd,
-                                        4 * x_idx + y_idx * bstride);
-    x->quantize_b_16x16(x, n * 16, tx_type, 16 * bw * bh);
-  }
-}
-
-void vp9_quantize_sby_8x8(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize) - 1, bw = 1 << bwl;
-  const int bh = 1 << (b_height_log2(bsize) - 1);
-  const int bstride = 4 << bwl;
-  int n;
-
-  for (n = 0; n < bw * bh; n++) {
-    const int x_idx = n & (bw - 1), y_idx = n >> bwl;
-    TX_TYPE tx_type = get_tx_type_8x8(&x->e_mbd,
-                                      2 * x_idx + y_idx * bstride);
-    x->quantize_b_8x8(x, n * 4, tx_type, 4 * bw * bh);
-  }
-}
-
-void vp9_quantize_sby_4x4(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize), bw = 1 << bwl;
-  const int bh = 1 << b_height_log2(bsize);
-  MACROBLOCKD *const xd = &x->e_mbd;
-  int n;
-
-  for (n = 0; n < bw * bh; n++) {
-    const TX_TYPE tx_type = get_tx_type_4x4(xd, n);
-    x->quantize_b_4x4(x, n, tx_type, bw * bh);
-  }
-}
-
-void vp9_quantize_sbuv_32x32(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
-  assert(bsize == BLOCK_SIZE_SB64X64);
-  vp9_regular_quantize_b_32x32(x, 256, 256);
-  vp9_regular_quantize_b_32x32(x, 320, 256);
-}
-
-void vp9_quantize_sbuv_16x16(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize) - 2;
-  const int bhl = b_height_log2(bsize) - 2;
-  const int uoff = 16 << (bhl + bwl);
-  int i;
-
-  for (i = uoff; i < ((uoff * 3) >> 1); i += 16)
-    x->quantize_b_16x16(x, i, DCT_DCT, uoff);
-}
-
-void vp9_quantize_sbuv_8x8(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize) - 2;
-  const int bhl = b_height_log2(bsize) - 2;
-  const int uoff = 16 << (bhl + bwl);
-  int i;
-
-  for (i = uoff; i < ((uoff * 3) >> 1); i += 4)
-    x->quantize_b_8x8(x, i, DCT_DCT, uoff);
-}
-
-void vp9_quantize_sbuv_4x4(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize) - 2;
-  const int bhl = b_height_log2(bsize) - 2;
-  const int uoff = 16 << (bhl + bwl);
-  int i;
-
-  for (i = uoff; i < ((uoff * 3) >> 1); i++)
-    x->quantize_b_4x4(x, i, DCT_DCT, uoff);
-}
-
-/* quantize_b_pair function pointer in MACROBLOCK structure is set to one of
- * these two C functions if corresponding optimized routine is not available.
- * NEON optimized version implements currently the fast quantization for pair
- * of blocks. */
-void vp9_regular_quantize_b_4x4_pair(MACROBLOCK *x, int b_idx1, int b_idx2,
-                                     int y_blocks) {
-  vp9_regular_quantize_b_4x4(x, b_idx1, DCT_DCT, y_blocks);
-  vp9_regular_quantize_b_4x4(x, b_idx2, DCT_DCT, y_blocks);
-}
-
 static void invert_quant(int16_t *quant, uint8_t *shift, int d) {
   unsigned t;
   int l;
@@ -266,6 +147,7 @@ static void invert_quant(int16_t *quant, uint8_t *shift, int d) {
 void vp9_init_quantizer(VP9_COMP *cpi) {
   int i;
   int quant_val;
+  int quant_uv_val;
   int q;
 
   static const int zbin_boost[16] = { 0,  0,  0,  8,  8,  8, 10, 12,
@@ -293,52 +175,36 @@ void vp9_init_quantizer(VP9_COMP *cpi) {
     cpi->common.uv_dequant[q][0] = quant_val;
     cpi->zrun_zbin_boost_uv[q][0] = (quant_val * zbin_boost[0]) >> 7;
 
+    quant_val = vp9_ac_quant(q, 0);
+    cpi->common.y_dequant[q][1] = quant_val;
+    quant_uv_val = vp9_ac_quant(q, cpi->common.uv_ac_delta_q);
+    cpi->common.uv_dequant[q][1] = quant_uv_val;
     // all the 4x4 ac values =;
     for (i = 1; i < 16; i++) {
       int rc = vp9_default_zig_zag1d_4x4[i];
 
-      quant_val = vp9_ac_quant(q, 0);
       invert_quant(cpi->Y1quant[q] + rc, cpi->Y1quant_shift[q] + rc, quant_val);
       cpi->Y1zbin[q][rc] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7);
       cpi->Y1round[q][rc] = (qrounding_factor * quant_val) >> 7;
-      cpi->common.y_dequant[q][rc] = quant_val;
       cpi->zrun_zbin_boost_y1[q][i] =
           ROUND_POWER_OF_TWO(quant_val * zbin_boost[i], 7);
 
-      quant_val = vp9_ac_quant(q, cpi->common.uv_ac_delta_q);
-      invert_quant(cpi->UVquant[q] + rc, cpi->UVquant_shift[q] + rc, quant_val);
-      cpi->UVzbin[q][rc] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7);
-      cpi->UVround[q][rc] = (qrounding_factor * quant_val) >> 7;
-      cpi->common.uv_dequant[q][rc] = quant_val;
+      invert_quant(cpi->UVquant[q] + rc, cpi->UVquant_shift[q] + rc,
+        quant_uv_val);
+      cpi->UVzbin[q][rc] = ROUND_POWER_OF_TWO(qzbin_factor * quant_uv_val, 7);
+      cpi->UVround[q][rc] = (qrounding_factor * quant_uv_val) >> 7;
       cpi->zrun_zbin_boost_uv[q][i] =
-          ROUND_POWER_OF_TWO(quant_val * zbin_boost[i], 7);
+          ROUND_POWER_OF_TWO(quant_uv_val * zbin_boost[i], 7);
     }
   }
 }
 
 void vp9_mb_init_quantizer(VP9_COMP *cpi, MACROBLOCK *x) {
   int i;
-  int qindex;
   MACROBLOCKD *xd = &x->e_mbd;
   int zbin_extra;
   int segment_id = xd->mode_info_context->mbmi.segment_id;
-
-  // Select the baseline MB Q index allowing for any segment level change.
-  if (vp9_segfeature_active(xd, segment_id, SEG_LVL_ALT_Q)) {
-    if (xd->mb_segment_abs_delta == SEGMENT_ABSDATA) {
-      // Abs Value
-      qindex = vp9_get_segdata(xd, segment_id, SEG_LVL_ALT_Q);
-    } else {
-      // Delta Value
-      qindex = cpi->common.base_qindex +
-                 vp9_get_segdata(xd, segment_id, SEG_LVL_ALT_Q);
-
-      // Clamp to valid range
-      qindex = clamp(qindex, 0, MAXQ);
-    }
-  } else {
-    qindex = cpi->common.base_qindex;
-  }
+  const int qindex = vp9_get_qindex(xd, segment_id, cpi->common.base_qindex);
 
   // Y
   zbin_extra = (cpi->common.y_dequant[qindex][1] *
diff --git a/vp9/encoder/vp9_quantize.h b/vp9/encoder/vp9_quantize.h
index fd7a4bb4f..2b1eeabbe 100644
--- a/vp9/encoder/vp9_quantize.h
+++ b/vp9/encoder/vp9_quantize.h
@@ -22,9 +22,8 @@
 #define prototype_quantize_mb(sym) \
   void (sym)(MACROBLOCK *x)
 
-#if ARCH_X86 || ARCH_X86_64
-#include "x86/vp9_quantize_x86.h"
-#endif
+void vp9_quantize(MACROBLOCK *mb, int plane, int block, int n_coefs,
+                  TX_TYPE tx_type);
 
 void vp9_regular_quantize_b_4x4_pair(MACROBLOCK *mb, int b_idx1, int b_idx2,
                                      int y_blocks);
@@ -32,20 +31,6 @@ void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type,
                                 int y_blocks);
 void vp9_regular_quantize_b_8x8(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type,
                                 int y_blocks);
-void vp9_regular_quantize_b_16x16(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type,
-                                  int y_blocks);
-void vp9_regular_quantize_b_32x32(MACROBLOCK *mb, int b_idx,
-                                  int y_blocks);
-
-void vp9_quantize_sby_32x32(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
-void vp9_quantize_sby_16x16(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
-void vp9_quantize_sby_8x8(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
-void vp9_quantize_sby_4x4(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
-void vp9_quantize_sbuv_32x32(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
-void vp9_quantize_sbuv_16x16(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
-void vp9_quantize_sbuv_8x8(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
-void vp9_quantize_sbuv_4x4(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
-
 struct VP9_COMP;
 
 extern void vp9_set_quantizer(struct VP9_COMP *cpi, int Q);
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index 47252253d..0f84b1a37 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -138,9 +138,7 @@ void vp9_save_coding_context(VP9_COMP *cpi) {
   vp9_copy(cc->sb_ymode_prob, cm->fc.sb_ymode_prob);
   vp9_copy(cc->bmode_prob, cm->fc.bmode_prob);
   vp9_copy(cc->uv_mode_prob, cm->fc.uv_mode_prob);
-  vp9_copy(cc->i8x8_mode_prob, cm->fc.i8x8_mode_prob);
   vp9_copy(cc->sub_mv_ref_prob, cm->fc.sub_mv_ref_prob);
-  vp9_copy(cc->mbsplit_prob, cm->fc.mbsplit_prob);
   vp9_copy(cc->partition_prob, cm->fc.partition_prob);
 
   // Stats
@@ -198,10 +196,8 @@ void vp9_restore_coding_context(VP9_COMP *cpi) {
   vp9_copy(cm->fc.ymode_prob, cc->ymode_prob);
   vp9_copy(cm->fc.sb_ymode_prob, cc->sb_ymode_prob);
   vp9_copy(cm->fc.bmode_prob, cc->bmode_prob);
-  vp9_copy(cm->fc.i8x8_mode_prob, cc->i8x8_mode_prob);
   vp9_copy(cm->fc.uv_mode_prob, cc->uv_mode_prob);
   vp9_copy(cm->fc.sub_mv_ref_prob, cc->sub_mv_ref_prob);
-  vp9_copy(cm->fc.mbsplit_prob, cc->mbsplit_prob);
   vp9_copy(cm->fc.partition_prob, cc->partition_prob);
 
   // Stats
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 0e85a0c71..90d56b2d2 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -13,8 +13,8 @@
 #include <math.h>
 #include <limits.h>
 #include <assert.h>
-#include "vp9/common/vp9_pragmas.h"
 
+#include "vp9/common/vp9_pragmas.h"
 #include "vp9/encoder/vp9_tokenize.h"
 #include "vp9/encoder/vp9_treewriter.h"
 #include "vp9/encoder/vp9_onyx_int.h"
@@ -34,7 +34,6 @@
 #include "vpx_mem/vpx_mem.h"
 #include "vp9/common/vp9_systemdependent.h"
 #include "vp9/encoder/vp9_encodemv.h"
-
 #include "vp9/common/vp9_seg_common.h"
 #include "vp9/common/vp9_pred_common.h"
 #include "vp9/common/vp9_entropy.h"
@@ -42,8 +41,6 @@
 #include "vp9/common/vp9_mvref_common.h"
 #include "vp9/common/vp9_common.h"
 
-#define MAXF(a,b)            (((a) > (b)) ? (a) : (b))
-
 #define INVALID_MV 0x80008000
 
 /* Factor to weigh the rate for switchable interp filters */
@@ -105,7 +102,6 @@ const MODE_DEFINITION vp9_mode_order[MAX_MODES] = {
   {SPLITMV,   ALTREF_FRAME, NONE},
 
   {I4X4_PRED,    INTRA_FRAME,  NONE},
-  {I8X8_PRED, INTRA_FRAME,  NONE},
 
   /* compound prediction modes */
   {ZEROMV,    LAST_FRAME,   GOLDEN_FRAME},
@@ -155,11 +151,9 @@ static void fill_token_costs(vp9_coeff_count *c,
   for (i = 0; i < BLOCK_TYPES; i++)
     for (j = 0; j < REF_TYPES; j++)
       for (k = 0; k < COEF_BANDS; k++)
-        for (l = 0; l < PREV_COEF_CONTEXTS; l++) {
-          vp9_cost_tokens_skip((int *)(c[i][j][k][l]),
-                               p[i][j][k][l],
+        for (l = 0; l < PREV_COEF_CONTEXTS; l++)
+          vp9_cost_tokens_skip((int *)c[i][j][k][l], p[i][j][k][l],
                                vp9_coef_tree);
-        }
 }
 
 static int rd_iifactor[32] =  { 4, 4, 3, 2, 1, 0, 0, 0,
@@ -182,7 +176,7 @@ void vp9_init_me_luts() {
   for (i = 0; i < QINDEX_RANGE; i++) {
     sad_per_bit16lut[i] =
       (int)((0.0418 * vp9_convert_qindex_to_q(i)) + 2.4107);
-    sad_per_bit4lut[i] = (int)((0.063 * vp9_convert_qindex_to_q(i)) + 2.742);
+    sad_per_bit4lut[i] = (int)(0.063 * vp9_convert_qindex_to_q(i) + 2.742);
   }
 }
 
@@ -206,7 +200,7 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex) {
   // for key frames, golden frames and arf frames.
   // if (cpi->common.refresh_golden_frame ||
   //     cpi->common.refresh_alt_ref_frame)
-  qindex = (qindex < 0) ? 0 : ((qindex > MAXQ) ? MAXQ : qindex);
+  qindex = clamp(qindex, 0, MAXQ);
 
   cpi->RDMULT = compute_rd_mult(qindex);
   if (cpi->pass == 2 && (cpi->common.frame_type != KEY_FRAME)) {
@@ -291,7 +285,7 @@ int vp9_block_error_c(int16_t *coeff, int16_t *dqcoeff, int block_size) {
 }
 
 static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb,
-                              int ib, PLANE_TYPE type,
+                              int plane, int block, PLANE_TYPE type,
                               ENTROPY_CONTEXT *A,
                               ENTROPY_CONTEXT *L,
                               TX_SIZE tx_size,
@@ -302,10 +296,9 @@ static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb,
   int c = 0;
   int cost = 0, pad;
   const int *scan, *nb;
-  const struct plane_block_idx pb_idx = plane_block_idx(y_blocks, ib);
-  const int eob = xd->plane[pb_idx.plane].eobs[pb_idx.block];
-  const int16_t *qcoeff_ptr = BLOCK_OFFSET(xd->plane[pb_idx.plane].qcoeff,
-                                           pb_idx.block, 16);
+  const int eob = xd->plane[plane].eobs[block];
+  const int16_t *qcoeff_ptr = BLOCK_OFFSET(xd->plane[plane].qcoeff,
+                                           block, 16);
   const int ref = mbmi->ref_frame != INTRA_FRAME;
   unsigned int (*token_costs)[PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS] =
       mb->token_costs[tx_size][type][ref];
@@ -332,7 +325,7 @@ static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb,
 #endif
 
   // Check for consistency of tx_size with mode info
-  assert((!type && !pb_idx.plane) || (type && pb_idx.plane));
+  assert((!type && !plane) || (type && plane));
   if (type == PLANE_TYPE_Y_WITH_DC) {
     assert(xd->mode_info_context->mbmi.txfm_size == tx_size);
   } else {
@@ -343,7 +336,7 @@ static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb,
   switch (tx_size) {
     case TX_4X4: {
       tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
-          get_tx_type_4x4(xd, ib) : DCT_DCT;
+          get_tx_type_4x4(xd, block) : DCT_DCT;
       above_ec = A[0] != 0;
       left_ec = L[0] != 0;
       coef_probs = cm->fc.coef_probs_4x4;
@@ -357,7 +350,7 @@ static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb,
     case TX_8X8: {
       const BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type;
       const int sz = 1 + b_width_log2(sb_type);
-      const int x = ib & ((1 << sz) - 1), y = ib - x;
+      const int x = block & ((1 << sz) - 1), y = block - x;
       TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
           get_tx_type_8x8(xd, y + (x >> 1)) : DCT_DCT;
       above_ec = (A[0] + A[1]) != 0;
@@ -373,7 +366,7 @@ static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb,
     case TX_16X16: {
       const BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type;
       const int sz = 2 + b_width_log2(sb_type);
-      const int x = ib & ((1 << sz) - 1), y = ib - x;
+      const int x = block & ((1 << sz) - 1), y = block - x;
       TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
           get_tx_type_16x16(xd, y + (x >> 2)) : DCT_DCT;
       scan = get_scan_16x16(tx_type);
@@ -563,17 +556,19 @@ static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
         rd[TX_32X32][1] < rd[TX_16X16][1] && rd[TX_32X32][1] < rd[TX_8X8][1] &&
         rd[TX_32X32][1] < rd[TX_4X4][1]))) {
     mbmi->txfm_size = TX_32X32;
-  } else if ( cm->txfm_mode == ALLOW_16X16 ||
-             (max_txfm_size == TX_16X16 && cm->txfm_mode == ALLOW_32X32) ||
-             (cm->txfm_mode == TX_MODE_SELECT &&
-              rd[TX_16X16][1] < rd[TX_8X8][1] &&
-              rd[TX_16X16][1] < rd[TX_4X4][1])) {
+  } else if (max_txfm_size >= TX_16X16 &&
+             (cm->txfm_mode == ALLOW_16X16 ||
+              cm->txfm_mode == ALLOW_32X32 ||
+              (cm->txfm_mode == TX_MODE_SELECT &&
+               rd[TX_16X16][1] < rd[TX_8X8][1] &&
+               rd[TX_16X16][1] < rd[TX_4X4][1]))) {
     mbmi->txfm_size = TX_16X16;
   } else if (cm->txfm_mode == ALLOW_8X8 ||
+             cm->txfm_mode == ALLOW_16X16 ||
+             cm->txfm_mode == ALLOW_32X32 ||
            (cm->txfm_mode == TX_MODE_SELECT && rd[TX_8X8][1] < rd[TX_4X4][1])) {
     mbmi->txfm_size = TX_8X8;
   } else {
-    assert(cm->txfm_mode == ONLY_4X4 || cm->txfm_mode == TX_MODE_SELECT);
     mbmi->txfm_size = TX_4X4;
   }
 
@@ -583,13 +578,14 @@ static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
 
   txfm_cache[ONLY_4X4] = rd[TX_4X4][0];
   txfm_cache[ALLOW_8X8] = rd[TX_8X8][0];
-  txfm_cache[ALLOW_16X16] = rd[TX_16X16][0];
-  txfm_cache[ALLOW_32X32] = rd[max_txfm_size][0];
+  txfm_cache[ALLOW_16X16] = rd[MIN(max_txfm_size, TX_16X16)][0];
+  txfm_cache[ALLOW_32X32] = rd[MIN(max_txfm_size, TX_32X32)][0];
   if (max_txfm_size == TX_32X32 &&
       rd[TX_32X32][1] < rd[TX_16X16][1] && rd[TX_32X32][1] < rd[TX_8X8][1] &&
       rd[TX_32X32][1] < rd[TX_4X4][1])
     txfm_cache[TX_MODE_SELECT] = rd[TX_32X32][1];
-  else if (rd[TX_16X16][1] < rd[TX_8X8][1] && rd[TX_16X16][1] < rd[TX_4X4][1])
+  else if (max_txfm_size >= TX_16X16 &&
+           rd[TX_16X16][1] < rd[TX_8X8][1] && rd[TX_16X16][1] < rd[TX_4X4][1])
     txfm_cache[TX_MODE_SELECT] = rd[TX_16X16][1];
   else
     txfm_cache[TX_MODE_SELECT] = rd[TX_4X4][1] < rd[TX_8X8][1] ?
@@ -603,16 +599,17 @@ static int block_error(int16_t *coeff, int16_t *dqcoeff,
 
   for (i = 0; i < block_size; i++) {
     int this_diff = coeff[i] - dqcoeff[i];
-    error += this_diff * this_diff;
+    error += (unsigned)this_diff * this_diff;
   }
   error >>= shift;
 
   return error > INT_MAX ? INT_MAX : (int)error;
 }
 
-static int block_error_sby(MACROBLOCK *x, int block_size, int shift) {
+static int block_error_sby(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize, int shift) {
+  const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);
   return block_error(x->plane[0].coeff, x->e_mbd.plane[0].dqcoeff,
-                     block_size, shift);
+                     16 << (bwl + bhl), shift);
 }
 
 static int block_error_sbuv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize, int shift) {
@@ -630,155 +627,54 @@ static int block_error_sbuv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize, int shift) {
   return sum > INT_MAX ? INT_MAX : (int)sum;
 }
 
-static int rdcost_sby_4x4(VP9_COMMON *const cm, MACROBLOCK *x,
-                          BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize), bw = 1 << bwl;
-  const int bh = 1 << b_height_log2(bsize);
-  int cost = 0, b;
+static int rdcost_plane(VP9_COMMON *const cm, MACROBLOCK *x,
+                        int plane, BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) {
   MACROBLOCKD *const xd = &x->e_mbd;
+  const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x;
+  const int bhl = b_height_log2(bsize) - xd->plane[plane].subsampling_y;
+  const int bw = 1 << bwl, bh = 1 << bhl;
   ENTROPY_CONTEXT t_above[16], t_left[16];
+  int block, cost;
 
-  vpx_memcpy(&t_above, xd->plane[0].above_context,
+  vpx_memcpy(&t_above, xd->plane[plane].above_context,
              sizeof(ENTROPY_CONTEXT) * bw);
-  vpx_memcpy(&t_left,  xd->plane[0].left_context,
+  vpx_memcpy(&t_left,  xd->plane[plane].left_context,
              sizeof(ENTROPY_CONTEXT) * bh);
 
-  for (b = 0; b < bw * bh; b++) {
-    const int x_idx = b & (bw - 1), y_idx = b >> bwl;
-    cost += cost_coeffs(cm, x, b, PLANE_TYPE_Y_WITH_DC,
-                        t_above + x_idx, t_left + y_idx,
-                        TX_4X4, bw * bh);
-  }
-
-  return cost;
-}
-
-static void super_block_yrd_4x4(VP9_COMMON *const cm, MACROBLOCK *x,
-                                int *rate, int *distortion, int *skippable,
-                                BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);
-  MACROBLOCKD *const xd = &x->e_mbd;
-
-  xd->mode_info_context->mbmi.txfm_size = TX_4X4;
-  vp9_transform_sby_4x4(x, bsize);
-  vp9_quantize_sby_4x4(x, bsize);
-
-  *distortion = block_error_sby(x, 16 << (bwl + bhl), 2);
-  *rate       = rdcost_sby_4x4(cm, x, bsize);
-  *skippable  = vp9_sby_is_skippable(xd, bsize);
-}
-
-static int rdcost_sby_8x8(VP9_COMMON *const cm, MACROBLOCK *x,
-                          BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize) - 1, bw = 1 << bwl;
-  const int bh = 1 << (b_height_log2(bsize) - 1);
-  int cost = 0, b;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  ENTROPY_CONTEXT t_above[16], t_left[16];
-
-  vpx_memcpy(&t_above, xd->plane[0].above_context,
-             sizeof(ENTROPY_CONTEXT) * 2 * bw);
-  vpx_memcpy(&t_left,  xd->plane[0].left_context,
-             sizeof(ENTROPY_CONTEXT) * 2 * bh);
-
-  for (b = 0; b < bw * bh; b++) {
-    const int x_idx = b & (bw - 1), y_idx = b >> bwl;
-    cost += cost_coeffs(cm, x, b * 4, PLANE_TYPE_Y_WITH_DC,
-                        t_above + x_idx * 2, t_left + y_idx * 2,
-                        TX_8X8, 4 * bw * bh);
-  }
-
-  return cost;
-}
-
-static void super_block_yrd_8x8(VP9_COMMON *const cm, MACROBLOCK *x,
-                                int *rate, int *distortion, int *skippable,
-                                BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);
-  MACROBLOCKD *const xd = &x->e_mbd;
-
-  xd->mode_info_context->mbmi.txfm_size = TX_8X8;
-  vp9_transform_sby_8x8(x, bsize);
-  vp9_quantize_sby_8x8(x, bsize);
+  cost = 0;
+  for (block = 0; block < bw * bh; block += 1 << (tx_size * 2)) {
+    int x_idx, y_idx;
 
-  *distortion = block_error_sby(x, 16 << (bhl + bwl), 2);
-  *rate       = rdcost_sby_8x8(cm, x, bsize);
-  *skippable  = vp9_sby_is_skippable(xd, bsize);
-}
-
-static int rdcost_sby_16x16(VP9_COMMON *const cm, MACROBLOCK *x,
-                            BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize) - 2, bw = 1 << bwl;
-  const int bh = 1 << (b_height_log2(bsize) - 2);
-  int cost = 0, b;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  ENTROPY_CONTEXT t_above[16], t_left[16];
+    txfrm_block_to_raster_xy(xd, bsize, plane, block, tx_size * 2,
+                             &x_idx, &y_idx);
 
-  vpx_memcpy(&t_above, xd->plane[0].above_context,
-             sizeof(ENTROPY_CONTEXT) * 4 * bw);
-  vpx_memcpy(&t_left,  xd->plane[0].left_context,
-             sizeof(ENTROPY_CONTEXT) * 4 * bh);
-
-  for (b = 0; b < bw * bh; b++) {
-    const int x_idx = b & (bw - 1), y_idx = b >> bwl;
-    cost += cost_coeffs(cm, x, b * 16, PLANE_TYPE_Y_WITH_DC,
-                        t_above + x_idx * 4, t_left + y_idx * 4,
-                        TX_16X16, bw * bh * 16);
+    cost += cost_coeffs(cm, x, plane, block, xd->plane[plane].plane_type,
+                        t_above + x_idx, t_left + y_idx,
+                        tx_size, bw * bh);
   }
 
   return cost;
 }
 
-static void super_block_yrd_16x16(VP9_COMMON *const cm, MACROBLOCK *x,
-                                  int *rate, int *distortion, int *skippable,
-                                  BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);
-  MACROBLOCKD *const xd = &x->e_mbd;
+static int rdcost_uv(VP9_COMMON *const cm, MACROBLOCK *x,
+                     BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) {
+  int cost = 0, plane;
 
-  xd->mode_info_context->mbmi.txfm_size = TX_16X16;
-  vp9_transform_sby_16x16(x, bsize);
-  vp9_quantize_sby_16x16(x, bsize);
-
-  *distortion = block_error_sby(x, 16 << (bwl + bhl), 2);
-  *rate       = rdcost_sby_16x16(cm, x, bsize);
-  *skippable  = vp9_sby_is_skippable(xd, bsize);
-}
-
-static int rdcost_sby_32x32(VP9_COMMON *const cm, MACROBLOCK *x,
-                            BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize) - 3, bw = 1 << bwl;
-  const int bh = 1 << (b_height_log2(bsize) - 3);
-  int cost = 0, b;
-  MACROBLOCKD * const xd = &x->e_mbd;
-  ENTROPY_CONTEXT t_above[16], t_left[16];
-
-  vpx_memcpy(&t_above, xd->plane[0].above_context,
-             sizeof(ENTROPY_CONTEXT) * 8 * bw);
-  vpx_memcpy(&t_left,  xd->plane[0].left_context,
-             sizeof(ENTROPY_CONTEXT) * 8 * bh);
-
-  for (b = 0; b < bw * bh; b++) {
-    const int x_idx = b & (bw - 1), y_idx = b >> bwl;
-    cost += cost_coeffs(cm, x, b * 64, PLANE_TYPE_Y_WITH_DC,
-                        t_above + x_idx * 8, t_left + y_idx * 8,
-                        TX_32X32, bw * bh * 64);
+  for (plane = 1; plane < MAX_MB_PLANE; plane++) {
+    cost += rdcost_plane(cm, x, plane, bsize, tx_size);
   }
-
   return cost;
 }
 
-static void super_block_yrd_32x32(VP9_COMMON *const cm, MACROBLOCK *x,
-                                  int *rate, int *distortion, int *skippable,
-                                  BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);
+static void super_block_yrd_for_txfm(VP9_COMMON *const cm, MACROBLOCK *x,
+                                     int *rate, int *distortion, int *skippable,
+                                     BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) {
   MACROBLOCKD *const xd = &x->e_mbd;
+  xd->mode_info_context->mbmi.txfm_size = tx_size;
+  vp9_xform_quant_sby(cm, x, bsize);
 
-  xd->mode_info_context->mbmi.txfm_size = TX_32X32;
-  vp9_transform_sby_32x32(x, bsize);
-  vp9_quantize_sby_32x32(x, bsize);
-
-  *distortion = block_error_sby(x, 16 << (bwl + bhl), 0);
-  *rate       = rdcost_sby_32x32(cm, x, bsize);
+  *distortion = block_error_sby(x, bsize, tx_size == TX_32X32 ? 0 : 2);
+  *rate       = rdcost_plane(cm, x, 0, bsize, tx_size);
   *skippable  = vp9_sby_is_skippable(xd, bsize);
 }
 
@@ -792,14 +688,19 @@ static void super_block_yrd(VP9_COMP *cpi,
   vp9_subtract_sby(x, bs);
 
   if (bs >= BLOCK_SIZE_SB32X32)
-    super_block_yrd_32x32(cm, x, &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32],
-                          bs);
-  super_block_yrd_16x16(cm, x, &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16], bs);
-  super_block_yrd_8x8(cm, x,   &r[TX_8X8][0],   &d[TX_8X8],   &s[TX_8X8],   bs);
-  super_block_yrd_4x4(cm, x,   &r[TX_4X4][0],   &d[TX_4X4],   &s[TX_4X4],   bs);
+    super_block_yrd_for_txfm(cm, x, &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32],
+                             bs, TX_32X32);
+  if (bs >= BLOCK_SIZE_MB16X16)
+    super_block_yrd_for_txfm(cm, x, &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16],
+                             bs, TX_16X16);
+  super_block_yrd_for_txfm(cm, x, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8], bs,
+                           TX_8X8);
+  super_block_yrd_for_txfm(cm, x, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4], bs,
+                           TX_4X4);
 
   choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s, skip, txfm_cache,
-                           TX_32X32 - (bs < BLOCK_SIZE_SB32X32));
+                           TX_32X32 - (bs < BLOCK_SIZE_SB32X32)
+                           - (bs < BLOCK_SIZE_MB16X16));
 }
 
 static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
@@ -816,17 +717,25 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
   VP9_COMMON *const cm = &cpi->common;
   const int src_stride = x->plane[0].src.stride;
   uint8_t* const src =
-      raster_block_offset_uint8(xd, BLOCK_SIZE_MB16X16, 0, ib,
+      raster_block_offset_uint8(xd,
+                                BLOCK_SIZE_SB8X8,
+                                0, ib,
                                 x->plane[0].src.buf, src_stride);
   int16_t* const src_diff =
-      raster_block_offset_int16(xd, BLOCK_SIZE_MB16X16, 0, ib,
+      raster_block_offset_int16(xd,
+                                BLOCK_SIZE_SB8X8,
+                                0, ib,
                                 x->plane[0].src_diff);
   int16_t* const diff =
-      raster_block_offset_int16(xd, BLOCK_SIZE_MB16X16, 0, ib,
+      raster_block_offset_int16(xd,
+                                BLOCK_SIZE_SB8X8,
+                                0, ib,
                                 xd->plane[0].diff);
   int16_t* const coeff = BLOCK_OFFSET(x->plane[0].coeff, ib, 16);
   uint8_t* const dst =
-      raster_block_offset_uint8(xd, BLOCK_SIZE_MB16X16, 0, ib,
+      raster_block_offset_uint8(xd,
+                                BLOCK_SIZE_SB8X8,
+                                0, ib,
                                 xd->plane[0].dst.buf, xd->plane[0].dst.stride);
   ENTROPY_CONTEXT ta = *a, tempa = *a;
   ENTROPY_CONTEXT tl = *l, templ = *l;
@@ -839,7 +748,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
    * */
   DECLARE_ALIGNED_ARRAY(16, int16_t, best_dqcoeff, 16);
 
-  assert(ib < 16);
+  assert(ib < 4);
 #if CONFIG_NEWBINTRAMODES
   xd->mode_info_context->bmi[ib].as_mode.context =
     vp9_find_bpred_context(xd, ib, dst, xd->plane[0].dst.stride);
@@ -867,25 +776,27 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
     rate = bmode_costs[mode];
 #endif
 
-    vp9_intra4x4_predict(xd, ib, mode, dst, xd->plane[0].dst.stride);
-    vp9_subtract_block(4, 4, src_diff, 16,
+    vp9_intra4x4_predict(xd, ib,
+                         BLOCK_SIZE_SB8X8,
+                         mode, dst, xd->plane[0].dst.stride);
+    vp9_subtract_block(4, 4, src_diff, 8,
                        src, src_stride,
                        dst, xd->plane[0].dst.stride);
 
     xd->mode_info_context->bmi[ib].as_mode.first = mode;
     tx_type = get_tx_type_4x4(xd, ib);
     if (tx_type != DCT_DCT) {
-      vp9_short_fht4x4(src_diff, coeff, 16, tx_type);
+      vp9_short_fht4x4(src_diff, coeff, 8, tx_type);
       x->quantize_b_4x4(x, ib, tx_type, 16);
     } else {
-      x->fwd_txm4x4(src_diff, coeff, 32);
+      x->fwd_txm4x4(src_diff, coeff, 16);
       x->quantize_b_4x4(x, ib, tx_type, 16);
     }
 
     tempa = ta;
     templ = tl;
 
-    ratey = cost_coeffs(cm, x, ib,
+    ratey = cost_coeffs(cm, x, 0, ib,
                         PLANE_TYPE_Y_WITH_DC, &tempa, &templ, TX_4X4, 16);
     rate += ratey;
     distortion = vp9_block_error(coeff,
@@ -911,13 +822,15 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
 
   // inverse transform
   if (best_tx_type != DCT_DCT)
-    vp9_short_iht4x4(best_dqcoeff, diff, 16, best_tx_type);
+    vp9_short_iht4x4(best_dqcoeff, diff, 8, best_tx_type);
   else
-    xd->inv_txm4x4(best_dqcoeff, diff, 32);
+    xd->inv_txm4x4(best_dqcoeff, diff, 16);
 
-  vp9_intra4x4_predict(xd, ib, *best_mode,
+  vp9_intra4x4_predict(xd, ib,
+                       BLOCK_SIZE_SB8X8,
+                       *best_mode,
                        dst, xd->plane[0].dst.stride);
-  vp9_recon_b(dst, diff,
+  vp9_recon_b(dst, diff, 8,
               dst, xd->plane[0].dst.stride);
 
   return best_rd;
@@ -932,7 +845,7 @@ static int64_t rd_pick_intra4x4mby_modes(VP9_COMP *cpi, MACROBLOCK *mb,
   int distortion = 0;
   int tot_rate_y = 0;
   int64_t total_rd = 0;
-  ENTROPY_CONTEXT t_above[4], t_left[4];
+  ENTROPY_CONTEXT t_above[2], t_left[2];
   int *bmode_costs;
 
   vpx_memcpy(t_above, xd->plane[0].above_context, sizeof(t_above));
@@ -941,15 +854,17 @@ static int64_t rd_pick_intra4x4mby_modes(VP9_COMP *cpi, MACROBLOCK *mb,
   xd->mode_info_context->mbmi.mode = I4X4_PRED;
   bmode_costs = mb->inter_bmode_costs;
 
-  for (i = 0; i < 16; i++) {
-    const int x_idx = i & 3, y_idx = i >> 2;
+  for (i = 0; i < 4; i++) {
+    const int x_idx = i & 1, y_idx = i >> 1;
     MODE_INFO *const mic = xd->mode_info_context;
     const int mis = xd->mode_info_stride;
     B_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode);
     int UNINITIALIZED_IS_SAFE(r), UNINITIALIZED_IS_SAFE(ry), UNINITIALIZED_IS_SAFE(d);
 #if CONFIG_NEWBINTRAMODES
     uint8_t* const dst =
-        raster_block_offset_uint8(xd, BLOCK_SIZE_MB16X16, 0, i,
+        raster_block_offset_uint8(xd,
+                                  BLOCK_SIZE_SB8X8,
+                                  0, i,
                                   xd->plane[0].dst.buf,
                                   xd->plane[0].dst.stride);
 #endif
@@ -1046,403 +961,16 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
   return best_rd;
 }
 
-static int64_t rd_pick_intra8x8block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
-                                     B_PREDICTION_MODE *best_mode,
-                                     int *mode_costs,
-                                     ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
-                                     int *bestrate, int *bestratey,
-                                     int *bestdistortion) {
-  VP9_COMMON *const cm = &cpi->common;
-  MB_PREDICTION_MODE mode;
-  MACROBLOCKD *xd = &x->e_mbd;
-  int64_t best_rd = INT64_MAX;
-  int distortion = 0, rate = 0;
-  ENTROPY_CONTEXT ta[2], tl[2], ta_temp[2], tl_temp[2];
-  // perform transformation of dimension 8x8
-  // note the input and output index mapping
-  int idx = (ib & 0x02) ? (ib + 2) : ib;
-  const int src_stride = x->plane[0].src.stride;
-  uint8_t* const src =
-      raster_block_offset_uint8(xd, BLOCK_SIZE_MB16X16, 0, ib,
-                                x->plane[0].src.buf, src_stride);
-  int16_t* const src_diff =
-      raster_block_offset_int16(xd, BLOCK_SIZE_MB16X16, 0, ib,
-                                x->plane[0].src_diff);
-  int16_t* const coeff = BLOCK_OFFSET(x->plane[0].coeff, idx, 16);
-  uint8_t* const dst =
-      raster_block_offset_uint8(xd, BLOCK_SIZE_MB16X16, 0, ib,
-                                xd->plane[0].dst.buf, xd->plane[0].dst.stride);
-
-  assert(ib < 16);
-  vpx_memcpy(ta, a, sizeof(ta));
-  vpx_memcpy(tl, l, sizeof(tl));
-  for (mode = DC_PRED; mode <= TM_PRED; mode++) {
-    int64_t this_rd;
-    int rate_t = 0;
-
-    // FIXME rate for compound mode and second intrapred mode
-    rate = mode_costs[mode];
-    xd->mode_info_context->bmi[ib].as_mode.first = mode;
-
-    vp9_intra8x8_predict(xd, ib, mode, dst, xd->plane[0].dst.stride);
-
-    vp9_subtract_block(8, 8, src_diff, 16,
-                       src, src_stride,
-                       dst, xd->plane[0].dst.stride);
-
-    vpx_memcpy(ta_temp, ta, sizeof(ta));
-    vpx_memcpy(tl_temp, tl, sizeof(tl));
-
-    if (xd->mode_info_context->mbmi.txfm_size == TX_8X8) {
-      TX_TYPE tx_type = get_tx_type_8x8(xd, ib);
-      if (tx_type != DCT_DCT)
-        vp9_short_fht8x8(src_diff, coeff, 16, tx_type);
-      else
-        x->fwd_txm8x8(src_diff, coeff, 32);
-      x->quantize_b_8x8(x, idx, tx_type, 16);
-
-      // compute quantization mse of 8x8 block
-      distortion = vp9_block_error_c(coeff,
-          BLOCK_OFFSET(xd->plane[0].dqcoeff, idx, 16), 64);
-
-      rate_t = cost_coeffs(cm, x, idx, PLANE_TYPE_Y_WITH_DC,
-                           ta_temp, tl_temp, TX_8X8, 16);
-
-      rate += rate_t;
-    } else {
-      static const int iblock[4] = {0, 1, 4, 5};
-      TX_TYPE tx_type;
-      int i;
-
-      distortion = 0;
-      rate_t = 0;
-      for (i = 0; i < 4; ++i) {
-        int16_t* const src_diff =
-            raster_block_offset_int16(xd, BLOCK_SIZE_MB16X16,
-                                      0, ib + iblock[i],
-                                      x->plane[0].src_diff);
-        int16_t* const coeff = BLOCK_OFFSET(x->plane[0].coeff,
-                                            ib + iblock[i], 16);
-        int do_two = 0;
-        tx_type = get_tx_type_4x4(xd, ib + iblock[i]);
-        if (tx_type != DCT_DCT) {
-          vp9_short_fht4x4(src_diff, coeff, 16, tx_type);
-          x->quantize_b_4x4(x, ib + iblock[i], tx_type, 16);
-        } else if (!(i & 1) &&
-                   get_tx_type_4x4(xd, ib + iblock[i] + 1) == DCT_DCT) {
-          x->fwd_txm8x4(src_diff, coeff, 32);
-          x->quantize_b_4x4_pair(x, ib + iblock[i], ib + iblock[i] + 1, 16);
-          do_two = 1;
-        } else {
-          x->fwd_txm4x4(src_diff, coeff, 32);
-          x->quantize_b_4x4(x, ib + iblock[i], tx_type, 16);
-        }
-        distortion += vp9_block_error_c(coeff,
-            BLOCK_OFFSET(xd->plane[0].dqcoeff, ib + iblock[i], 16),
-            16 << do_two);
-        rate_t += cost_coeffs(cm, x, ib + iblock[i], PLANE_TYPE_Y_WITH_DC,
-                              &ta_temp[i & 1], &tl_temp[i >> 1],
-                              TX_4X4, 16);
-        if (do_two) {
-          i++;
-          rate_t += cost_coeffs(cm, x, ib + iblock[i], PLANE_TYPE_Y_WITH_DC,
-                                &ta_temp[i & 1], &tl_temp[i >> 1],
-                                TX_4X4, 16);
-        }
-      }
-      rate += rate_t;
-    }
-
-    distortion >>= 2;
-    this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
-    if (this_rd < best_rd) {
-      *bestrate = rate;
-      *bestratey = rate_t;
-      *bestdistortion = distortion;
-      vpx_memcpy(a, ta_temp, sizeof(ta_temp));
-      vpx_memcpy(l, tl_temp, sizeof(tl_temp));
-      best_rd = this_rd;
-      *best_mode = mode;
-    }
-  }
-  xd->mode_info_context->bmi[ib].as_mode.first = (*best_mode);
-  vp9_encode_intra8x8(x, ib);
-
-  return best_rd;
-}
-
-static int64_t rd_pick_intra8x8mby_modes(VP9_COMP *cpi, MACROBLOCK *mb,
-                                         int *Rate, int *rate_y,
-                                         int *Distortion, int64_t best_rd) {
-  MACROBLOCKD *const xd = &mb->e_mbd;
-  int i, ib;
-  int cost = mb->mbmode_cost [xd->frame_type] [I8X8_PRED];
-  int distortion = 0;
-  int tot_rate_y = 0;
-  int64_t total_rd = 0;
-  ENTROPY_CONTEXT t_above[4], t_left[4];
-  int *i8x8mode_costs;
-
-  vpx_memcpy(t_above, xd->plane[0].above_context, sizeof(t_above));
-  vpx_memcpy(t_left, xd->plane[0].left_context, sizeof(t_left));
-
-  xd->mode_info_context->mbmi.mode = I8X8_PRED;
-  i8x8mode_costs  = mb->i8x8_mode_costs;
-
-  for (i = 0; i < 4; i++) {
-    const int x_idx = i & 1, y_idx = i >> 1;
-    MODE_INFO *const mic = xd->mode_info_context;
-    B_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode);
-    int UNINITIALIZED_IS_SAFE(r), UNINITIALIZED_IS_SAFE(ry), UNINITIALIZED_IS_SAFE(d);
-
-    ib = vp9_i8x8_block[i];
-    total_rd += rd_pick_intra8x8block(cpi, mb, ib, &best_mode, i8x8mode_costs,
-                                      t_above + x_idx * 2, t_left + y_idx * 2,
-                                      &r, &ry, &d);
-    cost += r;
-    distortion += d;
-    tot_rate_y += ry;
-    mic->bmi[ib].as_mode.first = best_mode;
-  }
-
-  *Rate = cost;
-  *rate_y = tot_rate_y;
-  *Distortion = distortion;
-  return RDCOST(mb->rdmult, mb->rddiv, cost, distortion);
-}
-
-static int64_t rd_pick_intra8x8mby_modes_and_txsz(VP9_COMP *cpi, MACROBLOCK *x,
-                                                  int *rate, int *rate_y,
-                                                  int *distortion,
-                                                  int *mode8x8,
-                                                  int64_t best_yrd,
-                                                  int64_t *txfm_cache) {
-  VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
-  int cost0 = vp9_cost_bit(cm->prob_tx[0], 0);
-  int cost1 = vp9_cost_bit(cm->prob_tx[0], 1);
-  int64_t tmp_rd_4x4s, tmp_rd_8x8s;
-  int64_t tmp_rd_4x4, tmp_rd_8x8, tmp_rd;
-  int r4x4, tok4x4, d4x4, r8x8, tok8x8, d8x8;
-
-  mbmi->txfm_size = TX_4X4;
-  tmp_rd_4x4 = rd_pick_intra8x8mby_modes(cpi, x, &r4x4, &tok4x4,
-                                         &d4x4, best_yrd);
-  mode8x8[0] = xd->mode_info_context->bmi[0].as_mode.first;
-  mode8x8[1] = xd->mode_info_context->bmi[2].as_mode.first;
-  mode8x8[2] = xd->mode_info_context->bmi[8].as_mode.first;
-  mode8x8[3] = xd->mode_info_context->bmi[10].as_mode.first;
-  mbmi->txfm_size = TX_8X8;
-  tmp_rd_8x8 = rd_pick_intra8x8mby_modes(cpi, x, &r8x8, &tok8x8,
-                                         &d8x8, best_yrd);
-  txfm_cache[ONLY_4X4]  = tmp_rd_4x4;
-  txfm_cache[ALLOW_8X8] = tmp_rd_8x8;
-  txfm_cache[ALLOW_16X16] = tmp_rd_8x8;
-  tmp_rd_4x4s = tmp_rd_4x4 + RDCOST(x->rdmult, x->rddiv, cost0, 0);
-  tmp_rd_8x8s = tmp_rd_8x8 + RDCOST(x->rdmult, x->rddiv, cost1, 0);
-  txfm_cache[TX_MODE_SELECT] = tmp_rd_4x4s < tmp_rd_8x8s ?
-                               tmp_rd_4x4s : tmp_rd_8x8s;
-  if (cm->txfm_mode == TX_MODE_SELECT) {
-    if (tmp_rd_4x4s < tmp_rd_8x8s) {
-      *rate = r4x4 + cost0;
-      *rate_y = tok4x4 + cost0;
-      *distortion = d4x4;
-      mbmi->txfm_size = TX_4X4;
-      tmp_rd = tmp_rd_4x4s;
-    } else {
-      *rate = r8x8 + cost1;
-      *rate_y = tok8x8 + cost1;
-      *distortion = d8x8;
-      mbmi->txfm_size = TX_8X8;
-      tmp_rd = tmp_rd_8x8s;
-
-      mode8x8[0] = xd->mode_info_context->bmi[0].as_mode.first;
-      mode8x8[1] = xd->mode_info_context->bmi[2].as_mode.first;
-      mode8x8[2] = xd->mode_info_context->bmi[8].as_mode.first;
-      mode8x8[3] = xd->mode_info_context->bmi[10].as_mode.first;
-    }
-  } else if (cm->txfm_mode == ONLY_4X4) {
-    *rate = r4x4;
-    *rate_y = tok4x4;
-    *distortion = d4x4;
-    mbmi->txfm_size = TX_4X4;
-    tmp_rd = tmp_rd_4x4;
-  } else {
-    *rate = r8x8;
-    *rate_y = tok8x8;
-    *distortion = d8x8;
-    mbmi->txfm_size = TX_8X8;
-    tmp_rd = tmp_rd_8x8;
-
-    mode8x8[0] = xd->mode_info_context->bmi[0].as_mode.first;
-    mode8x8[1] = xd->mode_info_context->bmi[2].as_mode.first;
-    mode8x8[2] = xd->mode_info_context->bmi[8].as_mode.first;
-    mode8x8[3] = xd->mode_info_context->bmi[10].as_mode.first;
-  }
-
-  return tmp_rd;
-}
-
-static int rd_cost_sbuv_4x4(VP9_COMMON *const cm, MACROBLOCK *x,
-                            BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize) - 1, bw = 1 << bwl;
-  const int bh = 1 << (b_height_log2(bsize) - 1);
-  int yoff = 4 * bw * bh;
-  int p, b, cost = 0;
-  MACROBLOCKD *const xd = &x->e_mbd;
-
-  for (p = 1; p < MAX_MB_PLANE; p++) {
-    ENTROPY_CONTEXT t_above[8], t_left[8];
-
-    vpx_memcpy(t_above, xd->plane[p].above_context,
-               sizeof(ENTROPY_CONTEXT) * 2 * bw >> xd->plane[p].subsampling_x);
-    vpx_memcpy(t_left, xd->plane[p].left_context,
-               sizeof(ENTROPY_CONTEXT) * 2 * bh >> xd->plane[p].subsampling_y);
-    for (b = 0; b < bw * bh; b++) {
-      const int x_idx = b & (bw - 1), y_idx = b >> bwl;
-      cost += cost_coeffs(cm, x, yoff + b, PLANE_TYPE_UV,
-                          t_above + x_idx, t_left + y_idx,
-                          TX_4X4, bw * bh * 4);
-    }
-    yoff = (yoff * 5) >> 2;  // u -> v
-  }
-
-  return cost;
-}
-
-static void super_block_uvrd_4x4(VP9_COMMON *const cm, MACROBLOCK *x,
-                                 int *rate, int *distortion, int *skip,
-                                 BLOCK_SIZE_TYPE bsize) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-
-  vp9_transform_sbuv_4x4(x, bsize);
-  vp9_quantize_sbuv_4x4(x, bsize);
-
-  *rate       = rd_cost_sbuv_4x4(cm, x, bsize);
-  *distortion = block_error_sbuv(x, bsize, 2);
-  *skip       = vp9_sbuv_is_skippable(xd, bsize);
-}
-
-static int rd_cost_sbuv_8x8(VP9_COMMON *const cm, MACROBLOCK *x,
-                            BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize) - 2, bw = 1 << bwl;
-  const int bh = 1 << (b_height_log2(bsize) - 2);
-  int yoff = 16 * bw * bh;
-  int p, b, cost = 0;
-  MACROBLOCKD *const xd = &x->e_mbd;
-
-  for (p = 1; p < MAX_MB_PLANE; p++) {
-    ENTROPY_CONTEXT t_above[8], t_left[8];
-
-    vpx_memcpy(t_above, xd->plane[p].above_context,
-               sizeof(ENTROPY_CONTEXT) * 4 * bw >> xd->plane[p].subsampling_x);
-    vpx_memcpy(t_left, xd->plane[p].left_context,
-               sizeof(ENTROPY_CONTEXT) * 4 * bh >> xd->plane[p].subsampling_y);
-    for (b = 0; b < bw * bh; b++) {
-      const int x_idx = b & (bw - 1), y_idx = b >> bwl;
-      cost += cost_coeffs(cm, x, yoff + b * 4, PLANE_TYPE_UV,
-                          t_above + x_idx * 2, t_left + y_idx * 2,
-                          TX_8X8, bw * bh * 16);
-    }
-    yoff = (yoff * 5) >> 2;  // u -> v
-  }
-
-  return cost;
-}
-
-static void super_block_uvrd_8x8(VP9_COMMON *const cm, MACROBLOCK *x,
-                                 int *rate, int *distortion, int *skip,
-                                 BLOCK_SIZE_TYPE bsize) {
+static void super_block_uvrd_for_txfm(VP9_COMMON *const cm, MACROBLOCK *x,
+                                      int *rate, int *distortion,
+                                      int *skippable, BLOCK_SIZE_TYPE bsize,
+                                      TX_SIZE uv_tx_size) {
   MACROBLOCKD *const xd = &x->e_mbd;
+  vp9_xform_quant_sbuv(cm, x, bsize);
 
-  vp9_transform_sbuv_8x8(x, bsize);
-  vp9_quantize_sbuv_8x8(x, bsize);
-
-  *rate       = rd_cost_sbuv_8x8(cm, x, bsize);
-  *distortion = block_error_sbuv(x, bsize, 2);
-  *skip       = vp9_sbuv_is_skippable(xd, bsize);
-}
-
-static int rd_cost_sbuv_16x16(VP9_COMMON *const cm, MACROBLOCK *x,
-                              BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize) - 3, bw = 1 << bwl;
-  const int bh = 1 << (b_height_log2(bsize) - 3);
-  int yoff = 64 * bw * bh;
-  int p, b, cost = 0;
-  MACROBLOCKD *const xd = &x->e_mbd;
-
-  for (p = 1; p < MAX_MB_PLANE; p++) {
-    ENTROPY_CONTEXT t_above[8], t_left[8];
-
-    vpx_memcpy(t_above, xd->plane[p].above_context,
-               sizeof(ENTROPY_CONTEXT) * 8 * bw >> xd->plane[p].subsampling_x);
-    vpx_memcpy(t_left, xd->plane[p].left_context,
-               sizeof(ENTROPY_CONTEXT) * 8 * bh >> xd->plane[p].subsampling_y);
-    for (b = 0; b < bw * bh; b++) {
-      const int x_idx = b & (bw - 1), y_idx = b >> bwl;
-      cost += cost_coeffs(cm, x, yoff + b * 16, PLANE_TYPE_UV,
-                          t_above + x_idx * 4, t_left + y_idx * 4,
-                          TX_16X16, bw * bh * 64);
-    }
-    yoff = (yoff * 5) >> 2;  // u -> v
-  }
-
-  return cost;
-}
-
-static void super_block_uvrd_16x16(VP9_COMMON *const cm, MACROBLOCK *x,
-                                   int *rate, int *distortion, int *skip,
-                                   BLOCK_SIZE_TYPE bsize) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-
-  vp9_transform_sbuv_16x16(x, bsize);
-  vp9_quantize_sbuv_16x16(x, bsize);
-
-  *rate       = rd_cost_sbuv_16x16(cm, x, bsize);
-  *distortion = block_error_sbuv(x, bsize, 2);
-  *skip       = vp9_sbuv_is_skippable(xd, bsize);
-}
-
-static int rd_cost_sbuv_32x32(VP9_COMMON *const cm, MACROBLOCK *x,
-                              BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize) - 4, bw = 1 << bwl;
-  const int bh = 1 << (b_height_log2(bsize) - 4);
-  int yoff = 256 * bh * bw;
-  int p, b, cost = 0;
-  MACROBLOCKD *const xd = &x->e_mbd;
-
-  for (p = 1; p < MAX_MB_PLANE; p++) {
-    ENTROPY_CONTEXT t_above[8], t_left[8];
-
-    vpx_memcpy(t_above, xd->plane[p].above_context,
-               sizeof(ENTROPY_CONTEXT) * 16 * bw >> xd->plane[p].subsampling_x);
-    vpx_memcpy(t_left, xd->plane[p].left_context,
-               sizeof(ENTROPY_CONTEXT) * 16 * bh >> xd->plane[p].subsampling_y);
-    for (b = 0; b < bw * bh; b++) {
-      const int x_idx = b * (bw - 1), y_idx = b >> bwl;
-      cost += cost_coeffs(cm, x, yoff + b * 64, PLANE_TYPE_UV,
-                          t_above + x_idx * 8, t_left + y_idx * 8,
-                          TX_32X32, 256 * bh * bw);
-    }
-    yoff = (yoff * 5) >> 2;  // u -> v
-  }
-
-  return cost;
-}
-#undef UVCTX
-
-static void super_block_uvrd_32x32(VP9_COMMON *const cm, MACROBLOCK *x,
-                                   int *rate, int *distortion, int *skip,
-                                   BLOCK_SIZE_TYPE bsize) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-
-  vp9_transform_sbuv_32x32(x, bsize);
-  vp9_quantize_sbuv_32x32(x, bsize);
-
-  *rate       = rd_cost_sbuv_32x32(cm, x, bsize);
-  *distortion = block_error_sbuv(x, bsize, 0);
-  *skip       = vp9_sbuv_is_skippable(xd, bsize);
+  *distortion = block_error_sbuv(x, bsize, uv_tx_size == TX_32X32 ? 0 : 2);
+  *rate       = rdcost_uv(cm, x, bsize, uv_tx_size);
+  *skippable  = vp9_sbuv_is_skippable(xd, bsize);
 }
 
 static void super_block_uvrd(VP9_COMMON *const cm, MACROBLOCK *x,
@@ -1454,14 +982,17 @@ static void super_block_uvrd(VP9_COMMON *const cm, MACROBLOCK *x,
   vp9_subtract_sbuv(x, bsize);
 
   if (mbmi->txfm_size >= TX_32X32 && bsize >= BLOCK_SIZE_SB64X64) {
-    super_block_uvrd_32x32(cm, x, rate, distortion, skippable, bsize);
+    super_block_uvrd_for_txfm(cm, x, rate, distortion, skippable, bsize,
+                              TX_32X32);
   } else if (mbmi->txfm_size >= TX_16X16 && bsize >= BLOCK_SIZE_SB32X32) {
-    super_block_uvrd_16x16(cm, x, rate, distortion, skippable, bsize);
-  } else if (mbmi->txfm_size >= TX_8X8) {
-    super_block_uvrd_8x8(cm, x, rate, distortion, skippable, bsize);
+    super_block_uvrd_for_txfm(cm, x, rate, distortion, skippable, bsize,
+                              TX_16X16);
+  } else if (mbmi->txfm_size >= TX_8X8 && bsize >= BLOCK_SIZE_MB16X16) {
+    super_block_uvrd_for_txfm(cm, x, rate, distortion, skippable, bsize,
+                              TX_8X8);
   } else {
-    assert(mbmi->txfm_size == TX_4X4);
-    super_block_uvrd_4x4(cm, x, rate, distortion, skippable, bsize);
+    super_block_uvrd_for_txfm(cm, x, rate, distortion, skippable, bsize,
+                              TX_4X4);
   }
 }
 
@@ -1524,28 +1055,25 @@ void vp9_set_mbmode_and_mvs(MACROBLOCK *x, MB_PREDICTION_MODE mb, int_mv *mv) {
   x->e_mbd.mode_info_context->mbmi.mv[0].as_int = mv->as_int;
 }
 
-static int labels2mode(
-  MACROBLOCK *x,
-  int const *labelings, int which_label,
-  B_PREDICTION_MODE this_mode,
-  int_mv *this_mv, int_mv *this_second_mv,
-  int_mv seg_mvs[MAX_REF_FRAMES - 1],
-  int_mv *best_ref_mv,
-  int_mv *second_best_ref_mv,
-  int *mvjcost, int *mvcost[2], VP9_COMP *cpi) {
+static int labels2mode(MACROBLOCK *x,
+                       int const *labelings, int which_label,
+                       B_PREDICTION_MODE this_mode,
+                       int_mv *this_mv, int_mv *this_second_mv,
+                       int_mv seg_mvs[MAX_REF_FRAMES - 1],
+                       int_mv *best_ref_mv,
+                       int_mv *second_best_ref_mv,
+                       int *mvjcost, int *mvcost[2], VP9_COMP *cpi) {
   MACROBLOCKD *const xd = &x->e_mbd;
   MODE_INFO *const mic = xd->mode_info_context;
   MB_MODE_INFO * mbmi = &mic->mbmi;
   const int mis = xd->mode_info_stride;
-
   int i, cost = 0, thismvcost = 0;
 
   /* We have to be careful retrieving previously-encoded motion vectors.
-     Ones from this macroblock have to be pulled from the BLOCKD array
-     as they have not yet made it to the bmi array in our MB_MODE_INFO. */
-  for (i = 0; i < 16; ++i) {
-    const int row = i >> 2,  col = i & 3;
-
+   Ones from this macroblock have to be pulled from the BLOCKD array
+   as they have not yet made it to the bmi array in our MB_MODE_INFO. */
+  for (i = 0; i < 4; ++i) {
+    const int row = i >> 1, col = i & 1;
     B_PREDICTION_MODE m;
 
     if (labelings[i] != which_label)
@@ -1553,7 +1081,7 @@ static int labels2mode(
 
     if (col  &&  labelings[i] == labelings[i - 1])
       m = LEFT4X4;
-    else if (row  &&  labelings[i] == labelings[i - 4])
+    else if (row  &&  labelings[i] == labelings[i - 2])
       m = ABOVE4X4;
     else {
       // the only time we should do costing for new motion vector or mode
@@ -1563,7 +1091,7 @@ static int labels2mode(
           if (mbmi->second_ref_frame > 0) {
             this_mv->as_int = seg_mvs[mbmi->ref_frame - 1].as_int;
             this_second_mv->as_int =
-              seg_mvs[mbmi->second_ref_frame - 1].as_int;
+            seg_mvs[mbmi->second_ref_frame - 1].as_int;
           }
 
           thismvcost  = vp9_mv_bit_cost(this_mv, best_ref_mv, mvjcost, mvcost,
@@ -1576,17 +1104,17 @@ static int labels2mode(
           break;
         case LEFT4X4:
           this_mv->as_int = col ? mic->bmi[i - 1].as_mv[0].as_int :
-                                  left_block_mv(xd, mic, i);
+          left_block_mv(xd, mic, i);
           if (mbmi->second_ref_frame > 0)
             this_second_mv->as_int = col ? mic->bmi[i - 1].as_mv[1].as_int :
-                                           left_block_second_mv(xd, mic, i);
+            left_block_second_mv(xd, mic, i);
           break;
         case ABOVE4X4:
-          this_mv->as_int = row ? mic->bmi[i - 4].as_mv[0].as_int :
-                                  above_block_mv(mic, i, mis);
+          this_mv->as_int = row ? mic->bmi[i - 2].as_mv[0].as_int :
+          above_block_mv(mic, i, mis);
           if (mbmi->second_ref_frame > 0)
-            this_second_mv->as_int = row ? mic->bmi[i - 4].as_mv[1].as_int :
-                                           above_block_second_mv(mic, i, mis);
+            this_second_mv->as_int = row ? mic->bmi[i - 2].as_mv[1].as_int :
+            above_block_second_mv(mic, i, mis);
           break;
         case ZERO4X4:
           this_mv->as_int = 0;
@@ -1597,15 +1125,15 @@ static int labels2mode(
           break;
       }
 
-      if (m == ABOVE4X4) { // replace above with left if same
+      if (m == ABOVE4X4) {  // replace above with left if same
         int_mv left_mv, left_second_mv;
 
         left_second_mv.as_int = 0;
         left_mv.as_int = col ? mic->bmi[i - 1].as_mv[0].as_int :
-                         left_block_mv(xd, mic, i);
+        left_block_mv(xd, mic, i);
         if (mbmi->second_ref_frame > 0)
           left_second_mv.as_int = col ? mic->bmi[i - 1].as_mv[1].as_int :
-                                  left_block_second_mv(xd, mic, i);
+          left_block_second_mv(xd, mic, i);
 
         if (left_mv.as_int == this_mv->as_int &&
             (mbmi->second_ref_frame <= 0 ||
@@ -1614,8 +1142,8 @@ static int labels2mode(
       }
 
 #if CONFIG_NEWBINTRAMODES
-      cost = x->inter_bmode_costs[
-          m == B_CONTEXT_PRED ? m - CONTEXT_PRED_REPLACEMENTS : m];
+      cost = x->inter_bmode_costs[m == B_CONTEXT_PRED ?
+                                  m - CONTEXT_PRED_REPLACEMENTS : m];
 #else
       cost = x->inter_bmode_costs[m];
 #endif
@@ -1648,24 +1176,24 @@ static int64_t encode_inter_mb_segment(VP9_COMMON *const cm,
 
   *labelyrate = 0;
   *distortion = 0;
-  for (i = 0; i < 16; i++) {
+  for (i = 0; i < 4; i++) {
     if (labels[i] == which_label) {
       const int src_stride = x->plane[0].src.stride;
       uint8_t* const src =
-          raster_block_offset_uint8(xd, BLOCK_SIZE_MB16X16, 0, i,
-                                    x->plane[0].src.buf, src_stride);
+      raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
+                                x->plane[0].src.buf, src_stride);
       int16_t* const src_diff =
-          raster_block_offset_int16(xd, BLOCK_SIZE_MB16X16, 0, i,
-                                    x->plane[0].src_diff);
+      raster_block_offset_int16(xd, BLOCK_SIZE_SB8X8, 0, i,
+                                x->plane[0].src_diff);
       int16_t* const coeff = BLOCK_OFFSET(x->plane[0].coeff, 16, i);
       uint8_t* const pre =
-          raster_block_offset_uint8(xd, BLOCK_SIZE_MB16X16, 0, i,
-                                    xd->plane[0].pre[0].buf,
-                                    xd->plane[0].pre[0].stride);
+      raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
+                                xd->plane[0].pre[0].buf,
+                                xd->plane[0].pre[0].stride);
       uint8_t* const dst =
-          raster_block_offset_uint8(xd, BLOCK_SIZE_MB16X16, 0, i,
-                                    xd->plane[0].dst.buf,
-                                    xd->plane[0].dst.stride);
+      raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
+                                xd->plane[0].dst.buf,
+                                xd->plane[0].dst.stride);
       int thisdistortion;
 
       vp9_build_inter_predictor(pre,
@@ -1681,210 +1209,48 @@ static int64_t encode_inter_mb_segment(VP9_COMMON *const cm,
       // weighting for splitmv modes is turned on.
       if (xd->mode_info_context->mbmi.second_ref_frame > 0) {
         uint8_t* const second_pre =
-          raster_block_offset_uint8(xd, BLOCK_SIZE_MB16X16, 0, i,
-                                    xd->plane[0].pre[1].buf,
-                                    xd->plane[0].pre[1].stride);
-        vp9_build_inter_predictor(
-            second_pre, xd->plane[0].pre[1].stride,
-            dst, xd->plane[0].dst.stride,
-            &xd->mode_info_context->bmi[i].as_mv[1],
-            &xd->scale_factor[1], 4, 4, 1,
-            &xd->subpix);
+        raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
+                                  xd->plane[0].pre[1].buf,
+                                  xd->plane[0].pre[1].stride);
+        vp9_build_inter_predictor(second_pre, xd->plane[0].pre[1].stride,
+                                  dst, xd->plane[0].dst.stride,
+                                  &xd->mode_info_context->bmi[i].as_mv[1],
+                                  &xd->scale_factor[1], 4, 4, 1,
+                                  &xd->subpix);
       }
 
-      vp9_subtract_block(4, 4, src_diff, 16,
+      vp9_subtract_block(4, 4, src_diff, 8,
                          src, src_stride,
                          dst, xd->plane[0].dst.stride);
-      x->fwd_txm4x4(src_diff, coeff, 32);
+      x->fwd_txm4x4(src_diff, coeff, 16);
       x->quantize_b_4x4(x, i, DCT_DCT, 16);
       thisdistortion = vp9_block_error(coeff,
-          BLOCK_OFFSET(xd->plane[0].dqcoeff, i, 16), 16);
+                                       BLOCK_OFFSET(xd->plane[0].dqcoeff,
+                                                    i, 16), 16);
       *distortion += thisdistortion;
-      *labelyrate += cost_coeffs(cm, x, i, PLANE_TYPE_Y_WITH_DC,
-                                 ta + (i & 3),
-                                 tl + (i >> 2), TX_4X4, 16);
+      *labelyrate += cost_coeffs(cm, x, 0, i, PLANE_TYPE_Y_WITH_DC,
+                                 ta + (i & 1),
+                                 tl + (i >> 1), TX_4X4, 16);
     }
   }
   *distortion >>= 2;
   return RDCOST(x->rdmult, x->rddiv, *labelyrate, *distortion);
 }
 
-static int64_t encode_inter_mb_segment_8x8(VP9_COMMON *const cm,
-                                           MACROBLOCK *x,
-                                           int const *labels,
-                                           int which_label,
-                                           int *labelyrate,
-                                           int *distortion,
-                                           int64_t *otherrd,
-                                           ENTROPY_CONTEXT *ta,
-                                           ENTROPY_CONTEXT *tl) {
-  int i, j;
-  MACROBLOCKD *xd = &x->e_mbd;
-  const int iblock[4] = { 0, 1, 4, 5 };
-  int othercost = 0, otherdist = 0;
-  ENTROPY_CONTEXT tac[4], tlc[4];
-
-  if (otherrd) {
-    memcpy(&tac, ta, sizeof(tac));
-    memcpy(&tlc, tl, sizeof(tlc));
-  }
-
-  *distortion = 0;
-  *labelyrate = 0;
-  for (i = 0; i < 4; i++) {
-    int ib = vp9_i8x8_block[i];
-
-    if (labels[ib] == which_label) {
-      const int use_second_ref =
-          xd->mode_info_context->mbmi.second_ref_frame > 0;
-      int which_mv;
-      const int idx = (ib & 8) + ((ib & 2) << 1);
-      const int src_stride = x->plane[0].src.stride;
-      uint8_t* const src =
-          raster_block_offset_uint8(xd, BLOCK_SIZE_MB16X16, 0, ib,
-                                    x->plane[0].src.buf, src_stride);
-      int16_t* const src_diff =
-          raster_block_offset_int16(xd, BLOCK_SIZE_MB16X16, 0, ib,
-                                    x->plane[0].src_diff);
-      int16_t* const coeff = BLOCK_OFFSET(x->plane[0].coeff, idx, 16);
-      int thisdistortion;
-      uint8_t* const dst =
-          raster_block_offset_uint8(xd, BLOCK_SIZE_MB16X16, 0, ib,
-                                    xd->plane[0].dst.buf,
-                                    xd->plane[0].dst.stride);
-
-      assert(idx < 16);
-      for (which_mv = 0; which_mv < 1 + use_second_ref; ++which_mv) {
-        uint8_t* const pre =
-            raster_block_offset_uint8(xd, BLOCK_SIZE_MB16X16, 0, ib,
-                                      xd->plane[0].pre[which_mv].buf,
-                                      xd->plane[0].pre[which_mv].stride);
-
-        // TODO(debargha): Make this work properly with the
-        // implicit-compoundinter-weight experiment when implicit
-        // weighting for splitmv modes is turned on.
-        vp9_build_inter_predictor(
-            pre, xd->plane[0].pre[which_mv].stride,
-            dst, xd->plane[0].dst.stride,
-            &xd->mode_info_context->bmi[ib].as_mv[which_mv],
-            &xd->scale_factor[which_mv], 8, 8,
-            which_mv, &xd->subpix);
-      }
-
-      vp9_subtract_block(8, 8, src_diff, 16,
-                         src, src_stride,
-                         dst, xd->plane[0].dst.stride);
-
-      if (xd->mode_info_context->mbmi.txfm_size == TX_4X4) {
-        if (otherrd) {
-          x->fwd_txm8x8(src_diff, coeff, 32);
-          x->quantize_b_8x8(x, idx, DCT_DCT, 16);
-          thisdistortion = vp9_block_error_c(coeff,
-              BLOCK_OFFSET(xd->plane[0].dqcoeff, idx, 16), 64);
-          otherdist += thisdistortion;
-          xd->mode_info_context->mbmi.txfm_size = TX_8X8;
-          othercost += cost_coeffs(cm, x, idx, PLANE_TYPE_Y_WITH_DC,
-                                   tac + (i & 1) * 2,
-                                   tlc + (i & 2),
-                                   TX_8X8, 16);
-          xd->mode_info_context->mbmi.txfm_size = TX_4X4;
-        }
-        for (j = 0; j < 4; j += 2) {
-          int16_t* const src_diff =
-              raster_block_offset_int16(xd, BLOCK_SIZE_MB16X16,
-                                        0, ib + iblock[j],
-                                        x->plane[0].src_diff);
-          int16_t* const coeff = BLOCK_OFFSET(x->plane[0].coeff,
-                                              ib + iblock[j], 16);
-          x->fwd_txm8x4(src_diff, coeff, 32);
-          x->quantize_b_4x4_pair(x, ib + iblock[j], ib + iblock[j] + 1, 16);
-          thisdistortion = vp9_block_error_c(coeff,
-              BLOCK_OFFSET(xd->plane[0].dqcoeff, ib + iblock[j], 16), 32);
-          *distortion += thisdistortion;
-          *labelyrate +=
-              cost_coeffs(cm, x, ib + iblock[j], PLANE_TYPE_Y_WITH_DC,
-                          ta + (i & 1) * 2,
-                          tl + (i & 2) + ((j & 2) >> 1),
-                          TX_4X4, 16);
-          *labelyrate +=
-              cost_coeffs(cm, x, ib + iblock[j] + 1,
-                          PLANE_TYPE_Y_WITH_DC,
-                          ta + (i & 1) * 2 + 1,
-                          tl + (i & 2) + ((j & 2) >> 1),
-                          TX_4X4, 16);
-        }
-      } else /* 8x8 */ {
-        if (otherrd) {
-          for (j = 0; j < 4; j += 2) {
-            int16_t* const src_diff =
-                raster_block_offset_int16(xd, BLOCK_SIZE_MB16X16,
-                                          0, ib + iblock[j],
-                                          x->plane[0].src_diff);
-            int16_t* const coeff = BLOCK_OFFSET(x->plane[0].coeff,
-                                                ib + iblock[j], 16);
-            x->fwd_txm8x4(src_diff, coeff, 32);
-            x->quantize_b_4x4_pair(x, ib + iblock[j], ib + iblock[j] + 1, 16);
-            thisdistortion = vp9_block_error_c(coeff,
-                BLOCK_OFFSET(xd->plane[0].dqcoeff, ib + iblock[j], 16), 32);
-            otherdist += thisdistortion;
-            xd->mode_info_context->mbmi.txfm_size = TX_4X4;
-            othercost +=
-                cost_coeffs(cm, x, ib + iblock[j], PLANE_TYPE_Y_WITH_DC,
-                            tac + (i & 1) * 2,
-                            tlc + (i & 2) + ((j & 2) >> 1),
-                            TX_4X4, 16);
-            othercost +=
-                cost_coeffs(cm, x, ib + iblock[j] + 1,
-                            PLANE_TYPE_Y_WITH_DC,
-                            tac + (i & 1) * 2 + 1,
-                            tlc + (i & 2) + ((j & 2) >> 1),
-                            TX_4X4, 16);
-            xd->mode_info_context->mbmi.txfm_size = TX_8X8;
-          }
-        }
-        x->fwd_txm8x8(src_diff, coeff, 32);
-        x->quantize_b_8x8(x, idx, DCT_DCT, 16);
-        thisdistortion = vp9_block_error_c(coeff,
-            BLOCK_OFFSET(xd->plane[0].dqcoeff, idx, 16), 64);
-        *distortion += thisdistortion;
-        *labelyrate += cost_coeffs(cm, x, idx, PLANE_TYPE_Y_WITH_DC,
-                                   ta + (i & 1) * 2,
-                                   tl + (i & 2),
-                                   TX_8X8, 16);
-      }
-    }
-  }
-  *distortion >>= 2;
-  if (otherrd) {
-    otherdist >>= 2;
-    *otherrd = RDCOST(x->rdmult, x->rddiv, othercost, otherdist);
-  }
-  return RDCOST(x->rdmult, x->rddiv, *labelyrate, *distortion);
-}
-
-static const unsigned int segmentation_to_sseshift[4] = {3, 3, 2, 0};
-
-
 typedef struct {
   int_mv *ref_mv, *second_ref_mv;
   int_mv mvp;
 
   int64_t segment_rd;
-  SPLITMV_PARTITIONING_TYPE segment_num;
-  TX_SIZE txfm_size;
   int r;
   int d;
   int segment_yrate;
-  B_PREDICTION_MODE modes[16];
-  int_mv mvs[16], second_mvs[16];
-  int eobs[16];
+  B_PREDICTION_MODE modes[4];
+  int_mv mvs[4], second_mvs[4];
+  int eobs[4];
 
   int mvthresh;
   int *mdcounts;
-
-  int_mv sv_mvp[4];     // save 4 mvp from 8x8
-  int sv_istep[2];  // save 2 initial step_param for 16x8/8x16
-
 } BEST_SEG_INFO;
 
 static INLINE int mv_check_bounds(MACROBLOCK *x, int_mv *mv) {
@@ -1898,37 +1264,29 @@ static INLINE int mv_check_bounds(MACROBLOCK *x, int_mv *mv) {
 
 static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
                                     BEST_SEG_INFO *bsi,
-                                    SPLITMV_PARTITIONING_TYPE segmentation,
-                                    TX_SIZE tx_size, int64_t *otherrds,
-                                    int64_t *rds, int *completed,
-                                    /* 16 = n_blocks */
-                                    int_mv seg_mvs[16 /* n_blocks */]
-                                                  [MAX_REF_FRAMES - 1]) {
+                                    int_mv seg_mvs[4][MAX_REF_FRAMES - 1]) {
   int i, j;
-  int const *labels;
+  static const int labels[4] = { 0, 1, 2, 3 };
   int br = 0, bd = 0;
   B_PREDICTION_MODE this_mode;
   MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
-
-  int label_count;
+  const int label_count = 4;
   int64_t this_segment_rd = 0, other_segment_rd;
   int label_mv_thresh;
   int rate = 0;
   int sbr = 0, sbd = 0;
   int segmentyrate = 0;
-  int best_eobs[16] = { 0 };
+  int best_eobs[4] = { 0 };
 
   vp9_variance_fn_ptr_t *v_fn_ptr;
 
-  ENTROPY_CONTEXT t_above[4], t_left[4];
-  ENTROPY_CONTEXT t_above_b[4], t_left_b[4];
+  ENTROPY_CONTEXT t_above[2], t_left[2];
+  ENTROPY_CONTEXT t_above_b[2], t_left_b[2];
 
   vpx_memcpy(t_above, x->e_mbd.plane[0].above_context, sizeof(t_above));
   vpx_memcpy(t_left, x->e_mbd.plane[0].left_context, sizeof(t_left));
 
-  v_fn_ptr = &cpi->fn_ptr[segmentation];
-  labels = vp9_mbsplits[segmentation];
-  label_count = vp9_mbsplit_count[segmentation];
+  v_fn_ptr = &cpi->fn_ptr[BLOCK_4X4];
 
   // 64 makes this threshold really big effectively
   // making it so that we very rarely check mvs on
@@ -1937,15 +1295,12 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
   label_mv_thresh = 1 * bsi->mvthresh / label_count;
 
   // Segmentation method overheads
-  rate = cost_token(vp9_mbsplit_tree, vp9_mbsplit_probs,
-                    vp9_mbsplit_encodings + segmentation);
   rate += vp9_cost_mv_ref(cpi, SPLITMV,
                           mbmi->mb_mode_context[mbmi->ref_frame]);
   this_segment_rd += RDCOST(x->rdmult, x->rddiv, rate, 0);
   br += rate;
   other_segment_rd = this_segment_rd;
 
-  mbmi->txfm_size = tx_size;
   for (i = 0; i < label_count && this_segment_rd < bsi->segment_rd; i++) {
     int_mv mode_mv[B_MODE_COUNT], second_mode_mv[B_MODE_COUNT];
     int64_t best_label_rd = INT64_MAX, best_other_rd = INT64_MAX;
@@ -1954,10 +1309,10 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
 
     // search for the best motion vector on this segment
     for (this_mode = LEFT4X4; this_mode <= NEW4X4; this_mode ++) {
-      int64_t this_rd, other_rd;
+      int64_t this_rd;
       int distortion;
       int labelyrate;
-      ENTROPY_CONTEXT t_above_s[4], t_left_s[4];
+      ENTROPY_CONTEXT t_above_s[2], t_left_s[2];
 
       vpx_memcpy(t_above_s, t_above, sizeof(t_above_s));
       vpx_memcpy(t_left_s, t_left, sizeof(t_left_s));
@@ -1977,22 +1332,13 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
           break;
 
         if (cpi->compressor_speed) {
-          if (segmentation == PARTITIONING_8X16 ||
-              segmentation == PARTITIONING_16X8) {
-            bsi->mvp.as_int = bsi->sv_mvp[i].as_int;
-            if (i == 1 && segmentation == PARTITIONING_16X8)
-              bsi->mvp.as_int = bsi->sv_mvp[2].as_int;
-
-            step_param = bsi->sv_istep[i];
-          }
-
           // use previous block's result as next block's MV predictor.
-          if (segmentation == PARTITIONING_4X4 && i > 0) {
+          if (i > 0) {
             bsi->mvp.as_int =
-              x->e_mbd.mode_info_context->bmi[i - 1].as_mv[0].as_int;
-            if (i == 4 || i == 8 || i == 12)
+            x->e_mbd.mode_info_context->bmi[i - 1].as_mv[0].as_int;
+            if (i == 2)
               bsi->mvp.as_int =
-                x->e_mbd.mode_info_context->bmi[i - 4].as_mv[0].as_int;
+              x->e_mbd.mode_info_context->bmi[i - 2].as_mv[0].as_int;
             step_param = 2;
           }
         }
@@ -2007,24 +1353,24 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
           mvp_full.as_mv.col = bsi->mvp.as_mv.col >> 3;
 
           // find first label
-          n = vp9_mbsplit_offset[segmentation][i];
+          n = i;
 
           // adjust src pointer for this segment
           x->plane[0].src.buf =
-              raster_block_offset_uint8(&x->e_mbd, BLOCK_SIZE_MB16X16, 0, n,
-                                        x->plane[0].src.buf,
-                                        x->plane[0].src.stride);
-          assert(((intptr_t)x->e_mbd.plane[0].pre[0].buf & 0xf) == 0);
+          raster_block_offset_uint8(&x->e_mbd, BLOCK_SIZE_SB8X8, 0, n,
+                                    x->plane[0].src.buf,
+                                    x->plane[0].src.stride);
+          assert(((intptr_t)x->e_mbd.plane[0].pre[0].buf & 0x7) == 0);
           x->e_mbd.plane[0].pre[0].buf =
-              raster_block_offset_uint8(&x->e_mbd, BLOCK_SIZE_MB16X16, 0, n,
-                                        x->e_mbd.plane[0].pre[0].buf,
-                                        x->e_mbd.plane[0].pre[0].stride);
+          raster_block_offset_uint8(&x->e_mbd, BLOCK_SIZE_SB8X8, 0, n,
+                                    x->e_mbd.plane[0].pre[0].buf,
+                                    x->e_mbd.plane[0].pre[0].stride);
 
           bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param,
                                            sadpb, further_steps, 0, v_fn_ptr,
                                            bsi->ref_mv, &mode_mv[NEW4X4]);
 
-          sseshift = segmentation_to_sseshift[segmentation];
+          sseshift = 0;
 
           // Should we do a full search (best quality only)
           if ((cpi->compressor_speed == 0) && (bestsme >> sseshift) > 4000) {
@@ -2041,12 +1387,12 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
             if (thissme < bestsme) {
               bestsme = thissme;
               mode_mv[NEW4X4].as_int =
-                x->e_mbd.mode_info_context->bmi[n].as_mv[0].as_int;
+              x->e_mbd.mode_info_context->bmi[n].as_mv[0].as_int;
             } else {
               /* The full search result is actually worse so re-instate the
                * previous best vector */
               x->e_mbd.mode_info_context->bmi[n].as_mv[0].as_int =
-                mode_mv[NEW4X4].as_int;
+              mode_mv[NEW4X4].as_int;
             }
           }
         }
@@ -2092,17 +1438,9 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
           mv_check_bounds(x, &second_mode_mv[this_mode]))
         continue;
 
-      if (segmentation == PARTITIONING_4X4) {
-        this_rd = encode_inter_mb_segment(&cpi->common,
-                                          x, labels, i, &labelyrate,
-                                          &distortion, t_above_s, t_left_s);
-        other_rd = this_rd;
-      } else {
-        this_rd = encode_inter_mb_segment_8x8(&cpi->common,
-                                              x, labels, i, &labelyrate,
-                                              &distortion, &other_rd,
-                                              t_above_s, t_left_s);
-      }
+      this_rd = encode_inter_mb_segment(&cpi->common,
+                                        x, labels, i, &labelyrate,
+                                        &distortion, t_above_s, t_left_s);
       this_rd += RDCOST(x->rdmult, x->rddiv, rate, 0);
       rate += labelyrate;
 
@@ -2112,24 +1450,12 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
         bestlabelyrate = labelyrate;
         mode_selected = this_mode;
         best_label_rd = this_rd;
-        if (x->e_mbd.mode_info_context->mbmi.txfm_size == TX_4X4) {
-          for (j = 0; j < 16; j++)
-            if (labels[j] == i)
-              best_eobs[j] = x->e_mbd.plane[0].eobs[j];
-        } else {
-          for (j = 0; j < 4; j++) {
-            int ib = vp9_i8x8_block[j], idx = j * 4;
-
-            if (labels[ib] == i)
-              best_eobs[idx] = x->e_mbd.plane[0].eobs[idx];
-          }
-        }
-        if (other_rd < best_other_rd)
-          best_other_rd = other_rd;
+        for (j = 0; j < 4; j++)
+          if (labels[j] == i)
+            best_eobs[j] = x->e_mbd.plane[0].eobs[j];
 
         vpx_memcpy(t_above_b, t_above_s, sizeof(t_above_s));
         vpx_memcpy(t_left_b, t_left_s, sizeof(t_left_s));
-
       }
     } /*for each 4x4 mode*/
 
@@ -2146,10 +1472,6 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
     segmentyrate += bestlabelyrate;
     this_segment_rd += best_label_rd;
     other_segment_rd += best_other_rd;
-    if (rds)
-      rds[i] = this_segment_rd;
-    if (otherrds)
-      otherrds[i] = other_segment_rd;
   } /* for each label */
 
   if (this_segment_rd < bsi->segment_rd) {
@@ -2157,11 +1479,9 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
     bsi->d = bd;
     bsi->segment_yrate = segmentyrate;
     bsi->segment_rd = this_segment_rd;
-    bsi->segment_num = segmentation;
-    bsi->txfm_size = mbmi->txfm_size;
 
     // store everything needed to come back to this!!
-    for (i = 0; i < 16; i++) {
+    for (i = 0; i < 4; i++) {
       bsi->mvs[i].as_mv = x->partition_info->bmi[i].mv.as_mv;
       if (mbmi->second_ref_frame > 0)
         bsi->second_mvs[i].as_mv = x->partition_info->bmi[i].second_mv.as_mv;
@@ -2169,118 +1489,12 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
       bsi->eobs[i] = best_eobs[i];
     }
   }
-
-  if (completed) {
-    *completed = i;
-  }
 }
 
 static void rd_check_segment(VP9_COMP *cpi, MACROBLOCK *x,
                              BEST_SEG_INFO *bsi,
-                             unsigned int segmentation,
-                             /* 16 = n_blocks */
-                             int_mv seg_mvs[16][MAX_REF_FRAMES - 1],
-                             int64_t txfm_cache[NB_TXFM_MODES]) {
-  int i, n, c = vp9_mbsplit_count[segmentation];
-
-  if (segmentation == PARTITIONING_4X4) {
-    int64_t rd[16];
-
-    rd_check_segment_txsize(cpi, x, bsi, segmentation, TX_4X4, NULL,
-                            rd, &n, seg_mvs);
-    if (n == c) {
-      for (i = 0; i < NB_TXFM_MODES; i++) {
-        if (rd[c - 1] < txfm_cache[i])
-          txfm_cache[i] = rd[c - 1];
-      }
-    }
-  } else {
-    int64_t diff, base_rd;
-    int cost4x4 = vp9_cost_bit(cpi->common.prob_tx[0], 0);
-    int cost8x8 = vp9_cost_bit(cpi->common.prob_tx[0], 1);
-
-    if (cpi->common.txfm_mode == TX_MODE_SELECT) {
-      int64_t rd4x4[4], rd8x8[4];
-      int n4x4, n8x8, nmin;
-      BEST_SEG_INFO bsi4x4, bsi8x8;
-
-      /* factor in cost of cost4x4/8x8 in decision */
-      vpx_memcpy(&bsi4x4, bsi, sizeof(*bsi));
-      vpx_memcpy(&bsi8x8, bsi, sizeof(*bsi));
-      rd_check_segment_txsize(cpi, x, &bsi4x4, segmentation,
-                              TX_4X4, NULL, rd4x4, &n4x4, seg_mvs);
-      rd_check_segment_txsize(cpi, x, &bsi8x8, segmentation,
-                              TX_8X8, NULL, rd8x8, &n8x8, seg_mvs);
-      if (bsi4x4.segment_num == segmentation) {
-        bsi4x4.segment_rd += RDCOST(x->rdmult, x->rddiv, cost4x4, 0);
-        if (bsi4x4.segment_rd < bsi->segment_rd)
-          vpx_memcpy(bsi, &bsi4x4, sizeof(*bsi));
-      }
-      if (bsi8x8.segment_num == segmentation) {
-        bsi8x8.segment_rd += RDCOST(x->rdmult, x->rddiv, cost8x8, 0);
-        if (bsi8x8.segment_rd < bsi->segment_rd)
-          vpx_memcpy(bsi, &bsi8x8, sizeof(*bsi));
-      }
-      n = n4x4 > n8x8 ? n4x4 : n8x8;
-      if (n == c) {
-        nmin = n4x4 < n8x8 ? n4x4 : n8x8;
-        diff = rd8x8[nmin - 1] - rd4x4[nmin - 1];
-        if (n == n4x4) {
-          base_rd = rd4x4[c - 1];
-        } else {
-          base_rd = rd8x8[c - 1] - diff;
-        }
-      }
-    } else {
-      int64_t rd[4], otherrd[4];
-
-      if (cpi->common.txfm_mode == ONLY_4X4) {
-        rd_check_segment_txsize(cpi, x, bsi, segmentation, TX_4X4, otherrd,
-                                rd, &n, seg_mvs);
-        if (n == c) {
-          base_rd = rd[c - 1];
-          diff = otherrd[c - 1] - rd[c - 1];
-        }
-      } else /* use 8x8 transform */ {
-        rd_check_segment_txsize(cpi, x, bsi, segmentation, TX_8X8, otherrd,
-                                rd, &n, seg_mvs);
-        if (n == c) {
-          diff = rd[c - 1] - otherrd[c - 1];
-          base_rd = otherrd[c - 1];
-        }
-      }
-    }
-
-    if (n == c) {
-      if (base_rd < txfm_cache[ONLY_4X4]) {
-        txfm_cache[ONLY_4X4] = base_rd;
-      }
-      if (base_rd + diff < txfm_cache[ALLOW_8X8]) {
-        txfm_cache[ALLOW_8X8] = txfm_cache[ALLOW_16X16] =
-            txfm_cache[ALLOW_32X32] = base_rd + diff;
-      }
-      if (diff < 0) {
-        base_rd += diff + RDCOST(x->rdmult, x->rddiv, cost8x8, 0);
-      } else {
-        base_rd += RDCOST(x->rdmult, x->rddiv, cost4x4, 0);
-      }
-      if (base_rd < txfm_cache[TX_MODE_SELECT]) {
-        txfm_cache[TX_MODE_SELECT] = base_rd;
-      }
-    }
-  }
-}
-
-static INLINE void cal_step_param(int sr, int *sp) {
-  int step = 0;
-
-  if (sr > MAX_FIRST_STEP) sr = MAX_FIRST_STEP;
-  else if (sr < 1) sr = 1;
-
-  while (sr >>= 1)
-    step++;
-
-  *sp = MAX_MVSEARCH_STEPS - 1 - step;
+                             int_mv seg_mvs[4][MAX_REF_FRAMES - 1]) {
+  rd_check_segment_txsize(cpi, x, bsi, seg_mvs);
 }
 
 static int rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x,
@@ -2292,17 +1506,12 @@ static int rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x,
                                        int *returnyrate,
                                        int *returndistortion,
                                        int *skippable, int mvthresh,
-                                       int_mv seg_mvs[NB_PARTITIONINGS]
-                                                     [16 /* n_blocks */]
-                                                     [MAX_REF_FRAMES - 1],
-                                       int64_t txfm_cache[NB_TXFM_MODES]) {
+                                       int_mv seg_mvs[4][MAX_REF_FRAMES - 1]) {
   int i;
   BEST_SEG_INFO bsi;
   MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
 
   vpx_memset(&bsi, 0, sizeof(bsi));
-  for (i = 0; i < NB_TXFM_MODES; i++)
-    txfm_cache[i] = INT64_MAX;
 
   bsi.segment_rd = best_rd;
   bsi.ref_mv = best_ref_mv;
@@ -2310,121 +1519,41 @@ static int rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x,
   bsi.mvp.as_int = best_ref_mv->as_int;
   bsi.mvthresh = mvthresh;
   bsi.mdcounts = mdcounts;
-  bsi.txfm_size = TX_4X4;
 
-  for (i = 0; i < 16; i++)
+  for (i = 0; i < 4; i++)
     bsi.modes[i] = ZERO4X4;
 
-  if (cpi->compressor_speed == 0) {
-    /* for now, we will keep the original segmentation order
-       when in best quality mode */
-    rd_check_segment(cpi, x, &bsi, PARTITIONING_16X8,
-                     seg_mvs[PARTITIONING_16X8], txfm_cache);
-    rd_check_segment(cpi, x, &bsi, PARTITIONING_8X16,
-                     seg_mvs[PARTITIONING_8X16], txfm_cache);
-    rd_check_segment(cpi, x, &bsi, PARTITIONING_8X8,
-                     seg_mvs[PARTITIONING_8X8], txfm_cache);
-    rd_check_segment(cpi, x, &bsi, PARTITIONING_4X4,
-                     seg_mvs[PARTITIONING_4X4], txfm_cache);
-  } else {
-    int sr;
-
-    rd_check_segment(cpi, x, &bsi, PARTITIONING_8X8,
-                     seg_mvs[PARTITIONING_8X8], txfm_cache);
-
-    if (bsi.segment_rd < best_rd) {
-      int tmp_col_min = x->mv_col_min;
-      int tmp_col_max = x->mv_col_max;
-      int tmp_row_min = x->mv_row_min;
-      int tmp_row_max = x->mv_row_max;
-
-      vp9_clamp_mv_min_max(x, best_ref_mv);
-
-      /* Get 8x8 result */
-      bsi.sv_mvp[0].as_int = bsi.mvs[0].as_int;
-      bsi.sv_mvp[1].as_int = bsi.mvs[2].as_int;
-      bsi.sv_mvp[2].as_int = bsi.mvs[8].as_int;
-      bsi.sv_mvp[3].as_int = bsi.mvs[10].as_int;
-
-      /* Use 8x8 result as 16x8/8x16's predictor MV. Adjust search range
-       * according to the closeness of 2 MV. */
-      /* block 8X16 */
-      sr = MAXF((abs(bsi.sv_mvp[0].as_mv.row - bsi.sv_mvp[2].as_mv.row)) >> 3,
-                (abs(bsi.sv_mvp[0].as_mv.col - bsi.sv_mvp[2].as_mv.col)) >> 3);
-      cal_step_param(sr, &bsi.sv_istep[0]);
-
-      sr = MAXF((abs(bsi.sv_mvp[1].as_mv.row - bsi.sv_mvp[3].as_mv.row)) >> 3,
-                (abs(bsi.sv_mvp[1].as_mv.col - bsi.sv_mvp[3].as_mv.col)) >> 3);
-      cal_step_param(sr, &bsi.sv_istep[1]);
-
-      rd_check_segment(cpi, x, &bsi, PARTITIONING_8X16,
-                       seg_mvs[PARTITIONING_8X16], txfm_cache);
-
-      /* block 16X8 */
-      sr = MAXF((abs(bsi.sv_mvp[0].as_mv.row - bsi.sv_mvp[1].as_mv.row)) >> 3,
-                (abs(bsi.sv_mvp[0].as_mv.col - bsi.sv_mvp[1].as_mv.col)) >> 3);
-      cal_step_param(sr, &bsi.sv_istep[0]);
-
-      sr = MAXF((abs(bsi.sv_mvp[2].as_mv.row - bsi.sv_mvp[3].as_mv.row)) >> 3,
-                (abs(bsi.sv_mvp[2].as_mv.col - bsi.sv_mvp[3].as_mv.col)) >> 3);
-      cal_step_param(sr, &bsi.sv_istep[1]);
-
-      rd_check_segment(cpi, x, &bsi, PARTITIONING_16X8,
-                       seg_mvs[PARTITIONING_16X8], txfm_cache);
-
-      /* If 8x8 is better than 16x8/8x16, then do 4x4 search */
-      /* Not skip 4x4 if speed=0 (good quality) */
-      if (cpi->sf.no_skip_block4x4_search ||
-          bsi.segment_num == PARTITIONING_8X8) {
-        /* || (sv_segment_rd8x8-bsi.segment_rd) < sv_segment_rd8x8>>5) */
-        bsi.mvp.as_int = bsi.sv_mvp[0].as_int;
-        rd_check_segment(cpi, x, &bsi, PARTITIONING_4X4,
-                         seg_mvs[PARTITIONING_4X4], txfm_cache);
-      }
-
-      /* restore UMV window */
-      x->mv_col_min = tmp_col_min;
-      x->mv_col_max = tmp_col_max;
-      x->mv_row_min = tmp_row_min;
-      x->mv_row_max = tmp_row_max;
-    }
-  }
+  rd_check_segment(cpi, x, &bsi, seg_mvs);
 
   /* set it to the best */
-  for (i = 0; i < 16; i++) {
+  for (i = 0; i < 4; i++) {
     x->e_mbd.mode_info_context->bmi[i].as_mv[0].as_int = bsi.mvs[i].as_int;
     if (mbmi->second_ref_frame > 0)
       x->e_mbd.mode_info_context->bmi[i].as_mv[1].as_int =
-        bsi.second_mvs[i].as_int;
+      bsi.second_mvs[i].as_int;
     x->e_mbd.plane[0].eobs[i] = bsi.eobs[i];
   }
 
   /* save partitions */
-  mbmi->txfm_size = bsi.txfm_size;
-  mbmi->partitioning = bsi.segment_num;
-  x->partition_info->count = vp9_mbsplit_count[bsi.segment_num];
+  x->partition_info->count = 4;
 
   for (i = 0; i < x->partition_info->count; i++) {
-    int j;
-
-    j = vp9_mbsplit_offset[bsi.segment_num][i];
-
-    x->partition_info->bmi[i].mode = bsi.modes[j];
-    x->partition_info->bmi[i].mv.as_mv = bsi.mvs[j].as_mv;
+    x->partition_info->bmi[i].mode = bsi.modes[i];
+    x->partition_info->bmi[i].mv.as_mv = bsi.mvs[i].as_mv;
     if (mbmi->second_ref_frame > 0)
-      x->partition_info->bmi[i].second_mv.as_mv = bsi.second_mvs[j].as_mv;
+      x->partition_info->bmi[i].second_mv.as_mv = bsi.second_mvs[i].as_mv;
   }
   /*
    * used to set mbmi->mv.as_int
    */
-  x->partition_info->bmi[15].mv.as_int = bsi.mvs[15].as_int;
+  x->partition_info->bmi[3].mv.as_int = bsi.mvs[3].as_int;
   if (mbmi->second_ref_frame > 0)
-    x->partition_info->bmi[15].second_mv.as_int = bsi.second_mvs[15].as_int;
+    x->partition_info->bmi[3].second_mv.as_int = bsi.second_mvs[3].as_int;
 
   *returntotrate = bsi.r;
   *returndistortion = bsi.d;
   *returnyrate = bsi.segment_yrate;
-  *skippable = vp9_sby_is_skippable(&x->e_mbd, BLOCK_SIZE_MB16X16);
+  *skippable = vp9_sby_is_skippable(&x->e_mbd, BLOCK_SIZE_SB8X8);
 
   return (int)(bsi.segment_rd);
 }
@@ -2474,22 +1603,10 @@ static void mv_pred(VP9_COMP *cpi, MACROBLOCK *x,
   x->mv_best_ref_index[ref_frame] = best_index;
 }
 
-static void set_i8x8_block_modes(MACROBLOCK *x, int modes[4]) {
-  int i;
-  MACROBLOCKD *xd = &x->e_mbd;
-  for (i = 0; i < 4; i++) {
-    int ib = vp9_i8x8_block[i];
-    xd->mode_info_context->bmi[ib + 0].as_mode.first = modes[i];
-    xd->mode_info_context->bmi[ib + 1].as_mode.first = modes[i];
-    xd->mode_info_context->bmi[ib + 4].as_mode.first = modes[i];
-    xd->mode_info_context->bmi[ib + 5].as_mode.first = modes[i];
-    // printf("%d,%d,%d,%d\n",
-    //       modes[0], modes[1], modes[2], modes[3]);
-  }
-}
-
 extern void vp9_calc_ref_probs(int *count, vp9_prob *probs);
-static void estimate_curframe_refprobs(VP9_COMP *cpi, vp9_prob mod_refprobs[3], int pred_ref) {
+static void estimate_curframe_refprobs(VP9_COMP *cpi,
+                                       vp9_prob mod_refprobs[3],
+                                       int pred_ref) {
   int norm_cnt[MAX_REF_FRAMES];
   const int *const rfct = cpi->count_mb_ref_frame_usage;
   int intra_count = rfct[INTRA_FRAME];
@@ -2539,7 +1656,8 @@ static INLINE unsigned weighted_cost(vp9_prob *tab0, vp9_prob *tab1,
   return (0x8000 + weight * cost1 + (0x10000 - weight) * cost0) >> 16;
 }
 
-static void estimate_ref_frame_costs(VP9_COMP *cpi, int segment_id, unsigned int *ref_costs) {
+static void estimate_ref_frame_costs(VP9_COMP *cpi, int segment_id,
+                                     unsigned int *ref_costs) {
   VP9_COMMON *cm = &cpi->common;
   MACROBLOCKD *xd = &cpi->mb.e_mbd;
   vp9_prob *mod_refprobs;
@@ -2588,10 +1706,10 @@ static void estimate_ref_frame_costs(VP9_COMP *cpi, int segment_id, unsigned int
       // Get the prediction for the current mb
       cost = weighted_cost(&pred_prob, &new_pred_prob, 0,
                            pred_flag, cpi->seg0_progress);
-      if (cost > 1024) cost = 768; // i.e. account for 4 bits max.
+      if (cost > 1024) cost = 768;  // i.e. account for 4 bits max.
 
       // for incorrectly predicted cases
-      if (! pred_flag) {
+      if (!pred_flag) {
         vp9_prob curframe_mod_refprobs[3];
 
         if (cpi->seg0_progress) {
@@ -2699,6 +1817,51 @@ static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
             frame_type, block_size);
 }
 
+
+static enum BlockSize get_block_size(int bw, int bh) {
+  if (bw == 4 && bh == 4)
+    return BLOCK_4X4;
+
+  if (bw == 4 && bh == 8)
+    return BLOCK_4X8;
+
+  if (bw == 8 && bh == 4)
+    return BLOCK_8X4;
+
+  if (bw == 8 && bh == 8)
+    return BLOCK_8X8;
+
+  if (bw == 8 && bh == 16)
+    return BLOCK_8X16;
+
+  if (bw == 16 && bh == 8)
+    return BLOCK_16X8;
+
+  if (bw == 16 && bh == 16)
+    return BLOCK_16X16;
+
+  if (bw == 32 && bh == 32)
+    return BLOCK_32X32;
+
+  if (bw == 32 && bh == 16)
+    return BLOCK_32X16;
+
+  if (bw == 16 && bh == 32)
+    return BLOCK_16X32;
+
+  if (bw == 64 && bh == 32)
+    return BLOCK_64X32;
+
+  if (bw == 32 && bh == 64)
+    return BLOCK_32X64;
+
+  if (bw == 64 && bh == 64)
+    return BLOCK_64X64;
+
+  assert(0);
+  return -1;
+}
+
 static void model_rd_from_var_lapndz(int var, int n, int qstep,
                                      int *rate, int *dist) {
   // This function models the rate and distortion for a Laplacian
@@ -2742,6 +1905,36 @@ static void model_rd_from_var_lapndz(int var, int n, int qstep,
   vp9_clear_system_state();
 }
 
+static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE_TYPE bsize,
+                            MACROBLOCK *x, MACROBLOCKD *xd,
+                            int *out_rate_sum, int *out_dist_sum) {
+  // Note our transform coeffs are 8 times an orthogonal transform.
+  // Hence quantizer step is also 8 times. To get effective quantizer
+  // we need to divide by 8 before sending to modeling function.
+  unsigned int sse, var;
+  int i, rate_sum = 0, dist_sum = 0;
+
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    struct macroblock_plane *const p = &x->plane[i];
+    struct macroblockd_plane *const pd = &xd->plane[i];
+
+    const int bwl = b_width_log2(bsize) - pd->subsampling_x;
+    const int bhl = b_height_log2(bsize) - pd->subsampling_y;
+    const enum BlockSize bs = get_block_size(4 << bwl, 4 << bhl);
+    int rate, dist;
+    var = cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride,
+                             pd->dst.buf, pd->dst.stride, &sse);
+    model_rd_from_var_lapndz(var, 16 << (bwl + bhl),
+                             pd->dequant[1] >> 3, &rate, &dist);
+
+    rate_sum += rate;
+    dist_sum += dist;
+  }
+
+  *out_rate_sum = rate_sum;
+  *out_dist_sum = dist_sum;
+}
+
 static enum BlockSize y_to_uv_block_size(enum BlockSize bs) {
   switch (bs) {
     case BLOCK_64X64: return BLOCK_32X32;
@@ -2751,6 +1944,9 @@ static enum BlockSize y_to_uv_block_size(enum BlockSize bs) {
     case BLOCK_32X16: return BLOCK_16X8;
     case BLOCK_16X32: return BLOCK_8X16;
     case BLOCK_16X16: return BLOCK_8X8;
+    case BLOCK_16X8:  return BLOCK_8X4;
+    case BLOCK_8X16:  return BLOCK_4X8;
+    case BLOCK_8X8:   return BLOCK_4X4;
     default:
       assert(0);
       return -1;
@@ -2766,6 +1962,9 @@ static enum BlockSize y_bsizet_to_block_size(BLOCK_SIZE_TYPE bs) {
     case BLOCK_SIZE_SB32X16: return BLOCK_32X16;
     case BLOCK_SIZE_SB16X32: return BLOCK_16X32;
     case BLOCK_SIZE_MB16X16: return BLOCK_16X16;
+    case BLOCK_SIZE_SB16X8:  return BLOCK_16X8;
+    case BLOCK_SIZE_SB8X16:  return BLOCK_8X16;
+    case BLOCK_SIZE_SB8X8:   return BLOCK_8X8;
     default:
       assert(0);
       return -1;
@@ -2966,76 +2165,41 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   // Search for best switchable filter by checking the variance of
   // pred error irrespective of whether the filter will be used
   if (1) {
-    int switchable_filter_index, newbest;
-    int tmp_rate_y_i = 0, tmp_rate_u_i = 0, tmp_rate_v_i = 0;
-    int tmp_dist_y_i = 0, tmp_dist_u_i = 0, tmp_dist_v_i = 0;
-    for (switchable_filter_index = 0;
-         switchable_filter_index < VP9_SWITCHABLE_FILTERS;
-         ++switchable_filter_index) {
+    int i, newbest;
+    int tmp_rate_sum = 0, tmp_dist_sum = 0;
+    for (i = 0; i < VP9_SWITCHABLE_FILTERS; ++i) {
       int rs = 0;
-      mbmi->interp_filter = vp9_switchable_interp[switchable_filter_index];
-      vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
+      const INTERPOLATIONFILTERTYPE filter = vp9_switchable_interp[i];
+      const int is_intpel_interp = intpel_mv &&
+                                   vp9_is_interpolating_filter[filter];
+      mbmi->interp_filter = filter;
+      vp9_setup_interp_filters(xd, mbmi->interp_filter, cm);
 
-      if (cpi->common.mcomp_filter_type == SWITCHABLE) {
+      if (cm->mcomp_filter_type == SWITCHABLE) {
         const int c = vp9_get_pred_context(cm, xd, PRED_SWITCHABLE_INTERP);
         const int m = vp9_switchable_interp_map[mbmi->interp_filter];
         rs = SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs[c][m];
       }
-      if (interpolating_intpel_seen && intpel_mv &&
-          vp9_is_interpolating_filter[mbmi->interp_filter]) {
-        rd = RDCOST(x->rdmult, x->rddiv,
-                    rs + tmp_rate_y_i + tmp_rate_u_i + tmp_rate_v_i,
-                    tmp_dist_y_i + tmp_dist_u_i + tmp_dist_v_i);
+
+      if (interpolating_intpel_seen && is_intpel_interp) {
+        rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate_sum, tmp_dist_sum);
       } else {
-        unsigned int sse, var;
-        int tmp_rate_y, tmp_rate_u, tmp_rate_v;
-        int tmp_dist_y, tmp_dist_u, tmp_dist_v;
+        int rate_sum = 0, dist_sum = 0;
         vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
-        var = cpi->fn_ptr[block_size].vf(x->plane[0].src.buf,
-                                         x->plane[0].src.stride,
-                                         xd->plane[0].dst.buf,
-                                         xd->plane[0].dst.stride,
-                                         &sse);
-        // Note our transform coeffs are 8 times an orthogonal transform.
-        // Hence quantizer step is also 8 times. To get effective quantizer
-        // we need to divide by 8 before sending to modeling function.
-        model_rd_from_var_lapndz(var, MI_SIZE * bw * MI_SIZE * bh,
-                                 xd->plane[0].dequant[1] >> 3,
-                                 &tmp_rate_y, &tmp_dist_y);
-        var = cpi->fn_ptr[uv_block_size].vf(x->plane[1].src.buf,
-                                            x->plane[1].src.stride,
-                                            xd->plane[1].dst.buf,
-                                            xd->plane[1].dst.stride,
-                                            &sse);
-        model_rd_from_var_lapndz(var, MI_UV_SIZE * bw * MI_UV_SIZE * bh,
-                                 xd->plane[1].dequant[1] >> 3,
-                                 &tmp_rate_u, &tmp_dist_u);
-        var = cpi->fn_ptr[uv_block_size].vf(x->plane[2].src.buf,
-                                            x->plane[1].src.stride,
-                                            xd->plane[2].dst.buf,
-                                            xd->plane[1].dst.stride,
-                                            &sse);
-        model_rd_from_var_lapndz(var, MI_UV_SIZE * bw * MI_UV_SIZE * bh,
-                                 xd->plane[2].dequant[1] >> 3,
-                                 &tmp_rate_v, &tmp_dist_v);
-        rd = RDCOST(x->rdmult, x->rddiv,
-                    rs + tmp_rate_y + tmp_rate_u + tmp_rate_v,
-                    tmp_dist_y + tmp_dist_u + tmp_dist_v);
-        if (!interpolating_intpel_seen && intpel_mv &&
-            vp9_is_interpolating_filter[mbmi->interp_filter]) {
-          tmp_rate_y_i = tmp_rate_y;
-          tmp_rate_u_i = tmp_rate_u;
-          tmp_rate_v_i = tmp_rate_v;
-          tmp_dist_y_i = tmp_dist_y;
-          tmp_dist_u_i = tmp_dist_u;
-          tmp_dist_v_i = tmp_dist_v;
+        model_rd_for_sb(cpi, bsize, x, xd, &rate_sum, &dist_sum);
+        rd = RDCOST(x->rdmult, x->rddiv, rs + rate_sum, dist_sum);
+        if (!interpolating_intpel_seen && is_intpel_interp) {
+          tmp_rate_sum = rate_sum;
+          tmp_dist_sum = dist_sum;
         }
       }
-      newbest = (switchable_filter_index == 0 || rd < best_rd);
+      newbest = i == 0 || rd < best_rd;
+
       if (newbest) {
         best_rd = rd;
         *best_filter = mbmi->interp_filter;
       }
+
       if ((cm->mcomp_filter_type == SWITCHABLE && newbest) ||
           (cm->mcomp_filter_type != SWITCHABLE &&
            cm->mcomp_filter_type == mbmi->interp_filter)) {
@@ -3050,21 +2214,19 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                      sizeof(unsigned char) * MI_UV_SIZE * bw);
         for (i = 0; i < MI_UV_SIZE * bh; ++i)
           vpx_memcpy(tmp_vbuf + i * MI_UV_SIZE * bw,
-                     xd->plane[2].dst.buf + i * xd->plane[1].dst.stride,
+                     xd->plane[2].dst.buf + i * xd->plane[2].dst.stride,
                      sizeof(unsigned char) * MI_UV_SIZE * bw);
         pred_exists = 1;
       }
-      interpolating_intpel_seen |=
-        intpel_mv && vp9_is_interpolating_filter[mbmi->interp_filter];
+      interpolating_intpel_seen |= is_intpel_interp;
     }
   }
 
   // Set the appripriate filter
-  if (cm->mcomp_filter_type != SWITCHABLE)
-    mbmi->interp_filter = cm->mcomp_filter_type;
-  else
-    mbmi->interp_filter = *best_filter;
-  vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
+  mbmi->interp_filter = cm->mcomp_filter_type != SWITCHABLE ?
+                             cm->mcomp_filter_type : *best_filter;
+  vp9_setup_interp_filters(xd, mbmi->interp_filter, cm);
+
 
   if (pred_exists) {
     // FIXME(rbultje): mb code still predicts into xd->predictor
@@ -3077,7 +2239,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                  tmp_ubuf + i * bw * MI_UV_SIZE,
                  sizeof(unsigned char) * bw * MI_UV_SIZE);
     for (i = 0; i < bh * MI_UV_SIZE; ++i)
-      vpx_memcpy(xd->plane[2].dst.buf + i * xd->plane[1].dst.stride,
+      vpx_memcpy(xd->plane[2].dst.buf + i * xd->plane[2].dst.stride,
                  tmp_vbuf + i * bw * MI_UV_SIZE,
                  sizeof(unsigned char) * bw * MI_UV_SIZE);
   } else {
@@ -3193,867 +2355,6 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   return this_rd;  // if 0, this will be re-calculated by caller
 }
 
-static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
-                               int mi_row, int mi_col,
-                               int *returnrate, int *returndistortion,
-                               int64_t *returnintra) {
-  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
-    VP9_ALT_FLAG };
-  VP9_COMMON *cm = &cpi->common;
-  MACROBLOCKD *xd = &x->e_mbd;
-  union b_mode_info best_bmodes[16];
-  MB_MODE_INFO best_mbmode;
-  PARTITION_INFO best_partition;
-  int_mv best_ref_mv, second_best_ref_mv;
-  MB_PREDICTION_MODE this_mode;
-  MB_PREDICTION_MODE best_mode = DC_PRED;
-  MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;
-  int i, best_mode_index = 0;
-  int mode8x8[4];
-  unsigned char segment_id = mbmi->segment_id;
-
-  int mode_index;
-  int mdcounts[4];
-  int rate, distortion;
-  int rate2, distortion2;
-  int64_t best_txfm_rd[NB_TXFM_MODES];
-  int64_t best_txfm_diff[NB_TXFM_MODES];
-  int64_t best_pred_diff[NB_PREDICTION_TYPES];
-  int64_t best_pred_rd[NB_PREDICTION_TYPES];
-  int64_t best_rd = INT64_MAX, best_intra_rd = INT64_MAX;
-#if CONFIG_COMP_INTERINTRA_PRED
-  int is_best_interintra = 0;
-  int64_t best_intra16_rd = INT64_MAX;
-  int best_intra16_mode = DC_PRED;
-#if SEPARATE_INTERINTRA_UV
-  int best_intra16_uv_mode = DC_PRED;
-#endif
-#endif
-  int64_t best_overall_rd = INT64_MAX;
-  INTERPOLATIONFILTERTYPE best_filter = SWITCHABLE;
-  INTERPOLATIONFILTERTYPE tmp_best_filter = SWITCHABLE;
-  int uv_intra_rate[2], uv_intra_distortion[2], uv_intra_rate_tokenonly[2];
-  int uv_intra_skippable[2];
-  MB_PREDICTION_MODE uv_intra_mode[2];
-  int rate_y, UNINITIALIZED_IS_SAFE(rate_uv);
-  int distortion_uv = INT_MAX;
-  int64_t best_yrd = INT64_MAX;
-
-  int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
-  int frame_mdcounts[4][4];
-  YV12_BUFFER_CONFIG yv12_mb[4];
-
-  unsigned int ref_costs[MAX_REF_FRAMES];
-  int_mv seg_mvs[NB_PARTITIONINGS][16 /* n_blocks */][MAX_REF_FRAMES - 1];
-
-  int intra_cost_penalty = 20 * vp9_dc_quant(cpi->common.base_qindex,
-                                             cpi->common.y_dc_delta_q);
-  int64_t mode_distortions[MB_MODE_COUNT] = {-1};
-  int64_t frame_distortions[MAX_REF_FRAMES] = {-1};
-  int ref_frame;
-
-  struct scale_factors scale_factor[4];
-
-  vpx_memset(mode8x8, 0, sizeof(mode8x8));
-  vpx_memset(&frame_mv, 0, sizeof(frame_mv));
-  vpx_memset(&best_mbmode, 0, sizeof(best_mbmode));
-  vpx_memset(&best_bmodes, 0, sizeof(best_bmodes));
-  vpx_memset(&x->mb_context[xd->sb_index][xd->mb_index], 0,
-             sizeof(PICK_MODE_CONTEXT));
-
-  x->mb_context[xd->sb_index][xd->mb_index].frames_with_high_error = 0;
-  x->mb_context[xd->sb_index][xd->mb_index].modes_with_high_error = 0;
-
-  for (i = 0; i < MAX_REF_FRAMES; i++)
-    frame_mv[NEWMV][i].as_int = INVALID_MV;
-  for (i = 0; i < NB_PREDICTION_TYPES; ++i)
-    best_pred_rd[i] = INT64_MAX;
-  for (i = 0; i < NB_TXFM_MODES; i++)
-    best_txfm_rd[i] = INT64_MAX;
-
-  for (i = 0; i < NB_PARTITIONINGS; i++) {
-    int j, k;
-
-    for (j = 0; j < 16; j++)
-      for (k = 0; k < MAX_REF_FRAMES - 1; k++)
-        seg_mvs[i][j][k].as_int = INVALID_MV;
-  }
-
-  if (cpi->ref_frame_flags & VP9_LAST_FLAG) {
-    setup_buffer_inter(cpi, x, cpi->lst_fb_idx,
-                       LAST_FRAME, BLOCK_16X16, mi_row, mi_col,
-                       frame_mv[NEARESTMV], frame_mv[NEARMV],
-                       frame_mdcounts, yv12_mb, scale_factor);
-  }
-
-  if (cpi->ref_frame_flags & VP9_GOLD_FLAG) {
-    setup_buffer_inter(cpi, x, cpi->gld_fb_idx,
-                       GOLDEN_FRAME, BLOCK_16X16, mi_row, mi_col,
-                       frame_mv[NEARESTMV], frame_mv[NEARMV],
-                       frame_mdcounts, yv12_mb, scale_factor);
-  }
-
-  if (cpi->ref_frame_flags & VP9_ALT_FLAG) {
-    setup_buffer_inter(cpi, x, cpi->alt_fb_idx,
-                       ALTREF_FRAME, BLOCK_16X16, mi_row, mi_col,
-                       frame_mv[NEARESTMV], frame_mv[NEARMV],
-                       frame_mdcounts, yv12_mb, scale_factor);
-  }
-
-  *returnintra = INT64_MAX;
-
-  mbmi->ref_frame = INTRA_FRAME;
-
-  /* Initialize zbin mode boost for uv costing */
-  cpi->zbin_mode_boost = 0;
-  vp9_update_zbin_extra(cpi, x);
-
-  xd->mode_info_context->mbmi.mode = DC_PRED;
-
-  for (i = 0; i <= TX_8X8; i++) {
-    mbmi->txfm_size = i;
-    rd_pick_intra_sbuv_mode(cpi, x, &uv_intra_rate[i],
-                            &uv_intra_rate_tokenonly[i],
-                            &uv_intra_distortion[i],
-                            &uv_intra_skippable[i],
-                            BLOCK_SIZE_MB16X16);
-    uv_intra_mode[i] = mbmi->uv_mode;
-  }
-
-  // Get estimates of reference frame costs for each reference frame
-  // that depend on the current prediction etc.
-  estimate_ref_frame_costs(cpi, segment_id, ref_costs);
-
-  for (mode_index = 0; mode_index < MAX_MODES; ++mode_index) {
-    int64_t this_rd = INT64_MAX;
-    int disable_skip = 0, skippable = 0;
-    int other_cost = 0;
-    int compmode_cost = 0;
-#if CONFIG_COMP_INTERINTRA_PRED
-    int compmode_interintra_cost = 0;
-#endif
-    int mode_excluded = 0;
-    int64_t txfm_cache[NB_TXFM_MODES] = { 0 };
-    YV12_BUFFER_CONFIG *scaled_ref_frame;
-
-    // These variables hold are rolling total cost and distortion for this mode
-    rate2 = 0;
-    distortion2 = 0;
-    rate_y = 0;
-    rate_uv = 0;
-
-    x->skip = 0;
-
-    this_mode = vp9_mode_order[mode_index].mode;
-    mbmi->mode = this_mode;
-    mbmi->uv_mode = DC_PRED;
-    mbmi->ref_frame = vp9_mode_order[mode_index].ref_frame;
-    mbmi->second_ref_frame = vp9_mode_order[mode_index].second_ref_frame;
-
-    mbmi->interp_filter = cm->mcomp_filter_type;
-
-    set_scale_factors(xd, mbmi->ref_frame, mbmi->second_ref_frame,
-                      scale_factor);
-
-    vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
-
-    // Test best rd so far against threshold for trying this mode.
-    if (best_rd <= cpi->rd_threshes[mode_index])
-      continue;
-
-    // Ensure that the references used by this mode are available.
-    if (mbmi->ref_frame &&
-        !(cpi->ref_frame_flags & flag_list[mbmi->ref_frame]))
-      continue;
-
-    if (mbmi->second_ref_frame > 0 &&
-        !(cpi->ref_frame_flags & flag_list[mbmi->second_ref_frame]))
-      continue;
-
-    // only scale on zeromv.
-    if (mbmi->ref_frame > 0 &&
-          (yv12_mb[mbmi->ref_frame].y_width != cm->mb_cols * 16 ||
-           yv12_mb[mbmi->ref_frame].y_height != cm->mb_rows * 16) &&
-        this_mode != ZEROMV)
-      continue;
-
-    if (mbmi->second_ref_frame > 0 &&
-          (yv12_mb[mbmi->second_ref_frame].y_width != cm->mb_cols * 16 ||
-           yv12_mb[mbmi->second_ref_frame].y_height != cm->mb_rows * 16) &&
-        this_mode != ZEROMV)
-      continue;
-
-    // current coding mode under rate-distortion optimization test loop
-#if CONFIG_COMP_INTERINTRA_PRED
-    mbmi->interintra_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
-    mbmi->interintra_uv_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
-#endif
-
-    // If the segment reference frame feature is enabled....
-    // then do nothing if the current ref frame is not allowed..
-    if (vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&
-        !vp9_check_segref(xd, segment_id, mbmi->ref_frame)) {
-      continue;
-    // If the segment skip feature is enabled....
-    // then do nothing if the current mode is not allowed..
-    } else if (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP) &&
-               (this_mode != ZEROMV)) {
-      continue;
-    // Disable this drop out case if  the ref frame segment
-    // level feature is enabled for this segment. This is to
-    // prevent the possibility that the we end up unable to pick any mode.
-    } else if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME)) {
-      // Only consider ZEROMV/ALTREF_FRAME for alt ref frame overlay,
-      // unless ARNR filtering is enabled in which case we want
-      // an unfiltered alternative
-      if (cpi->is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) {
-        if (this_mode != ZEROMV ||
-            mbmi->ref_frame != ALTREF_FRAME) {
-          continue;
-        }
-      }
-    }
-
-    /* everything but intra */
-    scaled_ref_frame = NULL;
-    if (mbmi->ref_frame) {
-      int ref = mbmi->ref_frame;
-      int fb;
-
-      best_ref_mv = mbmi->ref_mvs[ref][0];
-      vpx_memcpy(mdcounts, frame_mdcounts[ref], sizeof(mdcounts));
-
-      if (mbmi->ref_frame == LAST_FRAME) {
-        fb = cpi->lst_fb_idx;
-      } else if (mbmi->ref_frame == GOLDEN_FRAME) {
-        fb = cpi->gld_fb_idx;
-      } else {
-        fb = cpi->alt_fb_idx;
-      }
-
-      if (cpi->scaled_ref_idx[fb] != cm->ref_frame_map[fb])
-        scaled_ref_frame = &cm->yv12_fb[cpi->scaled_ref_idx[fb]];
-    }
-
-    if (mbmi->second_ref_frame > 0) {
-      int ref = mbmi->second_ref_frame;
-
-      second_best_ref_mv = mbmi->ref_mvs[ref][0];
-    }
-
-    // TODO(jkoleszar) scaling/translation handled during creation of yv12_mb
-    // currently.
-    setup_pre_planes(xd, &yv12_mb[mbmi->ref_frame],
-        mbmi->second_ref_frame > 0 ? &yv12_mb[mbmi->second_ref_frame] : NULL,
-        0, 0, NULL, NULL);
-
-    // Experimental code. Special case for gf and arf zeromv modes.
-    // Increase zbin size to suppress noise
-    if (cpi->zbin_mode_boost_enabled) {
-      if (vp9_mode_order[mode_index].ref_frame == INTRA_FRAME)
-        cpi->zbin_mode_boost = 0;
-      else {
-        if (vp9_mode_order[mode_index].mode == ZEROMV) {
-          if (vp9_mode_order[mode_index].ref_frame != LAST_FRAME)
-            cpi->zbin_mode_boost = GF_ZEROMV_ZBIN_BOOST;
-          else
-            cpi->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST;
-        } else if (vp9_mode_order[mode_index].mode == SPLITMV)
-          cpi->zbin_mode_boost = 0;
-        else
-          cpi->zbin_mode_boost = MV_ZBIN_BOOST;
-      }
-
-      vp9_update_zbin_extra(cpi, x);
-    }
-
-    // Intra
-    if (!mbmi->ref_frame) {
-      switch (this_mode) {
-        default:
-        case V_PRED:
-        case H_PRED:
-        case D45_PRED:
-        case D135_PRED:
-        case D117_PRED:
-        case D153_PRED:
-        case D27_PRED:
-        case D63_PRED:
-          rate2 += intra_cost_penalty;
-        case DC_PRED:
-        case TM_PRED:
-          mbmi->ref_frame = INTRA_FRAME;
-          // FIXME compound intra prediction
-          vp9_build_intra_predictors_sby_s(&x->e_mbd, BLOCK_SIZE_MB16X16);
-          // vp9_build_intra_predictors_mby(&x->e_mbd);
-          super_block_yrd(cpi, x, &rate_y, &distortion, &skippable,
-                          BLOCK_SIZE_MB16X16, txfm_cache);
-          rate2 += rate_y;
-          distortion2 += distortion;
-          rate2 += x->mbmode_cost[xd->frame_type][mbmi->mode];
-
-          rate2 += uv_intra_rate[mbmi->txfm_size != TX_4X4];
-          rate_uv = uv_intra_rate_tokenonly[mbmi->txfm_size != TX_4X4];
-          distortion2 += uv_intra_distortion[mbmi->txfm_size != TX_4X4];
-          distortion_uv = uv_intra_distortion[mbmi->txfm_size != TX_4X4];
-          skippable = skippable &&
-                      uv_intra_skippable[mbmi->txfm_size != TX_4X4];
-          break;
-        case I4X4_PRED: {
-          int64_t tmp_rd;
-
-          // Note the rate value returned here includes the cost of coding
-          // the I4X4_PRED mode : x->mbmode_cost[xd->frame_type][I4X4_PRED];
-          mbmi->txfm_size = TX_4X4;
-          tmp_rd = rd_pick_intra4x4mby_modes(cpi, x, &rate, &rate_y,
-                                             &distortion, best_yrd);
-          rate2 += rate;
-          rate2 += intra_cost_penalty;
-          distortion2 += distortion;
-
-          if (tmp_rd < best_yrd) {
-            rate2 += uv_intra_rate[TX_4X4];
-            rate_uv = uv_intra_rate_tokenonly[TX_4X4];
-            distortion2 += uv_intra_distortion[TX_4X4];
-            distortion_uv = uv_intra_distortion[TX_4X4];
-          } else {
-            this_rd = INT64_MAX;
-            disable_skip = 1;
-          }
-        }
-        break;
-        case I8X8_PRED: {
-          int64_t tmp_rd;
-
-          tmp_rd = rd_pick_intra8x8mby_modes_and_txsz(cpi, x, &rate, &rate_y,
-                                                      &distortion, mode8x8,
-                                                      best_yrd, txfm_cache);
-          rate2 += rate;
-          rate2 += intra_cost_penalty;
-          distortion2 += distortion;
-
-          /* TODO: uv rate maybe over-estimated here since there is UV intra
-                   mode coded in I8X8_PRED prediction */
-          if (tmp_rd < best_yrd) {
-            rate2 += uv_intra_rate[TX_4X4];
-            rate_uv = uv_intra_rate_tokenonly[TX_4X4];
-            distortion2 += uv_intra_distortion[TX_4X4];
-            distortion_uv = uv_intra_distortion[TX_4X4];
-          } else {
-            this_rd = INT64_MAX;
-            disable_skip = 1;
-          }
-        }
-        break;
-      }
-    }
-    // Split MV. The code is very different from the other inter modes so
-    // special case it.
-    else if (this_mode == SPLITMV) {
-      const int is_comp_pred = mbmi->second_ref_frame > 0;
-      int64_t this_rd_thresh;
-      int64_t tmp_rd, tmp_best_rd = INT64_MAX, tmp_best_rdu = INT64_MAX;
-      int tmp_best_rate = INT_MAX, tmp_best_ratey = INT_MAX;
-      int tmp_best_distortion = INT_MAX, tmp_best_skippable = 0;
-      int switchable_filter_index;
-      int_mv *second_ref = is_comp_pred ? &second_best_ref_mv : NULL;
-      union b_mode_info tmp_best_bmodes[16];
-      MB_MODE_INFO tmp_best_mbmode;
-      PARTITION_INFO tmp_best_partition;
-      int pred_exists = 0;
-
-      this_rd_thresh =
-          (mbmi->ref_frame == LAST_FRAME) ?
-          cpi->rd_threshes[THR_NEWMV] : cpi->rd_threshes[THR_NEWA];
-      this_rd_thresh =
-          (mbmi->ref_frame == GOLDEN_FRAME) ?
-          cpi->rd_threshes[THR_NEWG] : this_rd_thresh;
-      xd->mode_info_context->mbmi.txfm_size = TX_4X4;
-
-      for (switchable_filter_index = 0;
-           switchable_filter_index < VP9_SWITCHABLE_FILTERS;
-           ++switchable_filter_index) {
-        int newbest;
-        mbmi->interp_filter =
-            vp9_switchable_interp[switchable_filter_index];
-        vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
-
-        tmp_rd = rd_pick_best_mbsegmentation(cpi, x, &best_ref_mv,
-                                             second_ref, best_yrd, mdcounts,
-                                             &rate, &rate_y, &distortion,
-                                             &skippable,
-                                             (int)this_rd_thresh, seg_mvs,
-                                             txfm_cache);
-        if (cpi->common.mcomp_filter_type == SWITCHABLE) {
-          int rs = SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs
-                   [vp9_get_pred_context(&cpi->common, xd,
-                                         PRED_SWITCHABLE_INTERP)]
-                   [vp9_switchable_interp_map[mbmi->interp_filter]];
-          tmp_rd += RDCOST(x->rdmult, x->rddiv, rs, 0);
-        }
-        newbest = (tmp_rd < tmp_best_rd);
-        if (newbest) {
-          tmp_best_filter = mbmi->interp_filter;
-          tmp_best_rd = tmp_rd;
-        }
-        if ((newbest && cm->mcomp_filter_type == SWITCHABLE) ||
-            (mbmi->interp_filter == cm->mcomp_filter_type &&
-             cm->mcomp_filter_type != SWITCHABLE)) {
-          tmp_best_rdu = tmp_rd;
-          tmp_best_rate = rate;
-          tmp_best_ratey = rate_y;
-          tmp_best_distortion = distortion;
-          tmp_best_skippable = skippable;
-          vpx_memcpy(&tmp_best_mbmode, mbmi, sizeof(MB_MODE_INFO));
-          vpx_memcpy(&tmp_best_partition, x->partition_info,
-                     sizeof(PARTITION_INFO));
-          for (i = 0; i < 16; i++) {
-            tmp_best_bmodes[i] = xd->mode_info_context->bmi[i];
-          }
-          pred_exists = 1;
-        }
-      }  // switchable_filter_index loop
-
-      mbmi->interp_filter = (cm->mcomp_filter_type == SWITCHABLE ?
-                             tmp_best_filter : cm->mcomp_filter_type);
-      vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
-      if (!pred_exists) {
-        // Handles the special case when a filter that is not in the
-        // switchable list (bilinear, 6-tap) is indicated at the frame level
-        tmp_rd = rd_pick_best_mbsegmentation(cpi, x, &best_ref_mv,
-                                             second_ref, best_yrd, mdcounts,
-                                             &rate, &rate_y, &distortion,
-                                             &skippable,
-                                             (int)this_rd_thresh, seg_mvs,
-                                             txfm_cache);
-      } else {
-        if (cpi->common.mcomp_filter_type == SWITCHABLE) {
-          int rs = SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs
-                   [vp9_get_pred_context(&cpi->common, xd,
-                                         PRED_SWITCHABLE_INTERP)]
-                   [vp9_switchable_interp_map[mbmi->interp_filter]];
-          tmp_best_rdu -= RDCOST(x->rdmult, x->rddiv, rs, 0);
-        }
-        tmp_rd = tmp_best_rdu;
-        rate = tmp_best_rate;
-        rate_y = tmp_best_ratey;
-        distortion = tmp_best_distortion;
-        skippable = tmp_best_skippable;
-        vpx_memcpy(mbmi, &tmp_best_mbmode, sizeof(MB_MODE_INFO));
-        vpx_memcpy(x->partition_info, &tmp_best_partition,
-                   sizeof(PARTITION_INFO));
-        for (i = 0; i < 16; i++) {
-          xd->mode_info_context->bmi[i] = tmp_best_bmodes[i];
-        }
-      }
-
-      rate2 += rate;
-      distortion2 += distortion;
-
-      if (cpi->common.mcomp_filter_type == SWITCHABLE)
-        rate2 += SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs
-            [vp9_get_pred_context(&cpi->common, xd, PRED_SWITCHABLE_INTERP)]
-            [vp9_switchable_interp_map[mbmi->interp_filter]];
-
-      // If even the 'Y' rd value of split is higher than best so far
-      // then dont bother looking at UV
-      if (tmp_rd < best_yrd) {
-        int uv_skippable;
-
-        vp9_build_inter_predictors_sbuv(&x->e_mbd, mi_row, mi_col,
-                                        BLOCK_SIZE_MB16X16);
-
-        vp9_subtract_sbuv(x, BLOCK_SIZE_MB16X16);
-
-        super_block_uvrd_4x4(cm, x, &rate_uv, &distortion_uv,
-                             &uv_skippable, BLOCK_SIZE_MB16X16);
-        rate2 += rate_uv;
-        distortion2 += distortion_uv;
-        skippable = skippable && uv_skippable;
-      } else {
-        this_rd = INT64_MAX;
-        disable_skip = 1;
-      }
-
-      if (!mode_excluded) {
-        if (is_comp_pred)
-          mode_excluded = cpi->common.comp_pred_mode == SINGLE_PREDICTION_ONLY;
-        else
-          mode_excluded = cpi->common.comp_pred_mode == COMP_PREDICTION_ONLY;
-      }
-
-      compmode_cost =
-        vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_COMP), is_comp_pred);
-      mbmi->mode = this_mode;
-    }
-    else {
-#if CONFIG_COMP_INTERINTRA_PRED
-      if (mbmi->second_ref_frame == INTRA_FRAME) {
-        if (best_intra16_mode == DC_PRED - 1) continue;
-        mbmi->interintra_mode = best_intra16_mode;
-#if SEPARATE_INTERINTRA_UV
-        mbmi->interintra_uv_mode = best_intra16_uv_mode;
-#else
-        mbmi->interintra_uv_mode = best_intra16_mode;
-#endif
-      }
-#endif
-      this_rd = handle_inter_mode(cpi, x, BLOCK_SIZE_MB16X16,
-                                  mdcounts, txfm_cache,
-                                  &rate2, &distortion2, &skippable,
-                                  &compmode_cost,
-#if CONFIG_COMP_INTERINTRA_PRED
-                                  &compmode_interintra_cost,
-#endif
-                                  &rate_y, &distortion,
-                                  &rate_uv, &distortion_uv,
-                                  &mode_excluded, &disable_skip,
-                                  mode_index, &tmp_best_filter, frame_mv,
-                                  scaled_ref_frame, mi_row, mi_col);
-      if (this_rd == INT64_MAX)
-        continue;
-    }
-
-#if CONFIG_COMP_INTERINTRA_PRED
-    if (cpi->common.use_interintra)
-      rate2 += compmode_interintra_cost;
-#endif
-
-    if (cpi->common.comp_pred_mode == HYBRID_PREDICTION)
-      rate2 += compmode_cost;
-
-    // Estimate the reference frame signaling cost and add it
-    // to the rolling cost variable.
-    rate2 += ref_costs[mbmi->ref_frame];
-
-    if (!disable_skip) {
-      // Test for the condition where skip block will be activated
-      // because there are no non zero coefficients and make any
-      // necessary adjustment for rate. Ignore if skip is coded at
-      // segment level as the cost wont have been added in.
-      int mb_skip_allowed;
-
-      // Is Mb level skip allowed (i.e. not coded at segment level).
-      mb_skip_allowed = !vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP);
-
-      if (skippable) {
-        mbmi->mb_skip_coeff = 1;
-
-        // Back out the coefficient coding costs
-        rate2 -= (rate_y + rate_uv);
-        // for best_yrd calculation
-        rate_uv = 0;
-
-        if (mb_skip_allowed) {
-          int prob_skip_cost;
-
-          // Cost the skip mb case
-          vp9_prob skip_prob =
-            vp9_get_pred_prob(cm, &x->e_mbd, PRED_MBSKIP);
-
-          if (skip_prob) {
-            prob_skip_cost = vp9_cost_bit(skip_prob, 1);
-            rate2 += prob_skip_cost;
-            other_cost += prob_skip_cost;
-          }
-        }
-      } else {
-        // Add in the cost of the no skip flag.
-        mbmi->mb_skip_coeff = 0;
-        if (mb_skip_allowed) {
-          int prob_skip_cost = vp9_cost_bit(
-                 vp9_get_pred_prob(cm, &x->e_mbd, PRED_MBSKIP), 0);
-          rate2 += prob_skip_cost;
-          other_cost += prob_skip_cost;
-        }
-      }
-
-      // Calculate the final RD estimate for this mode.
-      this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
-    }
-
-    // Keep record of best intra distortion
-    if ((mbmi->ref_frame == INTRA_FRAME) &&
-        (this_rd < best_intra_rd)) {
-      best_intra_rd = this_rd;
-      *returnintra = distortion2;
-    }
-#if CONFIG_COMP_INTERINTRA_PRED
-    if ((mbmi->ref_frame == INTRA_FRAME) &&
-        (this_mode <= TM_PRED) &&
-        (this_rd < best_intra16_rd)) {
-      best_intra16_rd = this_rd;
-      best_intra16_mode = this_mode;
-#if SEPARATE_INTERINTRA_UV
-      best_intra16_uv_mode = uv_intra_mode[mbmi->txfm_size != TX_4X4];
-#endif
-    }
-#endif
-
-    if (!disable_skip && mbmi->ref_frame == INTRA_FRAME)
-      for (i = 0; i < NB_PREDICTION_TYPES; ++i)
-        best_pred_rd[i] = MIN(best_pred_rd[i], this_rd);
-
-    if (this_rd < best_overall_rd) {
-      best_overall_rd = this_rd;
-      best_filter = tmp_best_filter;
-      best_mode = this_mode;
-#if CONFIG_COMP_INTERINTRA_PRED
-      is_best_interintra = (mbmi->second_ref_frame == INTRA_FRAME);
-#endif
-    }
-
-    // Store the respective mode distortions for later use.
-    // Store the respective mode distortions for later use.
-    if (mode_distortions[this_mode] == -1
-        || distortion2 < mode_distortions[this_mode]) {
-      mode_distortions[this_mode] = distortion2;
-    }
-    if (frame_distortions[mbmi->ref_frame] == -1 ||
-        distortion2 < frame_distortions[mbmi->ref_frame]) {
-       frame_distortions[mbmi->ref_frame] = distortion2;
-    }
-
-    // Did this mode help.. i.e. is it the new best mode
-    if (this_rd < best_rd || x->skip) {
-      if (!mode_excluded) {
-        /*
-        if (mbmi->second_ref_frame == INTRA_FRAME) {
-          printf("rd %d best %d bestintra16 %d\n", this_rd, best_rd, best_intra16_rd);
-        }
-        */
-        // Note index of best mode so far
-        best_mode_index = mode_index;
-
-        if (this_mode <= I4X4_PRED) {
-          if (mbmi->txfm_size != TX_4X4
-              && this_mode != I4X4_PRED
-              && this_mode != I8X8_PRED)
-            mbmi->uv_mode = uv_intra_mode[TX_8X8];
-          else
-            mbmi->uv_mode = uv_intra_mode[TX_4X4];
-          /* required for left and above block mv */
-          mbmi->mv[0].as_int = 0;
-        }
-
-        other_cost += ref_costs[mbmi->ref_frame];
-
-        /* Calculate the final y RD estimate for this mode */
-        best_yrd = RDCOST(x->rdmult, x->rddiv, (rate2 - rate_uv - other_cost),
-                          (distortion2 - distortion_uv));
-
-        *returnrate = rate2;
-        *returndistortion = distortion2;
-        best_rd = this_rd;
-        vpx_memcpy(&best_mbmode, mbmi, sizeof(MB_MODE_INFO));
-        vpx_memcpy(&best_partition, x->partition_info, sizeof(PARTITION_INFO));
-
-        if ((this_mode == I4X4_PRED)
-            || (this_mode == I8X8_PRED)
-            || (this_mode == SPLITMV))
-          for (i = 0; i < 16; i++) {
-            best_bmodes[i] = xd->mode_info_context->bmi[i];
-          }
-      }
-
-      // Testing this mode gave rise to an improvement in best error score.
-      // Lower threshold a bit for next time
-      cpi->rd_thresh_mult[mode_index] =
-          (cpi->rd_thresh_mult[mode_index] >= (MIN_THRESHMULT + 2)) ?
-          cpi->rd_thresh_mult[mode_index] - 2 : MIN_THRESHMULT;
-      cpi->rd_threshes[mode_index] =
-          (cpi->rd_baseline_thresh[mode_index] >> 7) *
-          cpi->rd_thresh_mult[mode_index];
-    } else {
-      // If the mode did not help improve the best error case then raise the
-      // threshold for testing that mode next time around.
-      cpi->rd_thresh_mult[mode_index] += 4;
-
-      if (cpi->rd_thresh_mult[mode_index] > MAX_THRESHMULT)
-        cpi->rd_thresh_mult[mode_index] = MAX_THRESHMULT;
-
-      cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7)
-          * cpi->rd_thresh_mult[mode_index];
-    }
-
-    /* keep record of best compound/single-only prediction */
-    if (!disable_skip && mbmi->ref_frame != INTRA_FRAME) {
-      int64_t single_rd, hybrid_rd;
-      int single_rate, hybrid_rate;
-
-      if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {
-        single_rate = rate2 - compmode_cost;
-        hybrid_rate = rate2;
-      } else {
-        single_rate = rate2;
-        hybrid_rate = rate2 + compmode_cost;
-      }
-
-      single_rd = RDCOST(x->rdmult, x->rddiv, single_rate, distortion2);
-      hybrid_rd = RDCOST(x->rdmult, x->rddiv, hybrid_rate, distortion2);
-
-      if (mbmi->second_ref_frame <= INTRA_FRAME &&
-          single_rd < best_pred_rd[SINGLE_PREDICTION_ONLY]) {
-        best_pred_rd[SINGLE_PREDICTION_ONLY] = single_rd;
-      } else if (mbmi->second_ref_frame > INTRA_FRAME &&
-                 single_rd < best_pred_rd[COMP_PREDICTION_ONLY]) {
-        best_pred_rd[COMP_PREDICTION_ONLY] = single_rd;
-      }
-      if (hybrid_rd < best_pred_rd[HYBRID_PREDICTION])
-        best_pred_rd[HYBRID_PREDICTION] = hybrid_rd;
-    }
-
-    /* keep record of best txfm size */
-    if (!mode_excluded && this_rd != INT64_MAX) {
-      for (i = 0; i < NB_TXFM_MODES; i++) {
-        int64_t adj_rd;
-        if (this_mode != I4X4_PRED) {
-          const int64_t txfm_mode_diff =
-              txfm_cache[i] - txfm_cache[cm->txfm_mode];
-          adj_rd = this_rd + txfm_mode_diff;
-        } else {
-          adj_rd = this_rd;
-        }
-        if (adj_rd < best_txfm_rd[i])
-          best_txfm_rd[i] = adj_rd;
-      }
-    }
-
-    if (x->skip && !mode_excluded)
-      break;
-  }
-
-  assert((cm->mcomp_filter_type == SWITCHABLE) ||
-         (cm->mcomp_filter_type == best_mbmode.interp_filter) ||
-         (best_mbmode.mode <= I4X4_PRED));
-
-#if CONFIG_COMP_INTERINTRA_PRED
-  ++cpi->interintra_select_count[is_best_interintra];
-#endif
-
-  // Accumulate filter usage stats
-  // TODO(agrange): Use RD criteria to select interpolation filter mode.
-  if (is_inter_mode(best_mode))
-    ++cpi->best_switchable_interp_count[vp9_switchable_interp_map[best_filter]];
-
-  // Reduce the activation RD thresholds for the best choice mode
-  if ((cpi->rd_baseline_thresh[best_mode_index] > 0) &&
-      (cpi->rd_baseline_thresh[best_mode_index] < (INT_MAX >> 2))) {
-    int best_adjustment = (cpi->rd_thresh_mult[best_mode_index] >> 2);
-
-    cpi->rd_thresh_mult[best_mode_index] =
-        (cpi->rd_thresh_mult[best_mode_index] >=
-         (MIN_THRESHMULT + best_adjustment)) ?
-        cpi->rd_thresh_mult[best_mode_index] - best_adjustment : MIN_THRESHMULT;
-    cpi->rd_threshes[best_mode_index] =
-        (cpi->rd_baseline_thresh[best_mode_index] >> 7) *
-        cpi->rd_thresh_mult[best_mode_index];
-  }
-
-  // This code forces Altref,0,0 and skip for the frame that overlays a
-  // an alrtef unless Altref is filtered. However, this is unsafe if
-  // segment level coding of ref frame is enabled for this
-  // segment.
-  if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&
-      cpi->is_src_frame_alt_ref &&
-      (cpi->oxcf.arnr_max_frames == 0) &&
-      (best_mbmode.mode != ZEROMV || best_mbmode.ref_frame != ALTREF_FRAME)) {
-    mbmi->mode = ZEROMV;
-    if (cm->txfm_mode <= ALLOW_8X8)
-      mbmi->txfm_size = cm->txfm_mode;
-    else
-      mbmi->txfm_size = TX_16X16;
-    mbmi->ref_frame = ALTREF_FRAME;
-    mbmi->mv[0].as_int = 0;
-    mbmi->uv_mode = DC_PRED;
-    mbmi->mb_skip_coeff = 1;
-    mbmi->partitioning = 0;
-    set_scale_factors(xd, mbmi->ref_frame, mbmi->second_ref_frame,
-                      scale_factor);
-
-    vpx_memset(best_pred_diff, 0, sizeof(best_pred_diff));
-    vpx_memset(best_txfm_diff, 0, sizeof(best_txfm_diff));
-    goto end;
-  }
-
-  // macroblock modes
-  vpx_memcpy(mbmi, &best_mbmode, sizeof(MB_MODE_INFO));
-  if (best_mbmode.mode == I4X4_PRED) {
-    for (i = 0; i < 16; i++) {
-      xd->mode_info_context->bmi[i].as_mode = best_bmodes[i].as_mode;
-    }
-  }
-
-  if (best_mbmode.mode == I8X8_PRED)
-    set_i8x8_block_modes(x, mode8x8);
-
-  if (best_mbmode.mode == SPLITMV) {
-    for (i = 0; i < 16; i++)
-      xd->mode_info_context->bmi[i].as_mv[0].as_int =
-          best_bmodes[i].as_mv[0].as_int;
-    if (mbmi->second_ref_frame > 0)
-      for (i = 0; i < 16; i++)
-        xd->mode_info_context->bmi[i].as_mv[1].as_int =
-            best_bmodes[i].as_mv[1].as_int;
-
-    vpx_memcpy(x->partition_info, &best_partition, sizeof(PARTITION_INFO));
-
-    mbmi->mv[0].as_int = x->partition_info->bmi[15].mv.as_int;
-    mbmi->mv[1].as_int = x->partition_info->bmi[15].second_mv.as_int;
-  }
-
-  for (i = 0; i < NB_PREDICTION_TYPES; ++i) {
-    if (best_pred_rd[i] == INT64_MAX)
-      best_pred_diff[i] = INT_MIN;
-    else
-      best_pred_diff[i] = best_rd - best_pred_rd[i];
-  }
-
-  if (!x->skip) {
-    for (i = 0; i < NB_TXFM_MODES; i++) {
-      if (best_txfm_rd[i] == INT64_MAX)
-        best_txfm_diff[i] = 0;
-      else
-        best_txfm_diff[i] = best_rd - best_txfm_rd[i];
-    }
-  } else {
-    vpx_memset(best_txfm_diff, 0, sizeof(best_txfm_diff));
-  }
-
-end:
-
-  // Flag all modes that have a distortion thats > 2x the best we found at
-  // this level.
-  for (mode_index = 0; mode_index < MB_MODE_COUNT; ++mode_index) {
-    if (mode_index == NEARESTMV || mode_index == NEARMV || mode_index == NEWMV
-        || mode_index == SPLITMV)
-      continue;
-
-    if (mode_distortions[mode_index] > 2 * *returndistortion) {
-      x->mb_context[xd->sb_index][xd->mb_index].modes_with_high_error |= (1
-          << mode_index);
-    }
-  }
-
-  // Flag all ref frames that have a distortion thats > 2x the best we found at
-  // this level.
-  for (ref_frame = INTRA_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
-    if (frame_distortions[ref_frame] > 2 * *returndistortion) {
-      x->mb_context[xd->sb_index][xd->mb_index].frames_with_high_error |= (1
-          << ref_frame);
-    }
-  }
-
-  set_scale_factors(xd, mbmi->ref_frame, mbmi->second_ref_frame,
-                    scale_factor);
-  store_coding_context(x, &x->mb_context[xd->sb_index][xd->mb_index],
-                       best_mode_index, &best_partition,
-                       &mbmi->ref_mvs[mbmi->ref_frame][0],
-                       &mbmi->ref_mvs[mbmi->second_ref_frame < 0 ? 0 :
-                                      mbmi->second_ref_frame][0],
-                       best_pred_diff, best_txfm_diff);
-}
-
 void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
                                int *returnrate, int *returndist,
                                BLOCK_SIZE_TYPE bsize,
@@ -4065,14 +2366,24 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   int dist_y = 0, dist_uv;
   int y_skip = 0, uv_skip;
   int64_t txfm_cache[NB_TXFM_MODES], err;
+  MB_PREDICTION_MODE mode;
+  TX_SIZE txfm_size;
+  int rate4x4_y, rate4x4_y_tokenonly, dist4x4_y;
+  int64_t err4x4 = INT64_MAX;
   int i;
 
   ctx->skip = 0;
   xd->mode_info_context->mbmi.mode = DC_PRED;
   err = rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly,
                                &dist_y, &y_skip, bsize, txfm_cache);
+  mode = xd->mode_info_context->mbmi.mode;
+  txfm_size = xd->mode_info_context->mbmi.txfm_size;
   rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly,
                           &dist_uv, &uv_skip, bsize);
+  if (bsize == BLOCK_SIZE_SB8X8)
+    err4x4 = rd_pick_intra4x4mby_modes(cpi, x, &rate4x4_y,
+                                       &rate4x4_y_tokenonly,
+                                       &dist4x4_y, err);
 
   if (y_skip && uv_skip) {
     *returnrate = rate_y + rate_uv - rate_y_tokenonly - rate_uv_tokenonly +
@@ -4080,145 +2391,30 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     *returndist = dist_y + (dist_uv >> 2);
     memset(ctx->txfm_rd_diff, 0,
            sizeof(x->sb32_context[xd->sb_index].txfm_rd_diff));
+    xd->mode_info_context->mbmi.mode = mode;
+    xd->mode_info_context->mbmi.txfm_size = txfm_size;
+  } else if (bsize == BLOCK_SIZE_SB8X8 && err4x4 < err) {
+    *returnrate = rate4x4_y + rate_uv +
+        vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 0);
+    *returndist = dist4x4_y + (dist_uv >> 2);
+    for (i = 0; i < NB_TXFM_MODES; i++) {
+      ctx->txfm_rd_diff[i] = MIN(err4x4, err - txfm_cache[i]);
+    }
+    xd->mode_info_context->mbmi.txfm_size = TX_4X4;
   } else {
     *returnrate = rate_y + rate_uv +
         vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 0);
     *returndist = dist_y + (dist_uv >> 2);
     for (i = 0; i < NB_TXFM_MODES; i++) {
-      ctx->txfm_rd_diff[i] = err - txfm_cache[i];
+      ctx->txfm_rd_diff[i] = MIN(err4x4, err - txfm_cache[i]);
     }
+    xd->mode_info_context->mbmi.txfm_size = txfm_size;
+    xd->mode_info_context->mbmi.mode = mode;
   }
 
   vpx_memcpy(&ctx->mic, xd->mode_info_context, sizeof(MODE_INFO));
 }
 
-void vp9_rd_pick_intra_mode(VP9_COMP *cpi, MACROBLOCK *x,
-                            int *returnrate, int *returndist) {
-  VP9_COMMON *cm = &cpi->common;
-  MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
-  int64_t error4x4, error16x16;
-  int rate4x4, rate16x16 = 0, rateuv[2];
-  int dist4x4 = 0, dist16x16 = 0, distuv[2];
-  int rate;
-  int rate4x4_tokenonly = 0;
-  int rate16x16_tokenonly = 0;
-  int rateuv_tokenonly[2];
-  int64_t error8x8;
-  int rate8x8_tokenonly=0;
-  int rate8x8, dist8x8;
-  int mode16x16;
-  int mode8x8[4];
-  int dist;
-  int modeuv[2], uv_intra_skippable[2];
-  int y_intra16x16_skippable = 0;
-  int64_t txfm_cache[2][NB_TXFM_MODES];
-  TX_SIZE txfm_size_16x16, txfm_size_8x8;
-  int i;
-
-  x->mb_context[xd->sb_index][xd->mb_index].skip = 0;
-  mbmi->ref_frame = INTRA_FRAME;
-  mbmi->mode = DC_PRED;
-  for (i = 0; i <= TX_8X8; i++) {
-    mbmi->txfm_size = i;
-    rd_pick_intra_sbuv_mode(cpi, x, &rateuv[i], &rateuv_tokenonly[i],
-                            &distuv[i], &uv_intra_skippable[i],
-                            BLOCK_SIZE_MB16X16);
-    modeuv[i] = mbmi->uv_mode;
-  }
-
-  // current macroblock under rate-distortion optimization test loop
-  error16x16 = rd_pick_intra_sby_mode(cpi, x, &rate16x16,
-                                      &rate16x16_tokenonly, &dist16x16,
-                                      &y_intra16x16_skippable,
-                                      BLOCK_SIZE_MB16X16, txfm_cache[1]);
-  mode16x16 = mbmi->mode;
-  txfm_size_16x16 = mbmi->txfm_size;
-  if (y_intra16x16_skippable &&
-      ((cm->txfm_mode == ONLY_4X4 && uv_intra_skippable[TX_4X4]) ||
-       (cm->txfm_mode != ONLY_4X4 && uv_intra_skippable[TX_8X8]))) {
-    error16x16 -= RDCOST(x->rdmult, x->rddiv, rate16x16_tokenonly, 0);
-    rate16x16 -= rate16x16_tokenonly;
-  }
-  for (i = 0; i < NB_TXFM_MODES; i++) {
-    txfm_cache[0][i] = error16x16 - txfm_cache[1][cm->txfm_mode] +
-                       txfm_cache[1][i];
-  }
-
-  error8x8 = rd_pick_intra8x8mby_modes_and_txsz(cpi, x, &rate8x8,
-                                                &rate8x8_tokenonly,
-                                                &dist8x8, mode8x8,
-                                                error16x16, txfm_cache[1]);
-  txfm_size_8x8 = mbmi->txfm_size;
-  for (i = 0; i < NB_TXFM_MODES; i++) {
-    int64_t tmp_rd = error8x8 - txfm_cache[1][cm->txfm_mode] + txfm_cache[1][i];
-    if (tmp_rd < txfm_cache[0][i])
-      txfm_cache[0][i] = tmp_rd;
-  }
-
-  mbmi->txfm_size = TX_4X4;
-  error4x4 = rd_pick_intra4x4mby_modes(cpi, x,
-                                       &rate4x4, &rate4x4_tokenonly,
-                                       &dist4x4, error16x16);
-  for (i = 0; i < NB_TXFM_MODES; i++) {
-    if (error4x4 < txfm_cache[0][i])
-      txfm_cache[0][i] = error4x4;
-  }
-
-  mbmi->mb_skip_coeff = 0;
-  if (y_intra16x16_skippable &&
-      ((cm->txfm_mode == ONLY_4X4 && uv_intra_skippable[TX_4X4]) ||
-       (cm->txfm_mode != ONLY_4X4 && uv_intra_skippable[TX_8X8]))) {
-    mbmi->mb_skip_coeff = 1;
-    mbmi->mode = mode16x16;
-    mbmi->uv_mode = modeuv[cm->txfm_mode != ONLY_4X4];
-    rate = rate16x16 + vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 1);
-    dist = dist16x16;
-    rate += rateuv[cm->txfm_mode != ONLY_4X4] -
-            rateuv_tokenonly[cm->txfm_mode != ONLY_4X4];
-    dist += (distuv[cm->txfm_mode != ONLY_4X4] >> 2);
-    mbmi->txfm_size = txfm_size_16x16;
-  } else if (error8x8 > error16x16) {
-    if (error4x4 < error16x16) {
-      rate = rateuv[TX_4X4] + rate4x4;
-      mbmi->mode = I4X4_PRED;
-      mbmi->txfm_size = TX_4X4;
-      dist = dist4x4 + (distuv[TX_4X4] >> 2);
-      mbmi->uv_mode = modeuv[TX_4X4];
-    } else {
-      mbmi->txfm_size = txfm_size_16x16;
-      mbmi->mode = mode16x16;
-      rate = rate16x16 + rateuv[mbmi->txfm_size != TX_4X4];
-      dist = dist16x16 + (distuv[mbmi->txfm_size != TX_4X4] >> 2);
-      mbmi->uv_mode = modeuv[mbmi->txfm_size != TX_4X4];
-    }
-    rate += vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 0);
-  } else {
-    if (error4x4 < error8x8) {
-      rate = rateuv[TX_4X4] + rate4x4;
-      mbmi->mode = I4X4_PRED;
-      mbmi->txfm_size = TX_4X4;
-      dist = dist4x4 + (distuv[TX_4X4] >> 2);
-      mbmi->uv_mode = modeuv[TX_4X4];
-    } else {
-      mbmi->mode = I8X8_PRED;
-      mbmi->txfm_size = txfm_size_8x8;
-      set_i8x8_block_modes(x, mode8x8);
-      rate = rate8x8 + rateuv[TX_4X4];
-      dist = dist8x8 + (distuv[TX_4X4] >> 2);
-    }
-    rate += vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 0);
-  }
-
-  for (i = 0; i < NB_TXFM_MODES; i++) {
-    x->mb_context[xd->sb_index][xd->mb_index].txfm_rd_diff[i] =
-        txfm_cache[0][cm->txfm_mode] - txfm_cache[0][i];
-  }
-
-  *returnrate = rate;
-  *returndist = dist;
-}
-
 int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
                                   int mi_row, int mi_col,
                                   int *returnrate,
@@ -4272,7 +2468,18 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   unsigned int mode_mask = 0;
   int64_t mode_distortions[MB_MODE_COUNT] = {-1};
   int64_t frame_distortions[MAX_REF_FRAMES] = {-1};
+  int intra_cost_penalty = 20 * vp9_dc_quant(cpi->common.base_qindex,
+                                             cpi->common.y_dc_delta_q);
+  int_mv seg_mvs[4][MAX_REF_FRAMES - 1];
+  union b_mode_info best_bmodes[4];
+  PARTITION_INFO best_partition;
+
+  for (i = 0; i < 4; i++) {
+    int j;
 
+    for (j = 0; j < MAX_REF_FRAMES - 1; j++)
+      seg_mvs[i][j].as_int = INVALID_MV;
+  }
   // Everywhere the flag is set the error is much higher than its neighbors.
   ctx->frames_with_high_error = 0;
   ctx->modes_with_high_error = 0;
@@ -4330,7 +2537,9 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   if (cpi->Speed == 0
       || (cpi->Speed > 0 && (ref_frame_mask & (1 << INTRA_FRAME)))) {
     mbmi->mode = DC_PRED;
-    for (i = 0; i <= ((bsize < BLOCK_SIZE_SB64X64) ? TX_16X16 : TX_32X32);
+    for (i = 0; i <= (bsize < BLOCK_SIZE_MB16X16 ? TX_4X4 :
+                      (bsize < BLOCK_SIZE_SB32X32 ? TX_8X8 :
+                       (bsize < BLOCK_SIZE_SB64X64 ? TX_16X16 : TX_32X32)));
          i++) {
       mbmi->txfm_size = i;
       rd_pick_intra_sbuv_mode(cpi, x, &rate_uv_intra[i], &rate_uv_tokenonly[i],
@@ -4362,6 +2571,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     x->skip = 0;
     this_mode = vp9_mode_order[mode_index].mode;
     ref_frame = vp9_mode_order[mode_index].ref_frame;
+
     if (!(ref_frame == INTRA_FRAME
         || (cpi->ref_frame_flags & flag_list[ref_frame]))) {
       continue;
@@ -4382,6 +2592,21 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
 
     mbmi->ref_frame = ref_frame;
     mbmi->second_ref_frame = vp9_mode_order[mode_index].second_ref_frame;
+
+    // TODO(jingning, jkoleszar): scaling reference frame not supported for
+    // SPLITMV.
+    if (mbmi->ref_frame > 0 &&
+          (yv12_mb[mbmi->ref_frame].y_width != cm->mb_cols * 16 ||
+           yv12_mb[mbmi->ref_frame].y_height != cm->mb_rows * 16) &&
+        this_mode == SPLITMV)
+      continue;
+
+    if (mbmi->second_ref_frame > 0 &&
+          (yv12_mb[mbmi->second_ref_frame].y_width != cm->mb_cols * 16 ||
+           yv12_mb[mbmi->second_ref_frame].y_height != cm->mb_rows * 16) &&
+        this_mode == SPLITMV)
+      continue;
+
     set_scale_factors(xd, mbmi->ref_frame, mbmi->second_ref_frame,
                       scale_factor);
     comp_pred = mbmi->second_ref_frame > INTRA_FRAME;
@@ -4400,9 +2625,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     // if (!(cpi->ref_frame_flags & flag_list[ref_frame]))
     //  continue;
 
-    if (this_mode == I8X8_PRED ||
-        this_mode == I4X4_PRED ||
-        this_mode == SPLITMV)
+    if (bsize != BLOCK_SIZE_SB8X8 &&
+        (this_mode == I4X4_PRED || this_mode == SPLITMV))
       continue;
     //  if (vp9_mode_order[mode_index].second_ref_frame == INTRA_FRAME)
     //  continue;
@@ -4465,13 +2689,33 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       }
     }
 
-    if (ref_frame == INTRA_FRAME) {
+    if (this_mode == I4X4_PRED) {
+      int rate;
+
+      // Note the rate value returned here includes the cost of coding
+      // the I4X4_PRED mode : x->mbmode_cost[xd->frame_type][I4X4_PRED];
+      assert(bsize == BLOCK_SIZE_SB8X8);
+      mbmi->txfm_size = TX_4X4;
+      rd_pick_intra4x4mby_modes(cpi, x, &rate, &rate_y,
+                                &distortion_y, INT64_MAX);
+      rate2 += rate;
+      rate2 += intra_cost_penalty;
+      distortion2 += distortion_y;
+
+      rate2 += rate_uv_intra[TX_4X4];
+      rate_uv = rate_uv_intra[TX_4X4];
+      distortion2 += dist_uv[TX_4X4];
+      distortion_uv = dist_uv[TX_4X4];
+      mbmi->uv_mode = mode_uv[TX_4X4];
+    } else if (ref_frame == INTRA_FRAME) {
       TX_SIZE uv_tx;
       vp9_build_intra_predictors_sby_s(xd, bsize);
       super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable,
                       bsize, txfm_cache);
 
       uv_tx = mbmi->txfm_size;
+      if (bsize < BLOCK_SIZE_MB16X16 && uv_tx == TX_8X8)
+        uv_tx = TX_4X4;
       if (bsize < BLOCK_SIZE_SB32X32 && uv_tx == TX_16X16)
         uv_tx = TX_8X8;
       else if (bsize < BLOCK_SIZE_SB64X64 && uv_tx == TX_32X32)
@@ -4483,7 +2727,137 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       mbmi->uv_mode = mode_uv[uv_tx];
 
       rate2 = rate_y + x->mbmode_cost[cm->frame_type][mbmi->mode] + rate_uv;
+      if (mbmi->mode != DC_PRED && mbmi->mode != TM_PRED)
+        rate2 += intra_cost_penalty;
       distortion2 = distortion_y + distortion_uv;
+    } else if (this_mode == SPLITMV) {
+      const int is_comp_pred = mbmi->second_ref_frame > 0;
+      int rate, distortion;
+      int64_t this_rd_thresh;
+      int64_t tmp_rd, tmp_best_rd = INT64_MAX, tmp_best_rdu = INT64_MAX;
+      int tmp_best_rate = INT_MAX, tmp_best_ratey = INT_MAX;
+      int tmp_best_distortion = INT_MAX, tmp_best_skippable = 0;
+      int switchable_filter_index;
+      int_mv *second_ref = is_comp_pred ?
+          &mbmi->ref_mvs[mbmi->second_ref_frame][0] : NULL;
+      union b_mode_info tmp_best_bmodes[16];
+      MB_MODE_INFO tmp_best_mbmode;
+      PARTITION_INFO tmp_best_partition;
+      int pred_exists = 0;
+      int uv_skippable;
+
+      this_rd_thresh = (mbmi->ref_frame == LAST_FRAME) ?
+          cpi->rd_threshes[THR_NEWMV] : cpi->rd_threshes[THR_NEWA];
+      this_rd_thresh = (mbmi->ref_frame == GOLDEN_FRAME) ?
+          cpi->rd_threshes[THR_NEWG] : this_rd_thresh;
+      xd->mode_info_context->mbmi.txfm_size = TX_4X4;
+
+      for (switchable_filter_index = 0;
+           switchable_filter_index < VP9_SWITCHABLE_FILTERS;
+           ++switchable_filter_index) {
+        int newbest;
+        mbmi->interp_filter =
+        vp9_switchable_interp[switchable_filter_index];
+        vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
+
+        tmp_rd = rd_pick_best_mbsegmentation(cpi, x,
+                                             &mbmi->ref_mvs[mbmi->ref_frame][0],
+                                             second_ref, INT64_MAX, mdcounts,
+                                             &rate, &rate_y, &distortion,
+                                             &skippable,
+                                             (int)this_rd_thresh, seg_mvs);
+        if (cpi->common.mcomp_filter_type == SWITCHABLE) {
+          int rs = SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs
+          [vp9_get_pred_context(&cpi->common, xd,
+                                PRED_SWITCHABLE_INTERP)]
+          [vp9_switchable_interp_map[mbmi->interp_filter]];
+          tmp_rd += RDCOST(x->rdmult, x->rddiv, rs, 0);
+        }
+        newbest = (tmp_rd < tmp_best_rd);
+        if (newbest) {
+          tmp_best_filter = mbmi->interp_filter;
+          tmp_best_rd = tmp_rd;
+        }
+        if ((newbest && cm->mcomp_filter_type == SWITCHABLE) ||
+            (mbmi->interp_filter == cm->mcomp_filter_type &&
+             cm->mcomp_filter_type != SWITCHABLE)) {
+              tmp_best_rdu = tmp_rd;
+              tmp_best_rate = rate;
+              tmp_best_ratey = rate_y;
+              tmp_best_distortion = distortion;
+              tmp_best_skippable = skippable;
+              vpx_memcpy(&tmp_best_mbmode, mbmi, sizeof(MB_MODE_INFO));
+              vpx_memcpy(&tmp_best_partition, x->partition_info,
+                         sizeof(PARTITION_INFO));
+              for (i = 0; i < 4; i++) {
+                tmp_best_bmodes[i] = xd->mode_info_context->bmi[i];
+              }
+              pred_exists = 1;
+            }
+      }  // switchable_filter_index loop
+
+      mbmi->interp_filter = (cm->mcomp_filter_type == SWITCHABLE ?
+                             tmp_best_filter : cm->mcomp_filter_type);
+      vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
+      if (!pred_exists) {
+        // Handles the special case when a filter that is not in the
+        // switchable list (bilinear, 6-tap) is indicated at the frame level
+        tmp_rd = rd_pick_best_mbsegmentation(cpi, x,
+                                             &mbmi->ref_mvs[mbmi->ref_frame][0],
+                                             second_ref, INT64_MAX, mdcounts,
+                                             &rate, &rate_y, &distortion,
+                                             &skippable,
+                                             (int)this_rd_thresh, seg_mvs);
+      } else {
+        if (cpi->common.mcomp_filter_type == SWITCHABLE) {
+          int rs = SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs
+              [vp9_get_pred_context(&cpi->common, xd,
+                                    PRED_SWITCHABLE_INTERP)]
+              [vp9_switchable_interp_map[mbmi->interp_filter]];
+          tmp_best_rdu -= RDCOST(x->rdmult, x->rddiv, rs, 0);
+        }
+        tmp_rd = tmp_best_rdu;
+        rate = tmp_best_rate;
+        rate_y = tmp_best_ratey;
+        distortion = tmp_best_distortion;
+        skippable = tmp_best_skippable;
+        vpx_memcpy(mbmi, &tmp_best_mbmode, sizeof(MB_MODE_INFO));
+        vpx_memcpy(x->partition_info, &tmp_best_partition,
+                   sizeof(PARTITION_INFO));
+        for (i = 0; i < 4; i++) {
+          xd->mode_info_context->bmi[i] = tmp_best_bmodes[i];
+        }
+      }
+
+      rate2 += rate;
+      distortion2 += distortion;
+
+      if (cpi->common.mcomp_filter_type == SWITCHABLE)
+        rate2 += SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs
+            [vp9_get_pred_context(&cpi->common, xd, PRED_SWITCHABLE_INTERP)]
+            [vp9_switchable_interp_map[mbmi->interp_filter]];
+
+      // If even the 'Y' rd value of split is higher than best so far
+      // then dont bother looking at UV
+      vp9_build_inter_predictors_sbuv(&x->e_mbd, mi_row, mi_col,
+                                      bsize);
+      vp9_subtract_sbuv(x, bsize);
+      super_block_uvrd_for_txfm(cm, x, &rate_uv, &distortion_uv,
+                                &uv_skippable, bsize, TX_4X4);
+      rate2 += rate_uv;
+      distortion2 += distortion_uv;
+      skippable = skippable && uv_skippable;
+
+      if (!mode_excluded) {
+        if (is_comp_pred)
+          mode_excluded = cpi->common.comp_pred_mode == SINGLE_PREDICTION_ONLY;
+        else
+          mode_excluded = cpi->common.comp_pred_mode == COMP_PREDICTION_ONLY;
+      }
+
+      compmode_cost =
+          vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_COMP), is_comp_pred);
+      mbmi->mode = this_mode;
     } else {
       YV12_BUFFER_CONFIG *scaled_ref_frame = NULL;
       int fb;
@@ -4640,6 +3014,13 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
         *returndistortion = distortion2;
         best_rd = this_rd;
         vpx_memcpy(&best_mbmode, mbmi, sizeof(MB_MODE_INFO));
+        vpx_memcpy(&best_partition, x->partition_info, sizeof(PARTITION_INFO));
+
+        if (this_mode == I4X4_PRED || this_mode == SPLITMV) {
+          for (i = 0; i < 4; i++) {
+            best_bmodes[i] = xd->mode_info_context->bmi[i];
+          }
+        }
       }
 #if 0
       // Testing this mode gave rise to an improvement in best error score.
@@ -4693,6 +3074,14 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     }
 
     /* keep record of best txfm size */
+    if (bsize < BLOCK_SIZE_SB32X32) {
+      if (bsize < BLOCK_SIZE_MB16X16) {
+        if (this_mode == SPLITMV || this_mode == I4X4_PRED)
+          txfm_cache[ALLOW_8X8] = txfm_cache[ONLY_4X4];
+        txfm_cache[ALLOW_16X16] = txfm_cache[ALLOW_8X8];
+      }
+      txfm_cache[ALLOW_32X32] = txfm_cache[ALLOW_16X16];
+    }
     if (!mode_excluded && this_rd != INT64_MAX) {
       for (i = 0; i < NB_TXFM_MODES; i++) {
         int64_t adj_rd;
@@ -4769,13 +3158,18 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       (best_mbmode.mode != ZEROMV || best_mbmode.ref_frame != ALTREF_FRAME)) {
     mbmi->mode = ZEROMV;
     mbmi->ref_frame = ALTREF_FRAME;
-    mbmi->second_ref_frame = INTRA_FRAME;
+    mbmi->second_ref_frame = NONE;
     mbmi->mv[0].as_int = 0;
     mbmi->uv_mode = DC_PRED;
     mbmi->mb_skip_coeff = 1;
-    mbmi->partitioning = 0;
-    mbmi->txfm_size = cm->txfm_mode == TX_MODE_SELECT ?
-                      TX_32X32 : cm->txfm_mode;
+    if (cm->txfm_mode == TX_MODE_SELECT) {
+      if (bsize >= BLOCK_SIZE_SB32X32)
+        mbmi->txfm_size = TX_32X32;
+      else if (bsize >= BLOCK_SIZE_MB16X16)
+        mbmi->txfm_size = TX_16X16;
+      else
+        mbmi->txfm_size = TX_8X8;
+    }
 
     vpx_memset(best_txfm_diff, 0, sizeof(best_txfm_diff));
     vpx_memset(best_pred_diff, 0, sizeof(best_pred_diff));
@@ -4784,6 +3178,26 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
 
   // macroblock modes
   vpx_memcpy(mbmi, &best_mbmode, sizeof(MB_MODE_INFO));
+  if (best_mbmode.mode == I4X4_PRED) {
+    for (i = 0; i < 4; i++) {
+      xd->mode_info_context->bmi[i].as_mode = best_bmodes[i].as_mode;
+    }
+  }
+
+  if (best_mbmode.mode == SPLITMV) {
+    for (i = 0; i < 4; i++)
+      xd->mode_info_context->bmi[i].as_mv[0].as_int =
+          best_bmodes[i].as_mv[0].as_int;
+    if (mbmi->second_ref_frame > 0)
+      for (i = 0; i < 4; i++)
+        xd->mode_info_context->bmi[i].as_mv[1].as_int =
+            best_bmodes[i].as_mv[1].as_int;
+
+    vpx_memcpy(x->partition_info, &best_partition, sizeof(PARTITION_INFO));
+
+    mbmi->mv[0].as_int = x->partition_info->bmi[3].mv.as_int;
+    mbmi->mv[1].as_int = x->partition_info->bmi[3].second_mv.as_int;
+  }
 
   for (i = 0; i < NB_PREDICTION_TYPES; ++i) {
     if (best_pred_rd[i] == INT64_MAX)
@@ -4806,7 +3220,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
  end:
   set_scale_factors(xd, mbmi->ref_frame, mbmi->second_ref_frame,
                     scale_factor);
-  store_coding_context(x, ctx, best_mode_index, NULL,
+  store_coding_context(x, ctx, best_mode_index,
+                       &best_partition,
                        &mbmi->ref_mvs[mbmi->ref_frame][0],
                        &mbmi->ref_mvs[mbmi->second_ref_frame < 0 ? 0 :
                                       mbmi->second_ref_frame][0],
@@ -4814,41 +3229,3 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
 
   return best_rd;
 }
-
-void vp9_pick_mode_inter_macroblock(VP9_COMP *cpi, MACROBLOCK *x,
-                                    int mi_row, int mi_col,
-                                    int *totalrate, int *totaldist) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
-  int rate, distortion;
-  int64_t intra_error = 0;
-  unsigned char *segment_id = &mbmi->segment_id;
-
-  if (xd->segmentation_enabled)
-    x->encode_breakout = cpi->segment_encode_breakout[*segment_id];
-  else
-    x->encode_breakout = cpi->oxcf.encode_breakout;
-
-  // if (cpi->sf.RD)
-  // For now this codebase is limited to a single rd encode path
-  {
-    int zbin_mode_boost_enabled = cpi->zbin_mode_boost_enabled;
-
-    rd_pick_inter_mode(cpi, x, mi_row, mi_col, &rate,
-                       &distortion, &intra_error);
-
-    /* restore cpi->zbin_mode_boost_enabled */
-    cpi->zbin_mode_boost_enabled = zbin_mode_boost_enabled;
-  }
-  // else
-  // The non rd encode path has been deleted from this code base
-  // to simplify development
-  //    vp9_pick_inter_mode
-
-  // Store metrics so they can be added in to totals if this mode is picked
-  x->mb_context[xd->sb_index][xd->mb_index].distortion  = distortion;
-  x->mb_context[xd->sb_index][xd->mb_index].intra_error = intra_error;
-
-  *totalrate = rate;
-  *totaldist = distortion;
-}
diff --git a/vp9/encoder/vp9_rdopt.h b/vp9/encoder/vp9_rdopt.h
index eef2a4fe9..dcf5d00e9 100644
--- a/vp9/encoder/vp9_rdopt.h
+++ b/vp9/encoder/vp9_rdopt.h
@@ -19,17 +19,10 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex);
 
 void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex);
 
-void vp9_rd_pick_intra_mode(VP9_COMP *cpi, MACROBLOCK *x,
-                            int *r, int *d);
-
 void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
                                int *r, int *d, BLOCK_SIZE_TYPE bsize,
                                PICK_MODE_CONTEXT *ctx);
 
-void vp9_pick_mode_inter_macroblock(VP9_COMP *cpi, MACROBLOCK *x,
-                                    int mi_row, int mi_col,
-                                    int *r, int *d);
-
 int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
                                   int mi_row, int mi_col,
                                   int *r, int *d, BLOCK_SIZE_TYPE bsize,
diff --git a/vp9/encoder/vp9_segmentation.c b/vp9/encoder/vp9_segmentation.c
index 7f792ae2b..fe995ad72 100644
--- a/vp9/encoder/vp9_segmentation.c
+++ b/vp9/encoder/vp9_segmentation.c
@@ -16,18 +16,15 @@
 #include "vp9/common/vp9_tile_common.h"
 
 void vp9_enable_segmentation(VP9_PTR ptr) {
-  VP9_COMP *cpi = (VP9_COMP *)(ptr);
+  VP9_COMP *cpi = (VP9_COMP *)ptr;
 
-  // Set the appropriate feature bit
   cpi->mb.e_mbd.segmentation_enabled = 1;
   cpi->mb.e_mbd.update_mb_segmentation_map = 1;
   cpi->mb.e_mbd.update_mb_segmentation_data = 1;
 }
 
 void vp9_disable_segmentation(VP9_PTR ptr) {
-  VP9_COMP *cpi = (VP9_COMP *)(ptr);
-
-  // Clear the appropriate feature bit
+  VP9_COMP *cpi = (VP9_COMP *)ptr;
   cpi->mb.e_mbd.segmentation_enabled = 0;
 }
 
@@ -60,61 +57,57 @@ void vp9_set_segment_data(VP9_PTR ptr,
 }
 
 // Based on set of segment counts calculate a probability tree
-static void calc_segtree_probs(MACROBLOCKD *xd,
-                               int *segcounts,
+static void calc_segtree_probs(MACROBLOCKD *xd, int *segcounts,
                                vp9_prob *segment_tree_probs) {
   // Work out probabilities of each segment
-  segment_tree_probs[0] =
-    get_binary_prob(segcounts[0] + segcounts[1] + segcounts[2] + segcounts[3],
-                    segcounts[4] + segcounts[5] + segcounts[6] + segcounts[7]);
-  segment_tree_probs[1] =
-    get_binary_prob(segcounts[0] + segcounts[1], segcounts[2] + segcounts[3]);
-  segment_tree_probs[2] = get_binary_prob(segcounts[0], segcounts[1]);
-  segment_tree_probs[3] = get_binary_prob(segcounts[2], segcounts[3]);
-  segment_tree_probs[4] =
-    get_binary_prob(segcounts[4] + segcounts[5], segcounts[6] + segcounts[7]);
+  const int c01 = segcounts[0] + segcounts[1];
+  const int c23 = segcounts[2] + segcounts[3];
+  const int c45 = segcounts[4] + segcounts[5];
+  const int c67 = segcounts[6] + segcounts[7];
+
+  segment_tree_probs[0] = get_binary_prob(c01 + c23, c45 + c67);
+  segment_tree_probs[1] = get_binary_prob(c01, c23);
+  segment_tree_probs[2] = get_binary_prob(c45, c67);
+  segment_tree_probs[3] = get_binary_prob(segcounts[0], segcounts[1]);
+  segment_tree_probs[4] = get_binary_prob(segcounts[2], segcounts[3]);
   segment_tree_probs[5] = get_binary_prob(segcounts[4], segcounts[5]);
   segment_tree_probs[6] = get_binary_prob(segcounts[6], segcounts[7]);
 }
 
 // Based on set of segment counts and probabilities calculate a cost estimate
-static int cost_segmap(MACROBLOCKD *xd,
-                       int *segcounts,
-                       vp9_prob *probs) {
-  int cost;
-  int count1, count2;
+static int cost_segmap(MACROBLOCKD *xd, int *segcounts, vp9_prob *probs) {
+  const int c01 = segcounts[0] + segcounts[1];
+  const int c23 = segcounts[2] + segcounts[3];
+  const int c45 = segcounts[4] + segcounts[5];
+  const int c67 = segcounts[6] + segcounts[7];
+  const int c0123 = c01 + c23;
+  const int c4567 = c45 + c67;
 
   // Cost the top node of the tree
-  count1 = segcounts[0] + segcounts[1] + segcounts[2] + segcounts[3];
-  count2 = segcounts[3] + segcounts[4] + segcounts[5] + segcounts[6];
-  cost = count1 * vp9_cost_zero(probs[0]) +
-         count2 * vp9_cost_one(probs[0]);
+  int cost = c0123 * vp9_cost_zero(probs[0]) +
+             c4567 * vp9_cost_one(probs[0]);
 
   // Cost subsequent levels
-  if (count1 > 0) {
-    count1 = segcounts[0] + segcounts[1];
-    count2 = segcounts[2] + segcounts[3];
-    cost += count1 * vp9_cost_zero(probs[1]) +
-            count2 * vp9_cost_one(probs[1]);
-
-    if (count1 > 0)
-      cost += segcounts[0] * vp9_cost_zero(probs[2]) +
-              segcounts[1] * vp9_cost_one(probs[2]);
-    if (count2 > 0)
-      cost += segcounts[2] * vp9_cost_zero(probs[3]) +
-              segcounts[3] * vp9_cost_one(probs[3]);
+  if (c0123 > 0) {
+    cost += c01 * vp9_cost_zero(probs[1]) +
+            c23 * vp9_cost_one(probs[1]);
+
+    if (c01 > 0)
+      cost += segcounts[0] * vp9_cost_zero(probs[3]) +
+              segcounts[1] * vp9_cost_one(probs[3]);
+    if (c23 > 0)
+      cost += segcounts[2] * vp9_cost_zero(probs[4]) +
+              segcounts[3] * vp9_cost_one(probs[4]);
   }
 
-  if (count2 > 0) {
-    count1 = segcounts[4] + segcounts[5];
-    count2 = segcounts[6] + segcounts[7];
-    cost += count1 * vp9_cost_zero(probs[4]) +
-            count2 * vp9_cost_one(probs[4]);
+  if (c4567 > 0) {
+    cost += c45 * vp9_cost_zero(probs[2]) +
+            c67 * vp9_cost_one(probs[2]);
 
-    if (count1 > 0)
+    if (c45 > 0)
       cost += segcounts[4] * vp9_cost_zero(probs[5]) +
               segcounts[5] * vp9_cost_one(probs[5]);
-    if (count2 > 0)
+    if (c67 > 0)
       cost += segcounts[6] * vp9_cost_zero(probs[6]) +
               segcounts[7] * vp9_cost_one(probs[6]);
   }
@@ -130,11 +123,12 @@ static void count_segs(VP9_COMP *cpi,
                        int bw, int bh, int mi_row, int mi_col) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &cpi->mb.e_mbd;
-  const int segment_id = mi->mbmi.segment_id;
+  int segment_id;
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
 
+  segment_id = mi->mbmi.segment_id;
   xd->mode_info_context = mi;
   set_mi_row_col(cm, xd, mi_row, bh, mi_col, bw);
 
@@ -199,9 +193,11 @@ static void count_segs_sb(VP9_COMP *cpi, MODE_INFO *mi,
     assert(bwl < bsl && bhl < bsl);
     if (bsize == BLOCK_SIZE_SB64X64) {
       subsize = BLOCK_SIZE_SB32X32;
-    } else {
-      assert(bsize == BLOCK_SIZE_SB32X32);
+    } else if (bsize == BLOCK_SIZE_SB32X32) {
       subsize = BLOCK_SIZE_MB16X16;
+    } else {
+      assert(bsize == BLOCK_SIZE_MB16X16);
+      subsize = BLOCK_SIZE_SB8X8;
     }
 
     for (n = 0; n < 4; n++) {
@@ -238,10 +234,8 @@ void vp9_choose_segmap_coding_method(VP9_COMP *cpi) {
 
   // Set default state for the segment tree probabilities and the
   // temporal coding probabilities
-  vpx_memset(xd->mb_segment_tree_probs, 255,
-             sizeof(xd->mb_segment_tree_probs));
-  vpx_memset(cm->segment_pred_probs, 255,
-             sizeof(cm->segment_pred_probs));
+  vpx_memset(xd->mb_segment_tree_probs, 255, sizeof(xd->mb_segment_tree_probs));
+  vpx_memset(cm->segment_pred_probs, 255, sizeof(cm->segment_pred_probs));
 
   vpx_memset(no_pred_segcounts, 0, sizeof(no_pred_segcounts));
   vpx_memset(t_unpred_seg_counts, 0, sizeof(t_unpred_seg_counts));
@@ -249,16 +243,15 @@ void vp9_choose_segmap_coding_method(VP9_COMP *cpi) {
 
   // First of all generate stats regarding how well the last segment map
   // predicts this one
-
   for (tile_col = 0; tile_col < cm->tile_columns; tile_col++) {
     vp9_get_tile_col_offsets(cm, tile_col);
     mi_ptr = cm->mi + cm->cur_tile_mi_col_start;
     for (mi_row = 0; mi_row < cm->mi_rows;
-         mi_row += (4 << CONFIG_SB8X8), mi_ptr += (4 << CONFIG_SB8X8) * mis) {
+         mi_row += 8, mi_ptr += 8 * mis) {
       mi = mi_ptr;
       for (mi_col = cm->cur_tile_mi_col_start;
            mi_col < cm->cur_tile_mi_col_end;
-           mi_col += (4 << CONFIG_SB8X8), mi += (4 << CONFIG_SB8X8)) {
+           mi_col += 8, mi += 8) {
         count_segs_sb(cpi, mi, no_pred_segcounts, temporal_predictor_count,
                       t_unpred_seg_counts, mi_row, mi_col, BLOCK_SIZE_SB64X64);
       }
@@ -279,27 +272,24 @@ void vp9_choose_segmap_coding_method(VP9_COMP *cpi) {
 
     // Add in the cost of the signalling for each prediction context
     for (i = 0; i < PREDICTION_PROBS; i++) {
-      t_nopred_prob[i] = get_binary_prob(temporal_predictor_count[i][0],
-                                         temporal_predictor_count[i][1]);
+      const int count0 = temporal_predictor_count[i][0];
+      const int count1 = temporal_predictor_count[i][1];
+
+      t_nopred_prob[i] = get_binary_prob(count0, count1);
 
       // Add in the predictor signaling cost
-      t_pred_cost += (temporal_predictor_count[i][0] *
-                      vp9_cost_zero(t_nopred_prob[i])) +
-                     (temporal_predictor_count[i][1] *
-                      vp9_cost_one(t_nopred_prob[i]));
+      t_pred_cost += count0 * vp9_cost_zero(t_nopred_prob[i]) +
+                     count1 * vp9_cost_one(t_nopred_prob[i]);
     }
   }
 
   // Now choose which coding method to use.
   if (t_pred_cost < no_pred_cost) {
     cm->temporal_update = 1;
-    vpx_memcpy(xd->mb_segment_tree_probs,
-               t_pred_tree, sizeof(t_pred_tree));
-    vpx_memcpy(&cm->segment_pred_probs,
-               t_nopred_prob, sizeof(t_nopred_prob));
+    vpx_memcpy(xd->mb_segment_tree_probs, t_pred_tree, sizeof(t_pred_tree));
+    vpx_memcpy(cm->segment_pred_probs, t_nopred_prob, sizeof(t_nopred_prob));
   } else {
     cm->temporal_update = 0;
-    vpx_memcpy(xd->mb_segment_tree_probs,
-               no_pred_tree, sizeof(no_pred_tree));
+    vpx_memcpy(xd->mb_segment_tree_probs, no_pred_tree, sizeof(no_pred_tree));
   }
 }
diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c
index 1e6b9840b..6bd8b5036 100644
--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c
@@ -481,7 +481,7 @@ void configure_arnr_filter(VP9_COMP *cpi, const unsigned int this_frame,
   // Note: this_frame->frame has been updated in the loop
   // so it now points at the ARF frame.
   half_gf_int = cpi->baseline_gf_interval >> 1;
-  frames_after_arf = (int)(cpi->twopass.total_stats->count - this_frame - 1);
+  frames_after_arf = (int)(cpi->twopass.total_stats.count - this_frame - 1);
 
   switch (cpi->oxcf.arnr_type) {
     case 1:  // Backward filter
diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c
index 3c3367071..9a6598581 100644
--- a/vp9/encoder/vp9_tokenize.c
+++ b/vp9/encoder/vp9_tokenize.c
@@ -112,8 +112,6 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE_TYPE bsize,
   PLANE_TYPE type = plane ? PLANE_TYPE_UV : PLANE_TYPE_Y_WITH_DC;
   TX_SIZE tx_size = ss_txfrm_size / 2;
   int dry_run = args->dry_run;
-  int ib = old_block_idx_4x4(xd, b_width_log2(bsize) + b_height_log2(bsize),
-                             plane, block);
 
   MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
   int pt; /* near block/prev token context index */
@@ -158,7 +156,7 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE_TYPE bsize,
     default:
     case TX_4X4: {
       tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
-          get_tx_type_4x4(xd, ib) : DCT_DCT;
+          get_tx_type_4x4(xd, block) : DCT_DCT;
       above_ec = A[0] != 0;
       left_ec = L[0] != 0;
       seg_eob = 16;
@@ -173,7 +171,7 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE_TYPE bsize,
     }
     case TX_8X8: {
       const int sz = 1 + b_width_log2(sb_type);
-      const int x = ib & ((1 << sz) - 1), y = ib - x;
+      const int x = block & ((1 << sz) - 1), y = block - x;
       tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
           get_tx_type_8x8(xd, y + (x >> 1)) : DCT_DCT;
       above_ec = (A[0] + A[1]) != 0;
@@ -190,7 +188,7 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE_TYPE bsize,
     }
     case TX_16X16: {
       const int sz = 2 + b_width_log2(sb_type);
-      const int x = ib & ((1 << sz) - 1), y = ib - x;
+      const int x = block & ((1 << sz) - 1), y = block - x;
       tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
           get_tx_type_16x16(xd, y + (x >> 2)) : DCT_DCT;
       above_ec = (A[0] + A[1] + A[2] + A[3]) != 0;
@@ -376,7 +374,8 @@ int vp9_sb_is_skippable(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize) {
 int vp9_sby_is_skippable(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize) {
   int result = 1;
   struct is_skippable_args args = {xd, &result};
-  foreach_transformed_block_in_plane(xd, bsize, 0, 0, is_skippable, &args);
+  foreach_transformed_block_in_plane(xd, bsize, 0,
+                                     is_skippable, &args);
   return result;
 }
 
diff --git a/vp9/encoder/vp9_variance_c.c b/vp9/encoder/vp9_variance_c.c
index c4c70df43..c2a600408 100644
--- a/vp9/encoder/vp9_variance_c.c
+++ b/vp9/encoder/vp9_variance_c.c
@@ -239,6 +239,32 @@ unsigned int vp9_variance8x8_c(const uint8_t *src_ptr,
   return (var - (((unsigned int)avg * avg) >> 6));
 }
 
+unsigned int vp9_variance8x4_c(const uint8_t *src_ptr,
+                               int  source_stride,
+                               const uint8_t *ref_ptr,
+                               int  recon_stride,
+                               unsigned int *sse) {
+  unsigned int var;
+  int avg;
+
+  variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 4, &var, &avg);
+  *sse = var;
+  return (var - (((unsigned int)avg * avg) >> 5));
+}
+
+unsigned int vp9_variance4x8_c(const uint8_t *src_ptr,
+                               int  source_stride,
+                               const uint8_t *ref_ptr,
+                               int  recon_stride,
+                               unsigned int *sse) {
+  unsigned int var;
+  int avg;
+
+  variance(src_ptr, source_stride, ref_ptr, recon_stride, 4, 8, &var, &avg);
+  *sse = var;
+  return (var - (((unsigned int)avg * avg) >> 5));
+}
+
 unsigned int vp9_variance4x4_c(const uint8_t *src_ptr,
                                int  source_stride,
                                const uint8_t *ref_ptr,
diff --git a/vp9/encoder/x86/vp9_quantize_mmx.asm b/vp9/encoder/x86/vp9_quantize_mmx.asm
deleted file mode 100644
index 22e235610..000000000
--- a/vp9/encoder/x86/vp9_quantize_mmx.asm
+++ /dev/null
@@ -1,286 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;int vp9_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr,
-;                           short *qcoeff_ptr,short *dequant_ptr,
-;                           short *scan_mask, short *round_ptr,
-;                           short *quant_ptr, short *dqcoeff_ptr);
-global sym(vp9_fast_quantize_b_impl_mmx) PRIVATE
-sym(vp9_fast_quantize_b_impl_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 8
-    push rsi
-    push rdi
-    ; end prolog
-
-
-        mov             rsi,        arg(0) ;coeff_ptr
-        movq            mm0,        [rsi]
-
-        mov             rax,        arg(1) ;zbin_ptr
-        movq            mm1,        [rax]
-
-        movq            mm3,        mm0
-        psraw           mm0,        15
-
-        pxor            mm3,        mm0
-        psubw           mm3,        mm0         ; abs
-
-        movq            mm2,        mm3
-        pcmpgtw         mm1,        mm2
-
-        pandn           mm1,        mm2
-        movq            mm3,        mm1
-
-        mov             rdx,        arg(6) ;quant_ptr
-        movq            mm1,        [rdx]
-
-        mov             rcx,        arg(5) ;round_ptr
-        movq            mm2,        [rcx]
-
-        paddw           mm3,        mm2
-        pmulhuw         mm3,        mm1
-
-        pxor            mm3,        mm0
-        psubw           mm3,        mm0     ;gain the sign back
-
-        mov             rdi,        arg(2) ;qcoeff_ptr
-        movq            mm0,        mm3
-
-        movq            [rdi],      mm3
-
-        mov             rax,        arg(3) ;dequant_ptr
-        movq            mm2,        [rax]
-
-        pmullw          mm3,        mm2
-        mov             rax,        arg(7) ;dqcoeff_ptr
-
-        movq            [rax],      mm3
-
-        ; next 8
-        movq            mm4,        [rsi+8]
-
-        mov             rax,        arg(1) ;zbin_ptr
-        movq            mm5,        [rax+8]
-
-        movq            mm7,        mm4
-        psraw           mm4,        15
-
-        pxor            mm7,        mm4
-        psubw           mm7,        mm4         ; abs
-
-        movq            mm6,        mm7
-        pcmpgtw         mm5,        mm6
-
-        pandn           mm5,        mm6
-        movq            mm7,        mm5
-
-        movq            mm5,        [rdx+8]
-        movq            mm6,        [rcx+8]
-
-        paddw           mm7,        mm6
-        pmulhuw         mm7,        mm5
-
-        pxor            mm7,        mm4
-        psubw           mm7,        mm4;gain the sign back
-
-        mov             rdi,        arg(2) ;qcoeff_ptr
-
-        movq            mm1,        mm7
-        movq            [rdi+8],    mm7
-
-        mov             rax,        arg(3) ;dequant_ptr
-        movq            mm6,        [rax+8]
-
-        pmullw          mm7,        mm6
-        mov             rax,        arg(7) ;dqcoeff_ptr
-
-        movq            [rax+8],    mm7
-
-
-                ; next 8
-        movq            mm4,        [rsi+16]
-
-        mov             rax,        arg(1) ;zbin_ptr
-        movq            mm5,        [rax+16]
-
-        movq            mm7,        mm4
-        psraw           mm4,        15
-
-        pxor            mm7,        mm4
-        psubw           mm7,        mm4         ; abs
-
-        movq            mm6,        mm7
-        pcmpgtw         mm5,        mm6
-
-        pandn           mm5,        mm6
-        movq            mm7,        mm5
-
-        movq            mm5,        [rdx+16]
-        movq            mm6,        [rcx+16]
-
-        paddw           mm7,        mm6
-        pmulhuw         mm7,        mm5
-
-        pxor            mm7,        mm4
-        psubw           mm7,        mm4;gain the sign back
-
-        mov             rdi,        arg(2) ;qcoeff_ptr
-
-        movq            mm1,        mm7
-        movq            [rdi+16],   mm7
-
-        mov             rax,        arg(3) ;dequant_ptr
-        movq            mm6,        [rax+16]
-
-        pmullw          mm7,        mm6
-        mov             rax,        arg(7) ;dqcoeff_ptr
-
-        movq            [rax+16],   mm7
-
-
-                ; next 8
-        movq            mm4,        [rsi+24]
-
-        mov             rax,        arg(1) ;zbin_ptr
-        movq            mm5,        [rax+24]
-
-        movq            mm7,        mm4
-        psraw           mm4,        15
-
-        pxor            mm7,        mm4
-        psubw           mm7,        mm4         ; abs
-
-        movq            mm6,        mm7
-        pcmpgtw         mm5,        mm6
-
-        pandn           mm5,        mm6
-        movq            mm7,        mm5
-
-        movq            mm5,        [rdx+24]
-        movq            mm6,        [rcx+24]
-
-        paddw           mm7,        mm6
-        pmulhuw         mm7,        mm5
-
-        pxor            mm7,        mm4
-        psubw           mm7,        mm4;gain the sign back
-
-        mov             rdi,        arg(2) ;qcoeff_ptr
-
-        movq            mm1,        mm7
-        movq            [rdi+24],   mm7
-
-        mov             rax,        arg(3) ;dequant_ptr
-        movq            mm6,        [rax+24]
-
-        pmullw          mm7,        mm6
-        mov             rax,        arg(7) ;dqcoeff_ptr
-
-        movq            [rax+24],   mm7
-
-
-
-        mov             rdi,        arg(4) ;scan_mask
-        mov             rsi,        arg(2) ;qcoeff_ptr
-
-        pxor            mm5,        mm5
-        pxor            mm7,        mm7
-
-        movq            mm0,        [rsi]
-        movq            mm1,        [rsi+8]
-
-        movq            mm2,        [rdi]
-        movq            mm3,        [rdi+8];
-
-        pcmpeqw         mm0,        mm7
-        pcmpeqw         mm1,        mm7
-
-        pcmpeqw         mm6,        mm6
-        pxor            mm0,        mm6
-
-        pxor            mm1,        mm6
-        psrlw           mm0,        15
-
-        psrlw           mm1,        15
-        pmaddwd         mm0,        mm2
-
-        pmaddwd         mm1,        mm3
-        movq            mm5,        mm0
-
-        paddd           mm5,        mm1
-
-        movq            mm0,        [rsi+16]
-        movq            mm1,        [rsi+24]
-
-        movq            mm2,        [rdi+16]
-        movq            mm3,        [rdi+24];
-
-        pcmpeqw         mm0,        mm7
-        pcmpeqw         mm1,        mm7
-
-        pcmpeqw         mm6,        mm6
-        pxor            mm0,        mm6
-
-        pxor            mm1,        mm6
-        psrlw           mm0,        15
-
-        psrlw           mm1,        15
-        pmaddwd         mm0,        mm2
-
-        pmaddwd         mm1,        mm3
-        paddd           mm5,        mm0
-
-        paddd           mm5,        mm1
-        movq            mm0,        mm5
-
-        psrlq           mm5,        32
-        paddd           mm0,        mm5
-
-        ; eob adjustment begins here
-        movq            rcx,        mm0
-        and             rcx,        0xffff
-
-        xor             rdx,        rdx
-        sub             rdx,        rcx ; rdx=-rcx
-
-        bsr             rax,        rcx
-        inc             rax
-
-        sar             rdx,        31
-        and             rax,        rdx
-        ; Substitute the sse assembly for the old mmx mixed assembly/C. The
-        ; following is kept as reference
-        ;    movq            rcx,        mm0
-        ;    bsr             rax,        rcx
-        ;
-        ;    mov             eob,        rax
-        ;    mov             eee,        rcx
-        ;
-        ;if(eee==0)
-        ;{
-        ;    eob=-1;
-        ;}
-        ;else if(eee<0)
-        ;{
-        ;    eob=15;
-        ;}
-        ;d->eob = eob+1;
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
diff --git a/vp9/encoder/x86/vp9_quantize_sse2.asm b/vp9/encoder/x86/vp9_quantize_sse2.asm
deleted file mode 100644
index 700e64b1f..000000000
--- a/vp9/encoder/x86/vp9_quantize_sse2.asm
+++ /dev/null
@@ -1,379 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license and patent
-;  grant that can be found in the LICENSE file in the root of the source
-;  tree. All contributing project authors may be found in the AUTHORS
-;  file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-
-; void vp9_regular_quantize_b_sse2 | arg
-;  (BLOCK  *b,                     |  0
-;   BLOCKD *d)                     |  1
-
-global sym(vp9_regular_quantize_b_sse2) PRIVATE
-sym(vp9_regular_quantize_b_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SAVE_XMM 7
-    GET_GOT     rbx
-
-%if ABI_IS_32BIT
-    push        rdi
-    push        rsi
-%else
-  %if LIBVPX_YASM_WIN64
-    push        rdi
-    push        rsi
-  %endif
-%endif
-
-    ALIGN_STACK 16, rax
-    %define zrun_zbin_boost   0  ;  8
-    %define abs_minus_zbin    8  ; 32
-    %define temp_qcoeff       40 ; 32
-    %define qcoeff            72 ; 32
-    %define stack_size        104
-    sub         rsp, stack_size
-    ; end prolog
-
-%if ABI_IS_32BIT
-    mov         rdi, arg(0)                 ; BLOCK *b
-    mov         rsi, arg(1)                 ; BLOCKD *d
-%else
-  %if LIBVPX_YASM_WIN64
-    mov         rdi, rcx                    ; BLOCK *b
-    mov         rsi, rdx                    ; BLOCKD *d
-  %else
-    ;mov         rdi, rdi                    ; BLOCK *b
-    ;mov         rsi, rsi                    ; BLOCKD *d
-  %endif
-%endif
-
-    mov         rdx, [rdi + vp9_block_coeff] ; coeff_ptr
-    mov         rcx, [rdi + vp9_block_zbin] ; zbin_ptr
-    movd        xmm7, [rdi + vp9_block_zbin_extra] ; zbin_oq_value
-
-    ; z
-    movdqa      xmm0, [rdx]
-    movdqa      xmm4, [rdx + 16]
-    mov         rdx, [rdi + vp9_block_round] ; round_ptr
-
-    pshuflw     xmm7, xmm7, 0
-    punpcklwd   xmm7, xmm7                  ; duplicated zbin_oq_value
-
-    movdqa      xmm1, xmm0
-    movdqa      xmm5, xmm4
-
-    ; sz
-    psraw       xmm0, 15
-    psraw       xmm4, 15
-
-    ; (z ^ sz)
-    pxor        xmm1, xmm0
-    pxor        xmm5, xmm4
-
-    ; x = abs(z)
-    psubw       xmm1, xmm0
-    psubw       xmm5, xmm4
-
-    movdqa      xmm2, [rcx]
-    movdqa      xmm3, [rcx + 16]
-    mov         rcx, [rdi + vp9_block_quant] ; quant_ptr
-
-    ; *zbin_ptr + zbin_oq_value
-    paddw       xmm2, xmm7
-    paddw       xmm3, xmm7
-
-    ; x - (*zbin_ptr + zbin_oq_value)
-    psubw       xmm1, xmm2
-    psubw       xmm5, xmm3
-    movdqa      [rsp + abs_minus_zbin], xmm1
-    movdqa      [rsp + abs_minus_zbin + 16], xmm5
-
-    ; add (zbin_ptr + zbin_oq_value) back
-    paddw       xmm1, xmm2
-    paddw       xmm5, xmm3
-
-    movdqa      xmm2, [rdx]
-    movdqa      xmm6, [rdx + 16]
-
-    movdqa      xmm3, [rcx]
-    movdqa      xmm7, [rcx + 16]
-
-    ; x + round
-    paddw       xmm1, xmm2
-    paddw       xmm5, xmm6
-
-    ; y = x * quant_ptr >> 16
-    pmulhw      xmm3, xmm1
-    pmulhw      xmm7, xmm5
-
-    ; y += x
-    paddw       xmm1, xmm3
-    paddw       xmm5, xmm7
-
-    movdqa      [rsp + temp_qcoeff], xmm1
-    movdqa      [rsp + temp_qcoeff + 16], xmm5
-
-    pxor        xmm6, xmm6
-    ; zero qcoeff
-    movdqa      [rsp + qcoeff], xmm6
-    movdqa      [rsp + qcoeff + 16], xmm6
-
-    mov         rdx, [rdi + vp9_block_zrun_zbin_boost] ; zbin_boost_ptr
-    mov         rax, [rdi + vp9_block_quant_shift] ; quant_shift_ptr
-    mov         [rsp + zrun_zbin_boost], rdx
-
-%macro ZIGZAG_LOOP 1
-    ; x
-    movsx       ecx, WORD PTR[rsp + abs_minus_zbin + %1 * 2]
-
-    ; if (x >= zbin)
-    sub         cx, WORD PTR[rdx]           ; x - zbin
-    lea         rdx, [rdx + 2]              ; zbin_boost_ptr++
-    jl          .rq_zigzag_loop_%1           ; x < zbin
-
-    movsx       edi, WORD PTR[rsp + temp_qcoeff + %1 * 2]
-
-    ; downshift by quant_shift[rc]
-    movsx       cx, BYTE PTR[rax + %1]      ; quant_shift_ptr[rc]
-    sar         edi, cl                     ; also sets Z bit
-    je          .rq_zigzag_loop_%1           ; !y
-    mov         WORD PTR[rsp + qcoeff + %1 * 2], di ;qcoeff_ptr[rc] = temp_qcoeff[rc]
-    mov         rdx, [rsp + zrun_zbin_boost] ; reset to b->zrun_zbin_boost
-.rq_zigzag_loop_%1:
-%endmacro
-; in vp9_default_zig_zag1d order: see vp9/common/vp9_entropy.c
-ZIGZAG_LOOP  0
-ZIGZAG_LOOP  1
-ZIGZAG_LOOP  4
-ZIGZAG_LOOP  8
-ZIGZAG_LOOP  5
-ZIGZAG_LOOP  2
-ZIGZAG_LOOP  3
-ZIGZAG_LOOP  6
-ZIGZAG_LOOP  9
-ZIGZAG_LOOP 12
-ZIGZAG_LOOP 13
-ZIGZAG_LOOP 10
-ZIGZAG_LOOP  7
-ZIGZAG_LOOP 11
-ZIGZAG_LOOP 14
-ZIGZAG_LOOP 15
-
-    movdqa      xmm2, [rsp + qcoeff]
-    movdqa      xmm3, [rsp + qcoeff + 16]
-
-    mov         rcx, [rsi + vp9_blockd_dequant] ; dequant_ptr
-    mov         rdi, [rsi + vp9_blockd_dqcoeff] ; dqcoeff_ptr
-
-    ; y ^ sz
-    pxor        xmm2, xmm0
-    pxor        xmm3, xmm4
-    ; x = (y ^ sz) - sz
-    psubw       xmm2, xmm0
-    psubw       xmm3, xmm4
-
-    ; dequant
-    movdqa      xmm0, [rcx]
-    movdqa      xmm1, [rcx + 16]
-
-    mov         rcx, [rsi + vp9_blockd_qcoeff] ; qcoeff_ptr
-
-    pmullw      xmm0, xmm2
-    pmullw      xmm1, xmm3
-
-    movdqa      [rcx], xmm2        ; store qcoeff
-    movdqa      [rcx + 16], xmm3
-    movdqa      [rdi], xmm0        ; store dqcoeff
-    movdqa      [rdi + 16], xmm1
-
-    ; select the last value (in zig_zag order) for EOB
-    pcmpeqw     xmm2, xmm6
-    pcmpeqw     xmm3, xmm6
-    ; !
-    pcmpeqw     xmm6, xmm6
-    pxor        xmm2, xmm6
-    pxor        xmm3, xmm6
-    ; mask inv_zig_zag
-    pand        xmm2, [GLOBAL(inv_zig_zag)]
-    pand        xmm3, [GLOBAL(inv_zig_zag + 16)]
-    ; select the max value
-    pmaxsw      xmm2, xmm3
-    pshufd      xmm3, xmm2, 00001110b
-    pmaxsw      xmm2, xmm3
-    pshuflw     xmm3, xmm2, 00001110b
-    pmaxsw      xmm2, xmm3
-    pshuflw     xmm3, xmm2, 00000001b
-    pmaxsw      xmm2, xmm3
-    movd        eax, xmm2
-    and         eax, 0xff
-    mov         [rsi + vp9_blockd_eob], eax
-
-    ; begin epilog
-    add         rsp, stack_size
-    pop         rsp
-%if ABI_IS_32BIT
-    pop         rsi
-    pop         rdi
-%else
-  %if LIBVPX_YASM_WIN64
-    pop         rsi
-    pop         rdi
-  %endif
-%endif
-    RESTORE_GOT
-    RESTORE_XMM
-    pop         rbp
-    ret
-
-; void vp9_fast_quantize_b_sse2 | arg
-;  (BLOCK  *b,                  |  0
-;   BLOCKD *d)                  |  1
-
-global sym(vp9_fast_quantize_b_sse2) PRIVATE
-sym(vp9_fast_quantize_b_sse2):
-    push        rbp
-    mov         rbp, rsp
-    GET_GOT     rbx
-
-%if ABI_IS_32BIT
-    push        rdi
-    push        rsi
-%else
-  %if LIBVPX_YASM_WIN64
-    push        rdi
-    push        rsi
-  %else
-    ; these registers are used for passing arguments
-  %endif
-%endif
-
-    ; end prolog
-
-%if ABI_IS_32BIT
-    mov         rdi, arg(0)                 ; BLOCK *b
-    mov         rsi, arg(1)                 ; BLOCKD *d
-%else
-  %if LIBVPX_YASM_WIN64
-    mov         rdi, rcx                    ; BLOCK *b
-    mov         rsi, rdx                    ; BLOCKD *d
-  %else
-    ;mov         rdi, rdi                    ; BLOCK *b
-    ;mov         rsi, rsi                    ; BLOCKD *d
-  %endif
-%endif
-
-    mov         rax, [rdi + vp9_block_coeff]
-    mov         rcx, [rdi + vp9_block_round]
-    mov         rdx, [rdi + vp9_block_quant_fast]
-
-    ; z = coeff
-    movdqa      xmm0, [rax]
-    movdqa      xmm4, [rax + 16]
-
-    ; dup z so we can save sz
-    movdqa      xmm1, xmm0
-    movdqa      xmm5, xmm4
-
-    ; sz = z >> 15
-    psraw       xmm0, 15
-    psraw       xmm4, 15
-
-    ; x = abs(z) = (z ^ sz) - sz
-    pxor        xmm1, xmm0
-    pxor        xmm5, xmm4
-    psubw       xmm1, xmm0
-    psubw       xmm5, xmm4
-
-    ; x += round
-    paddw       xmm1, [rcx]
-    paddw       xmm5, [rcx + 16]
-
-    mov         rax, [rsi + vp9_blockd_qcoeff]
-    mov         rcx, [rsi + vp9_blockd_dequant]
-    mov         rdi, [rsi + vp9_blockd_dqcoeff]
-
-    ; y = x * quant >> 16
-    pmulhw      xmm1, [rdx]
-    pmulhw      xmm5, [rdx + 16]
-
-    ; x = (y ^ sz) - sz
-    pxor        xmm1, xmm0
-    pxor        xmm5, xmm4
-    psubw       xmm1, xmm0
-    psubw       xmm5, xmm4
-
-    ; qcoeff = x
-    movdqa      [rax], xmm1
-    movdqa      [rax + 16], xmm5
-
-    ; x * dequant
-    movdqa      xmm2, xmm1
-    movdqa      xmm3, xmm5
-    pmullw      xmm2, [rcx]
-    pmullw      xmm3, [rcx + 16]
-
-    ; dqcoeff = x * dequant
-    movdqa      [rdi], xmm2
-    movdqa      [rdi + 16], xmm3
-
-    pxor        xmm4, xmm4                  ;clear all bits
-    pcmpeqw     xmm1, xmm4
-    pcmpeqw     xmm5, xmm4
-
-    pcmpeqw     xmm4, xmm4                  ;set all bits
-    pxor        xmm1, xmm4
-    pxor        xmm5, xmm4
-
-    pand        xmm1, [GLOBAL(inv_zig_zag)]
-    pand        xmm5, [GLOBAL(inv_zig_zag + 16)]
-
-    pmaxsw      xmm1, xmm5
-
-    ; now down to 8
-    pshufd      xmm5, xmm1, 00001110b
-
-    pmaxsw      xmm1, xmm5
-
-    ; only 4 left
-    pshuflw     xmm5, xmm1, 00001110b
-
-    pmaxsw      xmm1, xmm5
-
-    ; okay, just 2!
-    pshuflw     xmm5, xmm1, 00000001b
-
-    pmaxsw      xmm1, xmm5
-
-    movd        eax, xmm1
-    and         eax, 0xff
-    mov         [rsi + vp9_blockd_eob], eax
-
-    ; begin epilog
-%if ABI_IS_32BIT
-    pop         rsi
-    pop         rdi
-%else
-  %if LIBVPX_YASM_WIN64
-    pop         rsi
-    pop         rdi
-  %endif
-%endif
-
-    RESTORE_GOT
-    pop         rbp
-    ret
-
-SECTION_RODATA
-align 16
-inv_zig_zag:
-  dw 0x0001, 0x0002, 0x0006, 0x0007
-  dw 0x0003, 0x0005, 0x0008, 0x000d
-  dw 0x0004, 0x0009, 0x000c, 0x000e
-  dw 0x000a, 0x000b, 0x000f, 0x0010
diff --git a/vp9/encoder/x86/vp9_quantize_sse4.asm b/vp9/encoder/x86/vp9_quantize_sse4.asm
deleted file mode 100644
index 4c14e5ffe..000000000
--- a/vp9/encoder/x86/vp9_quantize_sse4.asm
+++ /dev/null
@@ -1,253 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license and patent
-;  grant that can be found in the LICENSE file in the root of the source
-;  tree. All contributing project authors may be found in the AUTHORS
-;  file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-
-; void vp9_regular_quantize_b_sse4 | arg
-;  (BLOCK  *b,                     |  0
-;   BLOCKD *d)                     |  1
-
-global sym(vp9_regular_quantize_b_sse4) PRIVATE
-sym(vp9_regular_quantize_b_sse4):
-
-%if ABI_IS_32BIT
-    push        rbp
-    mov         rbp, rsp
-    GET_GOT     rbx
-    push        rdi
-    push        rsi
-
-    ALIGN_STACK 16, rax
-    %define qcoeff      0 ; 32
-    %define stack_size 32
-    sub         rsp, stack_size
-%else
-  %if LIBVPX_YASM_WIN64
-    SAVE_XMM 8, u
-    push        rdi
-    push        rsi
-  %endif
-%endif
-    ; end prolog
-
-%if ABI_IS_32BIT
-    mov         rdi, arg(0)                 ; BLOCK *b
-    mov         rsi, arg(1)                 ; BLOCKD *d
-%else
-  %if LIBVPX_YASM_WIN64
-    mov         rdi, rcx                    ; BLOCK *b
-    mov         rsi, rdx                    ; BLOCKD *d
-  %else
-    ;mov         rdi, rdi                    ; BLOCK *b
-    ;mov         rsi, rsi                    ; BLOCKD *d
-  %endif
-%endif
-
-    mov         rax, [rdi + vp9_block_coeff]
-    mov         rcx, [rdi + vp9_block_zbin]
-    mov         rdx, [rdi + vp9_block_round]
-    movd        xmm7, [rdi + vp9_block_zbin_extra]
-
-    ; z
-    movdqa      xmm0, [rax]
-    movdqa      xmm1, [rax + 16]
-
-    ; duplicate zbin_oq_value
-    pshuflw     xmm7, xmm7, 0
-    punpcklwd   xmm7, xmm7
-
-    movdqa      xmm2, xmm0
-    movdqa      xmm3, xmm1
-
-    ; sz
-    psraw       xmm0, 15
-    psraw       xmm1, 15
-
-    ; (z ^ sz)
-    pxor        xmm2, xmm0
-    pxor        xmm3, xmm1
-
-    ; x = abs(z)
-    psubw       xmm2, xmm0
-    psubw       xmm3, xmm1
-
-    ; zbin
-    movdqa      xmm4, [rcx]
-    movdqa      xmm5, [rcx + 16]
-
-    ; *zbin_ptr + zbin_oq_value
-    paddw       xmm4, xmm7
-    paddw       xmm5, xmm7
-
-    movdqa      xmm6, xmm2
-    movdqa      xmm7, xmm3
-
-    ; x - (*zbin_ptr + zbin_oq_value)
-    psubw       xmm6, xmm4
-    psubw       xmm7, xmm5
-
-    ; round
-    movdqa      xmm4, [rdx]
-    movdqa      xmm5, [rdx + 16]
-
-    mov         rax, [rdi + vp9_block_quant_shift]
-    mov         rcx, [rdi + vp9_block_quant]
-    mov         rdx, [rdi + vp9_block_zrun_zbin_boost]
-
-    ; x + round
-    paddw       xmm2, xmm4
-    paddw       xmm3, xmm5
-
-    ; quant
-    movdqa      xmm4, [rcx]
-    movdqa      xmm5, [rcx + 16]
-
-    ; y = x * quant_ptr >> 16
-    pmulhw      xmm4, xmm2
-    pmulhw      xmm5, xmm3
-
-    ; y += x
-    paddw       xmm2, xmm4
-    paddw       xmm3, xmm5
-
-    pxor        xmm4, xmm4
-%if ABI_IS_32BIT
-    movdqa      [rsp + qcoeff], xmm4
-    movdqa      [rsp + qcoeff + 16], xmm4
-%else
-    pxor        xmm8, xmm8
-%endif
-
-    ; quant_shift
-    movdqa      xmm5, [rax]
-
-    ; zrun_zbin_boost
-    mov         rax, rdx
-
-%macro ZIGZAG_LOOP 5
-    ; x
-    pextrw      ecx, %4, %2
-
-    ; if (x >= zbin)
-    sub         cx, WORD PTR[rdx]           ; x - zbin
-    lea         rdx, [rdx + 2]              ; zbin_boost_ptr++
-    jl          .rq_zigzag_loop_%1          ; x < zbin
-
-    pextrw      edi, %3, %2                 ; y
-
-    ; downshift by quant_shift[rc]
-    pextrb      ecx, xmm5, %1               ; quant_shift[rc]
-    sar         edi, cl                     ; also sets Z bit
-    je          .rq_zigzag_loop_%1          ; !y
-%if ABI_IS_32BIT
-    mov         WORD PTR[rsp + qcoeff + %1 *2], di
-%else
-    pinsrw      %5, edi, %2                 ; qcoeff[rc]
-%endif
-    mov         rdx, rax                    ; reset to b->zrun_zbin_boost
-.rq_zigzag_loop_%1:
-%endmacro
-; in vp9_default_zig_zag1d order: see vp9/common/vp9_entropy.c
-ZIGZAG_LOOP  0, 0, xmm2, xmm6, xmm4
-ZIGZAG_LOOP  1, 1, xmm2, xmm6, xmm4
-ZIGZAG_LOOP  4, 4, xmm2, xmm6, xmm4
-ZIGZAG_LOOP  8, 0, xmm3, xmm7, xmm8
-ZIGZAG_LOOP  5, 5, xmm2, xmm6, xmm4
-ZIGZAG_LOOP  2, 2, xmm2, xmm6, xmm4
-ZIGZAG_LOOP  3, 3, xmm2, xmm6, xmm4
-ZIGZAG_LOOP  6, 6, xmm2, xmm6, xmm4
-ZIGZAG_LOOP  9, 1, xmm3, xmm7, xmm8
-ZIGZAG_LOOP 12, 4, xmm3, xmm7, xmm8
-ZIGZAG_LOOP 13, 5, xmm3, xmm7, xmm8
-ZIGZAG_LOOP 10, 2, xmm3, xmm7, xmm8
-ZIGZAG_LOOP  7, 7, xmm2, xmm6, xmm4
-ZIGZAG_LOOP 11, 3, xmm3, xmm7, xmm8
-ZIGZAG_LOOP 14, 6, xmm3, xmm7, xmm8
-ZIGZAG_LOOP 15, 7, xmm3, xmm7, xmm8
-
-    mov         rcx, [rsi + vp9_blockd_dequant]
-    mov         rdi, [rsi + vp9_blockd_dqcoeff]
-
-%if ABI_IS_32BIT
-    movdqa      xmm4, [rsp + qcoeff]
-    movdqa      xmm5, [rsp + qcoeff + 16]
-%else
-    %define     xmm5 xmm8
-%endif
-
-    ; y ^ sz
-    pxor        xmm4, xmm0
-    pxor        xmm5, xmm1
-    ; x = (y ^ sz) - sz
-    psubw       xmm4, xmm0
-    psubw       xmm5, xmm1
-
-    ; dequant
-    movdqa      xmm0, [rcx]
-    movdqa      xmm1, [rcx + 16]
-
-    mov         rcx, [rsi + vp9_blockd_qcoeff]
-
-    pmullw      xmm0, xmm4
-    pmullw      xmm1, xmm5
-
-    ; store qcoeff
-    movdqa      [rcx], xmm4
-    movdqa      [rcx + 16], xmm5
-
-    ; store dqcoeff
-    movdqa      [rdi], xmm0
-    movdqa      [rdi + 16], xmm1
-
-    ; select the last value (in zig_zag order) for EOB
-    pxor        xmm6, xmm6
-    pcmpeqw     xmm4, xmm6
-    pcmpeqw     xmm5, xmm6
-
-    packsswb    xmm4, xmm5
-    pshufb      xmm4, [GLOBAL(zig_zag1d)]
-    pmovmskb    edx, xmm4
-    xor         rdi, rdi
-    mov         eax, -1
-    xor         dx, ax
-    bsr         eax, edx
-    sub         edi, edx
-    sar         edi, 31
-    add         eax, 1
-    and         eax, edi
-
-    mov         [rsi + vp9_blockd_eob], eax
-
-    ; begin epilog
-%if ABI_IS_32BIT
-    add         rsp, stack_size
-    pop         rsp
-
-    pop         rsi
-    pop         rdi
-    RESTORE_GOT
-    pop         rbp
-%else
-  %undef xmm5
-  %if LIBVPX_YASM_WIN64
-    pop         rsi
-    pop         rdi
-    RESTORE_XMM
-  %endif
-%endif
-
-    ret
-
-SECTION_RODATA
-align 16
-; vp9/common/vp9_entropy.c: vp9_default_zig_zag1d
-zig_zag1d:
-    db 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
diff --git a/vp9/encoder/x86/vp9_quantize_ssse3.asm b/vp9/encoder/x86/vp9_quantize_ssse3.asm
deleted file mode 100644
index 1fa052147..000000000
--- a/vp9/encoder/x86/vp9_quantize_ssse3.asm
+++ /dev/null
@@ -1,137 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license and patent
-;  grant that can be found in the LICENSE file in the root of the source
-;  tree. All contributing project authors may be found in the AUTHORS
-;  file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-
-; void vp9_fast_quantize_b_ssse3 | arg
-;  (BLOCK  *b,                   |  0
-;   BLOCKD *d)                   |  1
-;
-
-global sym(vp9_fast_quantize_b_ssse3) PRIVATE
-sym(vp9_fast_quantize_b_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    GET_GOT     rbx
-
-%if ABI_IS_32BIT
-    push        rdi
-    push        rsi
-%else
-  %if LIBVPX_YASM_WIN64
-    push        rdi
-    push        rsi
-  %endif
-%endif
-    ; end prolog
-
-%if ABI_IS_32BIT
-    mov         rdi, arg(0)                 ; BLOCK *b
-    mov         rsi, arg(1)                 ; BLOCKD *d
-%else
-  %if LIBVPX_YASM_WIN64
-    mov         rdi, rcx                    ; BLOCK *b
-    mov         rsi, rdx                    ; BLOCKD *d
-  %else
-    ;mov         rdi, rdi                    ; BLOCK *b
-    ;mov         rsi, rsi                    ; BLOCKD *d
-  %endif
-%endif
-
-    mov         rax, [rdi + vp9_block_coeff]
-    mov         rcx, [rdi + vp9_block_round]
-    mov         rdx, [rdi + vp9_block_quant_fast]
-
-    ; coeff
-    movdqa      xmm0, [rax]
-    movdqa      xmm4, [rax + 16]
-
-    ; round
-    movdqa      xmm2, [rcx]
-    movdqa      xmm3, [rcx + 16]
-
-    movdqa      xmm1, xmm0
-    movdqa      xmm5, xmm4
-
-    ; sz = z >> 15
-    psraw       xmm0, 15
-    psraw       xmm4, 15
-
-    pabsw       xmm1, xmm1
-    pabsw       xmm5, xmm5
-
-    paddw       xmm1, xmm2
-    paddw       xmm5, xmm3
-
-    ; quant_fast
-    pmulhw      xmm1, [rdx]
-    pmulhw      xmm5, [rdx + 16]
-
-    mov         rax, [rsi + vp9_blockd_qcoeff]
-    mov         rdi, [rsi + vp9_blockd_dequant]
-    mov         rcx, [rsi + vp9_blockd_dqcoeff]
-
-    pxor        xmm1, xmm0
-    pxor        xmm5, xmm4
-    psubw       xmm1, xmm0
-    psubw       xmm5, xmm4
-
-    movdqa      [rax], xmm1
-    movdqa      [rax + 16], xmm5
-
-    movdqa      xmm2, [rdi]
-    movdqa      xmm3, [rdi + 16]
-
-    pxor        xmm4, xmm4
-    pmullw      xmm2, xmm1
-    pmullw      xmm3, xmm5
-
-    pcmpeqw     xmm1, xmm4                  ;non zero mask
-    pcmpeqw     xmm5, xmm4                  ;non zero mask
-    packsswb    xmm1, xmm5
-    pshufb      xmm1, [GLOBAL(zz_shuf)]
-
-    pmovmskb    edx, xmm1
-
-    xor         rdi, rdi
-    mov         eax, -1
-    xor         dx, ax                      ;flip the bits for bsr
-    bsr         eax, edx
-
-    movdqa      [rcx], xmm2                 ;store dqcoeff
-    movdqa      [rcx + 16], xmm3            ;store dqcoeff
-
-    sub         edi, edx                    ;check for all zeros in bit mask
-    sar         edi, 31                     ;0 or -1
-    add         eax, 1
-    and         eax, edi                    ;if the bit mask was all zero,
-                                            ;then eob = 0
-    mov         [rsi + vp9_blockd_eob], eax
-
-    ; begin epilog
-%if ABI_IS_32BIT
-    pop         rsi
-    pop         rdi
-%else
-  %if LIBVPX_YASM_WIN64
-    pop         rsi
-    pop         rdi
-  %endif
-%endif
-
-    RESTORE_GOT
-    pop         rbp
-    ret
-
-SECTION_RODATA
-align 16
-zz_shuf:
-    db 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
diff --git a/vp9/encoder/x86/vp9_quantize_x86.h b/vp9/encoder/x86/vp9_quantize_x86.h
deleted file mode 100644
index d1db17394..000000000
--- a/vp9/encoder/x86/vp9_quantize_x86.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license and patent
- *  grant that can be found in the LICENSE file in the root of the source
- *  tree. All contributing project authors may be found in the AUTHORS
- *  file in the root of the source tree.
- */
-
-#ifndef VP9_ENCODER_X86_VP9_QUANTIZE_X86_H_
-#define VP9_ENCODER_X86_VP9_QUANTIZE_X86_H_
-
-
-/* Note:
- *
- * This platform is commonly built for runtime CPU detection. If you modify
- * any of the function mappings present in this file, be sure to also update
- * them in the function pointer initialization code
- */
-#if HAVE_MMX
-
-#endif /* HAVE_MMX */
-
-
-#if HAVE_SSE2
-extern prototype_quantize_block(vp9_regular_quantize_b_sse2);
-#if !CONFIG_RUNTIME_CPU_DETECT
-
-#undef vp9_quantize_quantb
-#define vp9_quantize_quantb vp9_regular_quantize_b_sse2
-#endif /* !CONFIG_RUNTIME_CPU_DETECT */
-
-#endif /* HAVE_SSE2 */
-
-
-#if HAVE_SSE4_1
-extern prototype_quantize_block(vp9_regular_quantize_b_sse4);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-
-#undef vp9_quantize_quantb
-#define vp9_quantize_quantb vp9_regular_quantize_b_sse4
-
-#endif /* !CONFIG_RUNTIME_CPU_DETECT */
-
-#endif /* HAVE_SSE4_1 */
-
-#endif /* QUANTIZE_X86_H */