10 files changed, 515 insertions, 894 deletions
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index 20154d813..ac29a8e0c 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -905,6 +905,7 @@ static void write_modes_sb(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc,
     case PARTITION_SPLIT:
       for (n = 0; n < 4; n++) {
         int j = n >> 1, i = n & 0x01;
+        *(get_sb_index(xd, subsize)) = n;
         write_modes_sb(cpi, m + j * bs * mis + i * bs, bc, tok, tok_end,
                        mi_row + j * bs, mi_col + i * bs, subsize);
       }
@@ -914,12 +915,11 @@ static void write_modes_sb(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc,
   }
 
   // update partition context
-  if ((partition == PARTITION_SPLIT) && (bsize > BLOCK_SIZE_MB16X16))
-    return;
-
-  xd->left_seg_context = cm->left_seg_context + (mi_row & MI_MASK);
-  xd->above_seg_context = cm->above_seg_context + mi_col;
-  update_partition_context(xd, subsize, bsize);
+  if (bsize > BLOCK_SIZE_SB8X8 &&
+      (bsize == BLOCK_SIZE_MB16X16 || partition != PARTITION_SPLIT)) {
+    set_partition_seg_context(cm, xd, mi_row, mi_col);
+    update_partition_context(xd, subsize, bsize);
+  }
 }
 
 static void write_modes(VP9_COMP *cpi, vp9_writer* const bc,
@@ -1242,16 +1242,6 @@ static void update_coef_probs(VP9_COMP* const cpi, vp9_writer* const bc) {
 FILE *vpxlogc = 0;
 #endif
 
-static void put_delta_q(vp9_writer *bc, int delta_q) {
-  if (delta_q != 0) {
-    vp9_write_bit(bc, 1);
-    vp9_write_literal(bc, abs(delta_q), 4);
-    vp9_write_bit(bc, delta_q < 0);
-  } else {
-    vp9_write_bit(bc, 0);
-  }
-}
-
 static void decide_kf_ymode_entropy(VP9_COMP *cpi) {
   int mode_cost[MB_MODE_COUNT];
   int bestcost = INT_MAX;
@@ -1298,9 +1288,21 @@ static void segment_reference_frames(VP9_COMP *cpi) {
   }
 }
 
-static void encode_loopfilter(MACROBLOCKD *xd, vp9_writer *w) {
+static void encode_loopfilter(VP9_COMMON *pc, MACROBLOCKD *xd, vp9_writer *w) {
   int i;
 
+  // Encode the loop filter level and type
+  vp9_write_literal(w, pc->filter_level, 6);
+  vp9_write_literal(w, pc->sharpness_level, 3);
+#if CONFIG_LOOP_DERING
+  if (pc->dering_enabled) {
+    vp9_write_bit(w, 1);
+    vp9_write_literal(w, pc->dering_enabled - 1, 4);
+  } else {
+    vp9_write_bit(w, 0);
+  }
+#endif
+
   // Write out loop filter deltas applied at the MB level based on mode or
   // ref frame (if they are enabled).
   vp9_write_bit(w, xd->mode_ref_lf_delta_enabled);
@@ -1354,6 +1356,24 @@ static void encode_loopfilter(MACROBLOCKD *xd, vp9_writer *w) {
   }
 }
 
+static void put_delta_q(vp9_writer *bc, int delta_q) {
+  if (delta_q != 0) {
+    vp9_write_bit(bc, 1);
+    vp9_write_literal(bc, abs(delta_q), 4);
+    vp9_write_bit(bc, delta_q < 0);
+  } else {
+    vp9_write_bit(bc, 0);
+  }
+}
+
+static void encode_quantization(VP9_COMMON *pc, vp9_writer *w) {
+  vp9_write_literal(w, pc->base_qindex, QINDEX_BITS);
+  put_delta_q(w, pc->y_dc_delta_q);
+  put_delta_q(w, pc->uv_dc_delta_q);
+  put_delta_q(w, pc->uv_ac_delta_q);
+}
+
+
 static void encode_segmentation(VP9_COMP *cpi, vp9_writer *w) {
   int i, j;
   VP9_COMMON *const pc = &cpi->common;
@@ -1495,27 +1515,9 @@ void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, unsigned long *size) {
   // lossless mode: note this needs to be before loopfilter
   vp9_write_bit(&header_bc, cpi->mb.e_mbd.lossless);
 
-  // Encode the loop filter level and type
-  vp9_write_literal(&header_bc, pc->filter_level, 6);
-  vp9_write_literal(&header_bc, pc->sharpness_level, 3);
-#if CONFIG_LOOP_DERING
-  if (pc->dering_enabled) {
-    vp9_write_bit(&header_bc, 1);
-    vp9_write_literal(&header_bc, pc->dering_enabled - 1, 4);
-  } else {
-    vp9_write_bit(&header_bc, 0);
-  }
-#endif
-
-  encode_loopfilter(xd, &header_bc);
-
-  // Frame Q baseline quantizer index
-  vp9_write_literal(&header_bc, pc->base_qindex, QINDEX_BITS);
+  encode_loopfilter(pc, xd, &header_bc);
 
-  // Transmit Dc, Second order and Uv quantizer delta information
-  put_delta_q(&header_bc, pc->y_dc_delta_q);
-  put_delta_q(&header_bc, pc->uv_dc_delta_q);
-  put_delta_q(&header_bc, pc->uv_ac_delta_q);
+  encode_quantization(pc, &header_bc);
 
   // When there is a key frame all reference buffers are updated using the new key frame
   if (pc->frame_type != KEY_FRAME) {
@@ -1805,6 +1807,8 @@ void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, unsigned long *size) {
     int scaling = (pc->width != pc->display_width ||
                    pc->height != pc->display_height);
     int v = (oh.first_partition_length_in_bytes << 8) |
+            (pc->subsampling_y << 7) |
+            (pc->subsampling_x << 6) |
             (scaling << 5) |
             (oh.show_frame << 4) |
             (oh.version << 1) |
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index 6bc42c7ff..44261481c 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -138,8 +138,8 @@ struct macroblock {
 
   int optimize;
 
-  // Structure to hold context for each of the 4 MBs within a SB:
-  // when encoded as 4 independent MBs:
+  // TODO(jingning): Need to refactor the structure arrays that buffers the
+  // coding mode decisions of each partition type.
   PICK_MODE_CONTEXT sb8_context[4][4][4];
   PICK_MODE_CONTEXT sb8x16_context[4][4][2];
   PICK_MODE_CONTEXT sb16x8_context[4][4][2];
@@ -153,6 +153,10 @@ struct macroblock {
   PICK_MODE_CONTEXT sb64_context;
   int partition_cost[NUM_PARTITION_CONTEXTS][PARTITION_TYPES];
 
+  BLOCK_SIZE_TYPE mb_partitioning[4][4];
+  BLOCK_SIZE_TYPE sb_partitioning[4];
+  BLOCK_SIZE_TYPE sb64_partitioning;
+
   void (*fwd_txm4x4)(int16_t *input, int16_t *output, int pitch);
   void (*fwd_txm8x4)(int16_t *input, int16_t *output, int pitch);
   void (*fwd_txm8x8)(int16_t *input, int16_t *output, int pitch);
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 49e8ccefa..2edeb7807 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -539,15 +539,6 @@ void vp9_setup_src_planes(MACROBLOCK *x,
                    x->e_mbd.plane[2].subsampling_y);
 }
 
-static INLINE void set_partition_seg_context(VP9_COMP *cpi,
-                                             int mi_row, int mi_col) {
-  VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
-
-  xd->above_seg_context = cm->above_seg_context + mi_col;
-  xd->left_seg_context  = cm->left_seg_context + (mi_row & MI_MASK);
-}
-
 static void set_offsets(VP9_COMP *cpi,
                         int mi_row, int mi_col, BLOCK_SIZE_TYPE bsize) {
   MACROBLOCK *const x = &cpi->mb;
@@ -571,7 +562,7 @@ static void set_offsets(VP9_COMP *cpi,
   }
 
   // partition contexts
-  set_partition_seg_context(cpi, mi_row, mi_col);
+  set_partition_seg_context(cm, xd, mi_row, mi_col);
 
   // Activity map pointer
   x->mb_activity_ptr = &cpi->mb_activity_map[idx_map];
@@ -731,6 +722,9 @@ static void set_block_index(MACROBLOCKD *xd, int idx,
   }
 }
 
+// TODO(jingning): the variables used here are little complicated. need further
+// refactoring on organizing the the temporary buffers, when recursive
+// partition down to 4x4 block size is enabled.
 static PICK_MODE_CONTEXT *get_block_context(MACROBLOCK *x,
                                             BLOCK_SIZE_TYPE bsize) {
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -762,6 +756,52 @@ static PICK_MODE_CONTEXT *get_block_context(MACROBLOCK *x,
   }
 }
 
+static BLOCK_SIZE_TYPE *get_sb_partitioning(MACROBLOCK *x,
+                                            BLOCK_SIZE_TYPE bsize) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  switch (bsize) {
+    case BLOCK_SIZE_SB64X64:
+      return &x->sb64_partitioning;
+    case BLOCK_SIZE_SB32X32:
+      return &x->sb_partitioning[xd->sb_index];
+    case BLOCK_SIZE_MB16X16:
+      return &x->mb_partitioning[xd->sb_index][xd->mb_index];
+    default:
+      assert(0);
+      return NULL;
+  }
+}
+
+static void restore_context(VP9_COMP *cpi, int mi_row, int mi_col,
+                            ENTROPY_CONTEXT a[16 * MAX_MB_PLANE],
+                            ENTROPY_CONTEXT l[16 * MAX_MB_PLANE],
+                            PARTITION_CONTEXT sa[8],
+                            PARTITION_CONTEXT sl[8],
+                            BLOCK_SIZE_TYPE bsize) {
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  int p;
+  int bwl = b_width_log2(bsize), bw = 1 << bwl;
+  int bhl = b_height_log2(bsize), bh = 1 << bhl;
+  int mwl = mi_width_log2(bsize), mw = 1 << mwl;
+  int mhl = mi_height_log2(bsize), mh = 1 << mhl;
+  for (p = 0; p < MAX_MB_PLANE; p++) {
+    vpx_memcpy(cm->above_context[p] +
+               ((mi_col * 2) >> xd->plane[p].subsampling_x),
+               a + bw * p,
+               sizeof(ENTROPY_CONTEXT) * bw >> xd->plane[p].subsampling_x);
+    vpx_memcpy(cm->left_context[p] +
+               ((mi_row & MI_MASK) * 2 >> xd->plane[p].subsampling_y),
+               l + bh * p,
+               sizeof(ENTROPY_CONTEXT) * bh >> xd->plane[p].subsampling_y);
+  }
+  vpx_memcpy(cm->above_seg_context + mi_col, sa,
+             sizeof(PARTITION_CONTEXT) * mw);
+  vpx_memcpy(cm->left_seg_context + (mi_row & MI_MASK), sl,
+             sizeof(PARTITION_CONTEXT) * mh);
+}
+
 static void encode_b(VP9_COMP *cpi, TOKENEXTRA **tp,
                      int mi_row, int mi_col, int output_enabled,
                      BLOCK_SIZE_TYPE bsize, int sub_index) {
@@ -788,27 +828,28 @@ static void encode_b(VP9_COMP *cpi, TOKENEXTRA **tp,
 
 static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp,
                       int mi_row, int mi_col, int output_enabled,
-                      BLOCK_SIZE_TYPE level,
-                      BLOCK_SIZE_TYPE c1, BLOCK_SIZE_TYPE c2[4],
-                      BLOCK_SIZE_TYPE c3[4][4]
-                      ) {
+                      BLOCK_SIZE_TYPE bsize) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
-  const int bsl = mi_width_log2(level), bs = 1 << (bsl - 1);
-  const int bwl = mi_width_log2(c1), bhl = mi_height_log2(c1);
+  BLOCK_SIZE_TYPE c1 = BLOCK_SIZE_SB8X8;
+  const int bsl = mi_width_log2(bsize), bs = 1 << (bsl - 1);
+  int bwl, bhl;
   int UNINITIALIZED_IS_SAFE(pl);
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
 
-  if (level > BLOCK_SIZE_SB8X8) {
-    set_partition_seg_context(cpi, mi_row, mi_col);
-    pl = partition_plane_context(xd, level);
+  if (bsize > BLOCK_SIZE_SB8X8) {
+    set_partition_seg_context(cm, xd, mi_row, mi_col);
+    pl = partition_plane_context(xd, bsize);
+    c1 = *(get_sb_partitioning(x, bsize));
   }
 
+  bwl = mi_width_log2(c1), bhl = mi_height_log2(c1);
+
   if (bsl == bwl && bsl == bhl) {
-    if (output_enabled && level > BLOCK_SIZE_SB8X8)
+    if (output_enabled && bsize > BLOCK_SIZE_SB8X8)
       cpi->partition_count[pl][PARTITION_NONE]++;
     encode_b(cpi, tp, mi_row, mi_col, output_enabled, c1, -1);
   } else if (bsl == bhl && bsl > bwl) {
@@ -826,12 +867,12 @@ static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp,
     int i;
 
     assert(bwl < bsl && bhl < bsl);
-    if (level == BLOCK_SIZE_SB64X64) {
+    if (bsize == BLOCK_SIZE_SB64X64) {
       subsize = BLOCK_SIZE_SB32X32;
-    } else if (level == BLOCK_SIZE_SB32X32) {
+    } else if (bsize == BLOCK_SIZE_SB32X32) {
       subsize = BLOCK_SIZE_MB16X16;
     } else {
-      assert(level == BLOCK_SIZE_MB16X16);
+      assert(bsize == BLOCK_SIZE_MB16X16);
       subsize = BLOCK_SIZE_SB8X8;
     }
 
@@ -843,554 +884,200 @@ static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp,
 
       set_block_index(xd, i, subsize);
       encode_sb(cpi, tp, mi_row + y_idx * bs, mi_col + x_idx * bs,
-                output_enabled, subsize,
-                c2 ? c2[i] : c1, c3 ? c3[i] : NULL, NULL);
+                output_enabled, subsize);
     }
   }
 
-  if (level > BLOCK_SIZE_SB8X8 &&
-      (level == BLOCK_SIZE_MB16X16 || bsl == bwl || bsl == bhl)) {
-    set_partition_seg_context(cpi, mi_row, mi_col);
-    update_partition_context(xd, c1, level);
+  if (bsize > BLOCK_SIZE_SB8X8 &&
+      (bsize == BLOCK_SIZE_MB16X16 || bsl == bwl || bsl == bhl)) {
+    set_partition_seg_context(cm, xd, mi_row, mi_col);
+    update_partition_context(xd, c1, bsize);
   }
 }
 
-static void encode_sb_row(VP9_COMP *cpi,
-                          int mi_row,
-                          TOKENEXTRA **tp,
-                          int *totalrate) {
+
+// TODO(jingning,jimbankoski,rbultje): properly skip partition types that are
+// unlikely to be selected depending on previously rate-distortion optimization
+// results, for encoding speed-up.
+static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp,
+                              int mi_row, int mi_col,
+                              BLOCK_SIZE_TYPE bsize,
+                              int *rate, int *dist) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
-  int mi_col, pl;
-
-  // Initialize the left context for the new SB row
-  vpx_memset(&cm->left_context, 0, sizeof(cm->left_context));
-  vpx_memset(cm->left_seg_context, 0, sizeof(cm->left_seg_context));
-
-  // Code each SB in the row
-  for (mi_col = cm->cur_tile_mi_col_start;
-       mi_col < cm->cur_tile_mi_col_end; mi_col += 8) {
-    int i, p;
-    BLOCK_SIZE_TYPE mb_partitioning[4][4];
-    BLOCK_SIZE_TYPE sb_partitioning[4];
-    BLOCK_SIZE_TYPE sb64_partitioning = BLOCK_SIZE_SB32X32;
-    int sb64_rate = 0, sb64_dist = 0;
-    int sb64_skip = 0;
-    ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
-    PARTITION_CONTEXT seg_l[64 / MI_SIZE], seg_a[64 / MI_SIZE];
-    TOKENEXTRA *tp_orig = *tp;
-
-    for (p = 0; p < MAX_MB_PLANE; p++) {
-      memcpy(a + 16 * p, cm->above_context[p] +
-                 (mi_col * 2 >> xd->plane[p].subsampling_x),
-             sizeof(ENTROPY_CONTEXT) * 16 >> xd->plane[p].subsampling_x);
-      memcpy(l + 16 * p, cm->left_context[p],
-             sizeof(ENTROPY_CONTEXT) * 16 >> xd->plane[p].subsampling_y);
-    }
-    vpx_memcpy(&seg_a, cm->above_seg_context + mi_col, sizeof(seg_a));
-    vpx_memcpy(&seg_l, cm->left_seg_context, sizeof(seg_l));
+  int bsl = b_width_log2(bsize), bs = 1 << bsl;
+  int msl = mi_height_log2(bsize), ms = 1 << msl;
+  ENTROPY_CONTEXT   l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
+  PARTITION_CONTEXT sl[8], sa[8];
+  TOKENEXTRA *tp_orig = *tp;
+  int i, p, pl;
+  BLOCK_SIZE_TYPE subsize;
+  int srate = INT_MAX, sdist = INT_MAX;
+
+  assert(mi_height_log2(bsize) == mi_width_log2(bsize));
+
+  // buffer the above/left context information of the block in search.
+  for (p = 0; p < MAX_MB_PLANE; ++p) {
+    vpx_memcpy(a + bs * p, cm->above_context[p] +
+               (mi_col * 2 >> xd->plane[p].subsampling_x),
+               sizeof(ENTROPY_CONTEXT) * bs >> xd->plane[p].subsampling_x);
+    vpx_memcpy(l + bs * p, cm->left_context[p] +
+               ((mi_row & MI_MASK) * 2 >> xd->plane[p].subsampling_y),
+               sizeof(ENTROPY_CONTEXT) * bs >> xd->plane[p].subsampling_y);
+  }
+  vpx_memcpy(sa, cm->above_seg_context + mi_col,
+             sizeof(PARTITION_CONTEXT) * ms);
+  vpx_memcpy(sl, cm->left_seg_context + (mi_row & MI_MASK),
+             sizeof(PARTITION_CONTEXT) * ms);
+
+  // PARTITION_SPLIT
+  if (bsize >= BLOCK_SIZE_MB16X16) {
+    int r4 = 0, d4 = 0;
+    subsize = get_subsize(bsize, PARTITION_SPLIT);
+    *(get_sb_partitioning(x, bsize)) = subsize;
+
+    for (i = 0; i < 4; ++i) {
+      int x_idx = (i & 1) * (ms >> 1);
+      int y_idx = (i >> 1) * (ms >> 1);
+      int r, d;
 
-    // FIXME(rbultje): this function should probably be rewritten to be
-    // recursive at some point in the future.
-    for (i = 0; i < 4; i++) {
-      const int x_idx = (i & 1) << 2;
-      const int y_idx = (i & 2) << 1;
-      int sb32_rate = 0, sb32_dist = 0;
-      int splitmodes_used = 0;
-      int sb32_skip = 0;
-      int j;
-      ENTROPY_CONTEXT l2[8 * MAX_MB_PLANE], a2[8 * MAX_MB_PLANE];
-      PARTITION_CONTEXT sl32[32 / MI_SIZE], sa32[32 / MI_SIZE];
-
-      sb_partitioning[i] = BLOCK_SIZE_MB16X16;
-      if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols)
+      if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
         continue;
 
-      xd->sb_index = i;
-
-      /* Function should not modify L & A contexts; save and restore on exit */
-      for (p = 0; p < MAX_MB_PLANE; p++) {
-        vpx_memcpy(l2 + 8 * p,
-                   cm->left_context[p] +
-                       (y_idx * 2 >> xd->plane[p].subsampling_y),
-                   sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_y);
-        vpx_memcpy(a2 + 8 * p,
-                   cm->above_context[p] +
-                       ((mi_col + x_idx) * 2 >> xd->plane[p].subsampling_x),
-                   sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_x);
-      }
-      vpx_memcpy(&sa32, cm->above_seg_context + mi_col + x_idx, sizeof(sa32));
-      vpx_memcpy(&sl32, cm->left_seg_context + y_idx, sizeof(sl32));
-
-      /* Encode MBs in raster order within the SB */
-      for (j = 0; j < 4; j++) {
-        const int x_idx_m = x_idx + ((j & 1) << 1);
-        const int y_idx_m = y_idx + ((j >> 1) << 1);
-        int r, d;
-        int r2, d2, mb16_rate = 0, mb16_dist = 0, k;
-        ENTROPY_CONTEXT l3[4 * MAX_MB_PLANE], a3[4 * MAX_MB_PLANE];
-        PARTITION_CONTEXT sl16[16 / MI_SIZE], sa16[16 / MI_SIZE];
-
-        mb_partitioning[i][j] = BLOCK_SIZE_SB8X8;
-
-        if (mi_row + y_idx_m >= cm->mi_rows ||
-            mi_col + x_idx_m >= cm->mi_cols) {
-          // MB lies outside frame, move on
-          continue;
-        }
-
-        // Index of the MB in the SB 0..3
-        xd->mb_index = j;
-
-        for (p = 0; p < MAX_MB_PLANE; p++) {
-          vpx_memcpy(l3 + 4 * p,
-                     cm->left_context[p] +
-                         (y_idx_m * 2 >> xd->plane[p].subsampling_y),
-                     sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_y);
-          vpx_memcpy(a3 + 4 * p,
-                     cm->above_context[p] +
-                         ((mi_col + x_idx_m) * 2 >> xd->plane[p].subsampling_x),
-                     sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_x);
-        }
-        vpx_memcpy(&sa16, cm->above_seg_context + mi_col + x_idx_m,
-                   sizeof(sa16));
-        vpx_memcpy(&sl16, cm->left_seg_context + y_idx_m, sizeof(sl16));
-
-        for (k = 0; k < 4; k++) {
-          xd->b_index = k;
-
-          // try 8x8 coding
-          pick_sb_modes(cpi, mi_row + y_idx_m + (k >> 1),
-                        mi_col + x_idx_m + (k & 1),
-                        tp, &r, &d, BLOCK_SIZE_SB8X8,
-                        &x->sb8_context[xd->sb_index][xd->mb_index]
-                                       [xd->b_index]);
-          mb16_rate += r;
-          mb16_dist += d;
-          update_state(cpi, &x->sb8_context[xd->sb_index][xd->mb_index]
-                                           [xd->b_index],
-                       BLOCK_SIZE_SB8X8, 0);
-          encode_superblock(cpi, tp,
-                            0, mi_row + y_idx_m + (k >> 1),
-                            mi_col + x_idx_m + (k & 1),
-                            BLOCK_SIZE_SB8X8);
-        }
-        set_partition_seg_context(cpi, mi_row + y_idx_m, mi_col + x_idx_m);
-        pl = partition_plane_context(xd, BLOCK_SIZE_MB16X16);
-        mb16_rate += x->partition_cost[pl][PARTITION_SPLIT];
-        for (p = 0; p < MAX_MB_PLANE; p++) {
-          vpx_memcpy(cm->left_context[p] +
-                         (y_idx_m * 2 >> xd->plane[p].subsampling_y),
-                     l3 + 4 * p,
-                     sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_y);
-          vpx_memcpy(cm->above_context[p] +
-                         ((mi_col + x_idx_m) * 2 >> xd->plane[p].subsampling_x),
-                     a3 + 4 * p,
-                     sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_x);
-        }
-        vpx_memcpy(cm->above_seg_context + mi_col + x_idx_m,
-                   sa16, sizeof(sa16));
-        vpx_memcpy(cm->left_seg_context + y_idx_m, sl16, sizeof(sl16));
-
-        // try 8x16 coding
-        r2 = 0;
-        d2 = 0;
-        xd->b_index = 0;
-        pick_sb_modes(cpi, mi_row + y_idx_m, mi_col + x_idx_m,
-                      tp, &r, &d, BLOCK_SIZE_SB8X16,
-                      &x->sb8x16_context[xd->sb_index][xd->mb_index]
-                                        [xd->b_index]);
-        r2 += r;
-        d2 += d;
-        update_state(cpi, &x->sb8x16_context[xd->sb_index][xd->mb_index]
-                                            [xd->b_index],
-                     BLOCK_SIZE_SB8X16, 0);
-        encode_superblock(cpi, tp,
-                          0, mi_row + y_idx_m, mi_col + x_idx_m,
-                          BLOCK_SIZE_SB8X16);
-        xd->b_index = 1;
-        pick_sb_modes(cpi, mi_row + y_idx_m, mi_col + x_idx_m + 1,
-                      tp, &r, &d, BLOCK_SIZE_SB8X16,
-                      &x->sb8x16_context[xd->sb_index][xd->mb_index]
-                                        [xd->b_index]);
-        r2 += r;
-        d2 += d;
-        set_partition_seg_context(cpi, mi_row + y_idx_m, mi_col + x_idx_m);
-        pl = partition_plane_context(xd, BLOCK_SIZE_MB16X16);
-        r2 += x->partition_cost[pl][PARTITION_VERT];
-        if (RDCOST(x->rdmult, x->rddiv, r2, d2) <
-                RDCOST(x->rdmult, x->rddiv, mb16_rate, mb16_dist)) {
-          mb16_rate = r2;
-          mb16_dist = d2;
-          mb_partitioning[i][j] = BLOCK_SIZE_SB8X16;
-        }
-        for (p = 0; p < MAX_MB_PLANE; p++) {
-          vpx_memcpy(cm->left_context[p] +
-                         (y_idx_m * 2 >> xd->plane[p].subsampling_y),
-                     l3 + 4 * p,
-                     sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_y);
-          vpx_memcpy(cm->above_context[p] +
-                         ((mi_col + x_idx_m) * 2 >> xd->plane[p].subsampling_x),
-                     a3 + 4 * p,
-                     sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_x);
-        }
-
-        // try 16x8 coding
-        r2 = 0;
-        d2 = 0;
-        xd->b_index = 0;
-        pick_sb_modes(cpi, mi_row + y_idx_m, mi_col + x_idx_m,
-                      tp, &r, &d, BLOCK_SIZE_SB16X8,
-                      &x->sb16x8_context[xd->sb_index][xd->mb_index]
-                                        [xd->b_index]);
-        r2 += r;
-        d2 += d;
-        update_state(cpi, &x->sb16x8_context[xd->sb_index][xd->mb_index]
-                                            [xd->b_index],
-                     BLOCK_SIZE_SB16X8, 0);
-        encode_superblock(cpi, tp,
-                          0, mi_row + y_idx_m, mi_col + x_idx_m,
-                          BLOCK_SIZE_SB16X8);
-        xd->b_index = 1;
-        pick_sb_modes(cpi, mi_row + y_idx_m + 1, mi_col + x_idx_m,
-                      tp, &r, &d, BLOCK_SIZE_SB16X8,
-                      &x->sb16x8_context[xd->sb_index][xd->mb_index]
-                                        [xd->b_index]);
-        r2 += r;
-        d2 += d;
-        set_partition_seg_context(cpi, mi_row + y_idx_m, mi_col + x_idx_m);
-        pl = partition_plane_context(xd, BLOCK_SIZE_MB16X16);
-        r2 += x->partition_cost[pl][PARTITION_HORZ];
-        if (RDCOST(x->rdmult, x->rddiv, r2, d2) <
-                RDCOST(x->rdmult, x->rddiv, mb16_rate, mb16_dist)) {
-          mb16_rate = r2;
-          mb16_dist = d2;
-          mb_partitioning[i][j] = BLOCK_SIZE_SB16X8;
-        }
-        for (p = 0; p < MAX_MB_PLANE; p++) {
-          vpx_memcpy(cm->left_context[p] +
-                         (y_idx_m * 2 >> xd->plane[p].subsampling_y),
-                     l3 + 4 * p,
-                     sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_y);
-          vpx_memcpy(cm->above_context[p] +
-                         ((mi_col + x_idx_m) * 2 >> xd->plane[p].subsampling_x),
-                     a3 + 4 * p,
-                     sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_x);
-        }
-
-        // try as 16x16
-        pick_sb_modes(cpi, mi_row + y_idx_m, mi_col + x_idx_m,
-                      tp, &r, &d, BLOCK_SIZE_MB16X16,
-                      &x->mb_context[xd->sb_index][xd->mb_index]);
-        set_partition_seg_context(cpi, mi_row + y_idx_m, mi_col + x_idx_m);
-        pl = partition_plane_context(xd, BLOCK_SIZE_MB16X16);
-        r += x->partition_cost[pl][PARTITION_NONE];
-        if (RDCOST(x->rdmult, x->rddiv, r, d) <
-                RDCOST(x->rdmult, x->rddiv, mb16_rate, mb16_dist)) {
-          mb16_rate = r;
-          mb16_dist = d;
-          mb_partitioning[i][j] = BLOCK_SIZE_MB16X16;
-        }
-        sb32_rate += mb16_rate;
-        sb32_dist += mb16_dist;
-
-        // Dummy encode, do not do the tokenization
-        encode_sb(cpi, tp, mi_row + y_idx_m, mi_col + x_idx_m, 0,
-                  BLOCK_SIZE_MB16X16, mb_partitioning[i][j], NULL, NULL);
-      }
-
-      /* Restore L & A coding context to those in place on entry */
-      for (p = 0; p < MAX_MB_PLANE; p++) {
-        vpx_memcpy(cm->left_context[p] +
-                       (y_idx * 2 >> xd->plane[p].subsampling_y),
-                   l2 + 8 * p,
-                   sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_y);
-        vpx_memcpy(cm->above_context[p] +
-                       ((mi_col + x_idx) * 2 >> xd->plane[p].subsampling_x),
-                   a2 + 8 * p,
-                   sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_x);
-      }
-      // restore partition information context
-      vpx_memcpy(cm->above_seg_context + mi_col + x_idx, sa32, sizeof(sa32));
-      vpx_memcpy(cm->left_seg_context + y_idx, sl32, sizeof(sl32));
-
-      set_partition_seg_context(cpi, mi_row + y_idx, mi_col + x_idx);
-      pl = partition_plane_context(xd, BLOCK_SIZE_SB32X32);
-      sb32_rate += x->partition_cost[pl][PARTITION_SPLIT];
-
-      if (cpi->sf.splitmode_breakout) {
-        sb32_skip = splitmodes_used;
-        sb64_skip += splitmodes_used;
-      }
-
-      // check 32x16
-      if (mi_col + x_idx + 4 <= cm->mi_cols) {
-        int r, d;
-
-        xd->mb_index = 0;
-        pick_sb_modes(cpi, mi_row + y_idx, mi_col + x_idx,
-                      tp, &r, &d, BLOCK_SIZE_SB32X16,
-                      &x->sb32x16_context[xd->sb_index][xd->mb_index]);
-        if (mi_row + y_idx + 2 < cm->mi_rows) {
-          int r2, d2;
-
-          update_state(cpi, &x->sb32x16_context[xd->sb_index][xd->mb_index],
-                       BLOCK_SIZE_SB32X16, 0);
-          encode_superblock(cpi, tp,
-                            0, mi_row + y_idx, mi_col + x_idx,
-                            BLOCK_SIZE_SB32X16);
-          xd->mb_index = 1;
-          pick_sb_modes(cpi, mi_row + y_idx + 2,
-                        mi_col + x_idx, tp, &r2, &d2, BLOCK_SIZE_SB32X16,
-                        &x->sb32x16_context[xd->sb_index][xd->mb_index]);
-          r += r2;
-          d += d2;
-        }
-
-        set_partition_seg_context(cpi, mi_row + y_idx, mi_col + x_idx);
-        pl = partition_plane_context(xd, BLOCK_SIZE_SB32X32);
-        r += x->partition_cost[pl][PARTITION_HORZ];
-
-        /* is this better than MB coding? */
-        if (RDCOST(x->rdmult, x->rddiv, r, d) <
-                RDCOST(x->rdmult, x->rddiv, sb32_rate, sb32_dist)) {
-          sb32_rate = r;
-          sb32_dist = d;
-          sb_partitioning[i] = BLOCK_SIZE_SB32X16;
-        }
-
-        for (p = 0; p < MAX_MB_PLANE; p++) {
-          vpx_memcpy(cm->left_context[p] +
-                         (y_idx * 2 >> xd->plane[p].subsampling_y),
-                     l2 + 8 * p,
-                     sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_y);
-          vpx_memcpy(cm->above_context[p] +
-                         ((mi_col + x_idx) * 2 >> xd->plane[p].subsampling_x),
-                     a2 + 8 * p,
-                     sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_x);
-        }
-      }
-
-      // check 16x32
-      if (mi_row + y_idx + 4 <= cm->mi_rows) {
-        int r, d;
-
-        xd->mb_index = 0;
-        pick_sb_modes(cpi, mi_row + y_idx, mi_col + x_idx,
-                      tp, &r, &d, BLOCK_SIZE_SB16X32,
-                      &x->sb16x32_context[xd->sb_index][xd->mb_index]);
-        if (mi_col + x_idx + 2 < cm->mi_cols) {
-          int r2, d2;
-
-          update_state(cpi, &x->sb16x32_context[xd->sb_index][xd->mb_index],
-                       BLOCK_SIZE_SB16X32, 0);
-          encode_superblock(cpi, tp,
-                            0, mi_row + y_idx, mi_col + x_idx,
-                            BLOCK_SIZE_SB16X32);
-          xd->mb_index = 1;
-          pick_sb_modes(cpi, mi_row + y_idx,
-                        mi_col + x_idx + 2,
-                        tp, &r2, &d2, BLOCK_SIZE_SB16X32,
-                        &x->sb16x32_context[xd->sb_index][xd->mb_index]);
-          r += r2;
-          d += d2;
-        }
-
-        set_partition_seg_context(cpi, mi_row + y_idx, mi_col + x_idx);
-        pl = partition_plane_context(xd, BLOCK_SIZE_SB32X32);
-        r += x->partition_cost[pl][PARTITION_VERT];
-
-        /* is this better than MB coding? */
-        if (RDCOST(x->rdmult, x->rddiv, r, d) <
-                RDCOST(x->rdmult, x->rddiv, sb32_rate, sb32_dist)) {
-          sb32_rate = r;
-          sb32_dist = d;
-          sb_partitioning[i] = BLOCK_SIZE_SB16X32;
-        }
-
-        for (p = 0; p < MAX_MB_PLANE; p++) {
-          vpx_memcpy(cm->left_context[p] +
-                         (y_idx * 2 >> xd->plane[p].subsampling_y),
-                     l2 + 8 * p,
-                     sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_y);
-          vpx_memcpy(cm->above_context[p] +
-                         ((mi_col + x_idx) * 2 >> xd->plane[p].subsampling_x),
-                     a2 + 8 * p,
-                     sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_x);
-        }
-      }
-
-      if (!sb32_skip &&
-          mi_col + x_idx + 4 <= cm->mi_cols &&
-          mi_row + y_idx + 4 <= cm->mi_rows) {
-        int r, d;
-
-        /* Pick a mode assuming that it applies to all 4 of the MBs in the SB */
-        pick_sb_modes(cpi, mi_row + y_idx, mi_col + x_idx,
-                      tp, &r, &d, BLOCK_SIZE_SB32X32,
-                      &x->sb32_context[xd->sb_index]);
-
-        set_partition_seg_context(cpi, mi_row + y_idx, mi_col + x_idx);
-        pl = partition_plane_context(xd, BLOCK_SIZE_SB32X32);
-        r += x->partition_cost[pl][PARTITION_NONE];
-
-        if (RDCOST(x->rdmult, x->rddiv, r, d) <
-                RDCOST(x->rdmult, x->rddiv, sb32_rate, sb32_dist)) {
-          sb32_rate = r;
-          sb32_dist = d;
-          sb_partitioning[i] = BLOCK_SIZE_SB32X32;
-        }
-      }
-
-      // If we used 16x16 instead of 32x32 then skip 64x64 (if enabled).
-      if (cpi->sf.mb16_breakout && sb_partitioning[i] != BLOCK_SIZE_SB32X32) {
-        ++sb64_skip;
-      }
-
-      sb64_rate += sb32_rate;
-      sb64_dist += sb32_dist;
-
-      /* Encode SB using best computed mode(s) */
-      // FIXME(rbultje): there really shouldn't be any need to encode_mb/sb
-      // for each level that we go up, we can just keep tokens and recon
-      // pixels of the lower level; also, inverting SB/MB order (big->small
-      // instead of small->big) means we can use as threshold for small, which
-      // may enable breakouts if RD is not good enough (i.e. faster)
-      encode_sb(cpi, tp, mi_row + y_idx, mi_col + x_idx, 0,
-                BLOCK_SIZE_SB32X32, sb_partitioning[i], mb_partitioning[i],
-                NULL);
+      *(get_sb_index(xd, subsize)) = i;
+      rd_pick_partition(cpi, tp, mi_row + y_idx, mi_col + x_idx, subsize,
+                        &r, &d);
+      r4 += r;
+      d4 += d;
     }
+    set_partition_seg_context(cm, xd, mi_row, mi_col);
+    pl = partition_plane_context(xd, bsize);
+    r4 += x->partition_cost[pl][PARTITION_SPLIT];
 
-    for (p = 0; p < MAX_MB_PLANE; p++) {
-      memcpy(cm->above_context[p] +
-                 (mi_col * 2 >> xd->plane[p].subsampling_x),
-             a + 16 * p,
-             sizeof(ENTROPY_CONTEXT) * 16 >> xd->plane[p].subsampling_x);
-      memcpy(cm->left_context[p], l + 16 * p,
-             sizeof(ENTROPY_CONTEXT) * 16 >> xd->plane[p].subsampling_y);
-    }
-    memcpy(cm->above_seg_context + mi_col, &seg_a, sizeof(seg_a));
-    memcpy(cm->left_seg_context, &seg_l, sizeof(seg_l));
-
-    set_partition_seg_context(cpi, mi_row, mi_col);
-    pl = partition_plane_context(xd, BLOCK_SIZE_SB64X64);
-    sb64_rate += x->partition_cost[pl][PARTITION_SPLIT];
+    srate = r4;
+    sdist = d4;
+    restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+  }
 
-    // check 64x32
-    if (mi_col + 8 <= cm->mi_cols && !(cm->mb_rows & 1)) {
+  // PARTITION_HORZ
+  if ((mi_col + ms <= cm->mi_cols) && (mi_row + (ms >> 1) <= cm->mi_rows) &&
+      (bsize >= BLOCK_SIZE_MB16X16)) {
+    int r2, d2;
+    int mb_skip = 0;
+    subsize = get_subsize(bsize, PARTITION_HORZ);
+    *(get_sb_index(xd, subsize)) = 0;
+    pick_sb_modes(cpi, mi_row, mi_col, tp, &r2, &d2, subsize,
+                  get_block_context(x, subsize));
+
+    if (mi_row + ms <= cm->mi_rows) {
       int r, d;
-
-      xd->sb_index = 0;
-      pick_sb_modes(cpi, mi_row, mi_col,
-                    tp, &r, &d, BLOCK_SIZE_SB64X32,
-                    &x->sb64x32_context[xd->sb_index]);
-      if (mi_row + 4 != cm->mi_rows) {
-        int r2, d2;
-
-        update_state(cpi, &x->sb64x32_context[xd->sb_index],
-                     BLOCK_SIZE_SB64X32, 0);
-        encode_superblock(cpi, tp,
-                          0, mi_row, mi_col, BLOCK_SIZE_SB64X32);
-        xd->sb_index = 1;
-        pick_sb_modes(cpi, mi_row + 4, mi_col,
-                      tp, &r2, &d2, BLOCK_SIZE_SB64X32,
-                      &x->sb64x32_context[xd->sb_index]);
-        r += r2;
-        d += d2;
-      }
-
-      set_partition_seg_context(cpi, mi_row, mi_col);
-      pl = partition_plane_context(xd, BLOCK_SIZE_SB64X64);
-      r += x->partition_cost[pl][PARTITION_HORZ];
-
-      /* is this better than MB coding? */
-      if (RDCOST(x->rdmult, x->rddiv, r, d) <
-              RDCOST(x->rdmult, x->rddiv, sb64_rate, sb64_dist)) {
-        sb64_rate = r;
-        sb64_dist = d;
-        sb64_partitioning = BLOCK_SIZE_SB64X32;
-      }
-
-      for (p = 0; p < MAX_MB_PLANE; p++) {
-        memcpy(cm->above_context[p] +
-                   (mi_col * 2 >> xd->plane[p].subsampling_x),
-               a + 16 * p,
-               sizeof(ENTROPY_CONTEXT) * 16 >> xd->plane[p].subsampling_x);
-        memcpy(cm->left_context[p], l + 16 * p,
-               sizeof(ENTROPY_CONTEXT) * 16 >> xd->plane[p].subsampling_y);
-      }
+      update_state(cpi, get_block_context(x, subsize), subsize, 0);
+      encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
+      *(get_sb_index(xd, subsize)) = 1;
+      pick_sb_modes(cpi, mi_row + (ms >> 1), mi_col, tp, &r, &d, subsize,
+                    get_block_context(x, subsize));
+      r2 += r;
+      d2 += d;
+    } else {
+      if (mi_row + (ms >> 1) != cm->mi_rows)
+        mb_skip = 1;
+    }
+    set_partition_seg_context(cm, xd, mi_row, mi_col);
+    pl = partition_plane_context(xd, bsize);
+    r2 += x->partition_cost[pl][PARTITION_HORZ];
+
+    if ((RDCOST(x->rdmult, x->rddiv, r2, d2) <
+         RDCOST(x->rdmult, x->rddiv, srate, sdist)) && !mb_skip) {
+      srate = r2;
+      sdist = d2;
+      *(get_sb_partitioning(x, bsize)) = subsize;
     }
+    restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+  }
 
-    // check 32x64
-    if (mi_row + 8 <= cm->mi_rows && !(cm->mb_cols & 1)) {
+  // PARTITION_VERT
+  if ((mi_row + ms <= cm->mi_rows) && (mi_col + (ms >> 1) <= cm->mi_cols) &&
+      (bsize >= BLOCK_SIZE_MB16X16)) {
+    int r2, d2;
+    int mb_skip = 0;
+    subsize = get_subsize(bsize, PARTITION_VERT);
+    *(get_sb_index(xd, subsize)) = 0;
+    pick_sb_modes(cpi, mi_row, mi_col, tp, &r2, &d2, subsize,
+                  get_block_context(x, subsize));
+    if (mi_col + ms <= cm->mi_cols) {
       int r, d;
+      update_state(cpi, get_block_context(x, subsize), subsize, 0);
+      encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
+      *(get_sb_index(xd, subsize)) = 1;
+      pick_sb_modes(cpi, mi_row, mi_col + (ms >> 1), tp, &r, &d, subsize,
+                    get_block_context(x, subsize));
+      r2 += r;
+      d2 += d;
+    } else {
+      if (mi_col + (ms >> 1) != cm->mi_cols)
+        mb_skip = 1;
+    }
+    set_partition_seg_context(cm, xd, mi_row, mi_col);
+    pl = partition_plane_context(xd, bsize);
+    r2 += x->partition_cost[pl][PARTITION_VERT];
+
+    if ((RDCOST(x->rdmult, x->rddiv, r2, d2) <
+         RDCOST(x->rdmult, x->rddiv, srate, sdist)) && !mb_skip) {
+      srate = r2;
+      sdist = d2;
+      *(get_sb_partitioning(x, bsize)) = subsize;
+    }
+    restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+  }
 
-      xd->sb_index = 0;
-      pick_sb_modes(cpi, mi_row, mi_col,
-                    tp, &r, &d, BLOCK_SIZE_SB32X64,
-                    &x->sb32x64_context[xd->sb_index]);
-      if (mi_col + 4 != cm->mi_cols) {
-        int r2, d2;
-
-        update_state(cpi, &x->sb32x64_context[xd->sb_index],
-                     BLOCK_SIZE_SB32X64, 0);
-        encode_superblock(cpi, tp,
-                          0, mi_row, mi_col, BLOCK_SIZE_SB32X64);
-        xd->sb_index = 1;
-        pick_sb_modes(cpi, mi_row, mi_col + 4,
-                      tp, &r2, &d2, BLOCK_SIZE_SB32X64,
-                      &x->sb32x64_context[xd->sb_index]);
-        r += r2;
-        d += d2;
-      }
-
-      set_partition_seg_context(cpi, mi_row, mi_col);
-      pl = partition_plane_context(xd, BLOCK_SIZE_SB64X64);
-      r += x->partition_cost[pl][PARTITION_VERT];
-
-      /* is this better than MB coding? */
-      if (RDCOST(x->rdmult, x->rddiv, r, d) <
-              RDCOST(x->rdmult, x->rddiv, sb64_rate, sb64_dist)) {
-        sb64_rate = r;
-        sb64_dist = d;
-        sb64_partitioning = BLOCK_SIZE_SB32X64;
-      }
+  // PARTITION_NONE
+  if (mi_row + ms <= cm->mi_rows && mi_col + ms <= cm->mi_cols) {
+    int r, d;
+    pick_sb_modes(cpi, mi_row, mi_col, tp, &r, &d, bsize,
+                  get_block_context(x, bsize));
+    if (bsize >= BLOCK_SIZE_MB16X16) {
+      set_partition_seg_context(cm, xd, mi_row, mi_col);
+      pl = partition_plane_context(xd, bsize);
+      r += x->partition_cost[pl][PARTITION_NONE];
+    }
 
-      for (p = 0; p < MAX_MB_PLANE; p++) {
-        memcpy(cm->above_context[p] +
-                   (mi_col * 2 >> xd->plane[p].subsampling_x),
-               a + 16 * p,
-               sizeof(ENTROPY_CONTEXT) * 16 >> xd->plane[p].subsampling_x);
-        memcpy(cm->left_context[p], l + 16 * p,
-               sizeof(ENTROPY_CONTEXT) * 16 >> xd->plane[p].subsampling_y);
-      }
+    if (RDCOST(x->rdmult, x->rddiv, r, d) <
+        RDCOST(x->rdmult, x->rddiv, srate, sdist)) {
+      srate = r;
+      sdist = d;
+      if (bsize >= BLOCK_SIZE_MB16X16)
+        *(get_sb_partitioning(x, bsize)) = bsize;
     }
+  }
 
-    if (!sb64_skip &&
-        mi_col + 8 <= cm->mi_cols &&
-        mi_row + 8 <= cm->mi_rows) {
-      int r, d;
+  assert(srate < INT_MAX && sdist < INT_MAX);
+  *rate = srate;
+  *dist = sdist;
 
-      pick_sb_modes(cpi, mi_row, mi_col, tp, &r, &d,
-                    BLOCK_SIZE_SB64X64, &x->sb64_context);
+  encode_sb(cpi, tp, mi_row, mi_col, bsize == BLOCK_SIZE_SB64X64, bsize);
 
-      set_partition_seg_context(cpi, mi_row, mi_col);
-      pl = partition_plane_context(xd, BLOCK_SIZE_SB64X64);
-      r += x->partition_cost[pl][PARTITION_NONE];
+  if (bsize == BLOCK_SIZE_SB64X64)
+    assert(tp_orig < *tp);
+  else
+    assert(tp_orig == *tp);
+}
 
-      if (RDCOST(x->rdmult, x->rddiv, r, d) <
-              RDCOST(x->rdmult, x->rddiv, sb64_rate, sb64_dist)) {
-        sb64_rate = r;
-        sb64_dist = d;
-        sb64_partitioning = BLOCK_SIZE_SB64X64;
-      }
-    }
+static void encode_sb_row(VP9_COMP *cpi, int mi_row,
+                       TOKENEXTRA **tp, int *totalrate) {
+  VP9_COMMON *const cm = &cpi->common;
+  int mi_col;
 
-    assert(tp_orig == *tp);
-    encode_sb(cpi, tp, mi_row, mi_col, 1, BLOCK_SIZE_SB64X64,
-              sb64_partitioning, sb_partitioning, mb_partitioning);
-    assert(tp_orig < *tp);
+  // Initialize the left context for the new SB row
+  vpx_memset(&cm->left_context, 0, sizeof(cm->left_context));
+  vpx_memset(cm->left_seg_context, 0, sizeof(cm->left_seg_context));
+
+  // Code each SB in the row
+  for (mi_col = cm->cur_tile_mi_col_start;
+       mi_col < cm->cur_tile_mi_col_end; mi_col += 8) {
+    int dummy_rate, dummy_dist;
+    rd_pick_partition(cpi, tp, mi_row, mi_col, BLOCK_SIZE_SB64X64,
+                      &dummy_rate, &dummy_dist);
   }
 }
 
@@ -1423,7 +1110,7 @@ static void init_encode_frame_mb_context(VP9_COMP *cpi) {
 
   vp9_build_block_offsets(x);
 
-  vp9_setup_block_dptrs(&x->e_mbd);
+  vp9_setup_block_dptrs(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);
 
   xd->mode_info_context->mbmi.mode = DC_PRED;
   xd->mode_info_context->mbmi.uv_mode = DC_PRED;
@@ -1559,9 +1246,8 @@ static void encode_frame_internal(VP9_COMP *cpi) {
           vp9_get_tile_col_offsets(cm, tile_col);
           for (mi_row = cm->cur_tile_mi_row_start;
                mi_row < cm->cur_tile_mi_row_end;
-               mi_row += 8) {
+               mi_row += 8)
             encode_sb_row(cpi, mi_row, &tp, &totalrate);
-          }
           cpi->tok_count[tile_col] = (unsigned int)(tp - tp_old);
           assert(tp - cpi->tok <=
                  get_token_alloc(cm->mb_rows, cm->mb_cols));
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 4665fccd0..e4002d689 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -139,6 +139,7 @@ static void optimize_b(VP9_COMMON *const cm, MACROBLOCK *mb,
   const int ib = txfrm_block_to_raster_block(xd, bsize, plane,
                                              block, 2 * tx_size);
   const int16_t *dequant_ptr = xd->plane[plane].dequant;
+  const uint8_t * band_translate;
 
   assert((!type && !plane) || (type && plane));
   dqcoeff_ptr = BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16);
@@ -149,23 +150,27 @@ static void optimize_b(VP9_COMMON *const cm, MACROBLOCK *mb,
       const TX_TYPE tx_type = plane == 0 ? get_tx_type_4x4(xd, ib) : DCT_DCT;
       default_eob = 16;
       scan = get_scan_4x4(tx_type);
+      band_translate = vp9_coefband_trans_4x4;
       break;
     }
     case TX_8X8: {
       const TX_TYPE tx_type = plane == 0 ? get_tx_type_8x8(xd, ib) : DCT_DCT;
       scan = get_scan_8x8(tx_type);
       default_eob = 64;
+      band_translate = vp9_coefband_trans_8x8plus;
       break;
     }
     case TX_16X16: {
       const TX_TYPE tx_type = plane == 0 ? get_tx_type_16x16(xd, ib) : DCT_DCT;
       scan = get_scan_16x16(tx_type);
       default_eob = 256;
+      band_translate = vp9_coefband_trans_8x8plus;
       break;
     }
     case TX_32X32:
       scan = vp9_default_zig_zag1d_32x32;
       default_eob = 1024;
+      band_translate = vp9_coefband_trans_8x8plus;
       break;
   }
   assert(eob <= default_eob);
@@ -204,7 +209,7 @@ static void optimize_b(VP9_COMMON *const cm, MACROBLOCK *mb,
       t0 = (vp9_dct_value_tokens_ptr + x)->token;
       /* Consider both possible successor states. */
       if (next < default_eob) {
-        band = get_coef_band(scan, tx_size, i + 1);
+        band = get_coef_band(band_translate, i + 1);
         pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache,
                                        pad, default_eob);
         rate0 +=
@@ -254,7 +259,7 @@ static void optimize_b(VP9_COMMON *const cm, MACROBLOCK *mb,
         t0 = t1 = (vp9_dct_value_tokens_ptr + x)->token;
       }
       if (next < default_eob) {
-        band = get_coef_band(scan, tx_size, i + 1);
+        band = get_coef_band(band_translate, i + 1);
         if (t0 != DCT_EOB_TOKEN) {
           pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache,
                                          pad, default_eob);
@@ -291,7 +296,7 @@ static void optimize_b(VP9_COMMON *const cm, MACROBLOCK *mb,
      *  add a new trellis node, but we do need to update the costs.
      */
     else {
-      band = get_coef_band(scan, tx_size, i + 1);
+      band = get_coef_band(band_translate, i + 1);
       t0 = tokens[next][0].token;
       t1 = tokens[next][1].token;
       /* Update the cost of each path if we're past the EOB token. */
@@ -310,7 +315,7 @@ static void optimize_b(VP9_COMMON *const cm, MACROBLOCK *mb,
   }
 
   /* Now pick the best path through the whole trellis. */
-  band = get_coef_band(scan, tx_size, i + 1);
+  band = get_coef_band(band_translate, i + 1);
   pt = combine_entropy_contexts(*a, *l);
   rate0 = tokens[next][0].rate;
   rate1 = tokens[next][1].rate;
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 436c8d4e0..ddcf849ce 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -47,7 +47,7 @@
 #define KF_MB_INTRA_MIN 150
 #define GF_MB_INTRA_MIN 100
 
-#define DOUBLE_DIVIDE_CHECK(X) ((X)<0?(X)-.000001:(X)+.000001)
+#define DOUBLE_DIVIDE_CHECK(x) ((x) < 0 ? (x) - 0.000001 : (x) + 0.000001)
 
 #define POW1 (double)cpi->oxcf.two_pass_vbrbias/100.0
 #define POW2 (double)cpi->oxcf.two_pass_vbrbias/100.0
@@ -78,8 +78,8 @@ static int select_cq_level(int qindex) {
 
 
 // Resets the first pass file to the given position using a relative seek from the current position
-static void reset_fpf_position(VP9_COMP *cpi, FIRSTPASS_STATS *Position) {
-  cpi->twopass.stats_in = Position;
+static void reset_fpf_position(VP9_COMP *cpi, FIRSTPASS_STATS *position) {
+  cpi->twopass.stats_in = position;
 }
 
 static int lookup_next_frame_stats(VP9_COMP *cpi, FIRSTPASS_STATS *next_frame) {
@@ -252,17 +252,11 @@ static void avg_stats(FIRSTPASS_STATS *section) {
 
 // Calculate a modified Error used in distributing bits between easier and harder frames
 static double calculate_modified_err(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
-  double av_err = (cpi->twopass.total_stats.ssim_weighted_pred_err /
-                   cpi->twopass.total_stats.count);
-  double this_err = this_frame->ssim_weighted_pred_err;
-  double modified_err;
-
-  if (this_err > av_err)
-    modified_err = av_err * pow((this_err / DOUBLE_DIVIDE_CHECK(av_err)), POW1);
-  else
-    modified_err = av_err * pow((this_err / DOUBLE_DIVIDE_CHECK(av_err)), POW2);
-
-  return modified_err;
+  const FIRSTPASS_STATS *const stats = &cpi->twopass.total_stats;
+  const double av_err = stats->ssim_weighted_pred_err / stats->count;
+  const double this_err = this_frame->ssim_weighted_pred_err;
+  return av_err * pow(this_err / DOUBLE_DIVIDE_CHECK(av_err),
+                      this_err > av_err ? POW1 : POW2);
 }
 
 static const double weight_table[256] = {
@@ -328,20 +322,14 @@ static double simple_weight(YV12_BUFFER_CONFIG *source) {
 static int frame_max_bits(VP9_COMP *cpi) {
   // Max allocation for a single frame based on the max section guidelines
   // passed in and how many bits are left.
-  int max_bits;
-
   // For VBR base this on the bits and frames left plus the
   // two_pass_vbrmax_section rate passed in by the user.
-  max_bits = (int) (((double) cpi->twopass.bits_left
-      / (cpi->twopass.total_stats.count - (double) cpi->common
-             .current_video_frame))
-                    * ((double) cpi->oxcf.two_pass_vbrmax_section / 100.0));
+  const double max_bits = (1.0 * cpi->twopass.bits_left /
+      (cpi->twopass.total_stats.count - cpi->common.current_video_frame)) *
+      (cpi->oxcf.two_pass_vbrmax_section / 100.0);
 
   // Trap case where we are out of bits.
-  if (max_bits < 0)
-    max_bits = 0;
-
-  return max_bits;
+  return MAX((int)max_bits, 0);
 }
 
 void vp9_init_first_pass(VP9_COMP *cpi) {
@@ -489,7 +477,7 @@ void vp9_first_pass(VP9_COMP *cpi) {
 
   vp9_build_block_offsets(x);
 
-  vp9_setup_block_dptrs(&x->e_mbd);
+  vp9_setup_block_dptrs(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);
 
   vp9_frame_init_quantizer(cpi);
 
@@ -854,26 +842,18 @@ static double calc_correction_factor(double err_per_mb,
                                      double err_divisor,
                                      double pt_low,
                                      double pt_high,
-                                     int Q) {
-  double power_term;
-  double error_term = err_per_mb / err_divisor;
-  double correction_factor;
+                                     int q) {
+  const double error_term = err_per_mb / err_divisor;
 
   // Adjustment based on actual quantizer to power term.
-  power_term = (vp9_convert_qindex_to_q(Q) * 0.01) + pt_low;
-  power_term = (power_term > pt_high) ? pt_high : power_term;
+  const double power_term = MIN(vp9_convert_qindex_to_q(q) * 0.01 + pt_low,
+                                pt_high);
 
   // Calculate correction factor
   if (power_term < 1.0)
     assert(error_term >= 0.0);
-  correction_factor = pow(error_term, power_term);
 
-  // Clip range
-  correction_factor =
-    (correction_factor < 0.05)
-    ? 0.05 : (correction_factor > 5.0) ? 5.0 : correction_factor;
-
-  return correction_factor;
+  return fclamp(pow(error_term, power_term), 0.05, 5.0);
 }
 
 // Given a current maxQ value sets a range for future values.
@@ -882,10 +862,8 @@ static double calc_correction_factor(double err_per_mb,
 // (now uses the actual quantizer) but has not been tuned.
 static void adjust_maxq_qrange(VP9_COMP *cpi) {
   int i;
-  double q;
-
   // Set the max corresponding to cpi->avg_q * 2.0
-  q = cpi->avg_q * 2.0;
+  double q = cpi->avg_q * 2.0;
   cpi->twopass.maxq_max_limit = cpi->worst_quality;
   for (i = cpi->best_quality; i <= cpi->worst_quality; i++) {
     cpi->twopass.maxq_max_limit = i;
@@ -906,12 +884,11 @@ static void adjust_maxq_qrange(VP9_COMP *cpi) {
 static int estimate_max_q(VP9_COMP *cpi,
                           FIRSTPASS_STATS *fpstats,
                           int section_target_bandwitdh) {
-  int Q;
+  int q;
   int num_mbs = cpi->common.MBs;
   int target_norm_bits_per_mb;
 
-  double section_err = (fpstats->coded_error / fpstats->count);
-  double sr_err_diff;
+  double section_err = fpstats->coded_error / fpstats->count;
   double sr_correction;
   double err_per_mb = section_err / num_mbs;
   double err_correction_factor;
@@ -920,92 +897,74 @@ static int estimate_max_q(VP9_COMP *cpi,
   if (section_target_bandwitdh <= 0)
     return cpi->twopass.maxq_max_limit;          // Highest value allowed
 
-  target_norm_bits_per_mb =
-    (section_target_bandwitdh < (1 << 20))
-    ? (512 * section_target_bandwitdh) / num_mbs
-    : 512 * (section_target_bandwitdh / num_mbs);
+  target_norm_bits_per_mb = section_target_bandwitdh < (1 << 20)
+                              ? (512 * section_target_bandwitdh) / num_mbs
+                              : 512 * (section_target_bandwitdh / num_mbs);
 
   // Look at the drop in prediction quality between the last frame
   // and the GF buffer (which contained an older frame).
   if (fpstats->sr_coded_error > fpstats->coded_error) {
-    sr_err_diff =
-      (fpstats->sr_coded_error - fpstats->coded_error) /
-      (fpstats->count * cpi->common.MBs);
-    sr_correction = (sr_err_diff / 32.0);
-    sr_correction = pow(sr_correction, 0.25);
-    if (sr_correction < 0.75)
-      sr_correction = 0.75;
-    else if (sr_correction > 1.25)
-      sr_correction = 1.25;
+    double sr_err_diff = (fpstats->sr_coded_error - fpstats->coded_error) /
+                             (fpstats->count * cpi->common.MBs);
+    sr_correction = fclamp(pow(sr_err_diff / 32.0, 0.25), 0.75, 1.25);
   } else {
     sr_correction = 0.75;
   }
 
   // Calculate a corrective factor based on a rolling ratio of bits spent
   // vs target bits
-  if ((cpi->rolling_target_bits > 0) &&
-      (cpi->active_worst_quality < cpi->worst_quality)) {
-    double rolling_ratio;
-
-    rolling_ratio = (double)cpi->rolling_actual_bits /
-                    (double)cpi->rolling_target_bits;
+  if (cpi->rolling_target_bits > 0 &&
+      cpi->active_worst_quality < cpi->worst_quality) {
+    double rolling_ratio = (double)cpi->rolling_actual_bits /
+                               (double)cpi->rolling_target_bits;
 
     if (rolling_ratio < 0.95)
       cpi->twopass.est_max_qcorrection_factor -= 0.005;
     else if (rolling_ratio > 1.05)
       cpi->twopass.est_max_qcorrection_factor += 0.005;
 
-    cpi->twopass.est_max_qcorrection_factor =
-      (cpi->twopass.est_max_qcorrection_factor < 0.1)
-      ? 0.1
-      : (cpi->twopass.est_max_qcorrection_factor > 10.0)
-      ? 10.0 : cpi->twopass.est_max_qcorrection_factor;
+    cpi->twopass.est_max_qcorrection_factor = fclamp(
+        cpi->twopass.est_max_qcorrection_factor, 0.1, 10.0);
   }
 
   // Corrections for higher compression speed settings
   // (reduced compression expected)
-  if (cpi->compressor_speed == 1) {
-    if (cpi->oxcf.cpu_used <= 5)
-      speed_correction = 1.04 + (cpi->oxcf.cpu_used * 0.04);
-    else
-      speed_correction = 1.25;
-  }
+  if (cpi->compressor_speed == 1)
+    speed_correction = cpi->oxcf.cpu_used <= 5 ?
+                          1.04 + (cpi->oxcf.cpu_used * 0.04) :
+                          1.25;
 
   // Try and pick a max Q that will be high enough to encode the
   // content at the given rate.
-  for (Q = cpi->twopass.maxq_min_limit; Q < cpi->twopass.maxq_max_limit; Q++) {
+  for (q = cpi->twopass.maxq_min_limit; q < cpi->twopass.maxq_max_limit; q++) {
     int bits_per_mb_at_this_q;
 
-    err_correction_factor =
-      calc_correction_factor(err_per_mb, ERR_DIVISOR, 0.4, 0.90, Q) *
-      sr_correction * speed_correction *
-      cpi->twopass.est_max_qcorrection_factor;
+    err_correction_factor = calc_correction_factor(err_per_mb,
+                                                   ERR_DIVISOR, 0.4, 0.90, q) *
+                                sr_correction * speed_correction *
+                                cpi->twopass.est_max_qcorrection_factor;
 
-
-    bits_per_mb_at_this_q =
-      vp9_bits_per_mb(INTER_FRAME, Q, err_correction_factor);
+    bits_per_mb_at_this_q = vp9_bits_per_mb(INTER_FRAME, q,
+                                            err_correction_factor);
 
     if (bits_per_mb_at_this_q <= target_norm_bits_per_mb)
       break;
   }
 
   // Restriction on active max q for constrained quality mode.
-  if ((cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) &&
-      (Q < cpi->cq_target_quality)) {
-    Q = cpi->cq_target_quality;
-  }
+  if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY &&
+      q < cpi->cq_target_quality)
+    q = cpi->cq_target_quality;
 
   // Adjust maxq_min_limit and maxq_max_limit limits based on
   // average q observed in clip for non kf/gf/arf frames
   // Give average a chance to settle though.
   // PGW TODO.. This code is broken for the extended Q range
-  if ((cpi->ni_frames >
-       ((int)cpi->twopass.total_stats.count >> 8)) &&
-      (cpi->ni_frames > 25)) {
+  if (cpi->ni_frames > ((int)cpi->twopass.total_stats.count >> 8) &&
+      cpi->ni_frames > 25)
     adjust_maxq_qrange(cpi);
-  }
 
-  return Q;
+  return q;
 }
 
 // For cq mode estimate a cq level that matches the observed
@@ -1013,7 +972,7 @@ static int estimate_max_q(VP9_COMP *cpi,
 static int estimate_cq(VP9_COMP *cpi,
                        FIRSTPASS_STATS *fpstats,
                        int section_target_bandwitdh) {
-  int Q;
+  int q;
   int num_mbs = cpi->common.MBs;
   int target_norm_bits_per_mb;
 
@@ -1064,29 +1023,29 @@ static int estimate_cq(VP9_COMP *cpi,
     clip_iifactor = 0.80;
 
   // Try and pick a Q that can encode the content at the given rate.
-  for (Q = 0; Q < MAXQ; Q++) {
+  for (q = 0; q < MAXQ; q++) {
     int bits_per_mb_at_this_q;
 
     // Error per MB based correction factor
     err_correction_factor =
-      calc_correction_factor(err_per_mb, 100.0, 0.4, 0.90, Q) *
+      calc_correction_factor(err_per_mb, 100.0, 0.4, 0.90, q) *
       sr_correction * speed_correction * clip_iifactor;
 
     bits_per_mb_at_this_q =
-      vp9_bits_per_mb(INTER_FRAME, Q, err_correction_factor);
+      vp9_bits_per_mb(INTER_FRAME, q, err_correction_factor);
 
     if (bits_per_mb_at_this_q <= target_norm_bits_per_mb)
       break;
   }
 
   // Clip value to range "best allowed to (worst allowed - 1)"
-  Q = select_cq_level(Q);
-  if (Q >= cpi->worst_quality)
-    Q = cpi->worst_quality - 1;
-  if (Q < cpi->best_quality)
-    Q = cpi->best_quality;
+  q = select_cq_level(q);
+  if (q >= cpi->worst_quality)
+    q = cpi->worst_quality - 1;
+  if (q < cpi->best_quality)
+    q = cpi->best_quality;
 
-  return Q;
+  return q;
 }
 
 
@@ -1117,9 +1076,8 @@ void vp9_init_second_pass(VP9_COMP *cpi) {
   // encoded in the second pass is a guess.  However the sum duration is not.
   // Its calculated based on the actual durations of all frames from the first
   // pass.
-  vp9_new_frame_rate(cpi,
-                     10000000.0 * cpi->twopass.total_stats.count /
-                     cpi->twopass.total_stats.duration);
+  vp9_new_frame_rate(cpi, 10000000.0 * cpi->twopass.total_stats.count /
+                       cpi->twopass.total_stats.duration);
 
   cpi->output_frame_rate = cpi->oxcf.frame_rate;
   cpi->twopass.bits_left = (int64_t)(cpi->twopass.total_stats.duration *
@@ -1191,9 +1149,8 @@ static double get_prediction_decay_rate(VP9_COMP *cpi,
 
   // Look at the observed drop in prediction quality between the last frame
   // and the GF buffer (which contains an older frame).
-  mb_sr_err_diff =
-    (next_frame->sr_coded_error - next_frame->coded_error) /
-    (cpi->common.MBs);
+  mb_sr_err_diff = (next_frame->sr_coded_error - next_frame->coded_error) /
+                   cpi->common.MBs;
   if (mb_sr_err_diff <= 512.0) {
     second_ref_decay = 1.0 - (mb_sr_err_diff / 512.0);
     second_ref_decay = pow(second_ref_decay, 0.5);
@@ -1225,9 +1182,9 @@ static int detect_transition_to_still(
   // Break clause to detect very still sections after motion
   // For example a static image after a fade or other transition
   // instead of a clean scene cut.
-  if ((frame_interval > MIN_GF_INTERVAL) &&
-      (loop_decay_rate >= 0.999) &&
-      (last_decay_rate < 0.9)) {
+  if (frame_interval > MIN_GF_INTERVAL &&
+      loop_decay_rate >= 0.999 &&
+      last_decay_rate < 0.9) {
     int j;
     FIRSTPASS_STATS *position = cpi->twopass.stats_in;
     FIRSTPASS_STATS tmp_next_frame;
@@ -1271,10 +1228,9 @@ static int detect_flash(VP9_COMP *cpi, int offset) {
     // are reasonably well predicted by an earlier (pre flash) frame.
     // The recovery after a flash is indicated by a high pcnt_second_ref
     // comapred to pcnt_inter.
-    if ((next_frame.pcnt_second_ref > next_frame.pcnt_inter) &&
-        (next_frame.pcnt_second_ref >= 0.5)) {
+    if (next_frame.pcnt_second_ref > next_frame.pcnt_inter &&
+        next_frame.pcnt_second_ref >= 0.5)
       flash_detected = 1;
-    }
   }
 
   return flash_detected;
@@ -1356,13 +1312,9 @@ static double calc_frame_boost(
   return frame_boost;
 }
 
-static int calc_arf_boost(
-  VP9_COMP *cpi,
-  int offset,
-  int f_frames,
-  int b_frames,
-  int *f_boost,
-  int *b_boost) {
+static int calc_arf_boost(VP9_COMP *cpi, int offset,
+                          int f_frames, int b_frames,
+                          int *f_boost, int *b_boost) {
   FIRSTPASS_STATS this_frame;
 
   int i;
@@ -1392,8 +1344,7 @@ static int calc_arf_boost(
 
     // Cumulative effect of prediction quality decay
     if (!flash_detected) {
-      decay_accumulator =
-        decay_accumulator * get_prediction_decay_rate(cpi, &this_frame);
+      decay_accumulator *= get_prediction_decay_rate(cpi, &this_frame);
       decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR
                           ? MIN_DECAY_FACTOR : decay_accumulator;
     }
@@ -1429,10 +1380,9 @@ static int calc_arf_boost(
 
     // Cumulative effect of prediction quality decay
     if (!flash_detected) {
-      decay_accumulator =
-        decay_accumulator * get_prediction_decay_rate(cpi, &this_frame);
+      decay_accumulator *= get_prediction_decay_rate(cpi, &this_frame);
       decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR
-                          ? MIN_DECAY_FACTOR : decay_accumulator;
+                              ? MIN_DECAY_FACTOR : decay_accumulator;
     }
 
     boost_score += (decay_accumulator *
@@ -1871,26 +1821,20 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   for (i = 0;
       i <= (cpi->source_alt_ref_pending && cpi->common.frame_type != KEY_FRAME);
       ++i) {
-    int boost;
     int allocation_chunks;
-    int Q =
-        (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME] : cpi->oxcf.fixed_q;
+    int q = cpi->oxcf.fixed_q < 0 ? cpi->last_q[INTER_FRAME]
+                                  : cpi->oxcf.fixed_q;
     int gf_bits;
 
-    boost = (cpi->gfu_boost * vp9_gfboost_qadjust(Q)) / 100;
+    int boost = (cpi->gfu_boost * vp9_gfboost_qadjust(q)) / 100;
 
     // Set max and minimum boost and hence minimum allocation
-    if (boost > ((cpi->baseline_gf_interval + 1) * 200))
-      boost = ((cpi->baseline_gf_interval + 1) * 200);
-    else if (boost < 125)
-      boost = 125;
+    boost = clamp(boost, 125, (cpi->baseline_gf_interval + 1) * 200);
 
     if (cpi->source_alt_ref_pending && i == 0)
-      allocation_chunks =
-        ((cpi->baseline_gf_interval + 1) * 100) + boost;
+      allocation_chunks = ((cpi->baseline_gf_interval + 1) * 100) + boost;
     else
-      allocation_chunks =
-        (cpi->baseline_gf_interval * 100) + (boost - 100);
+      allocation_chunks = (cpi->baseline_gf_interval * 100) + (boost - 100);
 
     // Prevent overflow
     if (boost > 1023) {
@@ -1901,41 +1845,34 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
 
     // Calculate the number of bits to be spent on the gf or arf based on
     // the boost number
-    gf_bits = (int)((double)boost *
-                    (cpi->twopass.gf_group_bits /
-                     (double)allocation_chunks));
+    gf_bits = (int)((double)boost * (cpi->twopass.gf_group_bits /
+                                       (double)allocation_chunks));
 
     // If the frame that is to be boosted is simpler than the average for
     // the gf/arf group then use an alternative calculation
     // based on the error score of the frame itself
     if (mod_frame_err < gf_group_err / (double)cpi->baseline_gf_interval) {
-      double  alt_gf_grp_bits;
-      int     alt_gf_bits;
-
-      alt_gf_grp_bits =
+      double alt_gf_grp_bits =
         (double)cpi->twopass.kf_group_bits  *
         (mod_frame_err * (double)cpi->baseline_gf_interval) /
         DOUBLE_DIVIDE_CHECK(cpi->twopass.kf_group_error_left);
 
-      alt_gf_bits = (int)((double)boost * (alt_gf_grp_bits /
+      int alt_gf_bits = (int)((double)boost * (alt_gf_grp_bits /
                                            (double)allocation_chunks));
 
-      if (gf_bits > alt_gf_bits) {
+      if (gf_bits > alt_gf_bits)
         gf_bits = alt_gf_bits;
-      }
     }
     // Else if it is harder than other frames in the group make sure it at
     // least receives an allocation in keeping with its relative error
     // score, otherwise it may be worse off than an "un-boosted" frame
     else {
-      int alt_gf_bits =
-        (int)((double)cpi->twopass.kf_group_bits *
-              mod_frame_err /
-              DOUBLE_DIVIDE_CHECK(cpi->twopass.kf_group_error_left));
+      int alt_gf_bits = (int)((double)cpi->twopass.kf_group_bits *
+                        mod_frame_err /
+                        DOUBLE_DIVIDE_CHECK(cpi->twopass.kf_group_error_left));
 
-      if (alt_gf_bits > gf_bits) {
+      if (alt_gf_bits > gf_bits)
         gf_bits = alt_gf_bits;
-      }
     }
 
     // Dont allow a negative value for gf_bits
@@ -1983,14 +1920,11 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     // despite (MIN_GF_INTERVAL) and would cause a divide by 0 in the
     // calculation of alt_extra_bits.
     if (cpi->baseline_gf_interval >= 3) {
-      int boost = (cpi->source_alt_ref_pending)
-                  ? b_boost : cpi->gfu_boost;
+      const int boost = cpi->source_alt_ref_pending ? b_boost : cpi->gfu_boost;
 
       if (boost >= 150) {
-        int pct_extra;
         int alt_extra_bits;
-
-        pct_extra = (boost - 100) / 50;
+        int pct_extra = (boost - 100) / 50;
         pct_extra = (pct_extra > 20) ? 20 : pct_extra;
 
         alt_extra_bits = (int)((cpi->twopass.gf_group_bits * pct_extra) / 100);
@@ -2071,33 +2005,21 @@ static void assign_std_frame_bits(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
 // Make a damped adjustment to the active max q.
 static int adjust_active_maxq(int old_maxqi, int new_maxqi) {
   int i;
-  int ret_val = new_maxqi;
-  double old_q;
-  double new_q;
-  double target_q;
-
-  old_q = vp9_convert_qindex_to_q(old_maxqi);
-  new_q = vp9_convert_qindex_to_q(new_maxqi);
-
-  target_q = ((old_q * 7.0) + new_q) / 8.0;
+  const double old_q = vp9_convert_qindex_to_q(old_maxqi);
+  const double new_q = vp9_convert_qindex_to_q(new_maxqi);
+  const double target_q = ((old_q * 7.0) + new_q) / 8.0;
 
   if (target_q > old_q) {
-    for (i = old_maxqi; i <= new_maxqi; i++) {
-      if (vp9_convert_qindex_to_q(i) >= target_q) {
-        ret_val = i;
-        break;
-      }
-    }
+    for (i = old_maxqi; i <= new_maxqi; i++)
+      if (vp9_convert_qindex_to_q(i) >= target_q)
+        return i;
   } else {
-    for (i = old_maxqi; i >= new_maxqi; i--) {
-      if (vp9_convert_qindex_to_q(i) <= target_q) {
-        ret_val = i;
-        break;
-      }
-    }
+    for (i = old_maxqi; i >= new_maxqi; i--)
+      if (vp9_convert_qindex_to_q(i) <= target_q)
+        return i;
   }
 
-  return ret_val;
+  return new_maxqi;
 }
 
 void vp9_second_pass(VP9_COMP *cpi) {
@@ -2111,9 +2033,8 @@ void vp9_second_pass(VP9_COMP *cpi) {
   double this_frame_intra_error;
   double this_frame_coded_error;
 
-  if (!cpi->twopass.stats_in) {
+  if (!cpi->twopass.stats_in)
     return;
-  }
 
   vp9_clear_system_state();
 
@@ -2123,12 +2044,8 @@ void vp9_second_pass(VP9_COMP *cpi) {
 
     // Set a cq_level in constrained quality mode.
     if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) {
-      int est_cq;
-
-      est_cq =
-        estimate_cq(cpi,
-                    &cpi->twopass.total_left_stats,
-                    (int)(cpi->twopass.bits_left / frames_left));
+      int est_cq = estimate_cq(cpi, &cpi->twopass.total_left_stats,
+                               (int)(cpi->twopass.bits_left / frames_left));
 
       cpi->cq_target_quality = cpi->oxcf.cq_level;
       if (est_cq > cpi->cq_target_quality)
@@ -2139,14 +2056,12 @@ void vp9_second_pass(VP9_COMP *cpi) {
     cpi->twopass.maxq_max_limit = cpi->worst_quality;
     cpi->twopass.maxq_min_limit = cpi->best_quality;
 
-    tmp_q = estimate_max_q(
-              cpi,
-              &cpi->twopass.total_left_stats,
-              (int)(cpi->twopass.bits_left / frames_left));
+    tmp_q = estimate_max_q(cpi, &cpi->twopass.total_left_stats,
+                           (int)(cpi->twopass.bits_left / frames_left));
 
-    cpi->active_worst_quality         = tmp_q;
-    cpi->ni_av_qi                     = tmp_q;
-    cpi->avg_q                        = vp9_convert_qindex_to_q(tmp_q);
+    cpi->active_worst_quality = tmp_q;
+    cpi->ni_av_qi = tmp_q;
+    cpi->avg_q = vp9_convert_qindex_to_q(tmp_q);
 
 #ifndef ONE_SHOT_Q_ESTIMATE
     // Limit the maxq value returned subsequently.
@@ -2404,9 +2319,9 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     if (cpi->oxcf.auto_key
         && lookup_next_frame_stats(cpi, &next_frame) != EOF) {
       // Normal scene cut check
-      if (test_candidate_kf(cpi, &last_frame, this_frame, &next_frame)) {
+      if (test_candidate_kf(cpi, &last_frame, this_frame, &next_frame))
         break;
-      }
+
 
       // How fast is prediction quality decaying
       loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
@@ -2416,19 +2331,14 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
       // quality since the last GF or KF.
       recent_loop_decay[i % 8] = loop_decay_rate;
       decay_accumulator = 1.0;
-      for (j = 0; j < 8; j++) {
-        decay_accumulator = decay_accumulator * recent_loop_decay[j];
-      }
+      for (j = 0; j < 8; j++)
+        decay_accumulator *= recent_loop_decay[j];
 
       // Special check for transition or high motion followed by a
       // to a static scene.
-      if (detect_transition_to_still(cpi, i,
-                                     (cpi->key_frame_frequency - i),
-                                     loop_decay_rate,
-                                     decay_accumulator)) {
+      if (detect_transition_to_still(cpi, i, cpi->key_frame_frequency - i,
+                                     loop_decay_rate, decay_accumulator))
         break;
-      }
-
 
       // Step on to the next frame
       cpi->twopass.frames_to_key++;
diff --git a/vp9/encoder/vp9_lookahead.c b/vp9/encoder/vp9_lookahead.c
index a89d2547e..708fe4549 100644
--- a/vp9/encoder/vp9_lookahead.c
+++ b/vp9/encoder/vp9_lookahead.c
@@ -46,7 +46,7 @@ void vp9_lookahead_destroy(struct lookahead_ctx *ctx) {
       unsigned int i;
 
       for (i = 0; i < ctx->max_sz; i++)
-        vp8_yv12_de_alloc_frame_buffer(&ctx->buf[i].img);
+        vp9_free_frame_buffer(&ctx->buf[i].img);
       free(ctx->buf);
     }
     free(ctx);
@@ -56,6 +56,8 @@ void vp9_lookahead_destroy(struct lookahead_ctx *ctx) {
 
 struct lookahead_ctx * vp9_lookahead_init(unsigned int width,
                                           unsigned int height,
+                                          unsigned int subsampling_x,
+                                          unsigned int subsampling_y,
                                           unsigned int depth) {
   struct lookahead_ctx *ctx = NULL;
 
@@ -71,8 +73,9 @@ struct lookahead_ctx * vp9_lookahead_init(unsigned int width,
     if (!ctx->buf)
       goto bail;
     for (i = 0; i < depth; i++)
-      if (vp8_yv12_alloc_frame_buffer(&ctx->buf[i].img,
-                                      width, height, VP9BORDERINPIXELS))
+      if (vp9_alloc_frame_buffer(&ctx->buf[i].img,
+                                 width, height, subsampling_x, subsampling_y,
+                                 VP9BORDERINPIXELS))
         goto bail;
   }
   return ctx;
diff --git a/vp9/encoder/vp9_lookahead.h b/vp9/encoder/vp9_lookahead.h
index 2406618b9..81baa2c6f 100644
--- a/vp9/encoder/vp9_lookahead.h
+++ b/vp9/encoder/vp9_lookahead.h
@@ -31,6 +31,8 @@ struct lookahead_ctx;
  */
 struct lookahead_ctx *vp9_lookahead_init(unsigned int width,
                                          unsigned int height,
+                                         unsigned int subsampling_x,
+                                         unsigned int subsampling_y,
                                          unsigned int depth);
 
 
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index 05105d794..3d8003c33 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -313,9 +313,9 @@ static void dealloc_compressor_data(VP9_COMP *cpi) {
 
   vp9_free_frame_buffers(&cpi->common);
 
-  vp8_yv12_de_alloc_frame_buffer(&cpi->last_frame_uf);
-  vp8_yv12_de_alloc_frame_buffer(&cpi->scaled_source);
-  vp8_yv12_de_alloc_frame_buffer(&cpi->alt_ref_buffer);
+  vp9_free_frame_buffer(&cpi->last_frame_uf);
+  vp9_free_frame_buffer(&cpi->scaled_source);
+  vp9_free_frame_buffer(&cpi->alt_ref_buffer);
   vp9_lookahead_destroy(cpi->lookahead);
 
   vpx_free(cpi->tok);
@@ -835,15 +835,19 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
 }
 
 static void alloc_raw_frame_buffers(VP9_COMP *cpi) {
+  VP9_COMMON *cm = &cpi->common;
+
   cpi->lookahead = vp9_lookahead_init(cpi->oxcf.width, cpi->oxcf.height,
+                                      cm->subsampling_x, cm->subsampling_y,
                                       cpi->oxcf.lag_in_frames);
   if (!cpi->lookahead)
     vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate lag buffers");
 
-  if (vp8_yv12_alloc_frame_buffer(&cpi->alt_ref_buffer,
-                                  cpi->oxcf.width, cpi->oxcf.height,
-                                  VP9BORDERINPIXELS))
+  if (vp9_realloc_frame_buffer(&cpi->alt_ref_buffer,
+                               cpi->oxcf.width, cpi->oxcf.height,
+                               cm->subsampling_x, cm->subsampling_y,
+                               VP9BORDERINPIXELS))
     vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate altref buffer");
 }
@@ -873,13 +877,17 @@ void vp9_alloc_compressor_data(VP9_COMP *cpi) {
     vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate partition data");
 
-  if (vp8_yv12_alloc_frame_buffer(&cpi->last_frame_uf,
-                                  cm->width, cm->height, VP9BORDERINPIXELS))
+  if (vp9_alloc_frame_buffer(&cpi->last_frame_uf,
+                             cm->width, cm->height,
+                             cm->subsampling_x, cm->subsampling_y,
+                             VP9BORDERINPIXELS))
     vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate last frame buffer");
 
-  if (vp8_yv12_alloc_frame_buffer(&cpi->scaled_source,
-                                  cm->width, cm->height, VP9BORDERINPIXELS))
+  if (vp9_alloc_frame_buffer(&cpi->scaled_source,
+                             cm->width, cm->height,
+                             cm->subsampling_x, cm->subsampling_y,
+                             VP9BORDERINPIXELS))
     vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate scaled source buffer");
 
@@ -914,13 +922,17 @@ static void update_frame_size(VP9_COMP *cpi) {
   vp9_update_frame_size(cm);
 
   // Update size of buffers local to this frame
-  if (vp8_yv12_realloc_frame_buffer(&cpi->last_frame_uf,
-                                    cm->width, cm->height, VP9BORDERINPIXELS))
+  if (vp9_realloc_frame_buffer(&cpi->last_frame_uf,
+                               cm->width, cm->height,
+                               cm->subsampling_x, cm->subsampling_y,
+                               VP9BORDERINPIXELS))
     vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
                        "Failed to reallocate last frame buffer");
 
-  if (vp8_yv12_realloc_frame_buffer(&cpi->scaled_source,
-                                    cm->width, cm->height, VP9BORDERINPIXELS))
+  if (vp9_realloc_frame_buffer(&cpi->scaled_source,
+                               cm->width, cm->height,
+                               cm->subsampling_x, cm->subsampling_y,
+                               VP9BORDERINPIXELS))
     vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
                        "Failed to reallocate scaled source buffer");
 
@@ -1032,6 +1044,9 @@ static void init_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {
 
   cm->width = oxcf->width;
   cm->height = oxcf->height;
+  cm->subsampling_x = 0;
+  cm->subsampling_y = 0;
+  vp9_alloc_compressor_data(cpi);
 
   // change includes all joint functionality
   vp9_change_config(ptr, oxcf);
@@ -1196,17 +1211,13 @@ void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {
 
   cm->sharpness_level = cpi->oxcf.Sharpness;
 
-  // Increasing the size of the frame beyond the first seen frame, or some
-  // otherwise signalled maximum size, is not supported.
-  // TODO(jkoleszar): exit gracefully.
-  if (!cpi->initial_width) {
-    alloc_raw_frame_buffers(cpi);
-    vp9_alloc_compressor_data(cpi);
-    cpi->initial_width = cm->width;
-    cpi->initial_height = cm->height;
+  if (cpi->initial_width) {
+    // Increasing the size of the frame beyond the first seen frame, or some
+    // otherwise signalled maximum size, is not supported.
+    // TODO(jkoleszar): exit gracefully.
+    assert(cm->width <= cpi->initial_width);
+    assert(cm->height <= cpi->initial_height);
   }
-  assert(cm->width <= cpi->initial_width);
-  assert(cm->height <= cpi->initial_height);
   update_frame_size(cpi);
 
   if (cpi->oxcf.fixed_q >= 0) {
@@ -1938,8 +1949,8 @@ static void generate_psnr_packet(VP9_COMP *cpi) {
   pkt.data.psnr.samples[0] = width * height;
   pkt.data.psnr.samples[1] = width * height;
 
-  width = (width + 1) / 2;
-  height = (height + 1) / 2;
+  width = orig->uv_width;
+  height = orig->uv_height;
 
   sse = calc_plane_error(orig->u_buffer, orig->uv_stride,
                          recon->u_buffer, recon->uv_stride,
@@ -2093,7 +2104,7 @@ void vp9_write_yuv_rec_frame(VP9_COMMON *cm) {
   } while (--h);
 
   src = s->u_buffer;
-  h = (cm->height + 1) / 2;
+  h = s->uv_height;
 
   do {
     fwrite(src, s->uv_width, 1,  yuv_rec_file);
@@ -2101,7 +2112,7 @@ void vp9_write_yuv_rec_frame(VP9_COMMON *cm) {
   } while (--h);
 
   src = s->v_buffer;
-  h = (cm->height + 1) / 2;
+  h = s->uv_height;
 
   do {
     fwrite(src, s->uv_width, 1, yuv_rec_file);
@@ -2117,49 +2128,31 @@ static void scale_and_extend_frame(YV12_BUFFER_CONFIG *src_fb,
   const int in_h = src_fb->y_crop_height;
   const int out_w = dst_fb->y_crop_width;
   const int out_h = dst_fb->y_crop_height;
-  int x, y;
+  int x, y, i;
+
+  uint8_t *srcs[3] = {src_fb->y_buffer, src_fb->u_buffer, src_fb->v_buffer};
+  int src_strides[3] = {src_fb->y_stride, src_fb->uv_stride, src_fb->uv_stride};
+
+  uint8_t *dsts[3] = {dst_fb->y_buffer, dst_fb->u_buffer, dst_fb->v_buffer};
+  int dst_strides[3] = {dst_fb->y_stride, dst_fb->uv_stride, dst_fb->uv_stride};
 
   for (y = 0; y < out_h; y += 16) {
     for (x = 0; x < out_w; x += 16) {
-      int x_q4 = x * 16 * in_w / out_w;
-      int y_q4 = y * 16 * in_h / out_h;
-      uint8_t *src = src_fb->y_buffer + y * in_h / out_h * src_fb->y_stride +
-                     x * in_w / out_w;
-      uint8_t *dst = dst_fb->y_buffer + y * dst_fb->y_stride + x;
-      int src_stride = src_fb->y_stride;
-      int dst_stride = dst_fb->y_stride;
-
-      vp9_convolve8(src, src_stride, dst, dst_stride,
-                    vp9_sub_pel_filters_8[x_q4 & 0xf], 16 * in_w / out_w,
-                    vp9_sub_pel_filters_8[y_q4 & 0xf], 16 * in_h / out_h,
-                    16, 16);
-
-      x_q4 >>= 1;
-      y_q4 >>= 1;
-      src_stride = src_fb->uv_stride;
-      dst_stride = dst_fb->uv_stride;
-
-      src = src_fb->u_buffer +
-          y / 2 * in_h / out_h * src_fb->uv_stride +
-          x / 2 * in_w / out_w;
-      dst = dst_fb->u_buffer +
-          y / 2 * dst_fb->uv_stride +
-          x / 2;
-      vp9_convolve8(src, src_stride, dst, dst_stride,
-                    vp9_sub_pel_filters_8[x_q4 & 0xf], 16 * in_w / out_w,
-                    vp9_sub_pel_filters_8[y_q4 & 0xf], 16 * in_h / out_h,
-                    8, 8);
-
-      src = src_fb->v_buffer +
-          y / 2 * in_h / out_h * src_fb->uv_stride +
-          x / 2 * in_w / out_w;
-      dst = dst_fb->v_buffer +
-          y / 2 * dst_fb->uv_stride +
-          x / 2;
-      vp9_convolve8(src, src_stride, dst, dst_stride,
-                    vp9_sub_pel_filters_8[x_q4 & 0xf], 16 * in_w / out_w,
-                    vp9_sub_pel_filters_8[y_q4 & 0xf], 16 * in_h / out_h,
-                    8, 8);
+      for (i = 0; i < MAX_MB_PLANE; ++i) {
+        const int factor = i == 0 ? 1 : 2;
+        const int x_q4 = x * (16 / factor) * in_w / out_w;
+        const int y_q4 = y * (16 / factor) * in_h / out_h;
+        const int src_stride = src_strides[i];
+        const int dst_stride = dst_strides[i];
+        uint8_t *src = srcs[i] + y / factor * in_h / out_h * src_stride +
+                                 x / factor * in_w / out_w;
+        uint8_t *dst = dsts[i] + y * dst_stride + x;
+
+        vp9_convolve8(src, src_stride, dst, dst_stride,
+                      vp9_sub_pel_filters_8[x_q4 & 0xf], 16 * in_w / out_w,
+                      vp9_sub_pel_filters_8[y_q4 & 0xf], 16 * in_h / out_h,
+                      16 / factor, 16 / factor);
+      }
     }
   }
 
@@ -2500,9 +2493,10 @@ static void scale_references(VP9_COMP *cpi) {
         ref->y_crop_height != cm->height) {
       int new_fb = get_free_fb(cm);
 
-      vp8_yv12_realloc_frame_buffer(&cm->yv12_fb[new_fb],
-                                    cm->width, cm->height,
-                                    VP9BORDERINPIXELS);
+      vp9_realloc_frame_buffer(&cm->yv12_fb[new_fb],
+                               cm->width, cm->height,
+                               cm->subsampling_x, cm->subsampling_y,
+                               VP9BORDERINPIXELS);
       scale_and_extend_frame(ref, &cm->yv12_fb[new_fb]);
       cpi->scaled_ref_idx[i] = new_fb;
     } else {
@@ -3587,6 +3581,15 @@ int vp9_receive_raw_frame(VP9_PTR ptr, unsigned int frame_flags,
   struct vpx_usec_timer  timer;
   int                    res = 0;
 
+  if (!cpi->initial_width) {
+    // TODO(jkoleszar): Support 1/4 subsampling?
+    cm->subsampling_x = sd->uv_width < sd->y_width;
+    cm->subsampling_y = sd->uv_height < sd->y_height;
+    alloc_raw_frame_buffers(cpi);
+
+    cpi->initial_width = cm->width;
+    cpi->initial_height = cm->height;
+  }
   vpx_usec_timer_start(&timer);
   if (vp9_lookahead_push(cpi->lookahead, sd, time_stamp, end_time, frame_flags,
                          cpi->active_map_enabled ? cpi->active_map : NULL))
@@ -3851,9 +3854,10 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
   cm->frame_flags = *frame_flags;
 
   // Reset the frame pointers to the current frame size
-  vp8_yv12_realloc_frame_buffer(&cm->yv12_fb[cm->new_fb_idx],
-                                cm->width, cm->height,
-                                VP9BORDERINPIXELS);
+  vp9_realloc_frame_buffer(&cm->yv12_fb[cm->new_fb_idx],
+                           cm->width, cm->height,
+                           cm->subsampling_x, cm->subsampling_y,
+                           VP9BORDERINPIXELS);
 
   // Calculate scaling factors for each of the 3 available references
   for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i) {
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index ef8cb2bab..5bff383b8 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -272,6 +272,7 @@ static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb,
                         [ENTROPY_NODES];
   int seg_eob, default_eob;
   uint8_t token_cache[1024];
+  const uint8_t * band_translate;
 
   // Check for consistency of tx_size with mode info
   assert((!type && !plane) || (type && plane));
@@ -291,6 +292,7 @@ static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb,
       coef_probs = cm->fc.coef_probs_4x4;
       seg_eob = 16;
       scan = get_scan_4x4(tx_type);
+      band_translate = vp9_coefband_trans_4x4;
       break;
     }
     case TX_8X8: {
@@ -304,6 +306,7 @@ static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb,
       scan = get_scan_8x8(tx_type);
       coef_probs = cm->fc.coef_probs_8x8;
       seg_eob = 64;
+      band_translate = vp9_coefband_trans_8x8plus;
       break;
     }
     case TX_16X16: {
@@ -317,6 +320,7 @@ static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb,
       seg_eob = 256;
       above_ec = (A[0] + A[1] + A[2] + A[3]) != 0;
       left_ec = (L[0] + L[1] + L[2] + L[3]) != 0;
+      band_translate = vp9_coefband_trans_8x8plus;
       break;
     }
     case TX_32X32:
@@ -325,6 +329,7 @@ static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb,
       seg_eob = 1024;
       above_ec = (A[0] + A[1] + A[2] + A[3] + A[4] + A[5] + A[6] + A[7]) != 0;
       left_ec = (L[0] + L[1] + L[2] + L[3] + L[4] + L[5] + L[6] + L[7]) != 0;
+      band_translate = vp9_coefband_trans_8x8plus;
       break;
     default:
       abort();
@@ -347,7 +352,7 @@ static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb,
     for (c = 0; c < eob; c++) {
       int v = qcoeff_ptr[scan[c]];
       int t = vp9_dct_value_tokens_ptr[v].token;
-      int band = get_coef_band(scan, tx_size, c);
+      int band = get_coef_band(band_translate, c);
       if (c)
         pt = vp9_get_coef_context(scan, nb, pad, token_cache, c, default_eob);
 
@@ -361,7 +366,7 @@ static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb,
       if (c)
         pt = vp9_get_coef_context(scan, nb, pad, token_cache, c, default_eob);
       cost += mb->token_costs[tx_size][type][ref]
-          [get_coef_band(scan, tx_size, c)]
+          [get_coef_band(band_translate, c)]
           [pt][DCT_EOB_TOKEN];
     }
   }
@@ -1069,9 +1074,7 @@ typedef struct {
   B_PREDICTION_MODE modes[4];
   int_mv mvs[4], second_mvs[4];
   int eobs[4];
-
   int mvthresh;
-  int *mdcounts;
 } BEST_SEG_INFO;
 
 static INLINE int mv_check_bounds(MACROBLOCK *x, int_mv *mv) {
@@ -1322,7 +1325,6 @@ static int rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x,
                                        int_mv *best_ref_mv,
                                        int_mv *second_best_ref_mv,
                                        int64_t best_rd,
-                                       int *mdcounts,
                                        int *returntotrate,
                                        int *returnyrate,
                                        int *returndistortion,
@@ -1339,7 +1341,6 @@ static int rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x,
   bsi.second_ref_mv = second_best_ref_mv;
   bsi.mvp.as_int = best_ref_mv->as_int;
   bsi.mvthresh = mvthresh;
-  bsi.mdcounts = mdcounts;
 
   for (i = 0; i < 4; i++)
     bsi.modes[i] = ZERO4X4;
@@ -1612,7 +1613,6 @@ static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
                                int mi_row, int mi_col,
                                int_mv frame_nearest_mv[MAX_REF_FRAMES],
                                int_mv frame_near_mv[MAX_REF_FRAMES],
-                               int frame_mdcounts[4][4],
                                struct buf_2d yv12_mb[4][MAX_MB_PLANE],
                                struct scale_factors scale[MAX_REF_FRAMES]) {
   VP9_COMMON *cm = &cpi->common;
@@ -1797,7 +1797,7 @@ static INLINE int get_switchable_rate(VP9_COMMON *cm, MACROBLOCK *x) {
 
 static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                  BLOCK_SIZE_TYPE bsize,
-                                 int mdcounts[4], int64_t txfm_cache[],
+                                 int64_t txfm_cache[],
                                  int *rate2, int *distortion, int *skippable,
                                  int *compmode_cost,
                                  int *rate_y, int *distortion_y,
@@ -2305,7 +2305,9 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   mode = xd->mode_info_context->mbmi.mode;
   txfm_size = xd->mode_info_context->mbmi.txfm_size;
   rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly,
-                          &dist_uv, &uv_skip, bsize);
+                          &dist_uv, &uv_skip,
+                          (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 :
+                                                       bsize);
   if (bsize == BLOCK_SIZE_SB8X8)
     err4x4 = rd_pick_intra4x4mby_modes(cpi, x, &rate4x4_y,
                                        &rate4x4_y_tokenonly,
@@ -2357,7 +2359,6 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   unsigned char segment_id = xd->mode_info_context->mbmi.segment_id;
   int comp_pred, i;
   int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
-  int frame_mdcounts[4][4];
   struct buf_2d yv12_mb[4][MAX_MB_PLANE];
   int_mv single_newmv[MAX_REF_FRAMES];
   static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
@@ -2366,7 +2367,6 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
                      cpi->lst_fb_idx,
                      cpi->gld_fb_idx,
                      cpi->alt_fb_idx};
-  int mdcounts[4];
   int64_t best_rd = INT64_MAX;
   int64_t best_txfm_rd[NB_TXFM_MODES];
   int64_t best_txfm_diff[NB_TXFM_MODES];
@@ -2449,7 +2449,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     if (cpi->ref_frame_flags & flag_list[ref_frame]) {
       setup_buffer_inter(cpi, x, idx_list[ref_frame], ref_frame, block_size,
                          mi_row, mi_col, frame_mv[NEARESTMV], frame_mv[NEARMV],
-                         frame_mdcounts, yv12_mb, scale_factor);
+                         yv12_mb, scale_factor);
     }
     frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
     frame_mv[ZEROMV][ref_frame].as_int = 0;
@@ -2576,8 +2576,6 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
         xd->plane[i].pre[1] = yv12_mb[second_ref][i];
     }
 
-    vpx_memcpy(mdcounts, frame_mdcounts[ref_frame], sizeof(mdcounts));
-
     // If the segment reference frame feature is enabled....
     // then do nothing if the current ref frame is not allowed..
     if (vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&
@@ -2675,7 +2673,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
 
         tmp_rd = rd_pick_best_mbsegmentation(cpi, x,
                                              &mbmi->ref_mvs[mbmi->ref_frame][0],
-                                             second_ref, INT64_MAX, mdcounts,
+                                             second_ref, INT64_MAX,
                                              &rate, &rate_y, &distortion,
                                              &skippable,
                                              (int)this_rd_thresh, seg_mvs);
@@ -2714,7 +2712,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
         // switchable list (bilinear, 6-tap) is indicated at the frame level
         tmp_rd = rd_pick_best_mbsegmentation(cpi, x,
                                              &mbmi->ref_mvs[mbmi->ref_frame][0],
-                                             second_ref, INT64_MAX, mdcounts,
+                                             second_ref, INT64_MAX,
                                              &rate, &rate_y, &distortion,
                                              &skippable,
                                              (int)this_rd_thresh, seg_mvs);
@@ -2745,10 +2743,10 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       // If even the 'Y' rd value of split is higher than best so far
       // then dont bother looking at UV
       vp9_build_inter_predictors_sbuv(&x->e_mbd, mi_row, mi_col,
-                                      bsize);
-      vp9_subtract_sbuv(x, bsize);
+                                      BLOCK_SIZE_SB8X8);
+      vp9_subtract_sbuv(x, BLOCK_SIZE_SB8X8);
       super_block_uvrd_for_txfm(cm, x, &rate_uv, &distortion_uv,
-                                &uv_skippable, bsize, TX_4X4);
+                                &uv_skippable, BLOCK_SIZE_SB8X8, TX_4X4);
       rate2 += rate_uv;
       distortion2 += distortion_uv;
       skippable = skippable && uv_skippable;
@@ -2792,7 +2790,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       }
 
       this_rd = handle_inter_mode(cpi, x, bsize,
-                                  mdcounts, txfm_cache,
+                                  txfm_cache,
                                   &rate2, &distortion2, &skippable,
                                   &compmode_cost,
                                   &rate_y, &distortion_y,
diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c
index cb670dab0..4420d49e3 100644
--- a/vp9/encoder/vp9_tokenize.c
+++ b/vp9/encoder/vp9_tokenize.c
@@ -136,6 +136,7 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE_TYPE bsize,
   ENTROPY_CONTEXT above_ec, left_ec;
   uint8_t token_cache[1024];
   TX_TYPE tx_type = DCT_DCT;
+  const uint8_t * band_translate;
   assert((!type && !plane) || (type && plane));
 
   switch (tx_size) {
@@ -149,6 +150,7 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE_TYPE bsize,
       scan = get_scan_4x4(tx_type);
       counts = cpi->coef_counts_4x4;
       coef_probs = cpi->common.fc.coef_probs_4x4;
+      band_translate = vp9_coefband_trans_4x4;
       break;
     }
     case TX_8X8: {
@@ -162,6 +164,7 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE_TYPE bsize,
       scan = get_scan_8x8(tx_type);
       counts = cpi->coef_counts_8x8;
       coef_probs = cpi->common.fc.coef_probs_8x8;
+      band_translate = vp9_coefband_trans_8x8plus;
       break;
     }
     case TX_16X16: {
@@ -175,6 +178,7 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE_TYPE bsize,
       scan = get_scan_16x16(tx_type);
       counts = cpi->coef_counts_16x16;
       coef_probs = cpi->common.fc.coef_probs_16x16;
+      band_translate = vp9_coefband_trans_8x8plus;
       break;
     }
     case TX_32X32:
@@ -184,6 +188,7 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE_TYPE bsize,
       scan = vp9_default_zig_zag1d_32x32;
       counts = cpi->coef_counts_32x32;
       coef_probs = cpi->common.fc.coef_probs_32x32;
+      band_translate = vp9_coefband_trans_8x8plus;
       break;
   }
 
@@ -196,7 +201,7 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE_TYPE bsize,
 
   c = 0;
   do {
-    const int band = get_coef_band(scan, tx_size, c);
+    const int band = get_coef_band(band_translate, c);
     int token;
     int v = 0;
     rc = scan[c];