29 files changed, 2005 insertions, 759 deletions
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index aebf4a1ae..426699e31 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -83,7 +83,9 @@ typedef enum {
   D27_PRED,           /* Directional 22 deg prediction  [anti-clockwise from 0 deg hor] */
   D63_PRED,           /* Directional 67 deg prediction  [anti-clockwise from 0 deg hor] */
   TM_PRED,            /* Truemotion prediction */
+#if !CONFIG_SB8X8
   I8X8_PRED,          /* 8x8 based prediction, each 8x8 has its own mode */
+#endif
   I4X4_PRED,          /* 4x4 based prediction, each 4x4 has its own mode */
   NEARESTMV,
   NEARMV,
@@ -126,7 +128,9 @@ typedef enum {
 
 #define VP9_YMODES  (I4X4_PRED + 1)
 #define VP9_UV_MODES (TM_PRED + 1)
+#if !CONFIG_SB8X8
 #define VP9_I8X8_MODES (TM_PRED + 1)
+#endif
 #define VP9_I32X32_MODES (TM_PRED + 1)
 
 #define VP9_MVREFS (1 + SPLITMV - NEARESTMV)
@@ -169,6 +173,7 @@ typedef enum {
 #define VP9_NKF_BINTRAMODES (VP9_BINTRAMODES)  /* 10 */
 #endif
 
+#if !CONFIG_SB8X8
 typedef enum {
   PARTITIONING_16X8 = 0,
   PARTITIONING_8X16,
@@ -176,6 +181,7 @@ typedef enum {
   PARTITIONING_4X4,
   NB_PARTITIONINGS,
 } SPLITMV_PARTITIONING_TYPE;
+#endif
 
 /* For keyframes, intra block modes are predicted by the (already decoded)
    modes for the Y blocks to the left and above us; for interframes, there
@@ -271,7 +277,9 @@ typedef struct {
 
   int mb_mode_context[MAX_REF_FRAMES];
 
+#if !CONFIG_SB8X8
   SPLITMV_PARTITIONING_TYPE partitioning;
+#endif
   unsigned char mb_skip_coeff;                                /* does this mb has coefficients at all, 1=no coefficients, 0=need decode tokens */
   unsigned char need_to_clamp_mvs;
   unsigned char need_to_clamp_secondmv;
@@ -293,7 +301,7 @@ typedef struct {
 
 typedef struct {
   MB_MODE_INFO mbmi;
-  union b_mode_info bmi[16];
+  union b_mode_info bmi[16 >> (CONFIG_SB8X8 * 2)];
 } MODE_INFO;
 
 struct scale_factors {
@@ -368,7 +376,7 @@ typedef struct macroblockd {
   PARTITION_CONTEXT *above_seg_context;
   PARTITION_CONTEXT *left_seg_context;
 
-  /* 0 indicates segmentation at MB level is not enabled. Otherwise the individual bits indicate which features are active. */
+  /* 0 (disable) 1 (enable) segmentation */
   unsigned char segmentation_enabled;
 
   /* 0 (do not update) 1 (update) the macroblock segmentation map. */
@@ -433,8 +441,11 @@ typedef struct macroblockd {
 
   int corrupted;
 
-  int sb_index;
-  int mb_index;   // Index of the MB in the SB (0..3)
+  int sb_index;   // index of 32x32 block inside the 64x64 block
+  int mb_index;   // index of 16x16 block inside the 32x32 block
+#if CONFIG_SB8X8
+  int b_index;    // index of 8x8 block inside the 16x16 block
+#endif
   int q_index;
 
 } MACROBLOCKD;
@@ -442,10 +453,10 @@ typedef struct macroblockd {
 static INLINE void update_partition_context(MACROBLOCKD *xd,
                                             BLOCK_SIZE_TYPE sb_type,
                                             BLOCK_SIZE_TYPE sb_size) {
-  int bsl = mi_width_log2(sb_size) - CONFIG_SB8X8, bs = 1 << bsl;
-  int bwl = mi_width_log2(sb_type) - CONFIG_SB8X8;
-  int bhl = mi_height_log2(sb_type) - CONFIG_SB8X8;
-  int boffset = mi_width_log2(BLOCK_SIZE_SB64X64) - CONFIG_SB8X8 - bsl;
+  int bsl = mi_width_log2(sb_size), bs = 1 << bsl;
+  int bwl = mi_width_log2(sb_type);
+  int bhl = mi_height_log2(sb_type);
+  int boffset = mi_width_log2(BLOCK_SIZE_SB64X64) - bsl;
   int i;
   // skip macroblock partition
   if (bsl == 0)
@@ -481,9 +492,9 @@ static INLINE void update_partition_context(MACROBLOCKD *xd,
 
 static INLINE int partition_plane_context(MACROBLOCKD *xd,
                                           BLOCK_SIZE_TYPE sb_type) {
-  int bsl = mi_width_log2(sb_type) - CONFIG_SB8X8, bs = 1 << bsl;
+  int bsl = mi_width_log2(sb_type), bs = 1 << bsl;
   int above = 0, left = 0, i;
-  int boffset = mi_width_log2(BLOCK_SIZE_SB64X64) - bsl - CONFIG_SB8X8;
+  int boffset = mi_width_log2(BLOCK_SIZE_SB64X64) - bsl;
 
   assert(mi_width_log2(sb_type) == mi_height_log2(sb_type));
   assert(bsl >= 0);
@@ -581,6 +592,7 @@ static TX_TYPE get_tx_type_4x4(const MACROBLOCKD *xd, int ib) {
           xd->mode_info_context->bmi[ib].as_mode.context :
 #endif
         xd->mode_info_context->bmi[ib].as_mode.first);
+#if !CONFIG_SB8X8
   } else if (xd->mode_info_context->mbmi.mode == I8X8_PRED &&
              xd->q_index < ACTIVE_HT) {
     const int ic = (ib & 10);
@@ -615,7 +627,8 @@ static TX_TYPE get_tx_type_4x4(const MACROBLOCKD *xd, int ib) {
     // Use 2D DCT
     tx_type = DCT_DCT;
 #endif
-  } else if (xd->mode_info_context->mbmi.mode < I8X8_PRED &&
+#endif  // !CONFIG_SB8X8
+  } else if (xd->mode_info_context->mbmi.mode <= TM_PRED &&
              xd->q_index < ACTIVE_HT) {
 #if USE_ADST_FOR_I16X16_4X4
 #if USE_ADST_PERIPHERY_ONLY
@@ -659,14 +672,17 @@ static TX_TYPE get_tx_type_8x8(const MACROBLOCKD *xd, int ib) {
 #endif
   if (ib >= (1 << (wb + hb)))  // no chroma adst
     return tx_type;
+#if !CONFIG_SB8X8
   if (xd->mode_info_context->mbmi.mode == I8X8_PRED &&
       xd->q_index < ACTIVE_HT8) {
     // TODO(rbultje): MB_PREDICTION_MODE / B_PREDICTION_MODE should be merged
     // or the relationship otherwise modified to address this type conversion.
     tx_type = txfm_map(pred_mode_conv(
            (MB_PREDICTION_MODE)xd->mode_info_context->bmi[ib].as_mode.first));
-  } else if (xd->mode_info_context->mbmi.mode < I8X8_PRED &&
-             xd->q_index < ACTIVE_HT8) {
+  } else
+#endif  // CONFIG_SB8X8
+  if (xd->mode_info_context->mbmi.mode <= TM_PRED &&
+      xd->q_index < ACTIVE_HT8) {
 #if USE_ADST_FOR_I16X16_8X8
 #if USE_ADST_PERIPHERY_ONLY
     const int hmax = 1 << wb;
@@ -707,7 +723,7 @@ static TX_TYPE get_tx_type_16x16(const MACROBLOCKD *xd, int ib) {
 #endif
   if (ib >= (1 << (wb + hb)))
     return tx_type;
-  if (xd->mode_info_context->mbmi.mode < I8X8_PRED &&
+  if (xd->mode_info_context->mbmi.mode <= TM_PRED &&
       xd->q_index < ACTIVE_HT16) {
     tx_type = txfm_map(pred_mode_conv(xd->mode_info_context->mbmi.mode));
 #if USE_ADST_PERIPHERY_ONLY
@@ -738,7 +754,9 @@ void vp9_setup_block_dptrs(MACROBLOCKD *xd);
 static TX_SIZE get_uv_tx_size(const MACROBLOCKD *xd) {
   MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
   const TX_SIZE size = mbmi->txfm_size;
+#if !CONFIG_SB8X8
   const MB_PREDICTION_MODE mode = mbmi->mode;
+#endif  // !CONFIG_SB8X8
 
   switch (mbmi->sb_type) {
     case BLOCK_SIZE_SB64X64:
@@ -750,6 +768,17 @@ static TX_SIZE get_uv_tx_size(const MACROBLOCKD *xd) {
         return TX_16X16;
       else
         return size;
+#if CONFIG_SB8X8
+    case BLOCK_SIZE_SB32X16:
+    case BLOCK_SIZE_SB16X32:
+    case BLOCK_SIZE_MB16X16:
+      if (size == TX_16X16)
+        return TX_8X8;
+      else
+        return size;
+    default:
+      return TX_4X4;
+#else  // CONFIG_SB8X8
     default:
       if (size == TX_16X16)
         return TX_8X8;
@@ -757,6 +786,7 @@ static TX_SIZE get_uv_tx_size(const MACROBLOCKD *xd) {
         return TX_4X4;
       else
         return size;
+#endif  // CONFIG_SB8X8
   }
 
   return size;
@@ -812,7 +842,10 @@ typedef void (*foreach_transformed_block_visitor)(int plane, int block,
                                                   void *arg);
 static INLINE void foreach_transformed_block_in_plane(
     const MACROBLOCKD* const xd, BLOCK_SIZE_TYPE bsize, int plane,
-    int is_split, foreach_transformed_block_visitor visit, void *arg) {
+#if !CONFIG_SB8X8
+    int is_split,
+#endif  // !CONFIG_SB8X8
+    foreach_transformed_block_visitor visit, void *arg) {
   const int bw = b_width_log2(bsize), bh = b_height_log2(bsize);
 
   // block and transform sizes, in number of 4x4 blocks log 2 ("*_b")
@@ -830,7 +863,10 @@ static INLINE void foreach_transformed_block_in_plane(
   // than the size of the subsampled data, or forced externally by the mb mode.
   const int ss_max = MAX(xd->plane[plane].subsampling_x,
                          xd->plane[plane].subsampling_y);
-  const int ss_txfrm_size = txfrm_size_b > ss_block_size || is_split
+  const int ss_txfrm_size = txfrm_size_b > ss_block_size
+#if !CONFIG_SB8X8
+                            || is_split
+#endif  // !CONFIG_SB8X8
                                 ? txfrm_size_b - ss_max * 2
                                 : txfrm_size_b;
   const int step = 1 << ss_txfrm_size;
@@ -847,17 +883,24 @@ static INLINE void foreach_transformed_block_in_plane(
 static INLINE void foreach_transformed_block(
     const MACROBLOCKD* const xd, BLOCK_SIZE_TYPE bsize,
     foreach_transformed_block_visitor visit, void *arg) {
+#if !CONFIG_SB8X8
   const MB_PREDICTION_MODE mode = xd->mode_info_context->mbmi.mode;
   const int is_split =
       xd->mode_info_context->mbmi.txfm_size == TX_8X8 &&
       (mode == I8X8_PRED || mode == SPLITMV);
+#endif  // !CONFIG_SB8X8
   int plane;
 
   for (plane = 0; plane < MAX_MB_PLANE; plane++) {
+#if !CONFIG_SB8X8
     const int is_split_chroma = is_split &&
          xd->plane[plane].plane_type == PLANE_TYPE_UV;
+#endif  // !CONFIG_SB8X8
 
-    foreach_transformed_block_in_plane(xd, bsize, plane, is_split_chroma,
+    foreach_transformed_block_in_plane(xd, bsize, plane,
+#if !CONFIG_SB8X8
+                                       is_split_chroma,
+#endif  // !CONFIG_SB8X8
                                        visit, arg);
   }
 }
@@ -865,14 +908,19 @@ static INLINE void foreach_transformed_block(
 static INLINE void foreach_transformed_block_uv(
     const MACROBLOCKD* const xd, BLOCK_SIZE_TYPE bsize,
     foreach_transformed_block_visitor visit, void *arg) {
+#if !CONFIG_SB8X8
   const MB_PREDICTION_MODE mode = xd->mode_info_context->mbmi.mode;
   const int is_split =
       xd->mode_info_context->mbmi.txfm_size == TX_8X8 &&
       (mode == I8X8_PRED || mode == SPLITMV);
+#endif  // !CONFIG_SB8X8
   int plane;
 
   for (plane = 1; plane < MAX_MB_PLANE; plane++) {
-    foreach_transformed_block_in_plane(xd, bsize, plane, is_split,
+    foreach_transformed_block_in_plane(xd, bsize, plane,
+#if !CONFIG_SB8X8
+                                       is_split,
+#endif  // !CONFIG_SB8X8
                                        visit, arg);
   }
 }
@@ -900,11 +948,16 @@ static INLINE void foreach_predicted_block_in_plane(
   int pred_w, pred_h;
 
   if (mode == SPLITMV) {
+#if CONFIG_SB8X8
+    pred_w = 0;
+    pred_h = 0;
+#else
     // 4x4 or 8x8
     const int is_4x4 =
         (xd->mode_info_context->mbmi.partitioning == PARTITIONING_4X4);
     pred_w = is_4x4 ? 0 : 1 >> xd->plane[plane].subsampling_x;
     pred_h = is_4x4 ? 0 : 1 >> xd->plane[plane].subsampling_y;
+#endif
   } else {
     pred_w = bw;
     pred_h = bh;
@@ -961,6 +1014,74 @@ static uint8_t* raster_block_offset_uint8(MACROBLOCKD *xd,
   return base + raster_block_offset(xd, bsize, plane, block, stride);
 }
 
+static int txfrm_block_to_raster_block(MACROBLOCKD *xd,
+                                       BLOCK_SIZE_TYPE bsize,
+                                       int plane, int block,
+                                       int ss_txfrm_size) {
+  const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x;
+  const int txwl = ss_txfrm_size / 2;
+  const int tx_cols_lg2 = bwl - txwl;
+  const int tx_cols = 1 << tx_cols_lg2;
+  const int raster_mb = block >> ss_txfrm_size;
+  const int x = (raster_mb & (tx_cols - 1)) << (txwl);
+  const int y = raster_mb >> tx_cols_lg2 << (txwl);
+  return x + (y << bwl);
+}
+
+static void txfrm_block_to_raster_xy(MACROBLOCKD *xd,
+                                     BLOCK_SIZE_TYPE bsize,
+                                     int plane, int block,
+                                     int ss_txfrm_size,
+                                     int *x, int *y) {
+  const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x;
+  const int txwl = ss_txfrm_size / 2;
+  const int tx_cols_lg2 = bwl - txwl;
+  const int tx_cols = 1 << tx_cols_lg2;
+  const int raster_mb = block >> ss_txfrm_size;
+  *x = (raster_mb & (tx_cols - 1)) << (txwl);
+  *y = raster_mb >> tx_cols_lg2 << (txwl);
+}
+
+static TX_SIZE tx_size_for_plane(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize,
+                                 int plane) {
+  // TODO(jkoleszar): This duplicates a ton of code, but we're going to be
+  // moving this to a per-plane lookup shortly, and this will go away then.
+  if (!plane) {
+    return xd->mode_info_context->mbmi.txfm_size;
+  } else {
+    const int bw = b_width_log2(bsize), bh = b_height_log2(bsize);
+#if !CONFIG_SB8X8
+    const MB_PREDICTION_MODE mode = xd->mode_info_context->mbmi.mode;
+    const int is_split =
+        xd->mode_info_context->mbmi.txfm_size == TX_8X8 &&
+        (mode == I8X8_PRED || mode == SPLITMV);
+#endif
+
+    // block and transform sizes, in number of 4x4 blocks log 2 ("*_b")
+    // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8
+    const TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;
+    const int block_size_b = bw + bh;
+    const int txfrm_size_b = tx_size * 2;
+
+    // subsampled size of the block
+    const int ss_sum = xd->plane[plane].subsampling_x +
+                       xd->plane[plane].subsampling_y;
+    const int ss_block_size = block_size_b - ss_sum;
+
+    // size of the transform to use. scale the transform down if it's larger
+    // than the size of the subsampled data, or forced externally by the mb mode
+    const int ss_max = MAX(xd->plane[plane].subsampling_x,
+                           xd->plane[plane].subsampling_y);
+    const int ss_txfrm_size = txfrm_size_b > ss_block_size
+#if !CONFIG_SB8X8
+                            || is_split
+#endif  // !CONFIG_SB8X8
+                                  ? txfrm_size_b - ss_max * 2
+                                  : txfrm_size_b;
+    return (TX_SIZE)(ss_txfrm_size / 2);
+  }
+}
+
 #if CONFIG_CODE_ZEROGROUP
 static int get_zpc_used(TX_SIZE tx_size) {
   return (tx_size >= TX_16X16);
diff --git a/vp9/common/vp9_entropymode.c b/vp9/common/vp9_entropymode.c
index 8d5577f24..ed5441cc1 100644
--- a/vp9/common/vp9_entropymode.c
+++ b/vp9/common/vp9_entropymode.c
@@ -16,6 +16,17 @@
 #include "vpx_mem/vpx_mem.h"
 
 static const unsigned int kf_y_mode_cts[8][VP9_YMODES] = {
+#if CONFIG_SB8X8
+  /* DC V   H  D45 135 117 153 D27 D63 TM i4X4 */
+  {12,  6,  5,  5,  5,  5,  5,  5,  5,  2, 200},
+  {25, 13, 13,  7,  7,  7,  7,  7,  7,  6, 160},
+  {31, 17, 18,  8,  8,  8,  8,  8,  8,  9, 139},
+  {40, 22, 23,  8,  8,  8,  8,  8,  8, 12, 116},
+  {53, 26, 28,  8,  8,  8,  8,  8,  8, 13,  94},
+  {68, 33, 35,  8,  8,  8,  8,  8,  8, 17,  68},
+  {78, 38, 38,  8,  8,  8,  8,  8,  8, 19,  52},
+  {89, 42, 42,  8,  8,  8,  8,  8,  8, 21,  34},
+#else
   /* DC V   H  D45 135 117 153 D27 D63 TM i8x8 i4X4 */
   {12,  6,  5,  5,  5,  5,  5,  5,  5,  2, 22, 200},
   {25, 13, 13,  7,  7,  7,  7,  7,  7,  6, 27, 160},
@@ -25,11 +36,17 @@ static const unsigned int kf_y_mode_cts[8][VP9_YMODES] = {
   {68, 33, 35,  8,  8,  8,  8,  8,  8, 17, 20,  68},
   {78, 38, 38,  8,  8,  8,  8,  8,  8, 19, 16,  52},
   {89, 42, 42,  8,  8,  8,  8,  8,  8, 21, 12,  34},
+#endif
 };
 
 static const unsigned int y_mode_cts  [VP9_YMODES] = {
+#if CONFIG_SB8X8
+  /* DC V   H  D45 135 117 153 D27 D63 TM i4X4 */
+  98, 19, 15, 14, 14, 14, 14, 12, 12, 13, 70
+#else
   /* DC V   H  D45 135 117 153 D27 D63 TM i8x8 i4X4 */
   98, 19, 15, 14, 14, 14, 14, 12, 12, 13, 16, 70
+#endif
 };
 
 static const unsigned int uv_mode_cts [VP9_YMODES] [VP9_UV_MODES] = {
@@ -44,14 +61,18 @@ static const unsigned int uv_mode_cts [VP9_YMODES] [VP9_UV_MODES] = {
   { 150, 15, 10, 10, 10, 10, 10, 75, 10,  6}, /* D27 */
   { 150, 15, 10, 10, 10, 10, 10, 10, 75,  6}, /* D63 */
   { 160, 30, 30, 10, 10, 10, 10, 10, 10, 16}, /* TM */
+#if !CONFIG_SB8X8
   { 132, 46, 40, 10, 10, 10, 10, 10, 10, 18}, /* i8x8 - never used */
+#endif
   { 150, 35, 41, 10, 10, 10, 10, 10, 10, 10}, /* i4X4 */
 };
 
+#if !CONFIG_SB8X8
 static const unsigned int i8x8_mode_cts  [VP9_I8X8_MODES] = {
   /* DC V  H D45 135 117 153 D27 D63  TM */
   73, 49, 61, 30, 30, 30, 30, 30, 30, 13
 };
+#endif
 
 static const unsigned int kf_uv_mode_cts [VP9_YMODES] [VP9_UV_MODES] = {
   // DC   V   H  D45 135 117 153 D27 D63 TM
@@ -65,7 +86,9 @@ static const unsigned int kf_uv_mode_cts [VP9_YMODES] [VP9_UV_MODES] = {
   { 102, 33, 20, 20, 20, 20, 20, 64, 20, 14}, /* D27 */
   { 102, 33, 20, 20, 20, 20, 20, 20, 64, 14}, /* D63 */
   { 132, 36, 30, 20, 20, 20, 20, 20, 20, 18}, /* TM */
+#if !CONFIG_SB8X8
   { 122, 41, 35, 20, 20, 20, 20, 20, 20, 18}, /* i8x8 - never used */
+#endif
   { 122, 41, 35, 20, 20, 20, 20, 20, 20, 18}, /* I4X4 */
 };
 
@@ -123,6 +146,7 @@ const vp9_prob vp9_sub_mv_ref_prob2 [SUBMVREF_COUNT][VP9_SUBMVREFS - 1] = {
   { 208, 1, 1  }
 };
 
+#if !CONFIG_SB8X8
 vp9_mbsplit vp9_mbsplits [VP9_NUMMBSPLITS] = {
   {
     0,  0,  0,  0,
@@ -150,9 +174,17 @@ vp9_mbsplit vp9_mbsplits [VP9_NUMMBSPLITS] = {
 const int vp9_mbsplit_count [VP9_NUMMBSPLITS] = { 2, 2, 4, 16};
 
 const vp9_prob vp9_mbsplit_probs [VP9_NUMMBSPLITS - 1] = { 110, 111, 150};
+#endif
 
 const vp9_prob vp9_partition_probs[NUM_PARTITION_CONTEXTS]
                                   [PARTITION_TYPES - 1] = {
+#if CONFIG_SB8X8
+  // FIXME(jingning,rbultje) put real probabilities here
+  {202, 162, 107},
+  {16,  2,   169},
+  {3,   246,  19},
+  {104, 90,  134},
+#endif
   {202, 162, 107},
   {16,  2,   169},
   {3,   246,  19},
@@ -228,8 +260,12 @@ const vp9_tree_index vp9_ymode_tree[VP9_YMODES * 2 - 2] = {
   -D27_PRED, -D63_PRED,
   16, 18,
   -V_PRED, -H_PRED,
+#if CONFIG_SB8X8
+  -TM_PRED, -I4X4_PRED
+#else
   -TM_PRED, 20,
   -I4X4_PRED, -I8X8_PRED
+#endif
 };
 
 const vp9_tree_index vp9_kf_ymode_tree[VP9_YMODES * 2 - 2] = {
@@ -242,10 +278,15 @@ const vp9_tree_index vp9_kf_ymode_tree[VP9_YMODES * 2 - 2] = {
   -D27_PRED, -D63_PRED,
   16, 18,
   -V_PRED, -H_PRED,
+#if CONFIG_SB8X8
+  -TM_PRED, -I4X4_PRED
+#else
   -TM_PRED, 20,
   -I4X4_PRED, -I8X8_PRED
+#endif
 };
 
+#if !CONFIG_SB8X8
 const vp9_tree_index vp9_i8x8_mode_tree[VP9_I8X8_MODES * 2 - 2] = {
   2, 14,
   -DC_PRED, 4,
@@ -257,6 +298,7 @@ const vp9_tree_index vp9_i8x8_mode_tree[VP9_I8X8_MODES * 2 - 2] = {
   -V_PRED, 16,
   -H_PRED, -TM_PRED
 };
+#endif
 
 const vp9_tree_index vp9_uv_mode_tree[VP9_UV_MODES * 2 - 2] = {
   2, 14,
@@ -270,11 +312,13 @@ const vp9_tree_index vp9_uv_mode_tree[VP9_UV_MODES * 2 - 2] = {
   -H_PRED, -TM_PRED
 };
 
+#if !CONFIG_SB8X8
 const vp9_tree_index vp9_mbsplit_tree[6] = {
   -PARTITIONING_4X4,   2,
   -PARTITIONING_8X8,   4,
   -PARTITIONING_16X8, -PARTITIONING_8X16,
 };
+#endif
 
 const vp9_tree_index vp9_mv_ref_tree[8] = {
   -ZEROMV, 2,
@@ -308,8 +352,10 @@ struct vp9_token vp9_sb_ymode_encodings[VP9_I32X32_MODES];
 struct vp9_token vp9_sb_kf_ymode_encodings[VP9_I32X32_MODES];
 struct vp9_token vp9_kf_ymode_encodings[VP9_YMODES];
 struct vp9_token vp9_uv_mode_encodings[VP9_UV_MODES];
+#if !CONFIG_SB8X8
 struct vp9_token vp9_i8x8_mode_encodings[VP9_I8X8_MODES];
 struct vp9_token vp9_mbsplit_encodings[VP9_NUMMBSPLITS];
+#endif
 
 struct vp9_token vp9_mv_ref_encoding_array[VP9_MVREFS];
 struct vp9_token vp9_sb_mv_ref_encoding_array[VP9_MVREFS];
@@ -340,12 +386,16 @@ void vp9_init_mbmode_probs(VP9_COMMON *x) {
                                      bct, uv_mode_cts[i], 0);
   }
 
+#if !CONFIG_SB8X8
   vp9_tree_probs_from_distribution(vp9_i8x8_mode_tree, x->fc.i8x8_mode_prob,
                                    bct, i8x8_mode_cts, 0);
+#endif
 
   vpx_memcpy(x->fc.sub_mv_ref_prob, vp9_sub_mv_ref_prob2,
              sizeof(vp9_sub_mv_ref_prob2));
+#if !CONFIG_SB8X8
   vpx_memcpy(x->fc.mbsplit_prob, vp9_mbsplit_probs, sizeof(vp9_mbsplit_probs));
+#endif
   vpx_memcpy(x->fc.switchable_interp_prob, vp9_switchable_interp_prob,
              sizeof(vp9_switchable_interp_prob));
 
@@ -449,8 +499,10 @@ void vp9_entropy_mode_init() {
   vp9_tokens_from_tree(vp9_sb_ymode_encodings, vp9_sb_ymode_tree);
   vp9_tokens_from_tree(vp9_sb_kf_ymode_encodings, vp9_sb_kf_ymode_tree);
   vp9_tokens_from_tree(vp9_uv_mode_encodings,  vp9_uv_mode_tree);
+#if !CONFIG_SB8X8
   vp9_tokens_from_tree(vp9_i8x8_mode_encodings,  vp9_i8x8_mode_tree);
   vp9_tokens_from_tree(vp9_mbsplit_encodings, vp9_mbsplit_tree);
+#endif
   vp9_tokens_from_tree(vp9_switchable_interp_encodings,
                        vp9_switchable_interp_tree);
   vp9_tokens_from_tree(vp9_partition_encodings, vp9_partition_tree);
@@ -629,9 +681,11 @@ void vp9_adapt_mode_probs(VP9_COMMON *cm) {
   update_mode_probs(VP9_NKF_BINTRAMODES, vp9_bmode_tree,
                     fc->bmode_counts, fc->pre_bmode_prob,
                     fc->bmode_prob, 0);
+#if !CONFIG_SB8X8
   update_mode_probs(VP9_I8X8_MODES,
                     vp9_i8x8_mode_tree, fc->i8x8_mode_counts,
                     fc->pre_i8x8_mode_prob, fc->i8x8_mode_prob, 0);
+#endif
 
   for (i = 0; i < SUBMVREF_COUNT; ++i)
     update_mode_probs(VP9_SUBMVREFS,
@@ -639,9 +693,11 @@ void vp9_adapt_mode_probs(VP9_COMMON *cm) {
                       fc->pre_sub_mv_ref_prob[i], fc->sub_mv_ref_prob[i],
                       LEFT4X4);
 
+#if !CONFIG_SB8X8
   update_mode_probs(VP9_NUMMBSPLITS, vp9_mbsplit_tree,
                     fc->mbsplit_counts, fc->pre_mbsplit_prob,
                     fc->mbsplit_prob, 0);
+#endif
 #if CONFIG_COMP_INTERINTRA_PRED
   if (cm->use_interintra) {
     int factor, interintra_prob, count;
diff --git a/vp9/common/vp9_entropymode.h b/vp9/common/vp9_entropymode.h
index 665569578..24f988f25 100644
--- a/vp9/common/vp9_entropymode.h
+++ b/vp9/common/vp9_entropymode.h
@@ -15,7 +15,9 @@
 #include "vp9/common/vp9_treecoder.h"
 
 #define SUBMVREF_COUNT 5
+#if !CONFIG_SB8X8
 #define VP9_NUMMBSPLITS 4
+#endif
 
 #if CONFIG_COMP_INTERINTRA_PRED
 #define VP9_DEF_INTERINTRA_PROB 248
@@ -24,6 +26,7 @@
 #define SEPARATE_INTERINTRA_UV  0
 #endif
 
+#if !CONFIG_SB8X8
 typedef const int vp9_mbsplit[16];
 
 extern vp9_mbsplit vp9_mbsplits[VP9_NUMMBSPLITS];
@@ -31,6 +34,7 @@ extern vp9_mbsplit vp9_mbsplits[VP9_NUMMBSPLITS];
 extern const int vp9_mbsplit_count[VP9_NUMMBSPLITS];    /* # of subsets */
 
 extern const vp9_prob vp9_mbsplit_probs[VP9_NUMMBSPLITS - 1];
+#endif
 
 extern int vp9_mv_cont(const int_mv *l, const int_mv *a);
 
@@ -48,8 +52,10 @@ extern const vp9_tree_index  vp9_kf_ymode_tree[];
 extern const vp9_tree_index  vp9_uv_mode_tree[];
 #define vp9_sb_ymode_tree vp9_uv_mode_tree
 #define vp9_sb_kf_ymode_tree vp9_uv_mode_tree
+#if !CONFIG_SB8X8
 extern const vp9_tree_index  vp9_i8x8_mode_tree[];
 extern const vp9_tree_index  vp9_mbsplit_tree[];
+#endif
 extern const vp9_tree_index  vp9_mv_ref_tree[];
 extern const vp9_tree_index  vp9_sb_mv_ref_tree[];
 extern const vp9_tree_index  vp9_sub_mv_ref_tree[];
@@ -60,9 +66,11 @@ extern struct vp9_token vp9_ymode_encodings[VP9_YMODES];
 extern struct vp9_token vp9_sb_ymode_encodings[VP9_I32X32_MODES];
 extern struct vp9_token vp9_sb_kf_ymode_encodings[VP9_I32X32_MODES];
 extern struct vp9_token vp9_kf_ymode_encodings[VP9_YMODES];
-extern struct vp9_token vp9_i8x8_mode_encodings[VP9_I8X8_MODES];
 extern struct vp9_token vp9_uv_mode_encodings[VP9_UV_MODES];
+#if !CONFIG_SB8X8
+extern struct vp9_token vp9_i8x8_mode_encodings[VP9_I8X8_MODES];
 extern struct vp9_token vp9_mbsplit_encodings[VP9_NUMMBSPLITS];
+#endif
 
 /* Inter mode values do not start at zero */
 
diff --git a/vp9/common/vp9_enums.h b/vp9/common/vp9_enums.h
index b72b41e95..3f00ba496 100644
--- a/vp9/common/vp9_enums.h
+++ b/vp9/common/vp9_enums.h
@@ -47,6 +47,6 @@ typedef enum PARTITION_TYPE {
 } PARTITION_TYPE;
 
 #define PARTITION_PLOFFSET   4  // number of probability models per block size
-#define NUM_PARTITION_CONTEXTS (2 * PARTITION_PLOFFSET)
+#define NUM_PARTITION_CONTEXTS ((2 + CONFIG_SB8X8) * PARTITION_PLOFFSET)
 
 #endif  // VP9_COMMON_VP9_ENUMS_H_
diff --git a/vp9/common/vp9_findnearmv.h b/vp9/common/vp9_findnearmv.h
index 085454512..df1ab73e8 100644
--- a/vp9/common/vp9_findnearmv.h
+++ b/vp9/common/vp9_findnearmv.h
@@ -74,11 +74,13 @@ vp9_prob *vp9_mv_ref_probs(VP9_COMMON *pc,
                            vp9_prob p[VP9_MVREFS - 1],
                            const int context);
 
+#if !CONFIG_SB8X8
 extern const uint8_t vp9_mbsplit_offset[4][16];
+#endif
 
 static int left_block_mv(const MACROBLOCKD *xd,
                          const MODE_INFO *cur_mb, int b) {
-  if (!(b & 3)) {
+  if (!(b & (3 >> CONFIG_SB8X8))) {
     if (!xd->left_available)
       return 0;
 
@@ -88,7 +90,7 @@ static int left_block_mv(const MACROBLOCKD *xd,
     if (cur_mb->mbmi.mode != SPLITMV)
       return cur_mb->mbmi.mv[0].as_int;
 
-    b += 4;
+    b += 4 >> CONFIG_SB8X8;
   }
 
   return (cur_mb->bmi + b - 1)->as_mv[0].as_int;
@@ -96,7 +98,7 @@ static int left_block_mv(const MACROBLOCKD *xd,
 
 static int left_block_second_mv(const MACROBLOCKD *xd,
                                 const MODE_INFO *cur_mb, int b) {
-  if (!(b & 3)) {
+  if (!(b & (3 >> CONFIG_SB8X8))) {
     if (!xd->left_available)
       return 0;
 
@@ -106,7 +108,7 @@ static int left_block_second_mv(const MACROBLOCKD *xd,
     if (cur_mb->mbmi.mode != SPLITMV)
       return cur_mb->mbmi.second_ref_frame > 0 ?
           cur_mb->mbmi.mv[1].as_int : cur_mb->mbmi.mv[0].as_int;
-    b += 4;
+    b += 4 >> CONFIG_SB8X8;
   }
 
   return cur_mb->mbmi.second_ref_frame > 0 ?
@@ -115,72 +117,85 @@ static int left_block_second_mv(const MACROBLOCKD *xd,
 }
 
 static int above_block_mv(const MODE_INFO *cur_mb, int b, int mi_stride) {
-  if (!(b >> 2)) {
+  if (!(b >> (2 >> CONFIG_SB8X8))) {
     /* On top edge, get from MB above us */
     cur_mb -= mi_stride;
 
     if (cur_mb->mbmi.mode != SPLITMV)
       return cur_mb->mbmi.mv[0].as_int;
-    b += 16;
+    b += 16 >> (2 * CONFIG_SB8X8);
   }
 
-  return (cur_mb->bmi + b - 4)->as_mv[0].as_int;
+  return (cur_mb->bmi + b - (4 >> CONFIG_SB8X8))->as_mv[0].as_int;
 }
 
 static int above_block_second_mv(const MODE_INFO *cur_mb, int b, int mi_stride) {
-  if (!(b >> 2)) {
+  if (!(b >> (2 >> CONFIG_SB8X8))) {
     /* On top edge, get from MB above us */
     cur_mb -= mi_stride;
 
     if (cur_mb->mbmi.mode != SPLITMV)
       return cur_mb->mbmi.second_ref_frame > 0 ?
           cur_mb->mbmi.mv[1].as_int : cur_mb->mbmi.mv[0].as_int;
-    b += 16;
+    b += 16 >> (2 * CONFIG_SB8X8);
   }
 
   return cur_mb->mbmi.second_ref_frame > 0 ?
-      (cur_mb->bmi + b - 4)->as_mv[1].as_int :
-      (cur_mb->bmi + b - 4)->as_mv[0].as_int;
+      (cur_mb->bmi + b - (4 >> CONFIG_SB8X8))->as_mv[1].as_int :
+      (cur_mb->bmi + b - (4 >> CONFIG_SB8X8))->as_mv[0].as_int;
 }
 
 static B_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mb, int b) {
-  if (!(b & 3)) {
+#if CONFIG_SB8X8
+  // FIXME(rbultje, jingning): temporary hack because jenkins doesn't
+  // understand this condition. This will go away soon.
+  if (b == 0 || b == 2) {
+#else
+  if (!(b & (3 >> CONFIG_SB8X8))) {
+#endif
     /* On L edge, get from MB to left of us */
     --cur_mb;
 
-    if (cur_mb->mbmi.mode < I8X8_PRED) {
+    if (cur_mb->mbmi.mode <= TM_PRED) {
       return pred_mode_conv(cur_mb->mbmi.mode);
+#if !CONFIG_SB8X8
     } else if (cur_mb->mbmi.mode == I8X8_PRED) {
       return pred_mode_conv(
           (MB_PREDICTION_MODE)(cur_mb->bmi + 3 + b)->as_mode.first);
+#endif  // !CONFIG_SB8X8
     } else if (cur_mb->mbmi.mode == I4X4_PRED) {
-      return ((cur_mb->bmi + 3 + b)->as_mode.first);
+      return ((cur_mb->bmi + (3 >> CONFIG_SB8X8) + b)->as_mode.first);
     } else {
       return B_DC_PRED;
     }
   }
+#if CONFIG_SB8X8
+  assert(b == 1 || b == 3);
+#endif
   return (cur_mb->bmi + b - 1)->as_mode.first;
 }
 
 static B_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mb,
                                           int b, int mi_stride) {
-  if (!(b >> 2)) {
+  if (!(b >> (2 >> CONFIG_SB8X8))) {
     /* On top edge, get from MB above us */
     cur_mb -= mi_stride;
 
-    if (cur_mb->mbmi.mode < I8X8_PRED) {
+    if (cur_mb->mbmi.mode <= TM_PRED) {
       return pred_mode_conv(cur_mb->mbmi.mode);
+#if !CONFIG_SB8X8
     } else if (cur_mb->mbmi.mode == I8X8_PRED) {
       return pred_mode_conv(
           (MB_PREDICTION_MODE)(cur_mb->bmi + 12 + b)->as_mode.first);
+#endif
     } else if (cur_mb->mbmi.mode == I4X4_PRED) {
-      return ((cur_mb->bmi + 12 + b)->as_mode.first);
+      return ((cur_mb->bmi + (CONFIG_SB8X8 ? 2 : 12) + b)->as_mode.first);
     } else {
       return B_DC_PRED;
     }
   }
 
-  return (cur_mb->bmi + b - 4)->as_mode.first;
+  return (cur_mb->bmi + b - (4 >> CONFIG_SB8X8))->as_mode.first;
 }
 
 #endif  // VP9_COMMON_VP9_FINDNEARMV_H_
diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c
index ed0c35463..edb0c540b 100644
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@@ -27,7 +27,9 @@ static void lf_init_lut(loop_filter_info_n *lfi) {
   lfi->mode_lf_lut[H_PRED] = 1;
   lfi->mode_lf_lut[TM_PRED] = 1;
   lfi->mode_lf_lut[I4X4_PRED]  = 0;
+#if !CONFIG_SB8X8
   lfi->mode_lf_lut[I8X8_PRED] = 0;
+#endif
   lfi->mode_lf_lut[ZEROMV]  = 1;
   lfi->mode_lf_lut[NEARESTMV] = 2;
   lfi->mode_lf_lut[NEARMV] = 2;
@@ -165,10 +167,14 @@ void vp9_loop_filter_frame_init(VP9_COMMON *cm,
 // the MB uses a prediction size of 16x16 and either 16x16 transform
 // is used or there is no residue at all.
 static int mb_lf_skip(const MB_MODE_INFO *const mbmi) {
-  const MB_PREDICTION_MODE mode = mbmi->mode;
   const int skip_coef = mbmi->mb_skip_coeff;
   const int tx_size = mbmi->txfm_size;
+#if CONFIG_SB8X8
+  return mbmi->sb_type >= BLOCK_SIZE_MB16X16 &&
+#else
+  const MB_PREDICTION_MODE mode = mbmi->mode;
   return mode != I4X4_PRED && mode != I8X8_PRED && mode != SPLITMV &&
+#endif
          (tx_size >= TX_16X16 || skip_coef);
 }
 
@@ -220,7 +226,13 @@ static void lpf_mb(VP9_COMMON *cm, const MODE_INFO *mi,
 
       if (!skip_lf) {
         if (tx_size >= TX_8X8) {
-          if (tx_size == TX_8X8 && (mode == I8X8_PRED || mode == SPLITMV))
+          if (tx_size == TX_8X8 &&
+#if CONFIG_SB8X8
+              (mi->mbmi.sb_type < BLOCK_SIZE_MB16X16)
+#else
+              (mode == I8X8_PRED || mode == SPLITMV)
+#endif
+              )
             vp9_loop_filter_bh8x8(y_ptr, u_ptr, v_ptr,
                                   y_stride, uv_stride, &lfi);
           else
@@ -244,7 +256,13 @@ static void lpf_mb(VP9_COMMON *cm, const MODE_INFO *mi,
 
       if (!skip_lf) {
         if (tx_size >= TX_8X8) {
-          if (tx_size == TX_8X8 && (mode == I8X8_PRED || mode == SPLITMV))
+          if (tx_size == TX_8X8 &&
+#if CONFIG_SB8X8
+              (mi->mbmi.sb_type < BLOCK_SIZE_MB16X16)
+#else
+              (mode == I8X8_PRED || mode == SPLITMV)
+#endif
+              )
             vp9_loop_filter_bv8x8(y_ptr, u_ptr, v_ptr,
                                   y_stride, uv_stride, &lfi);
           else
diff --git a/vp9/common/vp9_mvref_common.c b/vp9/common/vp9_mvref_common.c
index b6ccb8bd9..7a7ebe64f 100644
--- a/vp9/common/vp9_mvref_common.c
+++ b/vp9/common/vp9_mvref_common.c
@@ -13,6 +13,11 @@
 #define MVREF_NEIGHBOURS 8
 
 #if CONFIG_SB8X8
+static int b_mv_ref_search[MVREF_NEIGHBOURS][2] = {
+  {0, -1}, {-1, 0}, {-1, -1}, {0, -2},
+  {-2, 0}, {-1, -2}, {-2, -1}, {-2, -2}
+};
+
 static int mb_mv_ref_search[MVREF_NEIGHBOURS][2] = {
     {0, -1}, {-1, 0}, {-1, -1}, {0, -3},
     {-3, 0}, {-1, -3}, {-3, -1}, {-3, -3}
@@ -185,8 +190,15 @@ void vp9_find_mv_refs(VP9_COMMON *cm, MACROBLOCKD *xd, MODE_INFO *here,
     mv_ref_search = sb64_mv_ref_search;
   } else if (mbmi->sb_type >= BLOCK_SIZE_SB32X32) {
     mv_ref_search = sb_mv_ref_search;
+#if CONFIG_SB8X8
+  } else if (mbmi->sb_type >= BLOCK_SIZE_MB16X16) {
+    mv_ref_search = mb_mv_ref_search;
+  } else {
+    mv_ref_search = b_mv_ref_search;
+#else
   } else {
     mv_ref_search = mb_mv_ref_search;
+#endif
   }
 
   // We first scan for candidate vectors that match the current reference frame
diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h
index 211783e51..bb873c185 100644
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h
@@ -59,9 +59,13 @@ typedef struct frame_contexts {
   vp9_prob ymode_prob[VP9_YMODES - 1]; /* interframe intra mode probs */
   vp9_prob sb_ymode_prob[VP9_I32X32_MODES - 1];
   vp9_prob uv_mode_prob[VP9_YMODES][VP9_UV_MODES - 1];
+#if !CONFIG_SB8X8
   vp9_prob i8x8_mode_prob[VP9_I8X8_MODES - 1];
+#endif
   vp9_prob sub_mv_ref_prob[SUBMVREF_COUNT][VP9_SUBMVREFS - 1];
+#if !CONFIG_SB8X8
   vp9_prob mbsplit_prob[VP9_NUMMBSPLITS - 1];
+#endif
   vp9_prob partition_prob[NUM_PARTITION_CONTEXTS][PARTITION_TYPES - 1];
 
   vp9_coeff_probs coef_probs_4x4[BLOCK_TYPES];
@@ -81,17 +85,25 @@ typedef struct frame_contexts {
   vp9_prob pre_ymode_prob[VP9_YMODES - 1]; /* interframe intra mode probs */
   vp9_prob pre_sb_ymode_prob[VP9_I32X32_MODES - 1];
   vp9_prob pre_uv_mode_prob[VP9_YMODES][VP9_UV_MODES - 1];
+#if !CONFIG_SB8X8
   vp9_prob pre_i8x8_mode_prob[VP9_I8X8_MODES - 1];
+#endif
   vp9_prob pre_sub_mv_ref_prob[SUBMVREF_COUNT][VP9_SUBMVREFS - 1];
+#if !CONFIG_SB8X8
   vp9_prob pre_mbsplit_prob[VP9_NUMMBSPLITS - 1];
+#endif
   vp9_prob pre_partition_prob[NUM_PARTITION_CONTEXTS][PARTITION_TYPES - 1];
   unsigned int bmode_counts[VP9_NKF_BINTRAMODES];
   unsigned int ymode_counts[VP9_YMODES];   /* interframe intra mode probs */
   unsigned int sb_ymode_counts[VP9_I32X32_MODES];
   unsigned int uv_mode_counts[VP9_YMODES][VP9_UV_MODES];
+#if !CONFIG_SB8X8
   unsigned int i8x8_mode_counts[VP9_I8X8_MODES];   /* interframe intra probs */
+#endif
   unsigned int sub_mv_ref_counts[SUBMVREF_COUNT][VP9_SUBMVREFS];
+#if !CONFIG_SB8X8
   unsigned int mbsplit_counts[VP9_NUMMBSPLITS];
+#endif
   unsigned int partition_counts[NUM_PARTITION_CONTEXTS][PARTITION_TYPES];
 
   vp9_coeff_probs pre_coef_probs_4x4[BLOCK_TYPES];
diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c
index 6efe2465e..042006354 100644
--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c
@@ -265,19 +265,27 @@ static INLINE int round_mv_comp_q4(int value) {
   return (value < 0 ? value - 2 : value + 2) / 4;
 }
 
+#if CONFIG_SB8X8
+#define IDX1 2
+#define IDX2 3
+#else
+#define IDX1 4
+#define IDX2 5
+#endif
+
 static int mi_mv_pred_row_q4(MACROBLOCKD *mb, int off, int idx) {
   const int temp = mb->mode_info_context->bmi[off + 0].as_mv[idx].as_mv.row +
                    mb->mode_info_context->bmi[off + 1].as_mv[idx].as_mv.row +
-                   mb->mode_info_context->bmi[off + 4].as_mv[idx].as_mv.row +
-                   mb->mode_info_context->bmi[off + 5].as_mv[idx].as_mv.row;
+                   mb->mode_info_context->bmi[off + IDX1].as_mv[idx].as_mv.row +
+                   mb->mode_info_context->bmi[off + IDX2].as_mv[idx].as_mv.row;
   return round_mv_comp_q4(temp);
 }
 
 static int mi_mv_pred_col_q4(MACROBLOCKD *mb, int off, int idx) {
   const int temp = mb->mode_info_context->bmi[off + 0].as_mv[idx].as_mv.col +
                    mb->mode_info_context->bmi[off + 1].as_mv[idx].as_mv.col +
-                   mb->mode_info_context->bmi[off + 4].as_mv[idx].as_mv.col +
-                   mb->mode_info_context->bmi[off + 5].as_mv[idx].as_mv.col;
+                   mb->mode_info_context->bmi[off + IDX1].as_mv[idx].as_mv.col +
+                   mb->mode_info_context->bmi[off + IDX2].as_mv[idx].as_mv.col;
   return round_mv_comp_q4(temp);
 }
 
diff --git a/vp9/common/vp9_reconintra.c b/vp9/common/vp9_reconintra.c
index 4b62c1cce..a0155d9a9 100644
--- a/vp9/common/vp9_reconintra.c
+++ b/vp9/common/vp9_reconintra.c
@@ -15,25 +15,6 @@
 #include "vp9/common/vp9_reconintra.h"
 #include "vpx_mem/vpx_mem.h"
 
-// Using multiplication and shifting instead of division in diagonal prediction.
-// iscale table is calculated from ((1 << 16) + (i + 2) / 2) / (i+2) and used as
-// ((A + B) * iscale[i] + (1 << 15)) >> 16;
-// where A and B are weighted pixel values.
-static const unsigned int iscale[64] = {
-  32768, 21845, 16384, 13107, 10923,  9362,  8192,  7282,
-   6554,  5958,  5461,  5041,  4681,  4369,  4096,  3855,
-   3641,  3449,  3277,  3121,  2979,  2849,  2731,  2621,
-   2521,  2427,  2341,  2260,  2185,  2114,  2048,  1986,
-   1928,  1872,  1820,  1771,  1725,  1680,  1638,  1598,
-   1560,  1524,  1489,  1456,  1425,  1394,  1365,  1337,
-   1311,  1285,  1260,  1237,  1214,  1192,  1170,  1150,
-   1130,  1111,  1092,  1074,  1057,  1040,  1024,  1008,
-};
-
-static INLINE int iscale_round(int value, int i) {
-  return ROUND_POWER_OF_TWO(value * iscale[i], 16);
-}
-
 static void d27_predictor(uint8_t *ypred_ptr, int y_stride,
                           int bw, int bh,
                           uint8_t *yabove_row, uint8_t *yleft_col) {
diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index c51d0b243..474250cf7 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -65,9 +65,11 @@ static MB_PREDICTION_MODE read_kf_mb_ymode(vp9_reader *r, const vp9_prob *p) {
   return (MB_PREDICTION_MODE)treed_read(r, vp9_kf_ymode_tree, p);
 }
 
+#if !CONFIG_SB8X8
 static int read_i8x8_mode(vp9_reader *r, const vp9_prob *p) {
   return treed_read(r, vp9_i8x8_mode_tree, p);
 }
+#endif
 
 static MB_PREDICTION_MODE read_uv_mode(vp9_reader *r, const vp9_prob *p) {
   return (MB_PREDICTION_MODE)treed_read(r, vp9_uv_mode_tree, p);
@@ -161,6 +163,7 @@ static void kfread_modes(VP9D_COMP *pbi, MODE_INFO *m,
     }
   }
 
+#if !CONFIG_SB8X8
   if (m->mbmi.mode == I8X8_PRED) {
     int i;
     for (i = 0; i < 4; ++i) {
@@ -175,14 +178,25 @@ static void kfread_modes(VP9D_COMP *pbi, MODE_INFO *m,
   }
 
   // chroma mode
-  if (m->mbmi.mode != I8X8_PRED) {
+  if (m->mbmi.mode != I8X8_PRED)
+#endif
+  {
     m->mbmi.uv_mode = read_uv_mode(r, cm->kf_uv_mode_prob[m->mbmi.mode]);
   }
 
   if (cm->txfm_mode == TX_MODE_SELECT &&
       !m->mbmi.mb_skip_coeff &&
-      m->mbmi.mode <= I8X8_PRED) {
+#if CONFIG_SB8X8
+      m->mbmi.mode != I4X4_PRED
+#else
+      m->mbmi.mode <= I8X8_PRED
+#endif
+      ) {
+#if CONFIG_SB8X8
+    const int allow_16x16 = m->mbmi.sb_type >= BLOCK_SIZE_MB16X16;
+#else
     const int allow_16x16 = m->mbmi.mode != I8X8_PRED;
+#endif
     const int allow_32x32 = m->mbmi.sb_type >= BLOCK_SIZE_SB32X32;
     m->mbmi.txfm_size = select_txfm_size(cm, r, allow_16x16, allow_32x32);
   } else if (cm->txfm_mode >= ALLOW_32X32 &&
@@ -767,19 +781,29 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
     mbmi->uv_mode = DC_PRED;
     switch (mbmi->mode) {
       case SPLITMV: {
+#if CONFIG_SB8X8
+        const int num_p = 4;
+#else
         const int s = treed_read(r, vp9_mbsplit_tree, cm->fc.mbsplit_prob);
         const int num_p = vp9_mbsplit_count[s];
+#endif
         int j = 0;
 
+#if !CONFIG_SB8X8
         cm->fc.mbsplit_counts[s]++;
-        mbmi->need_to_clamp_mvs = 0;
         mbmi->partitioning = s;
+#endif
+        mbmi->need_to_clamp_mvs = 0;
         do {  // for each subset j
           int_mv leftmv, abovemv, second_leftmv, second_abovemv;
           int_mv blockmv, secondmv;
           int mv_contz;
           int blockmode;
+#if CONFIG_SB8X8
+          int k = j;
+#else
           int k = vp9_mbsplit_offset[s][j];  // first block in subset j
+#endif
 
           leftmv.as_int = left_block_mv(xd, mi, k);
           abovemv.as_int = above_block_mv(mi, k, mis);
@@ -851,6 +875,7 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
           }
           */
 
+#if !CONFIG_SB8X8
           {
             /* Fill (uniform) modes, mvs of jth subset.
              Must do it here because ensuing subsets can
@@ -866,12 +891,12 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
               fill_offset++;
             } while (--fill_count);
           }
-
+#endif
         } while (++j < num_p);
       }
 
-      mv0->as_int = mi->bmi[15].as_mv[0].as_int;
-      mv1->as_int = mi->bmi[15].as_mv[1].as_int;
+      mv0->as_int = mi->bmi[15 >> (2 * CONFIG_SB8X8)].as_mv[0].as_int;
+      mv1->as_int = mi->bmi[15 >> (2 * CONFIG_SB8X8)].as_mv[1].as_int;
 
       break;  /* done with SPLITMV */
 
@@ -957,6 +982,7 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
       } while (++j < 16);
     }
 
+#if !CONFIG_SB8X8
     if (mbmi->mode == I8X8_PRED) {
       int i;
       for (i = 0; i < 4; i++) {
@@ -969,7 +995,9 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
         mi->bmi[ib + 5].as_mode.first = mode8x8;
         cm->fc.i8x8_mode_counts[mode8x8]++;
       }
-    } else {
+    } else
+#endif
+    {
       mbmi->uv_mode = read_uv_mode(r, cm->fc.uv_mode_prob[mbmi->mode]);
       cm->fc.uv_mode_counts[mbmi->mode][mbmi->uv_mode]++;
     }
@@ -980,23 +1008,44 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
     */
 
   if (cm->txfm_mode == TX_MODE_SELECT && mbmi->mb_skip_coeff == 0 &&
-      ((mbmi->ref_frame == INTRA_FRAME && mbmi->mode <= I8X8_PRED) ||
-       (mbmi->ref_frame != INTRA_FRAME && !(mbmi->mode == SPLITMV &&
-                           mbmi->partitioning == PARTITIONING_4X4)))) {
+      ((mbmi->ref_frame == INTRA_FRAME &&
+#if CONFIG_SB8X8
+        mbmi->mode != I4X4_PRED
+#else
+        mbmi->mode <= I8X8_PRED
+#endif
+        ) ||
+       (mbmi->ref_frame != INTRA_FRAME &&
+#if CONFIG_SB8X8
+        mbmi->mode != SPLITMV
+#else
+        !(mbmi->mode == SPLITMV && mbmi->partitioning == PARTITIONING_4X4)
+#endif
+        ))) {
+#if CONFIG_SB8X8
+    const int allow_16x16 = mbmi->sb_type >= BLOCK_SIZE_MB16X16;
+#else
     const int allow_16x16 = mbmi->mode != I8X8_PRED && mbmi->mode != SPLITMV;
+#endif
     const int allow_32x32 = mbmi->sb_type >= BLOCK_SIZE_SB32X32;
     mbmi->txfm_size = select_txfm_size(cm, r, allow_16x16, allow_32x32);
   } else if (mbmi->sb_type >= BLOCK_SIZE_SB32X32 &&
              cm->txfm_mode >= ALLOW_32X32) {
     mbmi->txfm_size = TX_32X32;
   } else if (cm->txfm_mode >= ALLOW_16X16 &&
+#if CONFIG_SB8X8
+             mbmi->sb_type >= BLOCK_SIZE_MB16X16 &&
+#endif
       ((mbmi->ref_frame == INTRA_FRAME && mbmi->mode <= TM_PRED) ||
        (mbmi->ref_frame != INTRA_FRAME && mbmi->mode != SPLITMV))) {
     mbmi->txfm_size = TX_16X16;
   } else if (cm->txfm_mode >= ALLOW_8X8 &&
       (!(mbmi->ref_frame == INTRA_FRAME && mbmi->mode == I4X4_PRED) &&
-       !(mbmi->ref_frame != INTRA_FRAME && mbmi->mode == SPLITMV &&
-         mbmi->partitioning == PARTITIONING_4X4))) {
+       !(mbmi->ref_frame != INTRA_FRAME && mbmi->mode == SPLITMV
+#if !CONFIG_SB8X8
+         && mbmi->partitioning == PARTITIONING_4X4
+#endif
+         ))) {
     mbmi->txfm_size = TX_8X8;
   } else {
     mbmi->txfm_size = TX_4X4;
diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c
index 01e9a2b89..5ef48d9e7 100644
--- a/vp9/decoder/vp9_decodframe.c
+++ b/vp9/decoder/vp9_decodframe.c
@@ -191,86 +191,57 @@ static void mb_init_dequantizer(VP9_COMMON *pc, MACROBLOCKD *xd) {
     xd->plane[i].dequant = pc->uv_dequant[xd->q_index];
 }
 
-static void decode_16x16(MACROBLOCKD *xd) {
-  const TX_TYPE tx_type = get_tx_type_16x16(xd, 0);
-
-  vp9_iht_add_16x16_c(tx_type, xd->plane[0].qcoeff, xd->plane[0].dst.buf,
-                      xd->plane[0].dst.stride, xd->plane[0].eobs[0]);
-
-  vp9_idct_add_8x8(xd->plane[1].qcoeff, xd->plane[1].dst.buf,
-                   xd->plane[1].dst.stride, xd->plane[1].eobs[0]);
-
-  vp9_idct_add_8x8(xd->plane[2].qcoeff, xd->plane[2].dst.buf,
-                   xd->plane[1].dst.stride, xd->plane[2].eobs[0]);
-}
-
+#if !CONFIG_SB8X8
 static void decode_8x8(MACROBLOCKD *xd) {
   const MB_PREDICTION_MODE mode = xd->mode_info_context->mbmi.mode;
   // luma
   // if the first one is DCT_DCT assume all the rest are as well
   TX_TYPE tx_type = get_tx_type_8x8(xd, 0);
-  if (tx_type != DCT_DCT || mode == I8X8_PRED) {
-    int i;
-    for (i = 0; i < 4; i++) {
-      int ib = vp9_i8x8_block[i];
-      int idx = (ib & 0x02) ? (ib + 2) : ib;
-      int16_t *q  = BLOCK_OFFSET(xd->plane[0].qcoeff, idx, 16);
-      uint8_t* const dst =
+  int i;
+  assert(mode == I8X8_PRED);
+  for (i = 0; i < 4; i++) {
+    int ib = vp9_i8x8_block[i];
+    int idx = (ib & 0x02) ? (ib + 2) : ib;
+    int16_t *q  = BLOCK_OFFSET(xd->plane[0].qcoeff, idx, 16);
+    uint8_t* const dst =
           raster_block_offset_uint8(xd, BLOCK_SIZE_MB16X16, 0, ib,
                                     xd->plane[0].dst.buf,
                                     xd->plane[0].dst.stride);
-      int stride = xd->plane[0].dst.stride;
-      if (mode == I8X8_PRED) {
-        int i8x8mode = xd->mode_info_context->bmi[ib].as_mode.first;
-        vp9_intra8x8_predict(xd, ib, i8x8mode, dst, stride);
-      }
-      tx_type = get_tx_type_8x8(xd, ib);
-      vp9_iht_add_8x8_c(tx_type, q, dst, stride, xd->plane[0].eobs[idx]);
+    int stride = xd->plane[0].dst.stride;
+    if (mode == I8X8_PRED) {
+      int i8x8mode = xd->mode_info_context->bmi[ib].as_mode.first;
+      vp9_intra8x8_predict(xd, ib, i8x8mode, dst, stride);
     }
-  } else {
-    vp9_idct_add_y_block_8x8(xd->plane[0].qcoeff, xd->plane[0].dst.buf,
-                             xd->plane[0].dst.stride, xd);
+    tx_type = get_tx_type_8x8(xd, ib);
+    vp9_iht_add_8x8_c(tx_type, q, dst, stride, xd->plane[0].eobs[idx]);
   }
 
   // chroma
-  if (mode == I8X8_PRED) {
-    int i;
-    for (i = 0; i < 4; i++) {
-      int ib = vp9_i8x8_block[i];
-      int i8x8mode = xd->mode_info_context->bmi[ib].as_mode.first;
-      uint8_t* dst;
-
-      dst = raster_block_offset_uint8(xd, BLOCK_SIZE_MB16X16, 1, i,
-                                      xd->plane[1].dst.buf,
-                                      xd->plane[1].dst.stride);
-      vp9_intra_uv4x4_predict(xd, 16 + i, i8x8mode,
-                              dst, xd->plane[1].dst.stride);
-      xd->itxm_add(BLOCK_OFFSET(xd->plane[1].qcoeff, i, 16),
-                   dst, xd->plane[1].dst.stride,
-                   xd->plane[1].eobs[i]);
-
-      dst = raster_block_offset_uint8(xd, BLOCK_SIZE_MB16X16, 2, i,
-                                      xd->plane[2].dst.buf,
-                                      xd->plane[1].dst.stride);
-      vp9_intra_uv4x4_predict(xd, 20 + i, i8x8mode,
-                              dst, xd->plane[1].dst.stride);
-      xd->itxm_add(BLOCK_OFFSET(xd->plane[2].qcoeff, i, 16),
-                   dst, xd->plane[1].dst.stride,
-                   xd->plane[2].eobs[i]);
-    }
-  } else if (mode == SPLITMV) {
-    xd->itxm_add_uv_block(xd->plane[1].qcoeff, xd->plane[1].dst.buf,
-        xd->plane[1].dst.stride, xd->plane[1].eobs);
-    xd->itxm_add_uv_block(xd->plane[2].qcoeff, xd->plane[2].dst.buf,
-        xd->plane[1].dst.stride, xd->plane[2].eobs);
-  } else {
-    vp9_idct_add_8x8(xd->plane[1].qcoeff, xd->plane[1].dst.buf,
-                     xd->plane[1].dst.stride, xd->plane[1].eobs[0]);
+  for (i = 0; i < 4; i++) {
+    int ib = vp9_i8x8_block[i];
+    int i8x8mode = xd->mode_info_context->bmi[ib].as_mode.first;
+    uint8_t* dst;
 
-    vp9_idct_add_8x8(xd->plane[2].qcoeff, xd->plane[2].dst.buf,
-                     xd->plane[1].dst.stride, xd->plane[2].eobs[0]);
+    dst = raster_block_offset_uint8(xd, BLOCK_SIZE_MB16X16, 1, i,
+                                    xd->plane[1].dst.buf,
+                                    xd->plane[1].dst.stride);
+    vp9_intra_uv4x4_predict(xd, 16 + i, i8x8mode,
+                            dst, xd->plane[1].dst.stride);
+    xd->itxm_add(BLOCK_OFFSET(xd->plane[1].qcoeff, i, 16),
+                 dst, xd->plane[1].dst.stride,
+                 xd->plane[1].eobs[i]);
+
+    dst = raster_block_offset_uint8(xd, BLOCK_SIZE_MB16X16, 2, i,
+                                    xd->plane[2].dst.buf,
+                                    xd->plane[1].dst.stride);
+    vp9_intra_uv4x4_predict(xd, 20 + i, i8x8mode,
+                            dst, xd->plane[1].dst.stride);
+    xd->itxm_add(BLOCK_OFFSET(xd->plane[2].qcoeff, i, 16),
+                 dst, xd->plane[1].dst.stride,
+                 xd->plane[2].eobs[i]);
   }
 }
+#endif
 
 static INLINE void dequant_add_y(MACROBLOCKD *xd, TX_TYPE tx_type, int idx) {
   struct macroblockd_plane *const y = &xd->plane[0];
@@ -286,76 +257,46 @@ static INLINE void dequant_add_y(MACROBLOCKD *xd, TX_TYPE tx_type, int idx) {
   }
 }
 
+#if !CONFIG_SB8X8
 static void decode_4x4(VP9D_COMP *pbi, MACROBLOCKD *xd, vp9_reader *r) {
   TX_TYPE tx_type;
   int i = 0;
   const MB_PREDICTION_MODE mode = xd->mode_info_context->mbmi.mode;
-  if (mode == I8X8_PRED) {
-    for (i = 0; i < 4; i++) {
-      int ib = vp9_i8x8_block[i];
-      const int iblock[4] = {0, 1, 4, 5};
-      int j;
-      uint8_t* dst;
-      int i8x8mode = xd->mode_info_context->bmi[ib].as_mode.first;
+  assert(mode == I8X8_PRED);
+  for (i = 0; i < 4; i++) {
+    int ib = vp9_i8x8_block[i];
+    const int iblock[4] = {0, 1, 4, 5};
+    int j;
+    uint8_t* dst;
+    int i8x8mode = xd->mode_info_context->bmi[ib].as_mode.first;
 
-      dst = raster_block_offset_uint8(xd, BLOCK_SIZE_MB16X16, 0, ib,
-                                      xd->plane[0].dst.buf,
-                                      xd->plane[0].dst.stride);
-      vp9_intra8x8_predict(xd, ib, i8x8mode, dst, xd->plane[0].dst.stride);
-      for (j = 0; j < 4; j++) {
-        tx_type = get_tx_type_4x4(xd, ib + iblock[j]);
-        dequant_add_y(xd, tx_type, ib + iblock[j]);
-      }
-      dst = raster_block_offset_uint8(xd, BLOCK_SIZE_MB16X16, 1, i,
-                                      xd->plane[1].dst.buf,
-                                      xd->plane[1].dst.stride);
-      vp9_intra_uv4x4_predict(xd, 16 + i, i8x8mode,
-                              dst, xd->plane[1].dst.stride);
-      xd->itxm_add(BLOCK_OFFSET(xd->plane[1].qcoeff, i, 16),
-                   dst, xd->plane[1].dst.stride,
-                   xd->plane[1].eobs[i]);
-      dst = raster_block_offset_uint8(xd, BLOCK_SIZE_MB16X16, 2, i,
-                                      xd->plane[2].dst.buf,
-                                      xd->plane[2].dst.stride);
-      vp9_intra_uv4x4_predict(xd, 20 + i, i8x8mode,
-                              dst, xd->plane[1].dst.stride);
-      xd->itxm_add(BLOCK_OFFSET(xd->plane[2].qcoeff, i, 16),
-                   dst, xd->plane[1].dst.stride,
-                   xd->plane[2].eobs[i]);
-    }
-  } else if (mode == SPLITMV || get_tx_type_4x4(xd, 0) == DCT_DCT) {
-    xd->itxm_add_y_block(xd->plane[0].qcoeff, xd->plane[0].dst.buf,
-        xd->plane[0].dst.stride, xd);
-    xd->itxm_add_uv_block(xd->plane[1].qcoeff, xd->plane[1].dst.buf,
-        xd->plane[1].dst.stride, xd->plane[1].eobs);
-    xd->itxm_add_uv_block(xd->plane[2].qcoeff, xd->plane[2].dst.buf,
-        xd->plane[1].dst.stride, xd->plane[2].eobs);
-  } else {
-    for (i = 0; i < 16; i++) {
-      tx_type = get_tx_type_4x4(xd, i);
-      dequant_add_y(xd, tx_type, i);
+    dst = raster_block_offset_uint8(xd, BLOCK_SIZE_MB16X16, 0, ib,
+                                    xd->plane[0].dst.buf,
+                                    xd->plane[0].dst.stride);
+    vp9_intra8x8_predict(xd, ib, i8x8mode, dst, xd->plane[0].dst.stride);
+    for (j = 0; j < 4; j++) {
+      tx_type = get_tx_type_4x4(xd, ib + iblock[j]);
+      dequant_add_y(xd, tx_type, ib + iblock[j]);
     }
-    xd->itxm_add_uv_block(xd->plane[1].qcoeff, xd->plane[1].dst.buf,
-                          xd->plane[1].dst.stride, xd->plane[1].eobs);
-    xd->itxm_add_uv_block(xd->plane[2].qcoeff, xd->plane[2].dst.buf,
-                          xd->plane[1].dst.stride, xd->plane[2].eobs);
+    dst = raster_block_offset_uint8(xd, BLOCK_SIZE_MB16X16, 1, i,
+                                    xd->plane[1].dst.buf,
+                                    xd->plane[1].dst.stride);
+    vp9_intra_uv4x4_predict(xd, 16 + i, i8x8mode,
+                            dst, xd->plane[1].dst.stride);
+    xd->itxm_add(BLOCK_OFFSET(xd->plane[1].qcoeff, i, 16),
+                 dst, xd->plane[1].dst.stride,
+                 xd->plane[1].eobs[i]);
+    dst = raster_block_offset_uint8(xd, BLOCK_SIZE_MB16X16, 2, i,
+                                    xd->plane[2].dst.buf,
+                                    xd->plane[2].dst.stride);
+    vp9_intra_uv4x4_predict(xd, 20 + i, i8x8mode,
+                            dst, xd->plane[1].dst.stride);
+    xd->itxm_add(BLOCK_OFFSET(xd->plane[2].qcoeff, i, 16),
+                 dst, xd->plane[1].dst.stride,
+                 xd->plane[2].eobs[i]);
   }
 }
-
-static int txfrm_block_to_raster_block(MACROBLOCKD *xd,
-                                       BLOCK_SIZE_TYPE bsize,
-                                       int plane, int block,
-                                       int ss_txfrm_size) {
-  const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x;
-  const int txwl = ss_txfrm_size / 2;
-  const int tx_cols_lg2 = bwl - txwl;
-  const int tx_cols = 1 << tx_cols_lg2;
-  const int raster_mb = block >> ss_txfrm_size;
-  const int x = (raster_mb & (tx_cols - 1)) << (txwl);
-  const int y = raster_mb >> tx_cols_lg2 << (txwl);
-  return x + (y << bwl);
-}
-
+#endif
 
 static void decode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
                          int ss_txfrm_size, void *arg) {
@@ -428,6 +369,41 @@ static void decode_atom_intra(VP9D_COMP *pbi, MACROBLOCKD *xd,
   foreach_transformed_block_uv(xd, bsize, decode_block, xd);
 }
 
+static void decode_atom(VP9D_COMP *pbi, MACROBLOCKD *xd,
+                        int mi_row, int mi_col,
+                        vp9_reader *r, BLOCK_SIZE_TYPE bsize) {
+  MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
+
+  if (pbi->common.frame_type != KEY_FRAME)
+    vp9_setup_interp_filters(xd, mbmi->interp_filter, &pbi->common);
+
+  // prediction
+  if (mbmi->ref_frame == INTRA_FRAME)
+    vp9_build_intra_predictors_sbuv_s(xd, bsize);
+  else
+    vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
+
+  if (mbmi->mb_skip_coeff) {
+    vp9_reset_sb_tokens_context(xd, bsize);
+  } else {
+    // re-initialize macroblock dequantizer before detokenization
+    if (xd->segmentation_enabled)
+      mb_init_dequantizer(&pbi->common, xd);
+
+    if (!vp9_reader_has_error(r)) {
+#if CONFIG_NEWBINTRAMODES
+    if (mbmi->mode != I4X4_PRED)
+#endif
+      vp9_decode_tokens(pbi, xd, r, bsize);
+    }
+  }
+
+  if (mbmi->ref_frame == INTRA_FRAME)
+    decode_atom_intra(pbi, xd, r, bsize);
+  else
+    foreach_transformed_block(xd, bsize, decode_block, xd);
+}
+
 static void decode_sb(VP9D_COMP *pbi, MACROBLOCKD *xd, int mi_row, int mi_col,
                       vp9_reader *r, BLOCK_SIZE_TYPE bsize) {
   const int bwl = mi_width_log2(bsize), bhl = mi_height_log2(bsize);
@@ -473,39 +449,17 @@ static void decode_sb(VP9D_COMP *pbi, MACROBLOCKD *xd, int mi_row, int mi_col,
   }
 }
 
-// TODO(jingning): Need to merge SB and MB decoding. The MB decoding currently
-// couples special handles on I8x8, B_PRED, and splitmv modes.
+#if !CONFIG_SB8X8
+// TODO(jingning): This only performs I8X8_PRED decoding process, which will be
+// automatically covered by decode_sb, when SB8X8 is on.
 static void decode_mb(VP9D_COMP *pbi, MACROBLOCKD *xd,
                      int mi_row, int mi_col,
                      vp9_reader *r) {
-  int eobtotal = 0;
   MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
-  const MB_PREDICTION_MODE mode = mbmi->mode;
   const int tx_size = mbmi->txfm_size;
 
   assert(mbmi->sb_type == BLOCK_SIZE_MB16X16);
 
-  //mode = xd->mode_info_context->mbmi.mode;
-  if (pbi->common.frame_type != KEY_FRAME)
-    vp9_setup_interp_filters(xd, mbmi->interp_filter, &pbi->common);
-
-  // do prediction
-  if (mbmi->ref_frame == INTRA_FRAME) {
-    if (mode != I8X8_PRED) {
-      vp9_build_intra_predictors_sbuv_s(xd, BLOCK_SIZE_MB16X16);
-      if (mode != I4X4_PRED)
-        vp9_build_intra_predictors_sby_s(xd, BLOCK_SIZE_MB16X16);
-    }
-  } else {
-#if 0  // def DEC_DEBUG
-  if (dec_debug)
-    printf("Decoding mb:  %d %d interp %d\n",
-           xd->mode_info_context->mbmi.mode, tx_size,
-           xd->mode_info_context->mbmi.interp_filter);
-#endif
-    vp9_build_inter_predictors_sb(xd, mi_row, mi_col, BLOCK_SIZE_MB16X16);
-  }
-
   if (mbmi->mb_skip_coeff) {
     vp9_reset_sb_tokens_context(xd, BLOCK_SIZE_MB16X16);
   } else {
@@ -513,73 +467,16 @@ static void decode_mb(VP9D_COMP *pbi, MACROBLOCKD *xd,
     if (xd->segmentation_enabled)
       mb_init_dequantizer(&pbi->common, xd);
 
-    if (!vp9_reader_has_error(r)) {
-#if CONFIG_NEWBINTRAMODES
-    if (mode != I4X4_PRED)
-#endif
-      eobtotal = vp9_decode_tokens(pbi, xd, r, BLOCK_SIZE_MB16X16);
-    }
-  }
-
-  if (eobtotal == 0 &&
-      mode != I4X4_PRED && mode != I8X8_PRED && mode != SPLITMV &&
-      !vp9_reader_has_error(r)) {
-    mbmi->mb_skip_coeff = 1;
-  } else {
-#if 0  // def DEC_DEBUG
-  if (dec_debug)
-    printf("Decoding mb:  %d %d\n", xd->mode_info_context->mbmi.mode, tx_size);
-#endif
-
-    if (tx_size == TX_16X16) {
-      decode_16x16(xd);
-    } else if (tx_size == TX_8X8) {
-      decode_8x8(xd);
-    } else {
-      if (mbmi->mode == I4X4_PRED)
-        // TODO(jingning): we need to move this to decode_atom later and
-        // deprecate decode_mb, when SB8X8 is on.
-        decode_atom_intra(pbi, xd, r, BLOCK_SIZE_MB16X16);
-      else
-        decode_4x4(pbi, xd, r);
-    }
+    if (!vp9_reader_has_error(r))
+      vp9_decode_tokens(pbi, xd, r, BLOCK_SIZE_MB16X16);
   }
 
-#ifdef DEC_DEBUG
-  if (dec_debug) {
-    int i, j;
-    printf("\n");
-    printf("predictor y\n");
-    for (i = 0; i < 16; i++) {
-      for (j = 0; j < 16; j++)
-        printf("%3d ", xd->predictor[i * 16 + j]);
-      printf("\n");
-    }
-    printf("\n");
-    printf("final y\n");
-    for (i = 0; i < 16; i++) {
-      for (j = 0; j < 16; j++)
-        printf("%3d ", xd->plane[0].dst.buf[i * xd->plane[0].dst.stride + j]);
-      printf("\n");
-    }
-    printf("\n");
-    printf("final u\n");
-    for (i = 0; i < 8; i++) {
-      for (j = 0; j < 8; j++)
-        printf("%3d ", xd->plane[1].dst.buf[i * xd->plane[1].dst.stride + j]);
-      printf("\n");
-    }
-    printf("\n");
-    printf("final v\n");
-    for (i = 0; i < 8; i++) {
-      for (j = 0; j < 8; j++)
-        printf("%3d ", xd->plane[2].dst.buf[i * xd->plane[1].dst.stride + j]);
-      printf("\n");
-    }
-    fflush(stdout);
-  }
-#endif
+  if (tx_size == TX_8X8)
+    decode_8x8(xd);
+  else
+    decode_4x4(pbi, xd, r);
 }
+#endif
 
 static int get_delta_q(vp9_reader *r, int *dq) {
   const int old_value = *dq;
@@ -666,12 +563,31 @@ static void decode_modes_b(VP9D_COMP *pbi, int mi_row, int mi_col,
   vp9_decode_mb_mode_mv(pbi, xd, mi_row, mi_col, r);
   set_refs(pbi, mi_row, mi_col);
 
+#if CONFIG_SB8X8
+  if (bsize >= BLOCK_SIZE_SB8X8)
+    decode_sb(pbi, xd, mi_row, mi_col, r, bsize);
+  else
+    decode_atom(pbi, xd, mi_row, mi_col, r, BLOCK_SIZE_SB8X8);
+#else
   // TODO(jingning): merge decode_sb_ and decode_mb_
   if (bsize > BLOCK_SIZE_MB16X16) {
     decode_sb(pbi, xd, mi_row, mi_col, r, bsize);
   } else {
-    decode_mb(pbi, xd, mi_row, mi_col, r);
+    // TODO(jingning): In transition of separating functionalities of decode_mb
+    // into decode_sb and decode_atom. Will remove decode_mb and clean this up
+    // when SB8X8 is on.
+    if (xd->mode_info_context->mbmi.mode == I4X4_PRED ||
+        (xd->mode_info_context->mbmi.mode == SPLITMV &&
+         xd->mode_info_context->mbmi.partitioning == PARTITIONING_4X4))
+      decode_atom(pbi, xd, mi_row, mi_col, r, bsize);
+    else if (xd->mode_info_context->mbmi.mode != I8X8_PRED)
+      decode_sb(pbi, xd, mi_row, mi_col, r, bsize);
+    else
+      // TODO(jingning): decode_mb still carries deocding process of I8X8_PRED.
+      // This will be covered by decode_sb when SB8X8 is on.
+      decode_mb(pbi, xd, mi_row, mi_col, r);
   }
+#endif
 
   xd->corrupted |= vp9_reader_has_error(r);
 }
@@ -1090,9 +1006,13 @@ static void update_frame_context(FRAME_CONTEXT *fc) {
   vp9_copy(fc->pre_sb_ymode_prob, fc->sb_ymode_prob);
   vp9_copy(fc->pre_uv_mode_prob, fc->uv_mode_prob);
   vp9_copy(fc->pre_bmode_prob, fc->bmode_prob);
+#if !CONFIG_SB8X8
   vp9_copy(fc->pre_i8x8_mode_prob, fc->i8x8_mode_prob);
+#endif
   vp9_copy(fc->pre_sub_mv_ref_prob, fc->sub_mv_ref_prob);
+#if !CONFIG_SB8X8
   vp9_copy(fc->pre_mbsplit_prob, fc->mbsplit_prob);
+#endif
   vp9_copy(fc->pre_partition_prob, fc->partition_prob);
   fc->pre_nmvc = fc->nmvc;
 
@@ -1105,9 +1025,13 @@ static void update_frame_context(FRAME_CONTEXT *fc) {
   vp9_zero(fc->sb_ymode_counts);
   vp9_zero(fc->uv_mode_counts);
   vp9_zero(fc->bmode_counts);
+#if !CONFIG_SB8X8
   vp9_zero(fc->i8x8_mode_counts);
+#endif
   vp9_zero(fc->sub_mv_ref_counts);
+#if !CONFIG_SB8X8
   vp9_zero(fc->mbsplit_counts);
+#endif
   vp9_zero(fc->NMVcount);
   vp9_zero(fc->mv_ref_ct);
   vp9_zero(fc->partition_counts);
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index 3ab67cd8c..3c0bab2ce 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -281,9 +281,11 @@ static void sb_kfwrite_ymode(vp9_writer *bc, int m, const vp9_prob *p) {
   write_token(bc, vp9_uv_mode_tree, p, vp9_sb_kf_ymode_encodings + m);
 }
 
+#if !CONFIG_SB8X8
 static void write_i8x8_mode(vp9_writer *bc, int m, const vp9_prob *p) {
   write_token(bc, vp9_i8x8_mode_tree, p, vp9_i8x8_mode_encodings + m);
 }
+#endif
 
 static void write_uv_mode(vp9_writer *bc, int m, const vp9_prob *p) {
   write_token(bc, vp9_uv_mode_tree, p, vp9_uv_mode_encodings + m);
@@ -302,9 +304,11 @@ static void write_kf_bmode(vp9_writer *bc, int m, const vp9_prob *p) {
   write_token(bc, vp9_kf_bmode_tree, p, vp9_kf_bmode_encodings + m);
 }
 
+#if !CONFIG_SB8X8
 static void write_split(vp9_writer *bc, int x, const vp9_prob *p) {
   write_token(bc, vp9_mbsplit_tree, p, vp9_mbsplit_encodings + x);
 }
+#endif
 
 static int prob_update_savings(const unsigned int *ct,
                                const vp9_prob oldp, const vp9_prob newp,
@@ -728,8 +732,9 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m,
       do {
         write_bmode(bc, m->bmi[j].as_mode.first,
                     pc->fc.bmode_prob);
-      } while (++j < 16);
+      } while (++j < (16 >> (CONFIG_SB8X8 * 2)));
     }
+#if !CONFIG_SB8X8
     if (mode == I8X8_PRED) {
       write_i8x8_mode(bc, m->bmi[0].as_mode.first,
                       pc->fc.i8x8_mode_prob);
@@ -739,7 +744,9 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m,
                       pc->fc.i8x8_mode_prob);
       write_i8x8_mode(bc, m->bmi[10].as_mode.first,
                       pc->fc.i8x8_mode_prob);
-    } else {
+    } else
+#endif
+    {
       write_uv_mode(bc, mi->uv_mode,
                     pc->fc.uv_mode_prob[mode]);
     }
@@ -824,19 +831,26 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m,
         ++count_mb_seg[mi->partitioning];
 #endif
 
+#if !CONFIG_SB8X8
         write_split(bc, mi->partitioning, cpi->common.fc.mbsplit_prob);
         cpi->mbsplit_count[mi->partitioning]++;
+#endif
 
         do {
           B_PREDICTION_MODE blockmode;
           int_mv blockmv;
+#if !CONFIG_SB8X8
           const int *const  L = vp9_mbsplits[mi->partitioning];
+#endif
           int k = -1;  /* first block in subset j */
           int mv_contz;
           int_mv leftmv, abovemv;
 
           blockmode = cpi->mb.partition_info->bmi[j].mode;
           blockmv = cpi->mb.partition_info->bmi[j].mv;
+#if CONFIG_SB8X8
+          k = j;
+#else
 #if CONFIG_DEBUG
           while (j != L[++k])
             if (k >= 16)
@@ -844,6 +858,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m,
 #else
           while (j != L[++k]);
 #endif
+#endif
           leftmv.as_int = left_block_mv(xd, m, k);
           abovemv.as_int = above_block_mv(m, k, mis);
           mv_contz = vp9_mv_cont(&leftmv, &abovemv);
@@ -875,6 +890,22 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m,
     }
   }
 
+#if CONFIG_SB8X8
+  if (((rf == INTRA_FRAME && mode != I4X4_PRED) ||
+       (rf != INTRA_FRAME && mode != SPLITMV)) &&
+      pc->txfm_mode == TX_MODE_SELECT &&
+      !(skip_coeff || vp9_segfeature_active(xd, segment_id,
+                                            SEG_LVL_SKIP))) {
+    TX_SIZE sz = mi->txfm_size;
+    // FIXME(rbultje) code ternary symbol once all experiments are merged
+    vp9_write(bc, sz != TX_4X4, pc->prob_tx[0]);
+    if (mi->sb_type >= BLOCK_SIZE_MB16X16 && sz != TX_4X4) {
+      vp9_write(bc, sz != TX_8X8, pc->prob_tx[1]);
+      if (mi->sb_type >= BLOCK_SIZE_SB32X32 && sz != TX_8X8)
+        vp9_write(bc, sz != TX_16X16, pc->prob_tx[2]);
+    }
+  }
+#else
   if (((rf == INTRA_FRAME && mode <= I8X8_PRED) ||
        (rf != INTRA_FRAME && !(mode == SPLITMV &&
                                mi->partitioning == PARTITIONING_4X4))) &&
@@ -890,6 +921,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m,
         vp9_write(bc, sz != TX_16X16, pc->prob_tx[2]);
     }
   }
+#endif
 }
 
 static void write_mb_modes_kf(const VP9_COMP *cpi,
@@ -930,8 +962,9 @@ static void write_mb_modes_kf(const VP9_COMP *cpi,
 #endif
 
       write_kf_bmode(bc, bm, c->kf_bmode_prob[a][l]);
-    } while (++i < 16);
+    } while (++i < (16 >> (CONFIG_SB8X8 * 2)));
   }
+#if !CONFIG_SB8X8
   if (ym == I8X8_PRED) {
     write_i8x8_mode(bc, m->bmi[0].as_mode.first, c->fc.i8x8_mode_prob);
     // printf("    mode: %d\n", m->bmi[0].as_mode.first); fflush(stdout);
@@ -942,8 +975,22 @@ static void write_mb_modes_kf(const VP9_COMP *cpi,
     write_i8x8_mode(bc, m->bmi[10].as_mode.first, c->fc.i8x8_mode_prob);
     // printf("    mode: %d\n", m->bmi[10].as_mode.first); fflush(stdout);
   } else
+#endif
     write_uv_mode(bc, m->mbmi.uv_mode, c->kf_uv_mode_prob[ym]);
 
+#if CONFIG_SB8X8
+  if (ym != I4X4_PRED && c->txfm_mode == TX_MODE_SELECT &&
+      !(skip_coeff || vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP))) {
+    TX_SIZE sz = m->mbmi.txfm_size;
+    // FIXME(rbultje) code ternary symbol once all experiments are merged
+    vp9_write(bc, sz != TX_4X4, c->prob_tx[0]);
+    if (m->mbmi.sb_type >= BLOCK_SIZE_MB16X16 && sz != TX_4X4) {
+      vp9_write(bc, sz != TX_8X8, c->prob_tx[1]);
+      if (m->mbmi.sb_type >= BLOCK_SIZE_SB32X32 && sz != TX_8X8)
+        vp9_write(bc, sz != TX_16X16, c->prob_tx[2]);
+    }
+  }
+#else
   if (ym <= I8X8_PRED && c->txfm_mode == TX_MODE_SELECT &&
       !(skip_coeff || vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP))) {
     TX_SIZE sz = m->mbmi.txfm_size;
@@ -955,6 +1002,7 @@ static void write_mb_modes_kf(const VP9_COMP *cpi,
         vp9_write(bc, sz != TX_16X16, c->prob_tx[2]);
     }
   }
+#endif
 }
 
 
@@ -1719,16 +1767,91 @@ static void segment_reference_frames(VP9_COMP *cpi) {
   }
 }
 
-void vp9_pack_bitstream(VP9_COMP *cpi, unsigned char *dest,
-                        unsigned long *size) {
+static void encode_segmentation(VP9_COMP *cpi, vp9_writer *w) {
   int i, j;
+  VP9_COMMON *const pc = &cpi->common;
+  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+
+  vp9_write_bit(w, xd->segmentation_enabled);
+  if (!xd->segmentation_enabled)
+    return;
+
+  // Segmentation map
+  vp9_write_bit(w, xd->update_mb_segmentation_map);
+#if CONFIG_IMPLICIT_SEGMENTATION
+  vp9_write_bit(w, xd->allow_implicit_segment_update);
+#endif
+  if (xd->update_mb_segmentation_map) {
+    // Select the coding strategy (temporal or spatial)
+    vp9_choose_segmap_coding_method(cpi);
+    // Write out probabilities used to decode unpredicted  macro-block segments
+    for (i = 0; i < MB_SEG_TREE_PROBS; i++) {
+      const int prob = xd->mb_segment_tree_probs[i];
+      if (prob != MAX_PROB) {
+        vp9_write_bit(w, 1);
+        vp9_write_prob(w, prob);
+      } else {
+        vp9_write_bit(w, 0);
+      }
+    }
+
+    // Write out the chosen coding method.
+    vp9_write_bit(w, pc->temporal_update);
+    if (pc->temporal_update) {
+      for (i = 0; i < PREDICTION_PROBS; i++) {
+        const int prob = pc->segment_pred_probs[i];
+        if (prob != MAX_PROB) {
+          vp9_write_bit(w, 1);
+          vp9_write_prob(w, prob);
+        } else {
+          vp9_write_bit(w, 0);
+        }
+      }
+    }
+  }
+
+  // Segmentation data
+  vp9_write_bit(w, xd->update_mb_segmentation_data);
+  // segment_reference_frames(cpi);
+  if (xd->update_mb_segmentation_data) {
+    vp9_write_bit(w, xd->mb_segment_abs_delta);
+
+    for (i = 0; i < MAX_MB_SEGMENTS; i++) {
+      for (j = 0; j < SEG_LVL_MAX; j++) {
+        const int data = vp9_get_segdata(xd, i, j);
+        const int data_max = vp9_seg_feature_data_max(j);
+
+        if (vp9_segfeature_active(xd, i, j)) {
+          vp9_write_bit(w, 1);
+
+          if (vp9_is_segfeature_signed(j)) {
+            if (data < 0) {
+              vp9_encode_unsigned_max(w, -data, data_max);
+              vp9_write_bit(w, 1);
+            } else {
+              vp9_encode_unsigned_max(w, data, data_max);
+              vp9_write_bit(w, 0);
+            }
+          } else {
+            vp9_encode_unsigned_max(w, data, data_max);
+          }
+        } else {
+          vp9_write_bit(w, 0);
+        }
+      }
+    }
+  }
+}
+
+void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, unsigned long *size) {
+  int i;
   VP9_HEADER oh;
   VP9_COMMON *const pc = &cpi->common;
   vp9_writer header_bc, residual_bc;
   MACROBLOCKD *const xd = &cpi->mb.e_mbd;
   int extra_bytes_packed = 0;
 
-  unsigned char *cx_data = dest;
+  uint8_t *cx_data = dest;
 
   oh.show_frame = (int) pc->show_frame;
   oh.type = (int)pc->frame_type;
@@ -1960,87 +2083,7 @@ void vp9_pack_bitstream(VP9_COMP *cpi, unsigned char *dest,
     active_section = 7;
 #endif
 
-  // Signal whether or not Segmentation is enabled
-  vp9_write_bit(&header_bc, (xd->segmentation_enabled) ? 1 : 0);
-
-  // Indicate which features are enabled
-  if (xd->segmentation_enabled) {
-    // Indicate whether or not the segmentation map is being updated.
-    vp9_write_bit(&header_bc, (xd->update_mb_segmentation_map) ? 1 : 0);
-#if CONFIG_IMPLICIT_SEGMENTATION
-    vp9_write_bit(&header_bc, (xd->allow_implicit_segment_update) ? 1 : 0);
-#endif
-
-    // If it is, then indicate the method that will be used.
-    if (xd->update_mb_segmentation_map) {
-      // Select the coding strategy (temporal or spatial)
-      vp9_choose_segmap_coding_method(cpi);
-      // Send the tree probabilities used to decode unpredicted
-      // macro-block segments
-      for (i = 0; i < MB_SEG_TREE_PROBS; i++) {
-        const int prob = xd->mb_segment_tree_probs[i];
-        if (prob != 255) {
-          vp9_write_bit(&header_bc, 1);
-          vp9_write_prob(&header_bc, prob);
-        } else {
-          vp9_write_bit(&header_bc, 0);
-        }
-      }
-
-      // Write out the chosen coding method.
-      vp9_write_bit(&header_bc, (pc->temporal_update) ? 1 : 0);
-      if (pc->temporal_update) {
-        for (i = 0; i < PREDICTION_PROBS; i++) {
-          const int prob = pc->segment_pred_probs[i];
-          if (prob != 255) {
-            vp9_write_bit(&header_bc, 1);
-            vp9_write_prob(&header_bc, prob);
-          } else {
-            vp9_write_bit(&header_bc, 0);
-          }
-        }
-      }
-    }
-
-    vp9_write_bit(&header_bc, (xd->update_mb_segmentation_data) ? 1 : 0);
-
-    // segment_reference_frames(cpi);
-
-    if (xd->update_mb_segmentation_data) {
-      vp9_write_bit(&header_bc, (xd->mb_segment_abs_delta) ? 1 : 0);
-
-      // For each segments id...
-      for (i = 0; i < MAX_MB_SEGMENTS; i++) {
-        // For each segmentation codable feature...
-        for (j = 0; j < SEG_LVL_MAX; j++) {
-          const int8_t data = vp9_get_segdata(xd, i, j);
-          const int data_max = vp9_seg_feature_data_max(j);
-
-          // If the feature is enabled...
-          if (vp9_segfeature_active(xd, i, j)) {
-            vp9_write_bit(&header_bc, 1);
-
-            // Is the segment data signed..
-            if (vp9_is_segfeature_signed(j)) {
-              // Encode the relevant feature data
-              if (data < 0) {
-                vp9_encode_unsigned_max(&header_bc, -data, data_max);
-                vp9_write_bit(&header_bc, 1);
-              } else {
-                vp9_encode_unsigned_max(&header_bc, data, data_max);
-                vp9_write_bit(&header_bc, 0);
-              }
-            } else {
-              // Unsigned data element so no sign bit needed
-              vp9_encode_unsigned_max(&header_bc, data, data_max);
-            }
-          } else {
-            vp9_write_bit(&header_bc, 0);
-          }
-        }
-      }
-    }
-  }
+  encode_segmentation(cpi, &header_bc);
 
   // Encode the common prediction model status flag probability updates for
   // the reference frame
@@ -2153,15 +2196,19 @@ void vp9_pack_bitstream(VP9_COMP *cpi, unsigned char *dest,
   vp9_copy(cpi->common.fc.pre_uv_mode_prob, cpi->common.fc.uv_mode_prob);
   vp9_copy(cpi->common.fc.pre_bmode_prob, cpi->common.fc.bmode_prob);
   vp9_copy(cpi->common.fc.pre_sub_mv_ref_prob, cpi->common.fc.sub_mv_ref_prob);
+#if !CONFIG_SB8X8
   vp9_copy(cpi->common.fc.pre_mbsplit_prob, cpi->common.fc.mbsplit_prob);
   vp9_copy(cpi->common.fc.pre_i8x8_mode_prob, cpi->common.fc.i8x8_mode_prob);
+#endif
   vp9_copy(cpi->common.fc.pre_partition_prob, cpi->common.fc.partition_prob);
   cpi->common.fc.pre_nmvc = cpi->common.fc.nmvc;
 #if CONFIG_COMP_INTERINTRA_PRED
   cpi->common.fc.pre_interintra_prob = cpi->common.fc.interintra_prob;
 #endif
   vp9_zero(cpi->sub_mv_ref_count);
+#if !CONFIG_SB8X8
   vp9_zero(cpi->mbsplit_count);
+#endif
   vp9_zero(cpi->common.fc.mv_ref_ct);
 
   update_coef_probs(cpi, &header_bc);
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index 2c06457e7..40ad680b0 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -117,7 +117,9 @@ struct macroblock {
   int mbmode_cost[2][MB_MODE_COUNT];
   int intra_uv_mode_cost[2][MB_MODE_COUNT];
   int bmode_costs[VP9_KF_BINTRAMODES][VP9_KF_BINTRAMODES][VP9_KF_BINTRAMODES];
+#if !CONFIG_SB8X8
   int i8x8_mode_costs[MB_MODE_COUNT];
+#endif
   int inter_bmode_costs[B_MODE_COUNT];
   int switchable_interp_costs[VP9_SWITCHABLE_FILTERS + 1]
                              [VP9_SWITCHABLE_FILTERS];
@@ -141,6 +143,11 @@ struct macroblock {
 
   // Structure to hold context for each of the 4 MBs within a SB:
   // when encoded as 4 independent MBs:
+#if CONFIG_SB8X8
+  PICK_MODE_CONTEXT sb8_context[4][4][4];
+  PICK_MODE_CONTEXT sb8x16_context[4][4][2];
+  PICK_MODE_CONTEXT sb16x8_context[4][4][2];
+#endif
   PICK_MODE_CONTEXT mb_context[4][4];
   PICK_MODE_CONTEXT sb32x16_context[4][2];
   PICK_MODE_CONTEXT sb16x32_context[4][2];
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index d40c604a4..95bba21a9 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -47,8 +47,10 @@ int enc_debug = 0;
 
 void vp9_select_interp_filter_type(VP9_COMP *cpi);
 
+#if !CONFIG_SB8X8
 static void encode_macroblock(VP9_COMP *cpi, TOKENEXTRA **t,
                               int output_enabled, int mi_row, int mi_col);
+#endif
 
 static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t,
                               int output_enabled, int mi_row, int mi_col,
@@ -380,6 +382,8 @@ static void update_state(VP9_COMP *cpi,
     }
   }
   if (bsize < BLOCK_SIZE_SB32X32) {
+    if (bsize < BLOCK_SIZE_MB16X16)
+      ctx->txfm_rd_diff[ALLOW_16X16] = ctx->txfm_rd_diff[ALLOW_8X8];
     ctx->txfm_rd_diff[ALLOW_32X32] = ctx->txfm_rd_diff[ALLOW_16X16];
   }
 
@@ -387,8 +391,10 @@ static void update_state(VP9_COMP *cpi,
     vpx_memcpy(x->partition_info, &ctx->partition_info,
                sizeof(PARTITION_INFO));
 
-    mbmi->mv[0].as_int = x->partition_info->bmi[15].mv.as_int;
-    mbmi->mv[1].as_int = x->partition_info->bmi[15].second_mv.as_int;
+    mbmi->mv[0].as_int =
+        x->partition_info->bmi[15 >> (CONFIG_SB8X8 * 2)].mv.as_int;
+    mbmi->mv[1].as_int =
+        x->partition_info->bmi[15 >> (CONFIG_SB8X8 * 2)].second_mv.as_int;
 #if CONFIG_SB8X8
     vpx_memcpy(x->partition_info + mis, &ctx->partition_info,
                sizeof(PARTITION_INFO));
@@ -453,7 +459,9 @@ static void update_state(VP9_COMP *cpi,
       THR_D27_PRED /*D27_PRED*/,
       THR_D63_PRED /*D63_PRED*/,
       THR_TM /*TM_PRED*/,
+#if !CONFIG_SB8X8
       THR_I8X8_PRED /*I8X8_PRED*/,
+#endif
       THR_B_PRED /*I4X4_PRED*/,
     };
     cpi->mode_chosen_counts[kf_mode_index[mb_mode]]++;
@@ -631,13 +639,11 @@ static void set_offsets(VP9_COMP *cpi,
 
   /* segment ID */
   if (xd->segmentation_enabled) {
-    if (xd->update_mb_segmentation_map) {
-      mbmi->segment_id = find_seg_id(cpi->segmentation_map, bsize,
-                                     mi_row, cm->mi_rows, mi_col, cm->mi_cols);
-    } else {
-      mbmi->segment_id = find_seg_id(cm->last_frame_seg_map, bsize,
-                                     mi_row, cm->mi_rows, mi_col, cm->mi_cols);
-    }
+    uint8_t *map = xd->update_mb_segmentation_map ? cpi->segmentation_map
+                                                  : cm->last_frame_seg_map;
+    mbmi->segment_id = find_seg_id(map, bsize, mi_row,
+                                   cm->mi_rows, mi_col, cm->mi_cols);
+
     assert(mbmi->segment_id <= (MAX_MB_SEGMENTS-1));
     vp9_mb_init_quantizer(cpi, x);
 
@@ -667,6 +673,7 @@ static void set_offsets(VP9_COMP *cpi,
   }
 }
 
+#if !CONFIG_SB8X8
 static int pick_mb_mode(VP9_COMP *cpi,
                         int mi_row,
                         int mi_col,
@@ -707,6 +714,7 @@ static int pick_mb_mode(VP9_COMP *cpi,
 
   return splitmodes_used;
 }
+#endif
 
 static void pick_sb_modes(VP9_COMP *cpi, int mi_row, int mi_col,
                           TOKENEXTRA **tp, int *totalrate, int *totaldist,
@@ -790,11 +798,15 @@ static void set_block_index(MACROBLOCKD *xd, int idx,
                             BLOCK_SIZE_TYPE bsize) {
   if (bsize >= BLOCK_SIZE_SB32X32) {
     xd->sb_index = idx;
-  } else {
 #if CONFIG_SB8X8
-    assert(bsize >= BLOCK_SIZE_MB16X16);
-#endif
+  } else if (bsize >= BLOCK_SIZE_MB16X16) {
     xd->mb_index = idx;
+  } else {
+    xd->b_index = idx;
+#else
+  } else {
+    xd->mb_index = idx;
+#endif
   }
 }
 
@@ -817,6 +829,14 @@ static PICK_MODE_CONTEXT *get_block_context(MACROBLOCK *x,
       return &x->sb16x32_context[xd->sb_index][xd->mb_index];
     case BLOCK_SIZE_MB16X16:
       return &x->mb_context[xd->sb_index][xd->mb_index];
+#if CONFIG_SB8X8
+    case BLOCK_SIZE_SB16X8:
+      return &x->sb16x8_context[xd->sb_index][xd->mb_index][xd->b_index];
+    case BLOCK_SIZE_SB8X16:
+      return &x->sb8x16_context[xd->sb_index][xd->mb_index][xd->b_index];
+    case BLOCK_SIZE_SB8X8:
+      return &x->sb8_context[xd->sb_index][xd->mb_index][xd->b_index];
+#endif
     default:
       assert(0);
       return NULL;
@@ -837,12 +857,15 @@ static void encode_b(VP9_COMP *cpi, TOKENEXTRA **tp,
     set_block_index(xd, sub_index, bsize);
   set_offsets(cpi, mi_row, mi_col, bsize);
   update_state(cpi, get_block_context(x, bsize), bsize, output_enabled);
+#if !CONFIG_SB8X8
   if (bsize == BLOCK_SIZE_MB16X16) {
     if (cpi->oxcf.tuning == VP8_TUNE_SSIM)
       vp9_activity_masking(cpi, x);
 
     encode_macroblock(cpi, tp, output_enabled, mi_row, mi_col);
-  } else {
+  } else
+#endif
+  {
     encode_superblock(cpi, tp, output_enabled, mi_row, mi_col, bsize);
   }
 
@@ -857,22 +880,38 @@ static void encode_b(VP9_COMP *cpi, TOKENEXTRA **tp,
 static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp,
                       int mi_row, int mi_col, int output_enabled,
                       BLOCK_SIZE_TYPE level,
-                      BLOCK_SIZE_TYPE c1, BLOCK_SIZE_TYPE c2[4]) {
+                      BLOCK_SIZE_TYPE c1, BLOCK_SIZE_TYPE c2[4]
+#if CONFIG_SB8X8
+                      , BLOCK_SIZE_TYPE c3[4][4]
+#endif
+                      ) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   const int bsl = mi_width_log2(level), bs = 1 << (bsl - 1);
   const int bwl = mi_width_log2(c1), bhl = mi_height_log2(c1);
-  int pl;
+  int UNINITIALIZED_IS_SAFE(pl);
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
 
-  set_partition_seg_context(cpi, mi_row, mi_col);
-  pl = partition_plane_context(xd, level);
+#if CONFIG_SB8X8
+  if (level > BLOCK_SIZE_SB8X8) {
+#endif
+    set_partition_seg_context(cpi, mi_row, mi_col);
+    pl = partition_plane_context(xd, level);
+#if CONFIG_SB8X8
+  }
+#endif
 
   if (bsl == bwl && bsl == bhl) {
-    if (output_enabled && level > BLOCK_SIZE_MB16X16)
+    if (output_enabled &&
+#if CONFIG_SB8X8
+        level > BLOCK_SIZE_SB8X8
+#else
+        level > BLOCK_SIZE_MB16X16
+#endif
+        )
       cpi->partition_count[pl][PARTITION_NONE]++;
     encode_b(cpi, tp, mi_row, mi_col, output_enabled, c1, -1);
   } else if (bsl == bhl && bsl > bwl) {
@@ -892,9 +931,17 @@ static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp,
     assert(bwl < bsl && bhl < bsl);
     if (level == BLOCK_SIZE_SB64X64) {
       subsize = BLOCK_SIZE_SB32X32;
+#if CONFIG_SB8X8
+    } else if (level == BLOCK_SIZE_SB32X32) {
+      subsize = BLOCK_SIZE_MB16X16;
+    } else {
+      assert(level == BLOCK_SIZE_MB16X16);
+      subsize = BLOCK_SIZE_SB8X8;
+#else
     } else {
       assert(level == BLOCK_SIZE_SB32X32);
       subsize = BLOCK_SIZE_MB16X16;
+#endif
     }
 
     if (output_enabled)
@@ -906,12 +953,22 @@ static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp,
       set_block_index(xd, i, subsize);
       encode_sb(cpi, tp, mi_row + y_idx * bs, mi_col + x_idx * bs,
                 output_enabled, subsize,
-                subsize == BLOCK_SIZE_MB16X16 ? c1 : c2[i], c2);
+#if CONFIG_SB8X8
+                c2 ? c2[i] : c1, c3 ? c3[i] : NULL, NULL);
+#else
+                c2 ? c2[i] : c1, NULL);
+#endif
     }
   }
 
+#if CONFIG_SB8X8
+  if (level > BLOCK_SIZE_SB8X8 &&
+      (level == BLOCK_SIZE_MB16X16 || bsl == bwl || bsl == bhl))
+#else
   if (level > BLOCK_SIZE_MB16X16 &&
-      (level == BLOCK_SIZE_SB32X32 || bsl == bwl || bsl == bhl)) {
+      (level == BLOCK_SIZE_SB32X32 || bsl == bwl || bsl == bhl))
+#endif
+  {
     set_partition_seg_context(cpi, mi_row, mi_col);
     update_partition_context(xd, c1, level);
   }
@@ -934,7 +991,11 @@ static void encode_sb_row(VP9_COMP *cpi,
   for (mi_col = cm->cur_tile_mi_col_start;
        mi_col < cm->cur_tile_mi_col_end; mi_col += (4 << CONFIG_SB8X8)) {
     int i, p;
+#if CONFIG_SB8X8
+    BLOCK_SIZE_TYPE mb_partitioning[4][4];
+#endif
     BLOCK_SIZE_TYPE sb_partitioning[4];
+    BLOCK_SIZE_TYPE sb64_partitioning = BLOCK_SIZE_SB32X32;
     int sb64_rate = 0, sb64_dist = 0;
     int sb64_skip = 0;
     ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
@@ -951,6 +1012,9 @@ static void encode_sb_row(VP9_COMP *cpi,
     memcpy(&seg_a, cm->above_seg_context + (mi_col >> CONFIG_SB8X8),
            sizeof(seg_a));
     memcpy(&seg_l, cm->left_seg_context, sizeof(seg_l));
+
+    // FIXME(rbultje): this function should probably be rewritten to be
+    // recursive at some point in the future.
     for (i = 0; i < 4; i++) {
       const int x_idx = (i & 1) << (1 + CONFIG_SB8X8);
       const int y_idx = (i & 2) << CONFIG_SB8X8;
@@ -985,6 +1049,10 @@ static void encode_sb_row(VP9_COMP *cpi,
         const int x_idx_m = x_idx + ((j & 1) << CONFIG_SB8X8);
         const int y_idx_m = y_idx + ((j >> 1) << CONFIG_SB8X8);
         int r, d;
+#if CONFIG_SB8X8
+        int r2, d2, mb16_rate = 0, mb16_dist = 0, k;
+        ENTROPY_CONTEXT l3[4 * MAX_MB_PLANE], a3[4 * MAX_MB_PLANE];
+#endif
 
         if (mi_row + y_idx_m >= cm->mi_rows ||
             mi_col + x_idx_m >= cm->mi_cols) {
@@ -995,18 +1063,175 @@ static void encode_sb_row(VP9_COMP *cpi,
         // Index of the MB in the SB 0..3
         xd->mb_index = j;
 
+#if CONFIG_SB8X8
+        for (p = 0; p < MAX_MB_PLANE; p++) {
+          vpx_memcpy(l3 + 4 * p,
+                     cm->left_context[p] +
+                         (y_idx_m * 4 >> (CONFIG_SB8X8 +
+                                          xd->plane[p].subsampling_y)),
+                     sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_y);
+          vpx_memcpy(a3 + 4 * p,
+                     cm->above_context[p] +
+                         ((mi_col + x_idx_m) * 4 >> (CONFIG_SB8X8 +
+                                                   xd->plane[p].subsampling_x)),
+                     sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_x);
+        }
+
+        mb_partitioning[i][j] = BLOCK_SIZE_SB8X8;
+        for (k = 0; k < 4; k++) {
+          xd->b_index = k;
+
+          // try 8x8 coding
+          pick_sb_modes(cpi, mi_row + y_idx_m + (k & 1),
+                        mi_col + x_idx_m + (k >> 1),
+                        tp, &r, &d, BLOCK_SIZE_SB8X8,
+                        &x->sb8_context[xd->sb_index][xd->mb_index]
+                                       [xd->b_index]);
+          mb16_rate += r;
+          mb16_dist += d;
+          update_state(cpi, &x->sb8_context[xd->sb_index][xd->mb_index]
+                                           [xd->b_index],
+                       BLOCK_SIZE_SB8X8, 0);
+          encode_superblock(cpi, tp,
+                            0, mi_row + y_idx_m, mi_col + x_idx_m,
+                            BLOCK_SIZE_SB8X8);
+        }
+        set_partition_seg_context(cpi, mi_row + y_idx_m, mi_col + x_idx_m);
+        pl = partition_plane_context(xd, BLOCK_SIZE_MB16X16);
+        mb16_rate += x->partition_cost[pl][PARTITION_SPLIT];
+        for (p = 0; p < MAX_MB_PLANE; p++) {
+          vpx_memcpy(cm->left_context[p] +
+                         (y_idx_m * 4 >> (CONFIG_SB8X8 +
+                                          xd->plane[p].subsampling_y)),
+                     l3 + 4 * p,
+                     sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_y);
+          vpx_memcpy(cm->above_context[p] +
+                         ((mi_col + x_idx_m) * 4 >> (CONFIG_SB8X8 +
+                                                   xd->plane[p].subsampling_x)),
+                     a3 + 4 * p,
+                     sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_x);
+        }
+
+        // try 8x16 coding
+        r2 = 0;
+        d2 = 0;
+        xd->b_index = 0;
+        pick_sb_modes(cpi, mi_row + y_idx_m, mi_col + x_idx_m,
+                      tp, &r, &d, BLOCK_SIZE_SB8X16,
+                      &x->sb8x16_context[xd->sb_index][xd->mb_index]
+                                        [xd->b_index]);
+        r2 += r;
+        d2 += d;
+        update_state(cpi, &x->sb8x16_context[xd->sb_index][xd->mb_index]
+                                            [xd->b_index],
+                     BLOCK_SIZE_SB8X16, 0);
+        encode_superblock(cpi, tp,
+                          0, mi_row + y_idx_m, mi_col + x_idx_m,
+                          BLOCK_SIZE_SB8X16);
+        xd->b_index = 1;
+        pick_sb_modes(cpi, mi_row + y_idx_m, mi_col + x_idx_m + 1,
+                      tp, &r, &d, BLOCK_SIZE_SB8X16,
+                      &x->sb8x16_context[xd->sb_index][xd->mb_index]
+                                        [xd->b_index]);
+        r2 += r;
+        d2 += d;
+        set_partition_seg_context(cpi, mi_row + y_idx_m, mi_col + x_idx_m);
+        pl = partition_plane_context(xd, BLOCK_SIZE_MB16X16);
+        r2 += x->partition_cost[pl][PARTITION_VERT];
+        if (RDCOST(x->rdmult, x->rddiv, r2, d2) <
+                RDCOST(x->rdmult, x->rddiv, mb16_rate, mb16_dist)) {
+          mb16_rate = r;
+          mb16_dist = d;
+          mb_partitioning[i][j] = BLOCK_SIZE_SB8X16;
+        }
+        for (p = 0; p < MAX_MB_PLANE; p++) {
+          vpx_memcpy(cm->left_context[p] +
+                         (y_idx_m * 4 >> (CONFIG_SB8X8 +
+                                          xd->plane[p].subsampling_y)),
+                     l3 + 4 * p,
+                     sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_y);
+          vpx_memcpy(cm->above_context[p] +
+                         ((mi_col + x_idx_m) * 4 >> (CONFIG_SB8X8 +
+                                                   xd->plane[p].subsampling_x)),
+                     a3 + 4 * p,
+                     sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_x);
+        }
+
+        // try 16x8 coding
+        r2 = 0;
+        d2 = 0;
+        xd->b_index = 0;
+        pick_sb_modes(cpi, mi_row + y_idx_m, mi_col + x_idx_m,
+                      tp, &r, &d, BLOCK_SIZE_SB16X8,
+                      &x->sb16x8_context[xd->sb_index][xd->mb_index]
+                                        [xd->b_index]);
+        r2 += r;
+        d2 += d;
+        update_state(cpi, &x->sb16x8_context[xd->sb_index][xd->mb_index]
+                                            [xd->b_index],
+                     BLOCK_SIZE_SB16X8, 0);
+        encode_superblock(cpi, tp,
+                          0, mi_row + y_idx_m, mi_col + x_idx_m,
+                          BLOCK_SIZE_SB16X8);
+        xd->b_index = 1;
+        pick_sb_modes(cpi, mi_row + y_idx_m + 1, mi_col + x_idx_m,
+                      tp, &r, &d, BLOCK_SIZE_SB16X8,
+                      &x->sb16x8_context[xd->sb_index][xd->mb_index]
+                                        [xd->b_index]);
+        r2 += r;
+        d2 += d;
+        set_partition_seg_context(cpi, mi_row + y_idx_m, mi_col + x_idx_m);
+        pl = partition_plane_context(xd, BLOCK_SIZE_MB16X16);
+        r2 += x->partition_cost[pl][PARTITION_HORZ];
+        if (RDCOST(x->rdmult, x->rddiv, r2, d2) <
+                RDCOST(x->rdmult, x->rddiv, mb16_rate, mb16_dist)) {
+          mb16_rate = r;
+          mb16_dist = d;
+          mb_partitioning[i][j] = BLOCK_SIZE_SB16X8;
+        }
+        for (p = 0; p < MAX_MB_PLANE; p++) {
+          vpx_memcpy(cm->left_context[p] +
+                         (y_idx_m * 4 >> (CONFIG_SB8X8 +
+                                          xd->plane[p].subsampling_y)),
+                     l3 + 4 * p,
+                     sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_y);
+          vpx_memcpy(cm->above_context[p] +
+                         ((mi_col + x_idx_m) * 4 >> (CONFIG_SB8X8 +
+                                                   xd->plane[p].subsampling_x)),
+                     a3 + 4 * p,
+                     sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_x);
+        }
+
+        // try as 16x16
+        pick_sb_modes(cpi, mi_row + y_idx_m, mi_col + x_idx_m,
+                      tp, &r, &d, BLOCK_SIZE_MB16X16,
+                      &x->mb_context[xd->sb_index][xd->mb_index]);
+        set_partition_seg_context(cpi, mi_row + y_idx_m, mi_col + x_idx_m);
+        pl = partition_plane_context(xd, BLOCK_SIZE_MB16X16);
+        r += x->partition_cost[pl][PARTITION_NONE];
+        if (RDCOST(x->rdmult, x->rddiv, r, d) <
+                RDCOST(x->rdmult, x->rddiv, mb16_rate, mb16_dist)) {
+          mb16_rate = r;
+          mb16_dist = d;
+          mb_partitioning[i][j] = BLOCK_SIZE_MB16X16;
+        }
+        sb32_rate += mb16_rate;
+        sb32_dist += mb16_dist;
+#else
         splitmodes_used += pick_mb_mode(cpi, mi_row + y_idx_m,
                                         mi_col + x_idx_m, tp, &r, &d);
         sb32_rate += r;
         sb32_dist += d;
+#endif
 
         // Dummy encode, do not do the tokenization
 #if CONFIG_SB8X8
-        update_state(cpi, &x->mb_context[xd->sb_index][xd->mb_index],
-                     BLOCK_SIZE_MB16X16, 0);
-#endif
+        encode_sb(cpi, tp, mi_row + y_idx, mi_col + x_idx, 0,
+                  BLOCK_SIZE_MB16X16, mb_partitioning[i][j], NULL, NULL);
+#else
         encode_macroblock(cpi, tp, 0, mi_row + y_idx_m,
                           mi_col + x_idx_m);
+#endif
       }
 
       /* Restore L & A coding context to those in place on entry */
@@ -1170,7 +1395,12 @@ static void encode_sb_row(VP9_COMP *cpi,
       // instead of small->big) means we can use as threshold for small, which
       // may enable breakouts if RD is not good enough (i.e. faster)
       encode_sb(cpi, tp, mi_row + y_idx, mi_col + x_idx, 0,
-                BLOCK_SIZE_SB32X32, sb_partitioning[i], sb_partitioning);
+#if CONFIG_SB8X8
+                BLOCK_SIZE_SB32X32, sb_partitioning[i], mb_partitioning[i],
+                NULL);
+#else
+                BLOCK_SIZE_SB32X32, sb_partitioning[i], NULL);
+#endif
     }
 
     for (p = 0; p < MAX_MB_PLANE; p++) {
@@ -1221,7 +1451,7 @@ static void encode_sb_row(VP9_COMP *cpi,
               RDCOST(x->rdmult, x->rddiv, sb64_rate, sb64_dist)) {
         sb64_rate = r;
         sb64_dist = d;
-        sb_partitioning[0] = BLOCK_SIZE_SB64X32;
+        sb64_partitioning = BLOCK_SIZE_SB64X32;
       }
 
       for (p = 0; p < MAX_MB_PLANE; p++) {
@@ -1266,7 +1496,7 @@ static void encode_sb_row(VP9_COMP *cpi,
               RDCOST(x->rdmult, x->rddiv, sb64_rate, sb64_dist)) {
         sb64_rate = r;
         sb64_dist = d;
-        sb_partitioning[0] = BLOCK_SIZE_SB32X64;
+        sb64_partitioning = BLOCK_SIZE_SB32X64;
       }
 
       for (p = 0; p < MAX_MB_PLANE; p++) {
@@ -1295,13 +1525,17 @@ static void encode_sb_row(VP9_COMP *cpi,
               RDCOST(x->rdmult, x->rddiv, sb64_rate, sb64_dist)) {
         sb64_rate = r;
         sb64_dist = d;
-        sb_partitioning[0] = BLOCK_SIZE_SB64X64;
+        sb64_partitioning = BLOCK_SIZE_SB64X64;
       }
     }
 
     assert(tp_orig == *tp);
-    encode_sb(cpi, tp, mi_row, mi_col, 1,
-              BLOCK_SIZE_SB64X64, sb_partitioning[0], sb_partitioning);
+    encode_sb(cpi, tp, mi_row, mi_col, 1, BLOCK_SIZE_SB64X64,
+#if CONFIG_SB8X8
+              sb64_partitioning, sb_partitioning, mb_partitioning);
+#else
+              sb64_partitioning, sb_partitioning);
+#endif
     assert(tp_orig < *tp);
   }
 }
@@ -1346,10 +1580,14 @@ static void init_encode_frame_mb_context(VP9_COMP *cpi) {
   vp9_zero(cpi->count_mb_ref_frame_usage)
   vp9_zero(cpi->bmode_count)
   vp9_zero(cpi->ymode_count)
+#if !CONFIG_SB8X8
   vp9_zero(cpi->i8x8_mode_count)
+#endif
   vp9_zero(cpi->y_uv_mode_count)
   vp9_zero(cpi->sub_mv_ref_count)
+#if !CONFIG_SB8X8
   vp9_zero(cpi->mbsplit_count)
+#endif
   vp9_zero(cpi->common.fc.mv_ref_ct)
   vp9_zero(cpi->sb_ymode_count)
   vp9_zero(cpi->partition_count);
@@ -1616,9 +1854,17 @@ static void reset_skip_txfm_size_sb(VP9_COMP *cpi, MODE_INFO *mi,
     assert(bwl < bsl && bhl < bsl);
     if (bsize == BLOCK_SIZE_SB64X64) {
       subsize = BLOCK_SIZE_SB32X32;
+#if CONFIG_SB8X8
+    } else if (bsize == BLOCK_SIZE_SB32X32) {
+      subsize = BLOCK_SIZE_MB16X16;
+    } else {
+      assert(bsize == BLOCK_SIZE_MB16X16);
+      subsize = BLOCK_SIZE_SB8X8;
+#else
     } else {
       assert(bsize == BLOCK_SIZE_SB32X32);
       subsize = BLOCK_SIZE_MB16X16;
+#endif
     }
 
     for (n = 0; n < 4; n++) {
@@ -1823,9 +2069,10 @@ static void sum_intra_stats(VP9_COMP *cpi, MACROBLOCK *x) {
 
     do {
       ++ bct[xd->block[b].bmi.as_mode.first];
-    } while (++b < 16);
+    } while (++b < (16 >> (CONFIG_SB8X8 * 2)));
   }
 
+#if !CONFIG_SB8X8
   if (m == I8X8_PRED) {
     i8x8_modes[xd->block[0].bmi.as_mode.first]++;
     i8x8_modes[xd->block[2].bmi.as_mode.first]++;
@@ -1833,20 +2080,25 @@ static void sum_intra_stats(VP9_COMP *cpi, MACROBLOCK *x) {
     i8x8_modes[xd->block[10].bmi.as_mode.first]++;
   }
 #endif
+#endif
 
   if (xd->mode_info_context->mbmi.sb_type > BLOCK_SIZE_MB16X16) {
     ++cpi->sb_ymode_count[m];
   } else {
     ++cpi->ymode_count[m];
   }
+#if !CONFIG_SB8X8
   if (m != I8X8_PRED)
+#endif
     ++cpi->y_uv_mode_count[m][uvm];
+#if !CONFIG_SB8X8
   else {
     cpi->i8x8_mode_count[xd->mode_info_context->bmi[0].as_mode.first]++;
     cpi->i8x8_mode_count[xd->mode_info_context->bmi[2].as_mode.first]++;
     cpi->i8x8_mode_count[xd->mode_info_context->bmi[8].as_mode.first]++;
     cpi->i8x8_mode_count[xd->mode_info_context->bmi[10].as_mode.first]++;
   }
+#endif
   if (m == I4X4_PRED) {
     int b = 0;
     do {
@@ -1855,7 +2107,7 @@ static void sum_intra_stats(VP9_COMP *cpi, MACROBLOCK *x) {
       if (m == B_CONTEXT_PRED) m -= CONTEXT_PRED_REPLACEMENTS;
 #endif
       ++cpi->bmode_count[m];
-    } while (++b < 16);
+    } while (++b < (16 >> (CONFIG_SB8X8 * 2)));
   }
 }
 
@@ -1880,6 +2132,7 @@ static void adjust_act_zbin(VP9_COMP *cpi, MACROBLOCK *x) {
 #endif
 }
 
+#if !CONFIG_SB8X8
 static void encode_macroblock(VP9_COMP *cpi, TOKENEXTRA **t,
                               int output_enabled,
                               int mi_row, int mi_col) {
@@ -2103,7 +2356,6 @@ static void encode_macroblock(VP9_COMP *cpi, TOKENEXTRA **t,
 #if CONFIG_SB8X8
       int y, x;
 #endif
-
       if (mbmi->mode != I4X4_PRED && mbmi->mode != I8X8_PRED &&
           mbmi->mode != SPLITMV && cpi->common.txfm_mode >= ALLOW_16X16) {
         mbmi->txfm_size = TX_16X16;
@@ -2128,6 +2380,7 @@ static void encode_macroblock(VP9_COMP *cpi, TOKENEXTRA **t,
     }
   }
 }
+#endif
 
 static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t,
                               int output_enabled, int mi_row, int mi_col,
@@ -2178,6 +2431,24 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t,
     vp9_update_zbin_extra(cpi, x);
   }
 
+#if CONFIG_SB8X8
+  if (xd->mode_info_context->mbmi.mode == I4X4_PRED) {
+    assert(bsize == BLOCK_SIZE_SB8X8 &&
+           xd->mode_info_context->mbmi.txfm_size == TX_4X4);
+
+    vp9_encode_intra4x4mby(x, bsize);
+    vp9_build_intra_predictors_sbuv_s(&x->e_mbd, bsize);
+    vp9_subtract_sbuv(x, bsize);
+    vp9_transform_sbuv_4x4(x, bsize);
+    vp9_quantize_sbuv_4x4(x, bsize);
+    vp9_optimize_sbuv(cm, x, bsize);
+    vp9_inverse_transform_sbuv_4x4(xd, bsize);
+    vp9_recon_sbuv(xd, bsize);
+
+    if (output_enabled)
+      sum_intra_stats(cpi, x);
+  } else
+#endif
   if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {
     vp9_build_intra_predictors_sby_s(&x->e_mbd, bsize);
     vp9_build_intra_predictors_sbuv_s(&x->e_mbd, bsize);
@@ -2213,6 +2484,12 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t,
     vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
   }
 
+#if CONFIG_SB8X8
+  if (xd->mode_info_context->mbmi.mode == I4X4_PRED) {
+    assert(bsize == BLOCK_SIZE_SB8X8);
+    vp9_tokenize_sb(cpi, &x->e_mbd, t, !output_enabled, bsize);
+  } else
+#endif
   if (!x->skip) {
     vp9_subtract_sb(x, bsize);
 
@@ -2228,11 +2505,11 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t,
           vp9_quantize_sbuv_16x16(x, bsize);
         }
         if (x->optimize) {
-          vp9_optimize_sby_32x32(cm, x, bsize);
+          vp9_optimize_sby(cm, x, bsize);
           if (bsize == BLOCK_SIZE_SB64X64)
-            vp9_optimize_sbuv_32x32(cm, x, bsize);
+            vp9_optimize_sbuv(cm, x, bsize);
           else
-            vp9_optimize_sbuv_16x16(cm, x, bsize);
+            vp9_optimize_sbuv(cm, x, bsize);
         }
         vp9_inverse_transform_sby_32x32(xd, bsize);
         if (bsize == BLOCK_SIZE_SB64X64)
@@ -2251,11 +2528,11 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t,
           vp9_quantize_sbuv_8x8(x, bsize);
         }
         if (x->optimize) {
-          vp9_optimize_sby_16x16(cm, x, bsize);
+          vp9_optimize_sby(cm, x, bsize);
           if (bsize >= BLOCK_SIZE_SB32X32)
-            vp9_optimize_sbuv_16x16(cm, x, bsize);
+            vp9_optimize_sbuv(cm, x, bsize);
           else
-            vp9_optimize_sbuv_8x8(cm, x, bsize);
+            vp9_optimize_sbuv(cm, x, bsize);
         }
         vp9_inverse_transform_sby_16x16(xd, bsize);
         if (bsize >= BLOCK_SIZE_SB32X32)
@@ -2265,15 +2542,23 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t,
         break;
       case TX_8X8:
         vp9_transform_sby_8x8(x, bsize);
-        vp9_transform_sbuv_8x8(x, bsize);
         vp9_quantize_sby_8x8(x, bsize);
-        vp9_quantize_sbuv_8x8(x, bsize);
-        if (x->optimize) {
-          vp9_optimize_sby_8x8(cm, x, bsize);
-          vp9_optimize_sbuv_8x8(cm, x, bsize);
-        }
+        if (x->optimize)
+          vp9_optimize_sby(cm, x, bsize);
         vp9_inverse_transform_sby_8x8(xd, bsize);
-        vp9_inverse_transform_sbuv_8x8(xd, bsize);
+        if (bsize >= BLOCK_SIZE_MB16X16) {
+          vp9_transform_sbuv_8x8(x, bsize);
+          vp9_quantize_sbuv_8x8(x, bsize);
+          if (x->optimize)
+            vp9_optimize_sbuv(cm, x, bsize);
+          vp9_inverse_transform_sbuv_8x8(xd, bsize);
+        } else {
+          vp9_transform_sbuv_4x4(x, bsize);
+          vp9_quantize_sbuv_4x4(x, bsize);
+          if (x->optimize)
+            vp9_optimize_sbuv(cm, x, bsize);
+          vp9_inverse_transform_sbuv_4x4(xd, bsize);
+        }
         break;
       case TX_4X4:
         vp9_transform_sby_4x4(x, bsize);
@@ -2281,8 +2566,8 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t,
         vp9_quantize_sby_4x4(x, bsize);
         vp9_quantize_sbuv_4x4(x, bsize);
         if (x->optimize) {
-          vp9_optimize_sby_4x4(cm, x, bsize);
-          vp9_optimize_sbuv_4x4(cm, x, bsize);
+          vp9_optimize_sby(cm, x, bsize);
+          vp9_optimize_sbuv(cm, x, bsize);
         }
         vp9_inverse_transform_sby_4x4(xd, bsize);
         vp9_inverse_transform_sbuv_4x4(xd, bsize);
@@ -2316,8 +2601,10 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t,
           vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP))) {
       if (bsize >= BLOCK_SIZE_SB32X32) {
         cpi->txfm_count_32x32p[mi->mbmi.txfm_size]++;
-      } else {
+      } else if (bsize >= BLOCK_SIZE_MB16X16) {
         cpi->txfm_count_16x16p[mi->mbmi.txfm_size]++;
+      } else {
+        cpi->txfm_count_8x8p[mi->mbmi.txfm_size]++;
       }
     } else {
       int x, y;
@@ -2325,6 +2612,8 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t,
 
       if (sz == TX_32X32 && bsize < BLOCK_SIZE_SB32X32)
         sz = TX_16X16;
+      if (sz == TX_16X16 && bsize < BLOCK_SIZE_MB16X16)
+        sz = TX_8X8;
 
       for (y = 0; y < bh; y++) {
         for (x = 0; x < bw; x++) {
diff --git a/vp9/encoder/vp9_encodeintra.c b/vp9/encoder/vp9_encodeintra.c
index f6ddca8f4..c5f29fe7e 100644
--- a/vp9/encoder/vp9_encodeintra.c
+++ b/vp9/encoder/vp9_encodeintra.c
@@ -22,12 +22,15 @@ int vp9_encode_intra(VP9_COMP *cpi, MACROBLOCK *x, int use_16x16_pred) {
   MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
   (void) cpi;
 
+#if !CONFIG_SB8X8
   if (use_16x16_pred) {
+#endif
     mbmi->mode = DC_PRED;
     mbmi->uv_mode = DC_PRED;
     mbmi->ref_frame = INTRA_FRAME;
 
     vp9_encode_intra16x16mby(&cpi->common, x);
+#if !CONFIG_SB8X8
   } else {
     int i;
 
@@ -36,6 +39,7 @@ int vp9_encode_intra(VP9_COMP *cpi, MACROBLOCK *x, int use_16x16_pred) {
       encode_intra4x4block(x, i, BLOCK_SIZE_MB16X16);
     }
   }
+#endif
 
   return vp9_get_mb_ss(x->plane[0].src_diff);
 }
@@ -58,7 +62,7 @@ static void encode_intra4x4block(MACROBLOCK *x, int ib,
                                 xd->plane[0].diff);
   int16_t* const coeff = BLOCK_OFFSET(x->plane[0].coeff, ib, 16);
 
-  assert(ib < 16);
+  assert(ib < (16 >> (2 * CONFIG_SB8X8)));
 
 #if CONFIG_NEWBINTRAMODES
   xd->mode_info_context->bmi[ib].as_mode.context =
@@ -68,22 +72,22 @@ static void encode_intra4x4block(MACROBLOCK *x, int ib,
   vp9_intra4x4_predict(&x->e_mbd, ib,
                        xd->mode_info_context->bmi[ib].as_mode.first,
                        dst, xd->plane[0].dst.stride);
-  vp9_subtract_block(4, 4, src_diff, 16,
+  vp9_subtract_block(4, 4, src_diff, 16 >> CONFIG_SB8X8,
                      src, x->plane[0].src.stride,
                      dst, xd->plane[0].dst.stride);
 
   tx_type = get_tx_type_4x4(&x->e_mbd, ib);
   if (tx_type != DCT_DCT) {
-    vp9_short_fht4x4(src_diff, coeff, 16, tx_type);
+    vp9_short_fht4x4(src_diff, coeff, 16 >> CONFIG_SB8X8, tx_type);
     x->quantize_b_4x4(x, ib, tx_type, 16);
     vp9_short_iht4x4(BLOCK_OFFSET(xd->plane[0].dqcoeff, ib, 16),
-                     diff, 16, tx_type);
+                     diff, 16 >> CONFIG_SB8X8, tx_type);
   } else {
-    x->fwd_txm4x4(src_diff, coeff, 32);
+    x->fwd_txm4x4(src_diff, coeff, 32 >> CONFIG_SB8X8);
     x->quantize_b_4x4(x, ib, tx_type, 16);
     vp9_inverse_transform_b_4x4(&x->e_mbd, xd->plane[0].eobs[ib],
                                 BLOCK_OFFSET(xd->plane[0].dqcoeff, ib, 16),
-                                diff, 32);
+                                diff, 32 >> CONFIG_SB8X8);
   }
 
   vp9_recon_b(dst, diff, dst, xd->plane[0].dst.stride);
@@ -110,21 +114,21 @@ void vp9_encode_intra16x16mby(VP9_COMMON *const cm, MACROBLOCK *x) {
       vp9_transform_sby_16x16(x, BLOCK_SIZE_MB16X16);
       vp9_quantize_sby_16x16(x, BLOCK_SIZE_MB16X16);
       if (x->optimize)
-        vp9_optimize_sby_16x16(cm, x, BLOCK_SIZE_MB16X16);
+        vp9_optimize_sby(cm, x, BLOCK_SIZE_MB16X16);
       vp9_inverse_transform_sby_16x16(xd, BLOCK_SIZE_MB16X16);
       break;
     case TX_8X8:
       vp9_transform_sby_8x8(x, BLOCK_SIZE_MB16X16);
       vp9_quantize_sby_8x8(x, BLOCK_SIZE_MB16X16);
       if (x->optimize)
-        vp9_optimize_sby_8x8(cm, x, BLOCK_SIZE_MB16X16);
+        vp9_optimize_sby(cm, x, BLOCK_SIZE_MB16X16);
       vp9_inverse_transform_sby_8x8(xd, BLOCK_SIZE_MB16X16);
       break;
     default:
       vp9_transform_sby_4x4(x, BLOCK_SIZE_MB16X16);
       vp9_quantize_sby_4x4(x, BLOCK_SIZE_MB16X16);
       if (x->optimize)
-        vp9_optimize_sby_4x4(cm, x, BLOCK_SIZE_MB16X16);
+        vp9_optimize_sby(cm, x, BLOCK_SIZE_MB16X16);
       vp9_inverse_transform_sby_4x4(xd, BLOCK_SIZE_MB16X16);
       break;
   }
@@ -144,14 +148,14 @@ void vp9_encode_intra16x16mbuv(VP9_COMMON *const cm, MACROBLOCK *x) {
       vp9_transform_sbuv_4x4(x, BLOCK_SIZE_MB16X16);
       vp9_quantize_sbuv_4x4(x, BLOCK_SIZE_MB16X16);
       if (x->optimize)
-        vp9_optimize_sbuv_4x4(cm, x, BLOCK_SIZE_MB16X16);
+        vp9_optimize_sbuv(cm, x, BLOCK_SIZE_MB16X16);
       vp9_inverse_transform_sbuv_4x4(xd, BLOCK_SIZE_MB16X16);
       break;
     default:  // 16x16 or 8x8
       vp9_transform_sbuv_8x8(x, BLOCK_SIZE_MB16X16);
       vp9_quantize_sbuv_8x8(x, BLOCK_SIZE_MB16X16);
       if (x->optimize)
-        vp9_optimize_sbuv_8x8(cm, x, BLOCK_SIZE_MB16X16);
+        vp9_optimize_sbuv(cm, x, BLOCK_SIZE_MB16X16);
       vp9_inverse_transform_sbuv_8x8(xd, BLOCK_SIZE_MB16X16);
       break;
     }
@@ -159,6 +163,7 @@ void vp9_encode_intra16x16mbuv(VP9_COMMON *const cm, MACROBLOCK *x) {
   vp9_recon_sbuv(xd, BLOCK_SIZE_MB16X16);
 }
 
+#if !CONFIG_SB8X8
 void vp9_encode_intra8x8(MACROBLOCK *x, int ib) {
   MACROBLOCKD *xd = &x->e_mbd;
   uint8_t* const src =
@@ -304,3 +309,4 @@ void vp9_encode_intra8x8mbuv(MACROBLOCK *x) {
     encode_intra_uv4x4(x, i + 20, mode);  // v
   }
 }
+#endif
diff --git a/vp9/encoder/vp9_encodeintra.h b/vp9/encoder/vp9_encodeintra.h
index 7ec2f11d4..a4f4c184b 100644
--- a/vp9/encoder/vp9_encodeintra.h
+++ b/vp9/encoder/vp9_encodeintra.h
@@ -17,8 +17,10 @@ int vp9_encode_intra(VP9_COMP *cpi, MACROBLOCK *x, int use_16x16_pred);
 void vp9_encode_intra16x16mby(VP9_COMMON *const cm, MACROBLOCK *x);
 void vp9_encode_intra16x16mbuv(VP9_COMMON *const cm, MACROBLOCK *x);
 void vp9_encode_intra4x4mby(MACROBLOCK *mb, BLOCK_SIZE_TYPE bs);
+#if !CONFIG_SB8X8
 void vp9_encode_intra8x8mby(MACROBLOCK *x);
 void vp9_encode_intra8x8mbuv(MACROBLOCK *x);
 void vp9_encode_intra8x8(MACROBLOCK *x, int ib);
+#endif
 
 #endif  // VP9_ENCODER_VP9_ENCODEINTRA_H_
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 0cb1ae958..15fd4f1b6 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -249,57 +249,53 @@ static int trellis_get_coeff_context(const int *scan,
   return pt;
 }
 
-static void optimize_b(VP9_COMMON *const cm,
-                       MACROBLOCK *mb, int ib, PLANE_TYPE type,
-                       const int16_t *dequant_ptr,
+static void optimize_b(VP9_COMMON *const cm, MACROBLOCK *mb,
+                       int plane, int block, BLOCK_SIZE_TYPE bsize,
                        ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
-                       int tx_size, int y_blocks) {
+                       TX_SIZE tx_size) {
   const int ref = mb->e_mbd.mode_info_context->mbmi.ref_frame != INTRA_FRAME;
   MACROBLOCKD *const xd = &mb->e_mbd;
   vp9_token_state tokens[1025][2];
   unsigned best_index[1025][2];
-  const struct plane_block_idx pb_idx = plane_block_idx(y_blocks, ib);
-  const int16_t *coeff_ptr = BLOCK_OFFSET(mb->plane[pb_idx.plane].coeff,
-                                          pb_idx.block, 16);
+  const int16_t *coeff_ptr = BLOCK_OFFSET(mb->plane[plane].coeff,
+                                          block, 16);
   int16_t *qcoeff_ptr;
   int16_t *dqcoeff_ptr;
-  int eob = xd->plane[pb_idx.plane].eobs[pb_idx.block], final_eob, sz = 0;
+  int eob = xd->plane[plane].eobs[block], final_eob, sz = 0;
   const int i0 = 0;
   int rc, x, next, i;
   int64_t rdmult, rddiv, rd_cost0, rd_cost1;
   int rate0, rate1, error0, error1, t0, t1;
   int best, band, pt;
+  PLANE_TYPE type = xd->plane[plane].plane_type;
   int err_mult = plane_rd_mult[type];
   int default_eob, pad;
   int const *scan, *nb;
   const int mul = 1 + (tx_size == TX_32X32);
   uint8_t token_cache[1024];
+  const int ib = txfrm_block_to_raster_block(xd, bsize, plane,
+                                             block, 2 * tx_size);
+  const int16_t *dequant_ptr = xd->plane[plane].dequant;
 
-  assert((!type && !pb_idx.plane) || (type && pb_idx.plane));
-  dqcoeff_ptr = BLOCK_OFFSET(xd->plane[pb_idx.plane].dqcoeff, pb_idx.block, 16);
-  qcoeff_ptr = BLOCK_OFFSET(xd->plane[pb_idx.plane].qcoeff, pb_idx.block, 16);
+  assert((!type && !plane) || (type && plane));
+  dqcoeff_ptr = BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16);
+  qcoeff_ptr = BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16);
   switch (tx_size) {
     default:
     case TX_4X4: {
-      const TX_TYPE tx_type = get_tx_type_4x4(xd, ib);
+      const TX_TYPE tx_type = plane == 0 ? get_tx_type_4x4(xd, ib) : DCT_DCT;
       default_eob = 16;
       scan = get_scan_4x4(tx_type);
       break;
     }
     case TX_8X8: {
-      const BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type;
-      const int sz = 1 + b_width_log2(sb_type);
-      const int x = ib & ((1 << sz) - 1), y = ib - x;
-      const TX_TYPE tx_type = get_tx_type_8x8(xd, y + (x >> 1));
+      const TX_TYPE tx_type = plane == 0 ? get_tx_type_8x8(xd, ib) : DCT_DCT;
       scan = get_scan_8x8(tx_type);
       default_eob = 64;
       break;
     }
     case TX_16X16: {
-      const BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type;
-      const int sz = 2 + b_width_log2(sb_type);
-      const int x = ib & ((1 << sz) - 1), y = ib - x;
-      const TX_TYPE tx_type = get_tx_type_16x16(xd, y + (x >> 2));
+      const TX_TYPE tx_type = plane == 0 ? get_tx_type_16x16(xd, ib) : DCT_DCT;
       scan = get_scan_16x16(tx_type);
       default_eob = 256;
       break;
@@ -480,203 +476,84 @@ static void optimize_b(VP9_COMMON *const cm,
   }
   final_eob++;
 
-  xd->plane[pb_idx.plane].eobs[pb_idx.block] = final_eob;
+  xd->plane[plane].eobs[block] = final_eob;
   *a = *l = (final_eob > 0);
 }
 
-void vp9_optimize_sby_32x32(VP9_COMMON *const cm, MACROBLOCK *x,
-                            BLOCK_SIZE_TYPE bsize) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  ENTROPY_CONTEXT *a = xd->plane[0].above_context;
-  ENTROPY_CONTEXT *l = xd->plane[0].left_context;
-  const int bwl = b_width_log2(bsize) - 3, bw = 1 << bwl;
-  const int bh = 1 << (b_height_log2(bsize) - 3);
-  ENTROPY_CONTEXT ta[2], tl[2];
-  int n;
-
-  for (n = 0; n < bw; n++, a += 8)
-    ta[n] = (a[0] + a[1] + a[2] + a[3] + a[4] + a[5] + a[6] + a[7]) != 0;
-  for (n = 0; n < bh; n++, l += 8)
-    tl[n] = (l[0] + l[1] + l[2] + l[3] + l[4] + l[5] + l[6] + l[7]) != 0;
-
-  for (n = 0; n < bw * bh; n++) {
-    const int x_idx = n & (bw - 1), y_idx = n >> bwl;
-
-    optimize_b(cm, x, n * 64, PLANE_TYPE_Y_WITH_DC, x->e_mbd.plane[0].dequant,
-               ta + x_idx, tl + y_idx, TX_32X32, 64 * bw * bh);
-  }
-}
-
-void vp9_optimize_sby_16x16(VP9_COMMON *const cm, MACROBLOCK *x,
-                            BLOCK_SIZE_TYPE bsize) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  ENTROPY_CONTEXT *a = xd->plane[0].above_context;
-  ENTROPY_CONTEXT *l = xd->plane[0].left_context;
-  const int bwl = b_width_log2(bsize) - 2, bw = 1 << bwl;
-  const int bh = 1 << (b_height_log2(bsize) - 2);
-  ENTROPY_CONTEXT ta[4], tl[4];
-  int n;
-
-  for (n = 0; n < bw; n++, a += 4)
-    ta[n] = (a[0] + a[1] + a[2] + a[3]) != 0;
-  for (n = 0; n < bh; n++, l += 4)
-    tl[n] = (l[0] + l[1] + l[2] + l[3]) != 0;
-
-  for (n = 0; n < bw * bh; n++) {
-    const int x_idx = n & (bw - 1), y_idx = n >> bwl;
-
-    optimize_b(cm, x, n * 16, PLANE_TYPE_Y_WITH_DC, x->e_mbd.plane[0].dequant,
-               ta + x_idx, tl + y_idx, TX_16X16, 16 * bw * bh);
-  }
-}
-
-void vp9_optimize_sby_8x8(VP9_COMMON *const cm, MACROBLOCK *x,
-                          BLOCK_SIZE_TYPE bsize) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  ENTROPY_CONTEXT *a = xd->plane[0].above_context;
-  ENTROPY_CONTEXT *l = xd->plane[0].left_context;
-  const int bwl = b_width_log2(bsize) - 1, bw = 1 << bwl;
-  const int bh = 1 << (b_height_log2(bsize) - 1);
-  ENTROPY_CONTEXT ta[8], tl[8];
-  int n;
-
-  for (n = 0; n < bw; n++, a += 2)
-    ta[n] = (a[0] + a[1]) != 0;
-  for (n = 0; n < bh; n++, l += 2)
-    tl[n] = (l[0] + l[1]) != 0;
-
-  for (n = 0; n < bw * bh; n++) {
-    const int x_idx = n & (bw - 1), y_idx = n >> bwl;
-
-    optimize_b(cm, x, n * 4, PLANE_TYPE_Y_WITH_DC, x->e_mbd.plane[0].dequant,
-               ta + x_idx, tl + y_idx, TX_8X8, 4 * bw * bh);
-  }
-}
-
-void vp9_optimize_sby_4x4(VP9_COMMON *const cm, MACROBLOCK *x,
-                          BLOCK_SIZE_TYPE bsize) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  int bwl = b_width_log2(bsize), bw = 1 << bwl;
-  int bh = 1 << b_height_log2(bsize);
-  ENTROPY_CONTEXT ta[16], tl[16];
-  int n;
-
-  vpx_memcpy(ta, xd->plane[0].above_context, sizeof(ENTROPY_CONTEXT) * bw);
-  vpx_memcpy(tl, xd->plane[0].left_context, sizeof(ENTROPY_CONTEXT) * bh);
+struct optimize_ctx {
+  ENTROPY_CONTEXT ta[MAX_MB_PLANE][16];
+  ENTROPY_CONTEXT tl[MAX_MB_PLANE][16];
+};
 
-  for (n = 0; n < bw * bh; n++) {
-    const int x_idx = n & (bw - 1), y_idx = n >> bwl;
+struct optimize_block_args {
+  VP9_COMMON *cm;
+  MACROBLOCK *x;
+  struct optimize_ctx *ctx;
+};
 
-    optimize_b(cm, x, n, PLANE_TYPE_Y_WITH_DC, x->e_mbd.plane[0].dequant,
-               ta + x_idx, tl + y_idx, TX_4X4, bh * bw);
-  }
-}
+static void optimize_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
+                           int ss_txfrm_size, void *arg) {
+  const struct optimize_block_args* const args = arg;
+  MACROBLOCKD* const xd = &args->x->e_mbd;
+  int x, y;
 
-void vp9_optimize_sbuv_32x32(VP9_COMMON *const cm, MACROBLOCK *x,
-                             BLOCK_SIZE_TYPE bsize) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  int b;
+  // find current entropy context
+  txfrm_block_to_raster_xy(xd, bsize, plane, block, ss_txfrm_size, &x, &y);
 
-  assert(bsize == BLOCK_SIZE_SB64X64);
-  for (b = 256; b < 384; b += 64) {
-    const int plane = 1 + (b >= 320);
-    ENTROPY_CONTEXT *a = xd->plane[plane].above_context;
-    ENTROPY_CONTEXT *l = xd->plane[plane].left_context;
-    ENTROPY_CONTEXT a_ec, l_ec;
-
-    a_ec = (a[0] + a[1] + a[2] + a[3] + a[4] + a[5] + a[6] + a[7]) != 0;
-    l_ec = (l[0] + l[1] + l[2] + l[3] + l[4] + l[5] + l[6] + l[7]) != 0;
-    optimize_b(cm, x, b, PLANE_TYPE_UV, x->e_mbd.plane[plane].dequant,
-               &a_ec, &l_ec, TX_32X32, 256);
-  }
+  optimize_b(args->cm, args->x, plane, block, bsize,
+             &args->ctx->ta[plane][x], &args->ctx->tl[plane][y],
+             ss_txfrm_size / 2);
 }
 
-void vp9_optimize_sbuv_16x16(VP9_COMMON *const cm, MACROBLOCK *x,
-                             BLOCK_SIZE_TYPE bsize) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  const int bwl = b_width_log2(bsize) - 2, bhl = b_height_log2(bsize) - 2;
-  const int bw = 1 << (bwl - 1);
-  const int bh = 1 << (bhl - 1);
-  int uvoff = 16 << (bwl + bhl);
-  int plane, n;
-
-  for (plane = 1; plane < MAX_MB_PLANE; plane++) {
-    ENTROPY_CONTEXT ta[2], *a = xd->plane[plane].above_context;
-    ENTROPY_CONTEXT tl[2], *l = xd->plane[plane].left_context;
-
-    for (n = 0; n < bw; n++, a += 4)
-      ta[n] = (a[0] + a[1] + a[2] + a[3]) != 0;
-    for (n = 0; n < bh; n++, l += 4)
-      tl[n] = (l[0] + l[1] + l[2] + l[3]) != 0;
-
-    for (n = 0; n < bw * bh; n++) {
-      const int x_idx = n & (bw - 1), y_idx = n >> (bwl - 1);
-      optimize_b(cm, x, uvoff + n * 16, PLANE_TYPE_UV,
-                 x->e_mbd.plane[plane].dequant,
-                 &ta[x_idx], &tl[y_idx],
-                 TX_16X16, bh * bw * 64);
+void vp9_optimize_init(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize,
+                       struct optimize_ctx *ctx) {
+  int p;
+
+  for (p = 0; p < MAX_MB_PLANE; p++) {
+    const struct macroblockd_plane* const plane = &xd->plane[p];
+    const int bwl = b_width_log2(bsize) - plane->subsampling_x;
+    const int bhl = b_height_log2(bsize) - plane->subsampling_y;
+    const TX_SIZE tx_size = tx_size_for_plane(xd, bsize, p);
+    int i, j;
+
+    for (i = 0; i < 1 << bwl; i += 1 << tx_size) {
+      int c = 0;
+      ctx->ta[p][i] = 0;
+      for (j = 0; j < 1 << tx_size && !c; j++) {
+        c = ctx->ta[p][i] |= plane->above_context[i + j];
+      }
+    }
+    for (i = 0; i < 1 << bhl; i += 1 << tx_size) {
+      int c = 0;
+      ctx->tl[p][i] = 0;
+      for (j = 0; j < 1 << tx_size && !c; j++) {
+        c = ctx->tl[p][i] |= plane->left_context[i + j];
+      }
     }
-    uvoff = (uvoff * 5) >> 2;  // switch u -> v
   }
 }
 
-void vp9_optimize_sbuv_8x8(VP9_COMMON *const cm, MACROBLOCK *x,
-                           BLOCK_SIZE_TYPE bsize) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  const int bwl = b_width_log2(bsize) - 1, bhl = b_height_log2(bsize) - 1;
-  const int bw = 1 << (bwl - 1);
-  const int bh = 1 << (bhl - 1);
-  int uvoff = 4 << (bwl + bhl);
-  int plane, n;
-
-  for (plane = 1; plane < MAX_MB_PLANE; plane++) {
-    ENTROPY_CONTEXT ta[4], *a = xd->plane[plane].above_context;
-    ENTROPY_CONTEXT tl[4], *l = xd->plane[plane].left_context;
-
-    for (n = 0; n < bw; n++, a += 2)
-      ta[n] = (a[0] + a[1]) != 0;
-    for (n = 0; n < bh; n++, l += 2)
-      tl[n] = (l[0] + l[1]) != 0;
-
-    for (n = 0; n < bw * bh; n++) {
-      const int x_idx = n & (bw - 1), y_idx = n >> (bwl - 1);
-      optimize_b(cm, x, uvoff + n * 4, PLANE_TYPE_UV,
-                 x->e_mbd.plane[plane].dequant,
-                 &ta[x_idx], &tl[y_idx],
-                 TX_8X8, bh * bw * 16);
-    }
-    uvoff = (uvoff * 5) >> 2;  // switch u -> v
-  }
+void vp9_optimize_sby(VP9_COMMON *const cm, MACROBLOCK *x,
+                      BLOCK_SIZE_TYPE bsize) {
+  struct optimize_ctx ctx;
+  struct optimize_block_args arg = {cm, x, &ctx};
+  vp9_optimize_init(&x->e_mbd, bsize, &ctx);
+  foreach_transformed_block_in_plane(&x->e_mbd, bsize, 0,
+#if !CONFIG_SB8X8
+  0,
+#endif
+                                     optimize_block, &arg);
 }
 
-void vp9_optimize_sbuv_4x4(VP9_COMMON *const cm, MACROBLOCK *x,
-                           BLOCK_SIZE_TYPE bsize) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);
-  const int bw = 1 << (bwl - 1);
-  const int bh = 1 << (bhl - 1);
-  int uvoff = 1 << (bwl + bhl);
-  int plane, n;
-
-  for (plane = 1; plane < MAX_MB_PLANE; plane++) {
-    ENTROPY_CONTEXT ta[8], tl[8];
-
-    vpx_memcpy(ta, xd->plane[plane].above_context,
-               sizeof(ENTROPY_CONTEXT) * bw);
-    vpx_memcpy(tl, xd->plane[plane].left_context,
-               sizeof(ENTROPY_CONTEXT) * bh);
-
-    for (n = 0; n < bw * bh; n++) {
-      const int x_idx = n & (bw - 1), y_idx = n >> (bwl - 1);
-      optimize_b(cm, x, uvoff + n, PLANE_TYPE_UV,
-                 x->e_mbd.plane[plane].dequant,
-                 &ta[x_idx], &tl[y_idx],
-                 TX_4X4, bh * bw * 4);
-    }
-    uvoff = (uvoff * 5) >> 2;  // switch u -> v
-  }
+void vp9_optimize_sbuv(VP9_COMMON *const cm, MACROBLOCK *x,
+                       BLOCK_SIZE_TYPE bsize) {
+  struct optimize_ctx ctx;
+  struct optimize_block_args arg = {cm, x, &ctx};
+  vp9_optimize_init(&x->e_mbd, bsize, &ctx);
+  foreach_transformed_block_uv(&x->e_mbd, bsize, optimize_block, &arg);
 }
 
+#if !CONFIG_SB8X8
 void vp9_fidct_mb(VP9_COMMON *const cm, MACROBLOCK *x) {
   MACROBLOCKD *const xd = &x->e_mbd;
   const TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;
@@ -687,8 +564,8 @@ void vp9_fidct_mb(VP9_COMMON *const cm, MACROBLOCK *x) {
     vp9_quantize_sby_16x16(x, BLOCK_SIZE_MB16X16);
     vp9_quantize_sbuv_8x8(x, BLOCK_SIZE_MB16X16);
     if (x->optimize) {
-      vp9_optimize_sby_16x16(cm, x, BLOCK_SIZE_MB16X16);
-      vp9_optimize_sbuv_8x8(cm, x, BLOCK_SIZE_MB16X16);
+      vp9_optimize_sby(cm, x, BLOCK_SIZE_MB16X16);
+      vp9_optimize_sbuv(cm, x, BLOCK_SIZE_MB16X16);
     }
     vp9_inverse_transform_sby_16x16(xd, BLOCK_SIZE_MB16X16);
     vp9_inverse_transform_sbuv_8x8(xd, BLOCK_SIZE_MB16X16);
@@ -696,20 +573,20 @@ void vp9_fidct_mb(VP9_COMMON *const cm, MACROBLOCK *x) {
     vp9_transform_sby_8x8(x, BLOCK_SIZE_MB16X16);
     vp9_quantize_sby_8x8(x, BLOCK_SIZE_MB16X16);
     if (x->optimize)
-      vp9_optimize_sby_8x8(cm, x, BLOCK_SIZE_MB16X16);
+      vp9_optimize_sby(cm, x, BLOCK_SIZE_MB16X16);
     vp9_inverse_transform_sby_8x8(xd, BLOCK_SIZE_MB16X16);
     if (xd->mode_info_context->mbmi.mode == SPLITMV) {
       assert(xd->mode_info_context->mbmi.partitioning != PARTITIONING_4X4);
       vp9_transform_sbuv_4x4(x, BLOCK_SIZE_MB16X16);
       vp9_quantize_sbuv_4x4(x, BLOCK_SIZE_MB16X16);
       if (x->optimize)
-        vp9_optimize_sbuv_4x4(cm, x, BLOCK_SIZE_MB16X16);
+        vp9_optimize_sbuv(cm, x, BLOCK_SIZE_MB16X16);
       vp9_inverse_transform_sbuv_4x4(xd, BLOCK_SIZE_MB16X16);
     } else {
       vp9_transform_sbuv_8x8(x, BLOCK_SIZE_MB16X16);
       vp9_quantize_sbuv_8x8(x, BLOCK_SIZE_MB16X16);
       if (x->optimize)
-        vp9_optimize_sbuv_8x8(cm, x, BLOCK_SIZE_MB16X16);
+        vp9_optimize_sbuv(cm, x, BLOCK_SIZE_MB16X16);
       vp9_inverse_transform_sbuv_8x8(xd, BLOCK_SIZE_MB16X16);
     }
   } else {
@@ -718,8 +595,8 @@ void vp9_fidct_mb(VP9_COMMON *const cm, MACROBLOCK *x) {
     vp9_quantize_sby_4x4(x, BLOCK_SIZE_MB16X16);
     vp9_quantize_sbuv_4x4(x, BLOCK_SIZE_MB16X16);
     if (x->optimize) {
-      vp9_optimize_sby_4x4(cm, x, BLOCK_SIZE_MB16X16);
-      vp9_optimize_sbuv_4x4(cm, x, BLOCK_SIZE_MB16X16);
+      vp9_optimize_sby(cm, x, BLOCK_SIZE_MB16X16);
+      vp9_optimize_sbuv(cm, x, BLOCK_SIZE_MB16X16);
     }
     vp9_inverse_transform_sby_4x4(xd, BLOCK_SIZE_MB16X16);
     vp9_inverse_transform_sbuv_4x4(xd, BLOCK_SIZE_MB16X16);
@@ -735,6 +612,7 @@ void vp9_encode_inter16x16(VP9_COMMON *const cm, MACROBLOCK *x,
   vp9_fidct_mb(cm, x);
   vp9_recon_sb(xd, BLOCK_SIZE_MB16X16);
 }
+#endif
 
 /* this function is used by first pass only */
 void vp9_encode_inter16x16y(MACROBLOCK *x, int mi_row, int mi_col) {
diff --git a/vp9/encoder/vp9_encodemb.h b/vp9/encoder/vp9_encodemb.h
index da134a86b..b1d8771e0 100644
--- a/vp9/encoder/vp9_encodemb.h
+++ b/vp9/encoder/vp9_encodemb.h
@@ -24,37 +24,30 @@ typedef struct {
 
 
 struct VP9_ENCODER_RTCD;
+#if !CONFIG_SB8X8
 void vp9_encode_inter16x16(VP9_COMMON *const cm, MACROBLOCK *x,
                            int mb_row, int mb_col);
+#endif
 
 void vp9_encode_inter16x16y(MACROBLOCK *x, int mb_row, int mb_col);
 
 void vp9_transform_sby_32x32(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
-void vp9_optimize_sby_32x32(VP9_COMMON *const cm, MACROBLOCK *x,
-                            BLOCK_SIZE_TYPE bsize);
 void vp9_transform_sby_16x16(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
-void vp9_optimize_sby_16x16(VP9_COMMON *const cm, MACROBLOCK *x,
-                            BLOCK_SIZE_TYPE bsize);
 void vp9_transform_sby_8x8(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
-void vp9_optimize_sby_8x8(VP9_COMMON *const cm, MACROBLOCK *x,
-                          BLOCK_SIZE_TYPE bsize);
 void vp9_transform_sby_4x4(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
-void vp9_optimize_sby_4x4(VP9_COMMON *const cm, MACROBLOCK *x,
-                          BLOCK_SIZE_TYPE bsize);
 void vp9_transform_sbuv_32x32(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
-void vp9_optimize_sbuv_32x32(VP9_COMMON *const cm, MACROBLOCK *x,
-                             BLOCK_SIZE_TYPE bsize);
 void vp9_transform_sbuv_16x16(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
-void vp9_optimize_sbuv_16x16(VP9_COMMON *const cm, MACROBLOCK *x,
-                             BLOCK_SIZE_TYPE bsize);
 void vp9_transform_sbuv_8x8(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
-void vp9_optimize_sbuv_8x8(VP9_COMMON *const cm, MACROBLOCK *x,
-                           BLOCK_SIZE_TYPE bsize);
 void vp9_transform_sbuv_4x4(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
-void vp9_optimize_sbuv_4x4(VP9_COMMON *const cm, MACROBLOCK *x,
-                           BLOCK_SIZE_TYPE bsize);
 
+void vp9_optimize_sby(VP9_COMMON *const cm, MACROBLOCK *x,
+                      BLOCK_SIZE_TYPE bsize);
+void vp9_optimize_sbuv(VP9_COMMON *const cm, MACROBLOCK *x,
+                       BLOCK_SIZE_TYPE bsize);
+
+#if !CONFIG_SB8X8
 void vp9_fidct_mb(VP9_COMMON *const cm, MACROBLOCK *x);
+#endif
 
 void vp9_subtract_block(int rows, int cols,
                         int16_t *diff_ptr, int diff_stride,
diff --git a/vp9/encoder/vp9_mbgraph.c b/vp9/encoder/vp9_mbgraph.c
index fe5d114ba..af62ec394 100644
--- a/vp9/encoder/vp9_mbgraph.c
+++ b/vp9/encoder/vp9_mbgraph.c
@@ -419,10 +419,10 @@ static void separate_arf_mbs(VP9_COMP *cpi) {
       cpi->static_mb_pct = 0;
 
     cpi->seg0_cnt = ncnt[0];
-    vp9_enable_segmentation((VP9_PTR) cpi);
+    vp9_enable_segmentation((VP9_PTR)cpi);
   } else {
     cpi->static_mb_pct = 0;
-    vp9_disable_segmentation((VP9_PTR) cpi);
+    vp9_disable_segmentation((VP9_PTR)cpi);
   }
 
   // Free localy allocated storage
diff --git a/vp9/encoder/vp9_modecosts.c b/vp9/encoder/vp9_modecosts.c
index 7d9462f94..88cd1f41b 100644
--- a/vp9/encoder/vp9_modecosts.c
+++ b/vp9/encoder/vp9_modecosts.c
@@ -41,8 +41,10 @@ void vp9_init_mode_costs(VP9_COMP *c) {
                   x->fc.uv_mode_prob[VP9_YMODES - 1], vp9_uv_mode_tree);
   vp9_cost_tokens(c->mb.intra_uv_mode_cost[0],
                   x->kf_uv_mode_prob[VP9_YMODES - 1], vp9_uv_mode_tree);
+#if !CONFIG_SB8X8
   vp9_cost_tokens(c->mb.i8x8_mode_costs,
                   x->fc.i8x8_mode_prob, vp9_i8x8_mode_tree);
+#endif
 
   for (i = 0; i <= VP9_SWITCHABLE_FILTERS; ++i)
     vp9_cost_tokens((int *)c->mb.switchable_interp_costs[i],
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index 0af232eed..ceca60d70 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -280,8 +280,7 @@ static void setup_features(VP9_COMP *cpi) {
   MACROBLOCKD *xd = &cpi->mb.e_mbd;
 
   // Set up default state for MB feature flags
-
-  xd->segmentation_enabled = 0;   // Default segmentation disabled
+  xd->segmentation_enabled = 0;
 
   xd->update_mb_segmentation_map = 0;
   xd->update_mb_segmentation_data = 0;
@@ -383,7 +382,7 @@ static void configure_static_seg_features(VP9_COMP *cpi) {
     xd->update_mb_segmentation_map = 0;
     xd->update_mb_segmentation_data = 0;
 #if CONFIG_IMPLICIT_SEGMENTATION
-  xd->allow_implicit_segment_update = 0;
+    xd->allow_implicit_segment_update = 0;
 #endif
     cpi->static_mb_pct = 0;
 
@@ -399,7 +398,7 @@ static void configure_static_seg_features(VP9_COMP *cpi) {
     xd->update_mb_segmentation_map = 0;
     xd->update_mb_segmentation_data = 0;
 #if CONFIG_IMPLICIT_SEGMENTATION
-  xd->allow_implicit_segment_update = 0;
+    xd->allow_implicit_segment_update = 0;
 #endif
     cpi->static_mb_pct = 0;
 
@@ -428,9 +427,9 @@ static void configure_static_seg_features(VP9_COMP *cpi) {
       xd->mb_segment_abs_delta = SEGMENT_DELTADATA;
 
     }
-  }
-  // All other frames if segmentation has been enabled
-  else if (xd->segmentation_enabled) {
+  } else if (xd->segmentation_enabled) {
+    // All other frames if segmentation has been enabled
+
     // First normal frame in a valid gf or alt ref group
     if (cpi->common.frames_since_golden == 0) {
       // Set up segment features for normal frames in an arf group
@@ -454,10 +453,10 @@ static void configure_static_seg_features(VP9_COMP *cpi) {
           vp9_enable_segfeature(xd, 1, SEG_LVL_REF_FRAME);
           vp9_enable_segfeature(xd, 1, SEG_LVL_SKIP);
         }
-      }
-      // Disable segmentation and clear down features if alt ref
-      // is not active for this group
-      else {
+      } else {
+        // Disable segmentation and clear down features if alt ref
+        // is not active for this group
+
         vp9_disable_segmentation((VP9_PTR)cpi);
 
         vpx_memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
@@ -467,12 +466,11 @@ static void configure_static_seg_features(VP9_COMP *cpi) {
 
         vp9_clearall_segfeatures(xd);
       }
-    }
+    } else if (cpi->is_src_frame_alt_ref) {
+      // Special case where we are coding over the top of a previous
+      // alt ref frame.
+      // Segment coding disabled for compred testing
 
-    // Special case where we are coding over the top of a previous
-    // alt ref frame.
-    // Segment coding disabled for compred testing
-    else if (cpi->is_src_frame_alt_ref) {
       // Enable ref frame features for segment 0 as well
       vp9_enable_segfeature(xd, 0, SEG_LVL_REF_FRAME);
       vp9_enable_segfeature(xd, 1, SEG_LVL_REF_FRAME);
@@ -490,9 +488,9 @@ static void configure_static_seg_features(VP9_COMP *cpi) {
       }
       // Enable data udpate
       xd->update_mb_segmentation_data = 1;
-    }
-    // All other frames.
-    else {
+    } else {
+      // All other frames.
+
       // No updates.. leave things as they are.
       xd->update_mb_segmentation_map = 0;
       xd->update_mb_segmentation_data = 0;
@@ -628,7 +626,9 @@ static void set_rd_speed_thresholds(VP9_COMP *cpi, int mode, int speed) {
   sf->thresh_mult[THR_D63_PRED ] += speed_multiplier * 1500;
 
   sf->thresh_mult[THR_B_PRED   ] += speed_multiplier * 2500;
+#if !CONFIG_SB8X8
   sf->thresh_mult[THR_I8X8_PRED] += speed_multiplier * 2500;
+#endif
 
   sf->thresh_mult[THR_NEWMV    ] += speed_multiplier * 1000;
   sf->thresh_mult[THR_NEWG     ] += speed_multiplier * 1000;
@@ -3326,9 +3326,13 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
     vp9_copy(cpi->common.fc.ymode_counts, cpi->ymode_count);
     vp9_copy(cpi->common.fc.uv_mode_counts, cpi->y_uv_mode_count);
     vp9_copy(cpi->common.fc.bmode_counts, cpi->bmode_count);
+#if !CONFIG_SB8X8
     vp9_copy(cpi->common.fc.i8x8_mode_counts, cpi->i8x8_mode_count);
+#endif
     vp9_copy(cpi->common.fc.sub_mv_ref_counts, cpi->sub_mv_ref_count);
+#if !CONFIG_SB8X8
     vp9_copy(cpi->common.fc.mbsplit_counts, cpi->mbsplit_count);
+#endif
     vp9_copy(cpi->common.fc.partition_counts, cpi->partition_count);
 #if CONFIG_COMP_INTERINTRA_PRED
     vp9_copy(cpi->common.fc.interintra_counts, cpi->interintra_count);
diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h
index aeaf1bda3..541127e51 100644
--- a/vp9/encoder/vp9_onyx_int.h
+++ b/vp9/encoder/vp9_onyx_int.h
@@ -48,9 +48,9 @@
 #define KEY_FRAME_CONTEXT 5
 
 #if CONFIG_COMP_INTERINTRA_PRED
-#define MAX_MODES 54
+#define MAX_MODES 54 - CONFIG_SB8X8
 #else
-#define MAX_MODES 42
+#define MAX_MODES 42 - CONFIG_SB8X8
 #endif
 
 #define MIN_THRESHMULT  32
@@ -72,7 +72,9 @@ typedef struct {
   // Stats
   int y_modes[VP9_YMODES];
   int uv_modes[VP9_UV_MODES];
+#if !CONFIG_SB8X8
   int i8x8_modes[VP9_I8X8_MODES];
+#endif
   int b_modes[B_MODE_COUNT];
   int inter_y_modes[MB_MODE_COUNT];
   int inter_uv_modes[VP9_UV_MODES];
@@ -100,9 +102,13 @@ typedef struct {
   vp9_prob ymode_prob[VP9_YMODES - 1]; /* interframe intra mode probs */
   vp9_prob uv_mode_prob[VP9_YMODES][VP9_UV_MODES - 1];
   vp9_prob bmode_prob[VP9_NKF_BINTRAMODES - 1];
+#if !CONFIG_SB8X8
   vp9_prob i8x8_mode_prob[VP9_I8X8_MODES - 1];
+#endif
   vp9_prob sub_mv_ref_prob[SUBMVREF_COUNT][VP9_SUBMVREFS - 1];
+#if !CONFIG_SB8X8
   vp9_prob mbsplit_prob[VP9_NUMMBSPLITS - 1];
+#endif
   vp9_prob partition_prob[NUM_PARTITION_CONTEXTS][PARTITION_TYPES - 1];
 
   vp9_prob switchable_interp_prob[VP9_SWITCHABLE_FILTERS + 1]
@@ -207,7 +213,9 @@ typedef enum {
   THR_SPLITA,
 
   THR_B_PRED,
+#if !CONFIG_SB8X8
   THR_I8X8_PRED,
+#endif
 
   THR_COMP_ZEROLG,
   THR_COMP_NEARESTLG,
@@ -273,10 +281,17 @@ typedef struct {
 } SPEED_FEATURES;
 
 enum BlockSize {
+#if CONFIG_SB8X8
+  BLOCK_4X4,
+  BLOCK_8X8,
+  BLOCK_8X16,
+  BLOCK_16X8,
+#else
   BLOCK_16X8 = PARTITIONING_16X8,
   BLOCK_8X16 = PARTITIONING_8X16,
   BLOCK_8X8 = PARTITIONING_8X8,
   BLOCK_4X4 = PARTITIONING_4X4,
+#endif
   BLOCK_16X16,
   BLOCK_MAX_SEGMENTS,
   BLOCK_32X32 = BLOCK_MAX_SEGMENTS,
@@ -451,9 +466,13 @@ typedef struct VP9_COMP {
   int sb_ymode_count [VP9_I32X32_MODES];
   int ymode_count[VP9_YMODES];        /* intra MB type cts this frame */
   int bmode_count[VP9_NKF_BINTRAMODES];
+#if !CONFIG_SB8X8
   int i8x8_mode_count[VP9_I8X8_MODES];
+#endif
   int sub_mv_ref_count[SUBMVREF_COUNT][VP9_SUBMVREFS];
+#if !CONFIG_SB8X8
   int mbsplit_count[VP9_NUMMBSPLITS];
+#endif
   int y_uv_mode_count[VP9_YMODES][VP9_UV_MODES];
   unsigned int partition_count[NUM_PARTITION_CONTEXTS][PARTITION_TYPES];
 #if CONFIG_COMP_INTERINTRA_PRED
diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index 6c8474c0e..fe8ba4b64 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@@ -223,9 +223,9 @@ void vp9_quantize_sbuv_16x16(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
 }
 
 void vp9_quantize_sbuv_8x8(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize) - 2;
-  const int bhl = b_height_log2(bsize) - 2;
-  const int uoff = 16 << (bhl + bwl);
+  const int bwl = b_width_log2(bsize) - 1;
+  const int bhl = b_height_log2(bsize) - 1;
+  const int uoff = 4 << (bhl + bwl);
   int i;
 
   for (i = uoff; i < ((uoff * 3) >> 1); i += 4)
@@ -233,9 +233,9 @@ void vp9_quantize_sbuv_8x8(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
 }
 
 void vp9_quantize_sbuv_4x4(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize) - 2;
-  const int bhl = b_height_log2(bsize) - 2;
-  const int uoff = 16 << (bhl + bwl);
+  const int bwl = b_width_log2(bsize);
+  const int bhl = b_height_log2(bsize);
+  const int uoff = 1 << (bhl + bwl);
   int i;
 
   for (i = uoff; i < ((uoff * 3) >> 1); i++)
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index 47252253d..42d339dfb 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -138,9 +138,13 @@ void vp9_save_coding_context(VP9_COMP *cpi) {
   vp9_copy(cc->sb_ymode_prob, cm->fc.sb_ymode_prob);
   vp9_copy(cc->bmode_prob, cm->fc.bmode_prob);
   vp9_copy(cc->uv_mode_prob, cm->fc.uv_mode_prob);
+#if !CONFIG_SB8X8
   vp9_copy(cc->i8x8_mode_prob, cm->fc.i8x8_mode_prob);
+#endif
   vp9_copy(cc->sub_mv_ref_prob, cm->fc.sub_mv_ref_prob);
+#if !CONFIG_SB8X8
   vp9_copy(cc->mbsplit_prob, cm->fc.mbsplit_prob);
+#endif
   vp9_copy(cc->partition_prob, cm->fc.partition_prob);
 
   // Stats
@@ -198,10 +202,14 @@ void vp9_restore_coding_context(VP9_COMP *cpi) {
   vp9_copy(cm->fc.ymode_prob, cc->ymode_prob);
   vp9_copy(cm->fc.sb_ymode_prob, cc->sb_ymode_prob);
   vp9_copy(cm->fc.bmode_prob, cc->bmode_prob);
+#if !CONFIG_SB8X8
   vp9_copy(cm->fc.i8x8_mode_prob, cc->i8x8_mode_prob);
+#endif
   vp9_copy(cm->fc.uv_mode_prob, cc->uv_mode_prob);
   vp9_copy(cm->fc.sub_mv_ref_prob, cc->sub_mv_ref_prob);
+#if !CONFIG_SB8X8
   vp9_copy(cm->fc.mbsplit_prob, cc->mbsplit_prob);
+#endif
   vp9_copy(cm->fc.partition_prob, cc->partition_prob);
 
   // Stats
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 0e85a0c71..da78be14a 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -105,7 +105,9 @@ const MODE_DEFINITION vp9_mode_order[MAX_MODES] = {
   {SPLITMV,   ALTREF_FRAME, NONE},
 
   {I4X4_PRED,    INTRA_FRAME,  NONE},
+#if !CONFIG_SB8X8
   {I8X8_PRED, INTRA_FRAME,  NONE},
+#endif
 
   /* compound prediction modes */
   {ZEROMV,    LAST_FRAME,   GOLDEN_FRAME},
@@ -563,17 +565,19 @@ static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
         rd[TX_32X32][1] < rd[TX_16X16][1] && rd[TX_32X32][1] < rd[TX_8X8][1] &&
         rd[TX_32X32][1] < rd[TX_4X4][1]))) {
     mbmi->txfm_size = TX_32X32;
-  } else if ( cm->txfm_mode == ALLOW_16X16 ||
-             (max_txfm_size == TX_16X16 && cm->txfm_mode == ALLOW_32X32) ||
-             (cm->txfm_mode == TX_MODE_SELECT &&
-              rd[TX_16X16][1] < rd[TX_8X8][1] &&
-              rd[TX_16X16][1] < rd[TX_4X4][1])) {
+  } else if (max_txfm_size >= TX_16X16 &&
+             (cm->txfm_mode == ALLOW_16X16 ||
+              cm->txfm_mode == ALLOW_32X32 ||
+              (cm->txfm_mode == TX_MODE_SELECT &&
+               rd[TX_16X16][1] < rd[TX_8X8][1] &&
+               rd[TX_16X16][1] < rd[TX_4X4][1]))) {
     mbmi->txfm_size = TX_16X16;
   } else if (cm->txfm_mode == ALLOW_8X8 ||
+             cm->txfm_mode == ALLOW_16X16 ||
+             cm->txfm_mode == ALLOW_32X32 ||
            (cm->txfm_mode == TX_MODE_SELECT && rd[TX_8X8][1] < rd[TX_4X4][1])) {
     mbmi->txfm_size = TX_8X8;
   } else {
-    assert(cm->txfm_mode == ONLY_4X4 || cm->txfm_mode == TX_MODE_SELECT);
     mbmi->txfm_size = TX_4X4;
   }
 
@@ -583,13 +587,14 @@ static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
 
   txfm_cache[ONLY_4X4] = rd[TX_4X4][0];
   txfm_cache[ALLOW_8X8] = rd[TX_8X8][0];
-  txfm_cache[ALLOW_16X16] = rd[TX_16X16][0];
-  txfm_cache[ALLOW_32X32] = rd[max_txfm_size][0];
+  txfm_cache[ALLOW_16X16] = rd[MIN(max_txfm_size, TX_16X16)][0];
+  txfm_cache[ALLOW_32X32] = rd[MIN(max_txfm_size, TX_32X32)][0];
   if (max_txfm_size == TX_32X32 &&
       rd[TX_32X32][1] < rd[TX_16X16][1] && rd[TX_32X32][1] < rd[TX_8X8][1] &&
       rd[TX_32X32][1] < rd[TX_4X4][1])
     txfm_cache[TX_MODE_SELECT] = rd[TX_32X32][1];
-  else if (rd[TX_16X16][1] < rd[TX_8X8][1] && rd[TX_16X16][1] < rd[TX_4X4][1])
+  else if (max_txfm_size >= TX_16X16 &&
+           rd[TX_16X16][1] < rd[TX_8X8][1] && rd[TX_16X16][1] < rd[TX_4X4][1])
     txfm_cache[TX_MODE_SELECT] = rd[TX_16X16][1];
   else
     txfm_cache[TX_MODE_SELECT] = rd[TX_4X4][1] < rd[TX_8X8][1] ?
@@ -794,12 +799,18 @@ static void super_block_yrd(VP9_COMP *cpi,
   if (bs >= BLOCK_SIZE_SB32X32)
     super_block_yrd_32x32(cm, x, &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32],
                           bs);
-  super_block_yrd_16x16(cm, x, &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16], bs);
+  if (bs >= BLOCK_SIZE_MB16X16)
+    super_block_yrd_16x16(cm, x, &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16],
+                          bs);
   super_block_yrd_8x8(cm, x,   &r[TX_8X8][0],   &d[TX_8X8],   &s[TX_8X8],   bs);
   super_block_yrd_4x4(cm, x,   &r[TX_4X4][0],   &d[TX_4X4],   &s[TX_4X4],   bs);
 
   choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s, skip, txfm_cache,
-                           TX_32X32 - (bs < BLOCK_SIZE_SB32X32));
+                           TX_32X32 - (bs < BLOCK_SIZE_SB32X32)
+#if CONFIG_SB8X8
+                           - (bs < BLOCK_SIZE_MB16X16)
+#endif
+                           );
 }
 
 static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
@@ -816,17 +827,41 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
   VP9_COMMON *const cm = &cpi->common;
   const int src_stride = x->plane[0].src.stride;
   uint8_t* const src =
-      raster_block_offset_uint8(xd, BLOCK_SIZE_MB16X16, 0, ib,
+      raster_block_offset_uint8(xd,
+#if CONFIG_SB8X8
+                                BLOCK_SIZE_SB8X8,
+#else
+                                BLOCK_SIZE_MB16X16,
+#endif
+                                0, ib,
                                 x->plane[0].src.buf, src_stride);
   int16_t* const src_diff =
-      raster_block_offset_int16(xd, BLOCK_SIZE_MB16X16, 0, ib,
+      raster_block_offset_int16(xd,
+#if CONFIG_SB8X8
+                                BLOCK_SIZE_SB8X8,
+#else
+                                BLOCK_SIZE_MB16X16,
+#endif
+                                0, ib,
                                 x->plane[0].src_diff);
   int16_t* const diff =
-      raster_block_offset_int16(xd, BLOCK_SIZE_MB16X16, 0, ib,
+      raster_block_offset_int16(xd,
+#if CONFIG_SB8X8
+                                BLOCK_SIZE_SB8X8,
+#else
+                                BLOCK_SIZE_MB16X16,
+#endif
+                                0, ib,
                                 xd->plane[0].diff);
   int16_t* const coeff = BLOCK_OFFSET(x->plane[0].coeff, ib, 16);
   uint8_t* const dst =
-      raster_block_offset_uint8(xd, BLOCK_SIZE_MB16X16, 0, ib,
+      raster_block_offset_uint8(xd,
+#if CONFIG_SB8X8
+                                BLOCK_SIZE_SB8X8,
+#else
+                                BLOCK_SIZE_MB16X16,
+#endif
+                                0, ib,
                                 xd->plane[0].dst.buf, xd->plane[0].dst.stride);
   ENTROPY_CONTEXT ta = *a, tempa = *a;
   ENTROPY_CONTEXT tl = *l, templ = *l;
@@ -839,7 +874,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
    * */
   DECLARE_ALIGNED_ARRAY(16, int16_t, best_dqcoeff, 16);
 
-  assert(ib < 16);
+  assert(ib < (16 >> (2 * CONFIG_SB8X8)));
 #if CONFIG_NEWBINTRAMODES
   xd->mode_info_context->bmi[ib].as_mode.context =
     vp9_find_bpred_context(xd, ib, dst, xd->plane[0].dst.stride);
@@ -868,17 +903,17 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
 #endif
 
     vp9_intra4x4_predict(xd, ib, mode, dst, xd->plane[0].dst.stride);
-    vp9_subtract_block(4, 4, src_diff, 16,
+    vp9_subtract_block(4, 4, src_diff, 16 >> CONFIG_SB8X8,
                        src, src_stride,
                        dst, xd->plane[0].dst.stride);
 
     xd->mode_info_context->bmi[ib].as_mode.first = mode;
     tx_type = get_tx_type_4x4(xd, ib);
     if (tx_type != DCT_DCT) {
-      vp9_short_fht4x4(src_diff, coeff, 16, tx_type);
+      vp9_short_fht4x4(src_diff, coeff, 16 >> CONFIG_SB8X8, tx_type);
       x->quantize_b_4x4(x, ib, tx_type, 16);
     } else {
-      x->fwd_txm4x4(src_diff, coeff, 32);
+      x->fwd_txm4x4(src_diff, coeff, 32 >> CONFIG_SB8X8);
       x->quantize_b_4x4(x, ib, tx_type, 16);
     }
 
@@ -911,9 +946,9 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
 
   // inverse transform
   if (best_tx_type != DCT_DCT)
-    vp9_short_iht4x4(best_dqcoeff, diff, 16, best_tx_type);
+    vp9_short_iht4x4(best_dqcoeff, diff, 16 >> CONFIG_SB8X8, best_tx_type);
   else
-    xd->inv_txm4x4(best_dqcoeff, diff, 32);
+    xd->inv_txm4x4(best_dqcoeff, diff, 32 >> CONFIG_SB8X8);
 
   vp9_intra4x4_predict(xd, ib, *best_mode,
                        dst, xd->plane[0].dst.stride);
@@ -932,7 +967,7 @@ static int64_t rd_pick_intra4x4mby_modes(VP9_COMP *cpi, MACROBLOCK *mb,
   int distortion = 0;
   int tot_rate_y = 0;
   int64_t total_rd = 0;
-  ENTROPY_CONTEXT t_above[4], t_left[4];
+  ENTROPY_CONTEXT t_above[4 >> CONFIG_SB8X8], t_left[4 >> CONFIG_SB8X8];
   int *bmode_costs;
 
   vpx_memcpy(t_above, xd->plane[0].above_context, sizeof(t_above));
@@ -941,15 +976,21 @@ static int64_t rd_pick_intra4x4mby_modes(VP9_COMP *cpi, MACROBLOCK *mb,
   xd->mode_info_context->mbmi.mode = I4X4_PRED;
   bmode_costs = mb->inter_bmode_costs;
 
-  for (i = 0; i < 16; i++) {
-    const int x_idx = i & 3, y_idx = i >> 2;
+  for (i = 0; i < (16 >> (2 * CONFIG_SB8X8)); i++) {
+    const int x_idx = i & (3 >> CONFIG_SB8X8), y_idx = i >> (2 >> CONFIG_SB8X8);
     MODE_INFO *const mic = xd->mode_info_context;
     const int mis = xd->mode_info_stride;
     B_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode);
     int UNINITIALIZED_IS_SAFE(r), UNINITIALIZED_IS_SAFE(ry), UNINITIALIZED_IS_SAFE(d);
 #if CONFIG_NEWBINTRAMODES
     uint8_t* const dst =
-        raster_block_offset_uint8(xd, BLOCK_SIZE_MB16X16, 0, i,
+        raster_block_offset_uint8(xd,
+#if CONFIG_SB8X8
+                                  BLOCK_SIZE_SB8X8,
+#else
+                                  BLOCK_SIZE_MB16X16,
+#endif
+                                  0, i,
                                   xd->plane[0].dst.buf,
                                   xd->plane[0].dst.stride);
 #endif
@@ -1046,6 +1087,7 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
   return best_rd;
 }
 
+#if !CONFIG_SB8X8
 static int64_t rd_pick_intra8x8block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
                                      B_PREDICTION_MODE *best_mode,
                                      int *mode_costs,
@@ -1283,6 +1325,7 @@ static int64_t rd_pick_intra8x8mby_modes_and_txsz(VP9_COMP *cpi, MACROBLOCK *x,
 
   return tmp_rd;
 }
+#endif  // !CONFIG_SB8X8
 
 static int rd_cost_sbuv_4x4(VP9_COMMON *const cm, MACROBLOCK *x,
                             BLOCK_SIZE_TYPE bsize) {
@@ -1457,10 +1500,9 @@ static void super_block_uvrd(VP9_COMMON *const cm, MACROBLOCK *x,
     super_block_uvrd_32x32(cm, x, rate, distortion, skippable, bsize);
   } else if (mbmi->txfm_size >= TX_16X16 && bsize >= BLOCK_SIZE_SB32X32) {
     super_block_uvrd_16x16(cm, x, rate, distortion, skippable, bsize);
-  } else if (mbmi->txfm_size >= TX_8X8) {
+  } else if (mbmi->txfm_size >= TX_8X8 && bsize >= BLOCK_SIZE_MB16X16) {
     super_block_uvrd_8x8(cm, x, rate, distortion, skippable, bsize);
   } else {
-    assert(mbmi->txfm_size == TX_4X4);
     super_block_uvrd_4x4(cm, x, rate, distortion, skippable, bsize);
   }
 }
@@ -1524,6 +1566,514 @@ void vp9_set_mbmode_and_mvs(MACROBLOCK *x, MB_PREDICTION_MODE mb, int_mv *mv) {
   x->e_mbd.mode_info_context->mbmi.mv[0].as_int = mv->as_int;
 }
 
+#if CONFIG_SB8X8
+static int labels2mode(MACROBLOCK *x,
+                       int const *labelings, int which_label,
+                       B_PREDICTION_MODE this_mode,
+                       int_mv *this_mv, int_mv *this_second_mv,
+                       int_mv seg_mvs[MAX_REF_FRAMES - 1],
+                       int_mv *best_ref_mv,
+                       int_mv *second_best_ref_mv,
+                       int *mvjcost, int *mvcost[2], VP9_COMP *cpi) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MODE_INFO *const mic = xd->mode_info_context;
+  MB_MODE_INFO * mbmi = &mic->mbmi;
+  const int mis = xd->mode_info_stride;
+  int i, cost = 0, thismvcost = 0;
+
+  /* We have to be careful retrieving previously-encoded motion vectors.
+   Ones from this macroblock have to be pulled from the BLOCKD array
+   as they have not yet made it to the bmi array in our MB_MODE_INFO. */
+  for (i = 0; i < 4; ++i) {
+    const int row = i >> 1, col = i & 1;
+    B_PREDICTION_MODE m;
+
+    if (labelings[i] != which_label)
+      continue;
+
+    if (col  &&  labelings[i] == labelings[i - 1])
+      m = LEFT4X4;
+    else if (row  &&  labelings[i] == labelings[i - 2])
+      m = ABOVE4X4;
+    else {
+      // the only time we should do costing for new motion vector or mode
+      // is when we are on a new label  (jbb May 08, 2007)
+      switch (m = this_mode) {
+        case NEW4X4 :
+          if (mbmi->second_ref_frame > 0) {
+            this_mv->as_int = seg_mvs[mbmi->ref_frame - 1].as_int;
+            this_second_mv->as_int =
+            seg_mvs[mbmi->second_ref_frame - 1].as_int;
+          }
+
+          thismvcost  = vp9_mv_bit_cost(this_mv, best_ref_mv, mvjcost, mvcost,
+                                        102, xd->allow_high_precision_mv);
+          if (mbmi->second_ref_frame > 0) {
+            thismvcost += vp9_mv_bit_cost(this_second_mv, second_best_ref_mv,
+                                          mvjcost, mvcost, 102,
+                                          xd->allow_high_precision_mv);
+          }
+          break;
+        case LEFT4X4:
+          this_mv->as_int = col ? mic->bmi[i - 1].as_mv[0].as_int :
+          left_block_mv(xd, mic, i);
+          if (mbmi->second_ref_frame > 0)
+            this_second_mv->as_int = col ? mic->bmi[i - 1].as_mv[1].as_int :
+            left_block_second_mv(xd, mic, i);
+          break;
+        case ABOVE4X4:
+          this_mv->as_int = row ? mic->bmi[i - 2].as_mv[0].as_int :
+          above_block_mv(mic, i, mis);
+          if (mbmi->second_ref_frame > 0)
+            this_second_mv->as_int = row ? mic->bmi[i - 2].as_mv[1].as_int :
+            above_block_second_mv(mic, i, mis);
+          break;
+        case ZERO4X4:
+          this_mv->as_int = 0;
+          if (mbmi->second_ref_frame > 0)
+            this_second_mv->as_int = 0;
+          break;
+        default:
+          break;
+      }
+
+      if (m == ABOVE4X4) {  // replace above with left if same
+        int_mv left_mv, left_second_mv;
+
+        left_second_mv.as_int = 0;
+        left_mv.as_int = col ? mic->bmi[i - 1].as_mv[0].as_int :
+        left_block_mv(xd, mic, i);
+        if (mbmi->second_ref_frame > 0)
+          left_second_mv.as_int = col ? mic->bmi[i - 1].as_mv[1].as_int :
+          left_block_second_mv(xd, mic, i);
+
+        if (left_mv.as_int == this_mv->as_int &&
+            (mbmi->second_ref_frame <= 0 ||
+             left_second_mv.as_int == this_second_mv->as_int))
+          m = LEFT4X4;
+      }
+
+#if CONFIG_NEWBINTRAMODES
+      cost = x->inter_bmode_costs[m == B_CONTEXT_PRED ?
+                                  m - CONTEXT_PRED_REPLACEMENTS : m];
+#else
+      cost = x->inter_bmode_costs[m];
+#endif
+    }
+
+    mic->bmi[i].as_mv[0].as_int = this_mv->as_int;
+    if (mbmi->second_ref_frame > 0)
+      mic->bmi[i].as_mv[1].as_int = this_second_mv->as_int;
+
+    x->partition_info->bmi[i].mode = m;
+    x->partition_info->bmi[i].mv.as_int = this_mv->as_int;
+    if (mbmi->second_ref_frame > 0)
+      x->partition_info->bmi[i].second_mv.as_int = this_second_mv->as_int;
+  }
+
+  cost += thismvcost;
+  return cost;
+}
+
+static int64_t encode_inter_mb_segment(VP9_COMMON *const cm,
+                                       MACROBLOCK *x,
+                                       int const *labels,
+                                       int which_label,
+                                       int *labelyrate,
+                                       int *distortion,
+                                       ENTROPY_CONTEXT *ta,
+                                       ENTROPY_CONTEXT *tl) {
+  int i;
+  MACROBLOCKD *xd = &x->e_mbd;
+
+  *labelyrate = 0;
+  *distortion = 0;
+  for (i = 0; i < 4; i++) {
+    if (labels[i] == which_label) {
+      const int src_stride = x->plane[0].src.stride;
+      uint8_t* const src =
+      raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
+                                x->plane[0].src.buf, src_stride);
+      int16_t* const src_diff =
+      raster_block_offset_int16(xd, BLOCK_SIZE_SB8X8, 0, i,
+                                x->plane[0].src_diff);
+      int16_t* const coeff = BLOCK_OFFSET(x->plane[0].coeff, 16, i);
+      uint8_t* const pre =
+      raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
+                                xd->plane[0].pre[0].buf,
+                                xd->plane[0].pre[0].stride);
+      uint8_t* const dst =
+      raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
+                                xd->plane[0].dst.buf,
+                                xd->plane[0].dst.stride);
+      int thisdistortion;
+
+      vp9_build_inter_predictor(pre,
+                                xd->plane[0].pre[0].stride,
+                                dst,
+                                xd->plane[0].dst.stride,
+                                &xd->mode_info_context->bmi[i].as_mv[0],
+                                &xd->scale_factor[0],
+                                4, 4, 0 /* no avg */, &xd->subpix);
+
+      // TODO(debargha): Make this work properly with the
+      // implicit-compoundinter-weight experiment when implicit
+      // weighting for splitmv modes is turned on.
+      if (xd->mode_info_context->mbmi.second_ref_frame > 0) {
+        uint8_t* const second_pre =
+        raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
+                                  xd->plane[0].pre[1].buf,
+                                  xd->plane[0].pre[1].stride);
+        vp9_build_inter_predictor(second_pre, xd->plane[0].pre[1].stride,
+                                  dst, xd->plane[0].dst.stride,
+                                  &xd->mode_info_context->bmi[i].as_mv[1],
+                                  &xd->scale_factor[1], 4, 4, 1,
+                                  &xd->subpix);
+      }
+
+      vp9_subtract_block(4, 4, src_diff, 8,
+                         src, src_stride,
+                         dst, xd->plane[0].dst.stride);
+      x->fwd_txm4x4(src_diff, coeff, 16);
+      x->quantize_b_4x4(x, i, DCT_DCT, 16);
+      thisdistortion = vp9_block_error(coeff,
+                                       BLOCK_OFFSET(xd->plane[0].dqcoeff,
+                                                    i, 16), 16);
+      *distortion += thisdistortion;
+      *labelyrate += cost_coeffs(cm, x, i, PLANE_TYPE_Y_WITH_DC,
+                                 ta + (i & 1),
+                                 tl + (i >> 1), TX_4X4, 16);
+    }
+  }
+  *distortion >>= 2;
+  return RDCOST(x->rdmult, x->rddiv, *labelyrate, *distortion);
+}
+
+typedef struct {
+  int_mv *ref_mv, *second_ref_mv;
+  int_mv mvp;
+
+  int64_t segment_rd;
+  int r;
+  int d;
+  int segment_yrate;
+  B_PREDICTION_MODE modes[4];
+  int_mv mvs[4], second_mvs[4];
+  int eobs[4];
+
+  int mvthresh;
+  int *mdcounts;
+} BEST_SEG_INFO;
+#endif  // CONFIG_SB8X8
+
+static INLINE int mv_check_bounds(MACROBLOCK *x, int_mv *mv) {
+  int r = 0;
+  r |= (mv->as_mv.row >> 3) < x->mv_row_min;
+  r |= (mv->as_mv.row >> 3) > x->mv_row_max;
+  r |= (mv->as_mv.col >> 3) < x->mv_col_min;
+  r |= (mv->as_mv.col >> 3) > x->mv_col_max;
+  return r;
+}
+
+#if CONFIG_SB8X8
+static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
+                                    BEST_SEG_INFO *bsi,
+                                    int_mv seg_mvs[4][MAX_REF_FRAMES - 1]) {
+  int i, j;
+  static const int labels[4] = { 0, 1, 2, 3 };
+  int br = 0, bd = 0;
+  B_PREDICTION_MODE this_mode;
+  MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
+  const int label_count = 4;
+  int64_t this_segment_rd = 0, other_segment_rd;
+  int label_mv_thresh;
+  int rate = 0;
+  int sbr = 0, sbd = 0;
+  int segmentyrate = 0;
+  int best_eobs[4] = { 0 };
+
+  vp9_variance_fn_ptr_t *v_fn_ptr;
+
+  ENTROPY_CONTEXT t_above[2], t_left[2];
+  ENTROPY_CONTEXT t_above_b[2], t_left_b[2];
+
+  vpx_memcpy(t_above, x->e_mbd.plane[0].above_context, sizeof(t_above));
+  vpx_memcpy(t_left, x->e_mbd.plane[0].left_context, sizeof(t_left));
+
+  v_fn_ptr = &cpi->fn_ptr[BLOCK_4X4];
+
+  // 64 makes this threshold really big effectively
+  // making it so that we very rarely check mvs on
+  // segments.   setting this to 1 would make mv thresh
+  // roughly equal to what it is for macroblocks
+  label_mv_thresh = 1 * bsi->mvthresh / label_count;
+
+  // Segmentation method overheads
+  rate += vp9_cost_mv_ref(cpi, SPLITMV,
+                          mbmi->mb_mode_context[mbmi->ref_frame]);
+  this_segment_rd += RDCOST(x->rdmult, x->rddiv, rate, 0);
+  br += rate;
+  other_segment_rd = this_segment_rd;
+
+  for (i = 0; i < label_count && this_segment_rd < bsi->segment_rd; i++) {
+    int_mv mode_mv[B_MODE_COUNT], second_mode_mv[B_MODE_COUNT];
+    int64_t best_label_rd = INT64_MAX, best_other_rd = INT64_MAX;
+    B_PREDICTION_MODE mode_selected = ZERO4X4;
+    int bestlabelyrate = 0;
+
+    // search for the best motion vector on this segment
+    for (this_mode = LEFT4X4; this_mode <= NEW4X4; this_mode ++) {
+      int64_t this_rd;
+      int distortion;
+      int labelyrate;
+      ENTROPY_CONTEXT t_above_s[2], t_left_s[2];
+
+      vpx_memcpy(t_above_s, t_above, sizeof(t_above_s));
+      vpx_memcpy(t_left_s, t_left, sizeof(t_left_s));
+
+      // motion search for newmv (single predictor case only)
+      if (mbmi->second_ref_frame <= 0 && this_mode == NEW4X4) {
+        int sseshift, n;
+        int step_param = 0;
+        int further_steps;
+        int thissme, bestsme = INT_MAX;
+        const struct buf_2d orig_src = x->plane[0].src;
+        const struct buf_2d orig_pre = x->e_mbd.plane[0].pre[0];
+
+        /* Is the best so far sufficiently good that we cant justify doing
+         * and new motion search. */
+        if (best_label_rd < label_mv_thresh)
+          break;
+
+        if (cpi->compressor_speed) {
+          // use previous block's result as next block's MV predictor.
+          if (i > 0) {
+            bsi->mvp.as_int =
+            x->e_mbd.mode_info_context->bmi[i - 1].as_mv[0].as_int;
+            if (i == 2)
+              bsi->mvp.as_int =
+              x->e_mbd.mode_info_context->bmi[i - 2].as_mv[0].as_int;
+            step_param = 2;
+          }
+        }
+
+        further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;
+
+        {
+          int sadpb = x->sadperbit4;
+          int_mv mvp_full;
+
+          mvp_full.as_mv.row = bsi->mvp.as_mv.row >> 3;
+          mvp_full.as_mv.col = bsi->mvp.as_mv.col >> 3;
+
+          // find first label
+          n = i;
+
+          // adjust src pointer for this segment
+          x->plane[0].src.buf =
+          raster_block_offset_uint8(&x->e_mbd, BLOCK_SIZE_SB8X8, 0, n,
+                                    x->plane[0].src.buf,
+                                    x->plane[0].src.stride);
+          assert(((intptr_t)x->e_mbd.plane[0].pre[0].buf & 0xf) == 0);
+          x->e_mbd.plane[0].pre[0].buf =
+          raster_block_offset_uint8(&x->e_mbd, BLOCK_SIZE_SB8X8, 0, n,
+                                    x->e_mbd.plane[0].pre[0].buf,
+                                    x->e_mbd.plane[0].pre[0].stride);
+
+          bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param,
+                                           sadpb, further_steps, 0, v_fn_ptr,
+                                           bsi->ref_mv, &mode_mv[NEW4X4]);
+
+          sseshift = 0;
+
+          // Should we do a full search (best quality only)
+          if ((cpi->compressor_speed == 0) && (bestsme >> sseshift) > 4000) {
+            /* Check if mvp_full is within the range. */
+            clamp_mv(&mvp_full, x->mv_col_min, x->mv_col_max,
+                     x->mv_row_min, x->mv_row_max);
+
+            thissme = cpi->full_search_sad(x, &mvp_full,
+                                           sadpb, 16, v_fn_ptr,
+                                           x->nmvjointcost, x->mvcost,
+                                           bsi->ref_mv,
+                                           n);
+
+            if (thissme < bestsme) {
+              bestsme = thissme;
+              mode_mv[NEW4X4].as_int =
+              x->e_mbd.mode_info_context->bmi[n].as_mv[0].as_int;
+            } else {
+              /* The full search result is actually worse so re-instate the
+               * previous best vector */
+              x->e_mbd.mode_info_context->bmi[n].as_mv[0].as_int =
+              mode_mv[NEW4X4].as_int;
+            }
+          }
+        }
+
+        if (bestsme < INT_MAX) {
+          int distortion;
+          unsigned int sse;
+          cpi->find_fractional_mv_step(x, &mode_mv[NEW4X4],
+                                       bsi->ref_mv, x->errorperbit, v_fn_ptr,
+                                       x->nmvjointcost, x->mvcost,
+                                       &distortion, &sse);
+
+          // safe motion search result for use in compound prediction
+          seg_mvs[i][mbmi->ref_frame - 1].as_int = mode_mv[NEW4X4].as_int;
+        }
+
+        // restore src pointers
+        x->plane[0].src = orig_src;
+        x->e_mbd.plane[0].pre[0] = orig_pre;
+      } else if (mbmi->second_ref_frame > 0 && this_mode == NEW4X4) {
+        /* NEW4X4 */
+        /* motion search not completed? Then skip newmv for this block with
+         * comppred */
+        if (seg_mvs[i][mbmi->second_ref_frame - 1].as_int == INVALID_MV ||
+            seg_mvs[i][mbmi->ref_frame        - 1].as_int == INVALID_MV) {
+          continue;
+        }
+      }
+
+      rate = labels2mode(x, labels, i, this_mode, &mode_mv[this_mode],
+                         &second_mode_mv[this_mode], seg_mvs[i],
+                         bsi->ref_mv, bsi->second_ref_mv, x->nmvjointcost,
+                         x->mvcost, cpi);
+
+      // Trap vectors that reach beyond the UMV borders
+      if (((mode_mv[this_mode].as_mv.row >> 3) < x->mv_row_min) ||
+          ((mode_mv[this_mode].as_mv.row >> 3) > x->mv_row_max) ||
+          ((mode_mv[this_mode].as_mv.col >> 3) < x->mv_col_min) ||
+          ((mode_mv[this_mode].as_mv.col >> 3) > x->mv_col_max)) {
+        continue;
+      }
+      if (mbmi->second_ref_frame > 0 &&
+          mv_check_bounds(x, &second_mode_mv[this_mode]))
+        continue;
+
+      this_rd = encode_inter_mb_segment(&cpi->common,
+                                        x, labels, i, &labelyrate,
+                                        &distortion, t_above_s, t_left_s);
+      this_rd += RDCOST(x->rdmult, x->rddiv, rate, 0);
+      rate += labelyrate;
+
+      if (this_rd < best_label_rd) {
+        sbr = rate;
+        sbd = distortion;
+        bestlabelyrate = labelyrate;
+        mode_selected = this_mode;
+        best_label_rd = this_rd;
+        for (j = 0; j < 4; j++)
+          if (labels[j] == i)
+            best_eobs[j] = x->e_mbd.plane[0].eobs[j];
+
+        vpx_memcpy(t_above_b, t_above_s, sizeof(t_above_s));
+        vpx_memcpy(t_left_b, t_left_s, sizeof(t_left_s));
+      }
+    } /*for each 4x4 mode*/
+
+    vpx_memcpy(t_above, t_above_b, sizeof(t_above));
+    vpx_memcpy(t_left, t_left_b, sizeof(t_left));
+
+    labels2mode(x, labels, i, mode_selected, &mode_mv[mode_selected],
+                &second_mode_mv[mode_selected], seg_mvs[i],
+                bsi->ref_mv, bsi->second_ref_mv, x->nmvjointcost,
+                x->mvcost, cpi);
+
+    br += sbr;
+    bd += sbd;
+    segmentyrate += bestlabelyrate;
+    this_segment_rd += best_label_rd;
+    other_segment_rd += best_other_rd;
+  } /* for each label */
+
+  if (this_segment_rd < bsi->segment_rd) {
+    bsi->r = br;
+    bsi->d = bd;
+    bsi->segment_yrate = segmentyrate;
+    bsi->segment_rd = this_segment_rd;
+
+    // store everything needed to come back to this!!
+    for (i = 0; i < 4; i++) {
+      bsi->mvs[i].as_mv = x->partition_info->bmi[i].mv.as_mv;
+      if (mbmi->second_ref_frame > 0)
+        bsi->second_mvs[i].as_mv = x->partition_info->bmi[i].second_mv.as_mv;
+      bsi->modes[i] = x->partition_info->bmi[i].mode;
+      bsi->eobs[i] = best_eobs[i];
+    }
+  }
+}
+
+static void rd_check_segment(VP9_COMP *cpi, MACROBLOCK *x,
+                             BEST_SEG_INFO *bsi,
+                             int_mv seg_mvs[4][MAX_REF_FRAMES - 1]) {
+  rd_check_segment_txsize(cpi, x, bsi, seg_mvs);
+}
+
+static int rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x,
+                                       int_mv *best_ref_mv,
+                                       int_mv *second_best_ref_mv,
+                                       int64_t best_rd,
+                                       int *mdcounts,
+                                       int *returntotrate,
+                                       int *returnyrate,
+                                       int *returndistortion,
+                                       int *skippable, int mvthresh,
+                                       int_mv seg_mvs[4][MAX_REF_FRAMES - 1]) {
+  int i;
+  BEST_SEG_INFO bsi;
+  MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
+
+  vpx_memset(&bsi, 0, sizeof(bsi));
+
+  bsi.segment_rd = best_rd;
+  bsi.ref_mv = best_ref_mv;
+  bsi.second_ref_mv = second_best_ref_mv;
+  bsi.mvp.as_int = best_ref_mv->as_int;
+  bsi.mvthresh = mvthresh;
+  bsi.mdcounts = mdcounts;
+
+  for (i = 0; i < 4; i++)
+    bsi.modes[i] = ZERO4X4;
+
+  rd_check_segment(cpi, x, &bsi, seg_mvs);
+
+  /* set it to the best */
+  for (i = 0; i < 4; i++) {
+    x->e_mbd.mode_info_context->bmi[i].as_mv[0].as_int = bsi.mvs[i].as_int;
+    if (mbmi->second_ref_frame > 0)
+      x->e_mbd.mode_info_context->bmi[i].as_mv[1].as_int =
+      bsi.second_mvs[i].as_int;
+    x->e_mbd.plane[0].eobs[i] = bsi.eobs[i];
+  }
+
+  /* save partitions */
+  x->partition_info->count = 4;
+
+  for (i = 0; i < x->partition_info->count; i++) {
+    x->partition_info->bmi[i].mode = bsi.modes[i];
+    x->partition_info->bmi[i].mv.as_mv = bsi.mvs[i].as_mv;
+    if (mbmi->second_ref_frame > 0)
+      x->partition_info->bmi[i].second_mv.as_mv = bsi.second_mvs[i].as_mv;
+  }
+  /*
+   * used to set mbmi->mv.as_int
+   */
+  x->partition_info->bmi[3].mv.as_int = bsi.mvs[3].as_int;
+  if (mbmi->second_ref_frame > 0)
+    x->partition_info->bmi[3].second_mv.as_int = bsi.second_mvs[3].as_int;
+
+  *returntotrate = bsi.r;
+  *returndistortion = bsi.d;
+  *returnyrate = bsi.segment_yrate;
+  *skippable = vp9_sby_is_skippable(&x->e_mbd, BLOCK_SIZE_SB8X8);
+
+  return (int)(bsi.segment_rd);
+}
+
+#else  // !CONFIG_SB8X8
+
 static int labels2mode(
   MACROBLOCK *x,
   int const *labelings, int which_label,
@@ -1887,15 +2437,6 @@ typedef struct {
 
 } BEST_SEG_INFO;
 
-static INLINE int mv_check_bounds(MACROBLOCK *x, int_mv *mv) {
-  int r = 0;
-  r |= (mv->as_mv.row >> 3) < x->mv_row_min;
-  r |= (mv->as_mv.row >> 3) > x->mv_row_max;
-  r |= (mv->as_mv.col >> 3) < x->mv_col_min;
-  r |= (mv->as_mv.col >> 3) > x->mv_col_max;
-  return r;
-}
-
 static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
                                     BEST_SEG_INFO *bsi,
                                     SPLITMV_PARTITIONING_TYPE segmentation,
@@ -2428,6 +2969,7 @@ static int rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x,
 
   return (int)(bsi.segment_rd);
 }
+#endif  // !CONFIG_SB8X8
 
 static void mv_pred(VP9_COMP *cpi, MACROBLOCK *x,
                     uint8_t *ref_y_buffer, int ref_y_stride,
@@ -2474,6 +3016,7 @@ static void mv_pred(VP9_COMP *cpi, MACROBLOCK *x,
   x->mv_best_ref_index[ref_frame] = best_index;
 }
 
+#if !CONFIG_SB8X8
 static void set_i8x8_block_modes(MACROBLOCK *x, int modes[4]) {
   int i;
   MACROBLOCKD *xd = &x->e_mbd;
@@ -2487,6 +3030,7 @@ static void set_i8x8_block_modes(MACROBLOCK *x, int modes[4]) {
     //       modes[0], modes[1], modes[2], modes[3]);
   }
 }
+#endif
 
 extern void vp9_calc_ref_probs(int *count, vp9_prob *probs);
 static void estimate_curframe_refprobs(VP9_COMP *cpi, vp9_prob mod_refprobs[3], int pred_ref) {
@@ -3193,6 +3737,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   return this_rd;  // if 0, this will be re-calculated by caller
 }
 
+#if !CONFIG_SB8X8
 static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                int mi_row, int mi_col,
                                int *returnrate, int *returndistortion,
@@ -4053,6 +4598,7 @@ end:
                                       mbmi->second_ref_frame][0],
                        best_pred_diff, best_txfm_diff);
 }
+#endif  // !CONFIG_SB8X8
 
 void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
                                int *returnrate, int *returndist,
@@ -4065,14 +4611,30 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   int dist_y = 0, dist_uv;
   int y_skip = 0, uv_skip;
   int64_t txfm_cache[NB_TXFM_MODES], err;
+#if CONFIG_SB8X8
+  MB_PREDICTION_MODE mode;
+  TX_SIZE txfm_size;
+  int rate4x4_y, rate4x4_y_tokenonly, dist4x4_y;
+  int64_t err4x4 = INT64_MAX;
+#endif
   int i;
 
   ctx->skip = 0;
   xd->mode_info_context->mbmi.mode = DC_PRED;
   err = rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly,
                                &dist_y, &y_skip, bsize, txfm_cache);
+#if CONFIG_SB8X8
+  mode = xd->mode_info_context->mbmi.mode;
+  txfm_size = xd->mode_info_context->mbmi.txfm_size;
+#endif
   rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly,
                           &dist_uv, &uv_skip, bsize);
+#if CONFIG_SB8X8
+  if (bsize == BLOCK_SIZE_SB8X8)
+    err4x4 = rd_pick_intra4x4mby_modes(cpi, x, &rate4x4_y,
+                                       &rate4x4_y_tokenonly,
+                                       &dist4x4_y, err);
+#endif
 
   if (y_skip && uv_skip) {
     *returnrate = rate_y + rate_uv - rate_y_tokenonly - rate_uv_tokenonly +
@@ -4080,18 +4642,39 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     *returndist = dist_y + (dist_uv >> 2);
     memset(ctx->txfm_rd_diff, 0,
            sizeof(x->sb32_context[xd->sb_index].txfm_rd_diff));
+#if CONFIG_SB8X8
+    xd->mode_info_context->mbmi.mode = mode;
+    xd->mode_info_context->mbmi.txfm_size = txfm_size;
+  } else if (bsize == BLOCK_SIZE_SB8X8 && err4x4 < err) {
+    *returnrate = rate4x4_y + rate_uv +
+        vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 0);
+    *returndist = dist4x4_y + (dist_uv >> 2);
+    for (i = 0; i < NB_TXFM_MODES; i++) {
+      ctx->txfm_rd_diff[i] = MIN(err4x4, err - txfm_cache[i]);
+    }
+    xd->mode_info_context->mbmi.txfm_size = TX_4X4;
+#endif
   } else {
     *returnrate = rate_y + rate_uv +
         vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 0);
     *returndist = dist_y + (dist_uv >> 2);
     for (i = 0; i < NB_TXFM_MODES; i++) {
+#if CONFIG_SB8X8
+      ctx->txfm_rd_diff[i] = MIN(err4x4, err - txfm_cache[i]);
+#else
       ctx->txfm_rd_diff[i] = err - txfm_cache[i];
+#endif
     }
+#if CONFIG_SB8X8
+    xd->mode_info_context->mbmi.txfm_size = txfm_size;
+    xd->mode_info_context->mbmi.mode = mode;
+#endif
   }
 
   vpx_memcpy(&ctx->mic, xd->mode_info_context, sizeof(MODE_INFO));
 }
 
+#if !CONFIG_SB8X8
 void vp9_rd_pick_intra_mode(VP9_COMP *cpi, MACROBLOCK *x,
                             int *returnrate, int *returndist) {
   VP9_COMMON *cm = &cpi->common;
@@ -4218,6 +4801,7 @@ void vp9_rd_pick_intra_mode(VP9_COMP *cpi, MACROBLOCK *x,
   *returnrate = rate;
   *returndist = dist;
 }
+#endif
 
 int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
                                   int mi_row, int mi_col,
@@ -4272,7 +4856,20 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   unsigned int mode_mask = 0;
   int64_t mode_distortions[MB_MODE_COUNT] = {-1};
   int64_t frame_distortions[MAX_REF_FRAMES] = {-1};
+  int intra_cost_penalty = 20 * vp9_dc_quant(cpi->common.base_qindex,
+                                             cpi->common.y_dc_delta_q);
+#if CONFIG_SB8X8
+  int_mv seg_mvs[4][MAX_REF_FRAMES - 1];
+#endif
 
+#if CONFIG_SB8X8
+  for (i = 0; i < 4; i++) {
+    int j;
+
+    for (j = 0; j < MAX_REF_FRAMES - 1; j++)
+      seg_mvs[i][j].as_int = INVALID_MV;
+  }
+#endif
   // Everywhere the flag is set the error is much higher than its neighbors.
   ctx->frames_with_high_error = 0;
   ctx->modes_with_high_error = 0;
@@ -4400,9 +4997,16 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     // if (!(cpi->ref_frame_flags & flag_list[ref_frame]))
     //  continue;
 
-    if (this_mode == I8X8_PRED ||
+    if (
+#if CONFIG_SB8X8
+        bsize != BLOCK_SIZE_SB8X8 &&
+        (this_mode == I4X4_PRED || this_mode == SPLITMV)
+#else
         this_mode == I4X4_PRED ||
-        this_mode == SPLITMV)
+        this_mode == I8X8_PRED ||
+        this_mode == SPLITMV
+#endif
+        )
       continue;
     //  if (vp9_mode_order[mode_index].second_ref_frame == INTRA_FRAME)
     //  continue;
@@ -4465,6 +5069,27 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       }
     }
 
+#if CONFIG_SB8X8
+    if (this_mode == I4X4_PRED) {
+      int rate;
+
+      // Note the rate value returned here includes the cost of coding
+      // the I4X4_PRED mode : x->mbmode_cost[xd->frame_type][I4X4_PRED];
+      assert(bsize == BLOCK_SIZE_SB8X8);
+      mbmi->txfm_size = TX_4X4;
+      rd_pick_intra4x4mby_modes(cpi, x, &rate, &rate_y,
+                                &distortion_y, INT64_MAX);
+      rate2 += rate;
+      rate2 += intra_cost_penalty;
+      distortion2 += distortion_y;
+
+      rate2 += rate_uv_intra[TX_4X4];
+      rate_uv = rate_uv_intra[TX_4X4];
+      distortion2 += dist_uv[TX_4X4];
+      distortion_uv = dist_uv[TX_4X4];
+      mbmi->uv_mode = mode_uv[TX_4X4];
+    } else
+#endif
     if (ref_frame == INTRA_FRAME) {
       TX_SIZE uv_tx;
       vp9_build_intra_predictors_sby_s(xd, bsize);
@@ -4483,7 +5108,139 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       mbmi->uv_mode = mode_uv[uv_tx];
 
       rate2 = rate_y + x->mbmode_cost[cm->frame_type][mbmi->mode] + rate_uv;
+      if (mbmi->mode != DC_PRED && mbmi->mode != TM_PRED)
+        rate2 += intra_cost_penalty;
       distortion2 = distortion_y + distortion_uv;
+#if CONFIG_SB8X8
+    } else if (this_mode == SPLITMV) {
+      const int is_comp_pred = mbmi->second_ref_frame > 0;
+      int rate, distortion;
+      int64_t this_rd_thresh;
+      int64_t tmp_rd, tmp_best_rd = INT64_MAX, tmp_best_rdu = INT64_MAX;
+      int tmp_best_rate = INT_MAX, tmp_best_ratey = INT_MAX;
+      int tmp_best_distortion = INT_MAX, tmp_best_skippable = 0;
+      int switchable_filter_index;
+      int_mv *second_ref = is_comp_pred ?
+          &mbmi->ref_mvs[mbmi->second_ref_frame][0] : NULL;
+      union b_mode_info tmp_best_bmodes[16];
+      MB_MODE_INFO tmp_best_mbmode;
+      PARTITION_INFO tmp_best_partition;
+      int pred_exists = 0;
+      int uv_skippable;
+
+      this_rd_thresh = (mbmi->ref_frame == LAST_FRAME) ?
+          cpi->rd_threshes[THR_NEWMV] : cpi->rd_threshes[THR_NEWA];
+      this_rd_thresh = (mbmi->ref_frame == GOLDEN_FRAME) ?
+          cpi->rd_threshes[THR_NEWG] : this_rd_thresh;
+      xd->mode_info_context->mbmi.txfm_size = TX_4X4;
+
+      for (switchable_filter_index = 0;
+           switchable_filter_index < VP9_SWITCHABLE_FILTERS;
+           ++switchable_filter_index) {
+        int newbest;
+        mbmi->interp_filter =
+        vp9_switchable_interp[switchable_filter_index];
+        vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
+
+        tmp_rd = rd_pick_best_mbsegmentation(cpi, x,
+                                             &mbmi->ref_mvs[mbmi->ref_frame][0],
+                                             second_ref, INT64_MAX, mdcounts,
+                                             &rate, &rate_y, &distortion,
+                                             &skippable,
+                                             (int)this_rd_thresh, seg_mvs);
+        if (cpi->common.mcomp_filter_type == SWITCHABLE) {
+          int rs = SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs
+          [vp9_get_pred_context(&cpi->common, xd,
+                                PRED_SWITCHABLE_INTERP)]
+          [vp9_switchable_interp_map[mbmi->interp_filter]];
+          tmp_rd += RDCOST(x->rdmult, x->rddiv, rs, 0);
+        }
+        newbest = (tmp_rd < tmp_best_rd);
+        if (newbest) {
+          tmp_best_filter = mbmi->interp_filter;
+          tmp_best_rd = tmp_rd;
+        }
+        if ((newbest && cm->mcomp_filter_type == SWITCHABLE) ||
+            (mbmi->interp_filter == cm->mcomp_filter_type &&
+             cm->mcomp_filter_type != SWITCHABLE)) {
+              tmp_best_rdu = tmp_rd;
+              tmp_best_rate = rate;
+              tmp_best_ratey = rate_y;
+              tmp_best_distortion = distortion;
+              tmp_best_skippable = skippable;
+              vpx_memcpy(&tmp_best_mbmode, mbmi, sizeof(MB_MODE_INFO));
+              vpx_memcpy(&tmp_best_partition, x->partition_info,
+                         sizeof(PARTITION_INFO));
+              for (i = 0; i < 4; i++) {
+                tmp_best_bmodes[i] = xd->mode_info_context->bmi[i];
+              }
+              pred_exists = 1;
+            }
+      }  // switchable_filter_index loop
+
+      mbmi->interp_filter = (cm->mcomp_filter_type == SWITCHABLE ?
+                             tmp_best_filter : cm->mcomp_filter_type);
+      vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
+      if (!pred_exists) {
+        // Handles the special case when a filter that is not in the
+        // switchable list (bilinear, 6-tap) is indicated at the frame level
+        tmp_rd = rd_pick_best_mbsegmentation(cpi, x,
+                                             &mbmi->ref_mvs[mbmi->ref_frame][0],
+                                             second_ref, INT64_MAX, mdcounts,
+                                             &rate, &rate_y, &distortion,
+                                             &skippable,
+                                             (int)this_rd_thresh, seg_mvs);
+      } else {
+        if (cpi->common.mcomp_filter_type == SWITCHABLE) {
+          int rs = SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs
+              [vp9_get_pred_context(&cpi->common, xd,
+                                    PRED_SWITCHABLE_INTERP)]
+              [vp9_switchable_interp_map[mbmi->interp_filter]];
+          tmp_best_rdu -= RDCOST(x->rdmult, x->rddiv, rs, 0);
+        }
+        tmp_rd = tmp_best_rdu;
+        rate = tmp_best_rate;
+        rate_y = tmp_best_ratey;
+        distortion = tmp_best_distortion;
+        skippable = tmp_best_skippable;
+        vpx_memcpy(mbmi, &tmp_best_mbmode, sizeof(MB_MODE_INFO));
+        vpx_memcpy(x->partition_info, &tmp_best_partition,
+                   sizeof(PARTITION_INFO));
+        for (i = 0; i < 4; i++) {
+          xd->mode_info_context->bmi[i] = tmp_best_bmodes[i];
+        }
+      }
+
+      rate2 += rate;
+      distortion2 += distortion;
+
+      if (cpi->common.mcomp_filter_type == SWITCHABLE)
+        rate2 += SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs
+            [vp9_get_pred_context(&cpi->common, xd, PRED_SWITCHABLE_INTERP)]
+            [vp9_switchable_interp_map[mbmi->interp_filter]];
+
+      // If even the 'Y' rd value of split is higher than best so far
+      // then dont bother looking at UV
+      vp9_build_inter_predictors_sbuv(&x->e_mbd, mi_row, mi_col,
+                                      bsize);
+      vp9_subtract_sbuv(x, bsize);
+      super_block_uvrd_4x4(cm, x, &rate_uv, &distortion_uv,
+                           &uv_skippable, bsize);
+      rate2 += rate_uv;
+      distortion2 += distortion_uv;
+      skippable = skippable && uv_skippable;
+
+      if (!mode_excluded) {
+        if (is_comp_pred)
+          mode_excluded = cpi->common.comp_pred_mode == SINGLE_PREDICTION_ONLY;
+        else
+          mode_excluded = cpi->common.comp_pred_mode == COMP_PREDICTION_ONLY;
+      }
+
+      compmode_cost =
+          vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_COMP), is_comp_pred);
+      mbmi->mode = this_mode;
+#endif
     } else {
       YV12_BUFFER_CONFIG *scaled_ref_frame = NULL;
       int fb;
@@ -4693,6 +5450,14 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     }
 
     /* keep record of best txfm size */
+    if (bsize < BLOCK_SIZE_SB32X32) {
+      if (bsize < BLOCK_SIZE_MB16X16) {
+        if (this_mode == SPLITMV || this_mode == I4X4_PRED)
+          txfm_cache[ALLOW_8X8] = txfm_cache[ONLY_4X4];
+        txfm_cache[ALLOW_16X16] = txfm_cache[ALLOW_8X8];
+      }
+      txfm_cache[ALLOW_32X32] = txfm_cache[ALLOW_16X16];
+    }
     if (!mode_excluded && this_rd != INT64_MAX) {
       for (i = 0; i < NB_TXFM_MODES; i++) {
         int64_t adj_rd;
@@ -4769,13 +5534,27 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       (best_mbmode.mode != ZEROMV || best_mbmode.ref_frame != ALTREF_FRAME)) {
     mbmi->mode = ZEROMV;
     mbmi->ref_frame = ALTREF_FRAME;
-    mbmi->second_ref_frame = INTRA_FRAME;
+    mbmi->second_ref_frame = NONE;
     mbmi->mv[0].as_int = 0;
     mbmi->uv_mode = DC_PRED;
     mbmi->mb_skip_coeff = 1;
+#if !CONFIG_SB8X8
     mbmi->partitioning = 0;
-    mbmi->txfm_size = cm->txfm_mode == TX_MODE_SELECT ?
-                      TX_32X32 : cm->txfm_mode;
+#endif
+    if (cm->txfm_mode == TX_MODE_SELECT) {
+      if (bsize >= BLOCK_SIZE_SB32X32)
+        mbmi->txfm_size = TX_32X32;
+#if CONFIG_SB8X8
+      else if (bsize >= BLOCK_SIZE_MB16X16)
+#else
+      else
+#endif
+        mbmi->txfm_size = TX_16X16;
+#if CONFIG_SB8X8
+      else
+        mbmi->txfm_size = TX_8X8;
+#endif
+    }
 
     vpx_memset(best_txfm_diff, 0, sizeof(best_txfm_diff));
     vpx_memset(best_pred_diff, 0, sizeof(best_pred_diff));
@@ -4815,6 +5594,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   return best_rd;
 }
 
+#if !CONFIG_SB8X8
 void vp9_pick_mode_inter_macroblock(VP9_COMP *cpi, MACROBLOCK *x,
                                     int mi_row, int mi_col,
                                     int *totalrate, int *totaldist) {
@@ -4824,10 +5604,9 @@ void vp9_pick_mode_inter_macroblock(VP9_COMP *cpi, MACROBLOCK *x,
   int64_t intra_error = 0;
   unsigned char *segment_id = &mbmi->segment_id;
 
-  if (xd->segmentation_enabled)
-    x->encode_breakout = cpi->segment_encode_breakout[*segment_id];
-  else
-    x->encode_breakout = cpi->oxcf.encode_breakout;
+  x->encode_breakout = xd->segmentation_enabled ?
+                         cpi->segment_encode_breakout[*segment_id] :
+                         cpi->oxcf.encode_breakout;
 
   // if (cpi->sf.RD)
   // For now this codebase is limited to a single rd encode path
@@ -4852,3 +5631,4 @@ void vp9_pick_mode_inter_macroblock(VP9_COMP *cpi, MACROBLOCK *x,
   *totalrate = rate;
   *totaldist = distortion;
 }
+#endif
diff --git a/vp9/encoder/vp9_rdopt.h b/vp9/encoder/vp9_rdopt.h
index eef2a4fe9..6533a82e0 100644
--- a/vp9/encoder/vp9_rdopt.h
+++ b/vp9/encoder/vp9_rdopt.h
@@ -19,16 +19,20 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex);
 
 void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex);
 
+#if !CONFIG_SB8X8
 void vp9_rd_pick_intra_mode(VP9_COMP *cpi, MACROBLOCK *x,
                             int *r, int *d);
+#endif
 
 void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
                                int *r, int *d, BLOCK_SIZE_TYPE bsize,
                                PICK_MODE_CONTEXT *ctx);
 
+#if !CONFIG_SB8X8
 void vp9_pick_mode_inter_macroblock(VP9_COMP *cpi, MACROBLOCK *x,
                                     int mi_row, int mi_col,
                                     int *r, int *d);
+#endif
 
 int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
                                   int mi_row, int mi_col,
diff --git a/vp9/encoder/vp9_segmentation.c b/vp9/encoder/vp9_segmentation.c
index 7f792ae2b..e04980ce1 100644
--- a/vp9/encoder/vp9_segmentation.c
+++ b/vp9/encoder/vp9_segmentation.c
@@ -16,18 +16,15 @@
 #include "vp9/common/vp9_tile_common.h"
 
 void vp9_enable_segmentation(VP9_PTR ptr) {
-  VP9_COMP *cpi = (VP9_COMP *)(ptr);
+  VP9_COMP *cpi = (VP9_COMP *)ptr;
 
-  // Set the appropriate feature bit
   cpi->mb.e_mbd.segmentation_enabled = 1;
   cpi->mb.e_mbd.update_mb_segmentation_map = 1;
   cpi->mb.e_mbd.update_mb_segmentation_data = 1;
 }
 
 void vp9_disable_segmentation(VP9_PTR ptr) {
-  VP9_COMP *cpi = (VP9_COMP *)(ptr);
-
-  // Clear the appropriate feature bit
+  VP9_COMP *cpi = (VP9_COMP *)ptr;
   cpi->mb.e_mbd.segmentation_enabled = 0;
 }
 
@@ -199,9 +196,17 @@ static void count_segs_sb(VP9_COMP *cpi, MODE_INFO *mi,
     assert(bwl < bsl && bhl < bsl);
     if (bsize == BLOCK_SIZE_SB64X64) {
       subsize = BLOCK_SIZE_SB32X32;
+#if CONFIG_SB8X8
+    } else if (bsize == BLOCK_SIZE_SB32X32) {
+      subsize = BLOCK_SIZE_MB16X16;
+    } else {
+      assert(bsize == BLOCK_SIZE_MB16X16);
+      subsize = BLOCK_SIZE_SB8X8;
+#else
     } else {
       assert(bsize == BLOCK_SIZE_SB32X32);
       subsize = BLOCK_SIZE_MB16X16;
+#endif
     }
 
     for (n = 0; n < 4; n++) {
@@ -238,10 +243,8 @@ void vp9_choose_segmap_coding_method(VP9_COMP *cpi) {
 
   // Set default state for the segment tree probabilities and the
   // temporal coding probabilities
-  vpx_memset(xd->mb_segment_tree_probs, 255,
-             sizeof(xd->mb_segment_tree_probs));
-  vpx_memset(cm->segment_pred_probs, 255,
-             sizeof(cm->segment_pred_probs));
+  vpx_memset(xd->mb_segment_tree_probs, 255, sizeof(xd->mb_segment_tree_probs));
+  vpx_memset(cm->segment_pred_probs, 255, sizeof(cm->segment_pred_probs));
 
   vpx_memset(no_pred_segcounts, 0, sizeof(no_pred_segcounts));
   vpx_memset(t_unpred_seg_counts, 0, sizeof(t_unpred_seg_counts));
@@ -249,7 +252,6 @@ void vp9_choose_segmap_coding_method(VP9_COMP *cpi) {
 
   // First of all generate stats regarding how well the last segment map
   // predicts this one
-
   for (tile_col = 0; tile_col < cm->tile_columns; tile_col++) {
     vp9_get_tile_col_offsets(cm, tile_col);
     mi_ptr = cm->mi + cm->cur_tile_mi_col_start;
@@ -279,27 +281,24 @@ void vp9_choose_segmap_coding_method(VP9_COMP *cpi) {
 
     // Add in the cost of the signalling for each prediction context
     for (i = 0; i < PREDICTION_PROBS; i++) {
-      t_nopred_prob[i] = get_binary_prob(temporal_predictor_count[i][0],
-                                         temporal_predictor_count[i][1]);
+      const int count0 = temporal_predictor_count[i][0];
+      const int count1 = temporal_predictor_count[i][1];
+
+      t_nopred_prob[i] = get_binary_prob(count0, count1);
 
       // Add in the predictor signaling cost
-      t_pred_cost += (temporal_predictor_count[i][0] *
-                      vp9_cost_zero(t_nopred_prob[i])) +
-                     (temporal_predictor_count[i][1] *
-                      vp9_cost_one(t_nopred_prob[i]));
+      t_pred_cost += count0 * vp9_cost_zero(t_nopred_prob[i]) +
+                     count1 * vp9_cost_one(t_nopred_prob[i]);
     }
   }
 
   // Now choose which coding method to use.
   if (t_pred_cost < no_pred_cost) {
     cm->temporal_update = 1;
-    vpx_memcpy(xd->mb_segment_tree_probs,
-               t_pred_tree, sizeof(t_pred_tree));
-    vpx_memcpy(&cm->segment_pred_probs,
-               t_nopred_prob, sizeof(t_nopred_prob));
+    vpx_memcpy(xd->mb_segment_tree_probs, t_pred_tree, sizeof(t_pred_tree));
+    vpx_memcpy(cm->segment_pred_probs, t_nopred_prob, sizeof(t_nopred_prob));
   } else {
     cm->temporal_update = 0;
-    vpx_memcpy(xd->mb_segment_tree_probs,
-               no_pred_tree, sizeof(no_pred_tree));
+    vpx_memcpy(xd->mb_segment_tree_probs, no_pred_tree, sizeof(no_pred_tree));
   }
 }
diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c
index 3c3367071..9756e6e54 100644
--- a/vp9/encoder/vp9_tokenize.c
+++ b/vp9/encoder/vp9_tokenize.c
@@ -376,7 +376,11 @@ int vp9_sb_is_skippable(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize) {
 int vp9_sby_is_skippable(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize) {
   int result = 1;
   struct is_skippable_args args = {xd, &result};
-  foreach_transformed_block_in_plane(xd, bsize, 0, 0, is_skippable, &args);
+  foreach_transformed_block_in_plane(xd, bsize, 0,
+#if !CONFIG_SB8X8
+                                     0,
+#endif
+                                     is_skippable, &args);
   return result;
 }