14 files changed, 2158 insertions, 221 deletions
diff --git a/vp8/encoder/bitstream.c b/vp8/encoder/bitstream.c
index 76aed7e2d..90bc8e987 100644
--- a/vp8/encoder/bitstream.c
+++ b/vp8/encoder/bitstream.c
@@ -288,6 +288,12 @@ static void kfwrite_ymode(vp8_writer *bc, int m, const vp8_prob *p) {
   vp8_write_token(bc, vp8_kf_ymode_tree, p, vp8_kf_ymode_encodings + m);
 }
 
+#if CONFIG_SUPERBLOCKS
+static void sb_kfwrite_ymode(vp8_writer *bc, int m, const vp8_prob *p) {
+  vp8_write_token(bc, vp8_uv_mode_tree, p, vp8_sb_kf_ymode_encodings + m);
+}
+#endif
+
 static void write_i8x8_mode(vp8_writer *bc, int m, const vp8_prob *p) {
   vp8_write_token(bc, vp8_i8x8_mode_tree, p, vp8_i8x8_mode_encodings + m);
 }
@@ -533,6 +539,16 @@ static void write_mv_ref
                   vp8_mv_ref_encoding_array - NEARESTMV + m);
 }
 
+#if CONFIG_SUPERBLOCKS
+static void write_sb_mv_ref(vp8_writer *w, MB_PREDICTION_MODE m, const vp8_prob *p) {
+#if CONFIG_DEBUG
+  assert(NEARESTMV <= m  &&  m < SPLITMV);
+#endif
+  vp8_write_token(w, vp8_sb_mv_ref_tree, p,
+                  vp8_sb_mv_ref_encoding_array - NEARESTMV + m);
+}
+#endif
+
 static void write_sub_mv_ref
 (
   vp8_writer *w, B_PREDICTION_MODE m, const vp8_prob *p
@@ -810,6 +826,9 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi) {
 
       // Process the 4 MBs in the order:
       // top-left, top-right, bottom-left, bottom-right
+#if CONFIG_SUPERBLOCKS
+      vp8_write(w, m->mbmi.encoded_as_sb, pc->sb_coded);
+#endif
       for (i = 0; i < 4; i++) {
         MB_MODE_INFO *mi;
         MV_REFERENCE_FRAME rf;
@@ -872,7 +891,15 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi) {
         if (pc->mb_no_coeff_skip &&
             (!segfeature_active(xd, segment_id, SEG_LVL_EOB) ||
              (get_segdata(xd, segment_id, SEG_LVL_EOB) != 0))) {
-          vp8_encode_bool(w, mi->mb_skip_coeff,
+          int skip_coeff = mi->mb_skip_coeff;
+#if CONFIG_SUPERBLOCKS
+          if (mi->encoded_as_sb) {
+            skip_coeff &= m[1].mbmi.mb_skip_coeff;
+            skip_coeff &= m[mis].mbmi.mb_skip_coeff;
+            skip_coeff &= m[mis + 1].mbmi.mb_skip_coeff;
+          }
+#endif
+          vp8_encode_bool(w, skip_coeff,
                           get_pred_prob(pc, xd, PRED_MBSKIP));
         }
 
@@ -884,6 +911,8 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi) {
           active_section = 6;
 #endif
 
+          // TODO(rbultje) write using SB tree structure
+
           if (!segfeature_active(xd, segment_id, SEG_LVL_MODE)) {
             write_ymode(w, mode, pc->fc.ymode_prob);
           }
@@ -949,7 +978,14 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi) {
 
           // Is the segment coding of mode enabled
           if (!segfeature_active(xd, segment_id, SEG_LVL_MODE)) {
-            write_mv_ref(w, mode, mv_ref_p);
+#if CONFIG_SUPERBLOCKS
+            if (mi->encoded_as_sb) {
+              write_sb_mv_ref(w, mode, mv_ref_p);
+            } else
+#endif
+            {
+              write_mv_ref(w, mode, mv_ref_p);
+            }
             vp8_accum_mv_refs(&cpi->common, mode, ct);
           }
 
@@ -1085,6 +1121,17 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi) {
           }
         }
 
+#if CONFIG_SUPERBLOCKS
+        if (m->mbmi.encoded_as_sb) {
+          assert(!i);
+          mb_col += 2;
+          m += 2;
+          cpi->mb.partition_info += 2;
+          prev_m += 2;
+          break;
+        }
+#endif
+
         // Next MB
         mb_row += dy;
         mb_col += dx;
@@ -1151,6 +1198,9 @@ static void write_kfmodes(VP8_COMP *cpi) {
 
     mb_col = 0;
     for (col = 0; col < c->mb_cols; col += 2) {
+#if CONFIG_SUPERBLOCKS
+      vp8_write(bc, m->mbmi.encoded_as_sb, c->sb_coded);
+#endif
       // Process the 4 MBs in the order:
       // top-left, top-right, bottom-left, bottom-right
       for (i = 0; i < 4; i++) {
@@ -1181,11 +1231,27 @@ static void write_kfmodes(VP8_COMP *cpi) {
         if (c->mb_no_coeff_skip &&
             (!segfeature_active(xd, segment_id, SEG_LVL_EOB) ||
              (get_segdata(xd, segment_id, SEG_LVL_EOB) != 0))) {
-          vp8_encode_bool(bc, m->mbmi.mb_skip_coeff,
+              int skip_coeff = m->mbmi.mb_skip_coeff;
+#if CONFIG_SUPERBLOCKS
+              if (m->mbmi.encoded_as_sb) {
+                skip_coeff &= m[1].mbmi.mb_skip_coeff;
+                skip_coeff &= m[mis].mbmi.mb_skip_coeff;
+                skip_coeff &= m[mis + 1].mbmi.mb_skip_coeff;
+              }
+#endif
+              vp8_encode_bool(bc, skip_coeff,
                           get_pred_prob(c, xd, PRED_MBSKIP));
         }
-        kfwrite_ymode(bc, ym,
-                      c->kf_ymode_prob[c->kf_ymode_probs_index]);
+#if CONFIG_SUPERBLOCKS
+        if (m->mbmi.encoded_as_sb) {
+          sb_kfwrite_ymode(bc, ym,
+                           c->sb_kf_ymode_prob[c->kf_ymode_probs_index]);
+        } else
+#endif
+        {
+          kfwrite_ymode(bc, ym,
+                        c->kf_ymode_prob[c->kf_ymode_probs_index]);
+        }
 
         if (ym == B_PRED) {
           const int mis = c->mode_info_stride;
@@ -1233,6 +1299,14 @@ static void write_kfmodes(VP8_COMP *cpi) {
         } else
           write_uv_mode(bc, m->mbmi.uv_mode, c->kf_uv_mode_prob[ym]);
 
+#if CONFIG_SUPERBLOCKS
+        if (m->mbmi.encoded_as_sb) {
+          assert(!i);
+          mb_col += 2;
+          m += 2;
+          break;
+        }
+#endif
         // Next MB
         mb_row += dy;
         mb_col += dx;
@@ -1793,7 +1867,7 @@ static void put_delta_q(vp8_writer *bc, int delta_q) {
   } else
     vp8_write_bit(bc, 0);
 }
-extern const unsigned int kf_y_mode_cts[8][VP8_YMODES];
+
 static void decide_kf_ymode_entropy(VP8_COMP *cpi) {
 
   int mode_cost[MB_MODE_COUNT];
@@ -1808,6 +1882,13 @@ static void decide_kf_ymode_entropy(VP8_COMP *cpi) {
     for (j = 0; j < VP8_YMODES; j++) {
       cost += mode_cost[j] * cpi->ymode_count[j];
     }
+#if CONFIG_SUPERBLOCKS
+    vp8_cost_tokens(mode_cost, cpi->common.sb_kf_ymode_prob[i],
+                    vp8_sb_ymode_tree);
+    for (j = 0; j < VP8_I32X32_MODES; j++) {
+      cost += mode_cost[j] * cpi->sb_ymode_count[j];
+    }
+#endif
     if (cost < bestcost) {
       bestindex = i;
       bestcost = cost;
@@ -1906,11 +1987,6 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned long *size)
       // Select the coding strategy (temporal or spatial)
       choose_segmap_coding_method(cpi);
 
-      // Take a copy of the segment map if it changed for
-      // future comparison
-      vpx_memcpy(pc->last_frame_seg_map,
-                 cpi->segmentation_map, pc->MBs);
-
       // Write out the chosen coding method.
       vp8_write_bit(bc, (pc->temporal_update) ? 1 : 0);
     }
@@ -2048,6 +2124,19 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned long *size)
     }
   }
 
+#if CONFIG_SUPERBLOCKS
+  {
+    /* sb mode probability */
+    int sb_coded = 256 - (cpi->sb_count << 8) / (((pc->mb_rows + 1) >> 1) * ((pc->mb_cols + 1) >> 1));
+    if (sb_coded <= 0)
+      sb_coded = 1;
+    else if (sb_coded >= 256)
+      sb_coded = 255;
+    pc->sb_coded = sb_coded;
+    vp8_write_literal(bc, pc->sb_coded, 8);
+  }
+#endif
+
   vp8_write_bit(bc, pc->txfm_mode);
 
   // Encode the loop filter level and type
diff --git a/vp8/encoder/block.h b/vp8/encoder/block.h
index dfc1d743e..d73af4faa 100644
--- a/vp8/encoder/block.h
+++ b/vp8/encoder/block.h
@@ -82,7 +82,9 @@ typedef struct {
   int best_mode_index;
   int rddiv;
   int rdmult;
-
+  int hybrid_pred_diff;
+  int comp_pred_diff;
+  int single_pred_diff;
 } PICK_MODE_CONTEXT;
 
 typedef struct {
@@ -139,12 +141,6 @@ typedef struct {
   int mv_col_max;
   int mv_row_min;
   int mv_row_max;
-#if CONFIG_SUPERBLOCKS
-  int mv_col_min_sb;
-  int mv_col_max_sb;
-  int mv_row_min_sb;
-  int mv_row_max_sb;
-#endif
 
   int skip;
 
@@ -163,8 +159,6 @@ typedef struct {
   int optimize;
   int q_index;
 
-  int encode_as_sb;
-
   // Structure to hold context for each of the 4 MBs within a SB:
   // when encoded as 4 independent MBs:
   PICK_MODE_CONTEXT mb_context[4];
diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c
index e58c852a7..4472497e0 100644
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@@ -57,16 +57,24 @@ extern void vp8cx_init_mbrthread_data(VP8_COMP *cpi,
                                       MB_ROW_COMP *mbr_ei,
                                       int mb_row,
                                       int count);
-extern int vp8cx_pick_mode_inter_macroblock(VP8_COMP *cpi, MACROBLOCK *x,
+int64_t vp8_rd_pick_inter_mode_sb(VP8_COMP *cpi, MACROBLOCK *x,
+                              int recon_yoffset, int recon_uvoffset,
+                              int *returnrate, int *returndistortion);
+extern void vp8cx_pick_mode_inter_macroblock(VP8_COMP *cpi, MACROBLOCK *x,
                                             int recon_yoffset,
-                                            int recon_uvoffset);
+                                            int recon_uvoffset, int *r, int *d);
 void vp8_build_block_offsets(MACROBLOCK *x);
 void vp8_setup_block_ptrs(MACROBLOCK *x);
 void vp8cx_encode_inter_macroblock(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t,
                                    int recon_yoffset, int recon_uvoffset,
                                    int output_enabled);
+void vp8cx_encode_inter_superblock(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t,
+                                   int recon_yoffset, int recon_uvoffset, int mb_col, int mb_row);
 void vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x,
                                     TOKENEXTRA **t, int output_enabled);
+void vp8cx_encode_intra_super_block(VP8_COMP *cpi,
+                                    MACROBLOCK *x,
+                                    TOKENEXTRA **t, int mb_col);
 static void adjust_act_zbin(VP8_COMP *cpi, MACROBLOCK *x);
 
 
@@ -378,6 +386,13 @@ static void update_state(VP8_COMP *cpi, MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
   // Restore the coding context of the MB to that that was in place
   // when the mode was picked for it
   vpx_memcpy(xd->mode_info_context, mi, sizeof(MODE_INFO));
+#if CONFIG_SUPERBLOCKS
+  if (mi->mbmi.encoded_as_sb) {
+    vpx_memcpy(xd->mode_info_context + 1, mi, sizeof(MODE_INFO));
+    vpx_memcpy(xd->mode_info_context + cpi->common.mode_info_stride, mi, sizeof(MODE_INFO));
+    vpx_memcpy(xd->mode_info_context + cpi->common.mode_info_stride + 1, mi, sizeof(MODE_INFO));
+  }
+#endif
 
   if (mb_mode == B_PRED) {
     for (i = 0; i < 16; i++) {
@@ -448,6 +463,10 @@ static void update_state(VP8_COMP *cpi, MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
 
     cpi->prediction_error += ctx->distortion;
     cpi->intra_error += ctx->intra_error;
+
+    cpi->rd_comp_pred_diff[0] += ctx->single_pred_diff;
+    cpi->rd_comp_pred_diff[1] += ctx->comp_pred_diff;
+    cpi->rd_comp_pred_diff[2] += ctx->hybrid_pred_diff;
   }
 }
 
@@ -458,7 +477,8 @@ static void pick_mb_modes(VP8_COMP *cpi,
                           MACROBLOCK  *x,
                           MACROBLOCKD *xd,
                           TOKENEXTRA **tp,
-                          int *totalrate) {
+                          int *totalrate,
+                          int *totaldist) {
   int i;
   int map_index;
   int recon_yoffset, recon_uvoffset;
@@ -477,7 +497,7 @@ static void pick_mb_modes(VP8_COMP *cpi,
 
   /* Function should not modify L & A contexts; save and restore on exit */
   vpx_memcpy(left_context,
-             cpi->left_context,
+             cm->left_context,
              sizeof(left_context));
   vpx_memcpy(above_context,
              initial_above_context_ptr,
@@ -525,9 +545,7 @@ static void pick_mb_modes(VP8_COMP *cpi,
 
     // Restore the appropriate left context depending on which
     // row in the SB the MB is situated
-    vpx_memcpy(&cm->left_context,
-               &cpi->left_context[i >> 1],
-               sizeof(ENTROPY_CONTEXT_PLANES));
+    xd->left_context = cm->left_context + (i >> 1);
 
     // Set up distance of MB to edge of frame in 1/8th pel units
     xd->mb_to_top_edge    = -((mb_row * 16) << 3);
@@ -568,9 +586,11 @@ static void pick_mb_modes(VP8_COMP *cpi,
     // Is segmentation enabled
     if (xd->segmentation_enabled) {
       // Code to set segment id in xd->mbmi.segment_id
-      if (cpi->segmentation_map[map_index] <= 3)
+      if (xd->update_mb_segmentation_map)
         mbmi->segment_id = cpi->segmentation_map[map_index];
       else
+        mbmi->segment_id = cm->last_frame_seg_map[map_index];
+      if (mbmi->segment_id > 3)
         mbmi->segment_id = 0;
 
       vp8cx_mb_init_quantizer(cpi, x);
@@ -583,22 +603,29 @@ static void pick_mb_modes(VP8_COMP *cpi,
     /* force 4x4 transform for mode selection */
     mbmi->txfm_size = TX_4X4; // TODO IS this right??
 
+#if CONFIG_SUPERBLOCKS
+    xd->mode_info_context->mbmi.encoded_as_sb = 0;
+#endif
+
     cpi->update_context = 0;    // TODO Do we need this now??
 
     // Find best coding mode & reconstruct the MB so it is available
     // as a predictor for MBs that follow in the SB
     if (cm->frame_type == KEY_FRAME) {
-      *totalrate += vp8_rd_pick_intra_mode(cpi, x);
-
-      // Save the coding context
-      vpx_memcpy(&x->mb_context[i].mic, xd->mode_info_context,
-                 sizeof(MODE_INFO));
+      int r, d;
+      vp8_rd_pick_intra_mode(cpi, x, &r, &d);
+      *totalrate += r;
+      *totaldist += d;
 
       // Dummy encode, do not do the tokenization
       vp8cx_encode_intra_macro_block(cpi, x, tp, 0);
       // Note the encoder may have changed the segment_id
+
+      // Save the coding context
+      vpx_memcpy(&x->mb_context[i].mic, xd->mode_info_context,
+                 sizeof(MODE_INFO));
     } else {
-      int seg_id;
+      int seg_id, r, d;
 
       if (xd->segmentation_enabled && cpi->seg0_cnt > 0 &&
           !segfeature_active(xd, 0, SEG_LVL_REF_FRAME) &&
@@ -612,9 +639,10 @@ static void pick_mb_modes(VP8_COMP *cpi,
         cpi->seg0_progress = (((mb_col & ~1) * 2 + (mb_row & ~1) * cm->mb_cols + i) << 16) / cm->MBs;
       }
 
-      *totalrate += vp8cx_pick_mode_inter_macroblock(cpi, x,
-                                                     recon_yoffset,
-                                                     recon_uvoffset);
+      vp8cx_pick_mode_inter_macroblock(cpi, x, recon_yoffset,
+                                       recon_uvoffset, &r, &d);
+      *totalrate += r;
+      *totaldist += d;
 
       // Dummy encode, do not do the tokenization
       vp8cx_encode_inter_macroblock(cpi, x, tp,
@@ -639,11 +667,6 @@ static void pick_mb_modes(VP8_COMP *cpi,
       }
     }
 
-    // Keep a copy of the updated left context
-    vpx_memcpy(&cpi->left_context[i >> 1],
-               &cm->left_context,
-               sizeof(ENTROPY_CONTEXT_PLANES));
-
     // Next MB
     mb_row += dy;
     mb_col += dx;
@@ -664,7 +687,7 @@ static void pick_mb_modes(VP8_COMP *cpi,
   }
 
   /* Restore L & A coding context to those in place on entry */
-  vpx_memcpy(cpi->left_context,
+  vpx_memcpy(cm->left_context,
              left_context,
              sizeof(left_context));
   vpx_memcpy(initial_above_context_ptr,
@@ -672,6 +695,156 @@ static void pick_mb_modes(VP8_COMP *cpi,
              sizeof(above_context));
 }
 
+#if CONFIG_SUPERBLOCKS
+static void pick_sb_modes (VP8_COMP *cpi,
+                           VP8_COMMON *cm,
+                           int mb_row,
+                           int mb_col,
+                           MACROBLOCK  *x,
+                           MACROBLOCKD *xd,
+                           TOKENEXTRA **tp,
+                           int *totalrate,
+                           int *totaldist)
+{
+  int map_index;
+  int recon_yoffset, recon_uvoffset;
+  int ref_fb_idx = cm->lst_fb_idx;
+  int dst_fb_idx = cm->new_fb_idx;
+  int recon_y_stride = cm->yv12_fb[ref_fb_idx].y_stride;
+  int recon_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride;
+  ENTROPY_CONTEXT_PLANES left_context[2];
+  ENTROPY_CONTEXT_PLANES above_context[2];
+  ENTROPY_CONTEXT_PLANES *initial_above_context_ptr = cm->above_context
+    + mb_col;
+
+  /* Function should not modify L & A contexts; save and restore on exit */
+  vpx_memcpy (left_context,
+              cm->left_context,
+              sizeof(left_context));
+  vpx_memcpy (above_context,
+              initial_above_context_ptr,
+              sizeof(above_context));
+
+  map_index = (mb_row * cpi->common.mb_cols) + mb_col;
+  x->mb_activity_ptr = &cpi->mb_activity_map[map_index];
+
+  /* set above context pointer */
+  xd->above_context = cm->above_context + mb_col;
+
+  /* Restore the appropriate left context depending on which
+   * row in the SB the MB is situated */
+  xd->left_context = cm->left_context;
+
+  // Set up distance of MB to edge of frame in 1/8th pel units
+  xd->mb_to_top_edge    = -((mb_row * 16) << 3);
+  xd->mb_to_left_edge   = -((mb_col * 16) << 3);
+  xd->mb_to_bottom_edge = ((cm->mb_rows - 1 - mb_row) * 16) << 3;
+  xd->mb_to_right_edge  = ((cm->mb_cols - 1 - mb_col) * 16) << 3;
+
+  /* Set up limit values for MV components to prevent them from
+   * extending beyond the UMV borders assuming 16x16 block size */
+  x->mv_row_min = -((mb_row * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);
+  x->mv_col_min = -((mb_col * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);
+  x->mv_row_max = ((cm->mb_rows - mb_row) * 16 +
+                   (VP8BORDERINPIXELS - 32 - INTERP_EXTEND));
+  x->mv_col_max = ((cm->mb_cols - mb_col) * 16 +
+                   (VP8BORDERINPIXELS - 32 - INTERP_EXTEND));
+
+  xd->up_available   = (mb_row != 0);
+  xd->left_available = (mb_col != 0);
+
+  recon_yoffset  = (mb_row * recon_y_stride * 16) + (mb_col * 16);
+  recon_uvoffset = (mb_row * recon_uv_stride * 8) + (mb_col *  8);
+
+  xd->dst.y_buffer = cm->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset;
+  xd->dst.u_buffer = cm->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset;
+  xd->dst.v_buffer = cm->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;
+#if 0 // FIXME
+  /* Copy current MB to a work buffer */
+  RECON_INVOKE(&xd->rtcd->recon, copy16x16)(x->src.y_buffer,
+                                            x->src.y_stride,
+                                            x->thismb, 16);
+#endif
+  x->rddiv = cpi->RDDIV;
+  x->rdmult = cpi->RDMULT;
+  if(cpi->oxcf.tuning == VP8_TUNE_SSIM)
+    vp8_activity_masking(cpi, x);
+  /* Is segmentation enabled */
+  if (xd->segmentation_enabled)
+  {
+    /* Code to set segment id in xd->mbmi.segment_id */
+    if (xd->update_mb_segmentation_map)
+      xd->mode_info_context->mbmi.segment_id =
+            cpi->segmentation_map[map_index] &&
+            cpi->segmentation_map[map_index + 1] &&
+            cpi->segmentation_map[map_index + cm->mb_cols] &&
+            cpi->segmentation_map[map_index + cm->mb_cols + 1];
+    else
+      xd->mode_info_context->mbmi.segment_id =
+            cm->last_frame_seg_map[map_index] &&
+            cm->last_frame_seg_map[map_index + 1] &&
+            cm->last_frame_seg_map[map_index + cm->mb_cols] &&
+            cm->last_frame_seg_map[map_index + cm->mb_cols + 1];
+    if (xd->mode_info_context->mbmi.segment_id > 3)
+      xd->mode_info_context->mbmi.segment_id = 0;
+
+    vp8cx_mb_init_quantizer(cpi, x);
+  }
+  else
+    /* Set to Segment 0 by default */
+    xd->mode_info_context->mbmi.segment_id = 0;
+
+  x->active_ptr = cpi->active_map + map_index;
+  
+  cpi->update_context = 0;    // TODO Do we need this now??
+
+  /* Find best coding mode & reconstruct the MB so it is available
+   * as a predictor for MBs that follow in the SB */
+  if (cm->frame_type == KEY_FRAME)
+  {
+    vp8_rd_pick_intra_mode_sb(cpi, x,
+                              totalrate,
+                              totaldist);
+
+    /* Save the coding context */
+    vpx_memcpy(&x->sb_context[0].mic, xd->mode_info_context,
+               sizeof(MODE_INFO));
+  }
+  else
+  {
+    if (xd->segmentation_enabled && cpi->seg0_cnt > 0 &&
+        !segfeature_active( xd, 0, SEG_LVL_REF_FRAME ) &&
+        segfeature_active( xd, 1, SEG_LVL_REF_FRAME ) &&
+        check_segref(xd, 1, INTRA_FRAME)  +
+        check_segref(xd, 1, LAST_FRAME)   +
+        check_segref(xd, 1, GOLDEN_FRAME) +
+        check_segref(xd, 1, ALTREF_FRAME) == 1)
+    {
+      cpi->seg0_progress = (cpi->seg0_idx << 16) / cpi->seg0_cnt;
+    }
+    else
+    {
+      cpi->seg0_progress =
+        (((mb_col & ~1) * 2 + (mb_row & ~1) * cm->mb_cols) << 16) / cm->MBs;
+    }
+
+    vp8_rd_pick_inter_mode_sb(cpi, x,
+                              recon_yoffset,
+                              recon_uvoffset,
+                              totalrate,
+                              totaldist);
+  }
+
+  /* Restore L & A coding context to those in place on entry */
+  vpx_memcpy (cm->left_context,
+              left_context,
+              sizeof(left_context));
+  vpx_memcpy (initial_above_context_ptr,
+              above_context,
+              sizeof(above_context));
+}
+#endif
+
 static void encode_sb(VP8_COMP *cpi,
                       VP8_COMMON *cm,
                       int mbrow,
@@ -679,6 +852,7 @@ static void encode_sb(VP8_COMP *cpi,
                       MACROBLOCK  *x,
                       MACROBLOCKD *xd,
                       TOKENEXTRA **tp) {
+  VP8_COMMON *pc = cm;
   int i;
   int map_index;
   int mb_row, mb_col;
@@ -733,22 +907,19 @@ static void encode_sb(VP8_COMP *cpi,
 
     // Restore MB state to that when it was picked
 #if CONFIG_SUPERBLOCKS
-    if (x->encode_as_sb)
+    if (xd->mode_info_context->mbmi.encoded_as_sb) {
       update_state(cpi, x, &x->sb_context[i]);
-    else
+      cpi->sb_count++;
+    } else
 #endif
       update_state(cpi, x, &x->mb_context[i]);
 
-    // Copy in the appropriate left context
-    vpx_memcpy(&cm->left_context,
-               &cpi->left_context[i >> 1],
-               sizeof(ENTROPY_CONTEXT_PLANES));
-
     map_index = (mb_row * cpi->common.mb_cols) + mb_col;
     x->mb_activity_ptr = &cpi->mb_activity_map[map_index];
 
     // reset above block coeffs
     xd->above_context = cm->above_context + mb_col;
+    xd->left_context  = cm->left_context + (i >> 1);
 
     // Set up distance of MB to edge of the frame in 1/8th pel units
     xd->mb_to_top_edge    = -((mb_row * 16) << 3);
@@ -756,24 +927,28 @@ static void encode_sb(VP8_COMP *cpi,
     xd->mb_to_bottom_edge = ((cm->mb_rows - 1 - mb_row) * 16) << 3;
     xd->mb_to_right_edge  = ((cm->mb_cols - 1 - mb_col) * 16) << 3;
 
-    // Set up limit values for MV components to prevent them from
-    // extending beyond the UMV borders assuming 16x16 block size
-    x->mv_row_min = -((mb_row * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);
-    x->mv_col_min = -((mb_col * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);
-    x->mv_row_max = ((cm->mb_rows - mb_row) * 16 +
-                     (VP8BORDERINPIXELS - 16 - INTERP_EXTEND));
-    x->mv_col_max = ((cm->mb_cols - mb_col) * 16 +
-                     (VP8BORDERINPIXELS - 16 - INTERP_EXTEND));
-
 #if CONFIG_SUPERBLOCKS
-    // Set up limit values for MV components to prevent them from
-    // extending beyond the UMV borders assuming 32x32 block size
-    x->mv_row_min_sb = -((mb_row * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);
-    x->mv_col_min_sb = -((mb_col * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);
-    x->mv_row_max_sb = ((cm->mb_rows - mb_row) * 16 +
-                        (VP8BORDERINPIXELS - 32 - INTERP_EXTEND));
-    x->mv_col_max_sb = ((cm->mb_cols - mb_col) * 16 +
-                        (VP8BORDERINPIXELS - 32 - INTERP_EXTEND));
+    if (xd->mode_info_context->mbmi.encoded_as_sb) {
+      // Set up limit values for MV components to prevent them from
+      // extending beyond the UMV borders assuming 32x32 block size
+      x->mv_row_min = -((mb_row * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);
+      x->mv_col_min = -((mb_col * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);
+      x->mv_row_max = ((cm->mb_rows - mb_row) * 16 +
+                       (VP8BORDERINPIXELS - 32 - INTERP_EXTEND));
+      x->mv_col_max = ((cm->mb_cols - mb_col) * 16 +
+                       (VP8BORDERINPIXELS - 32 - INTERP_EXTEND));
+    } else {
+#endif
+      // Set up limit values for MV components to prevent them from
+      // extending beyond the UMV borders assuming 16x16 block size
+      x->mv_row_min = -((mb_row * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);
+      x->mv_col_min = -((mb_col * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);
+      x->mv_row_max = ((cm->mb_rows - mb_row) * 16 +
+                       (VP8BORDERINPIXELS - 16 - INTERP_EXTEND));
+      x->mv_col_max = ((cm->mb_cols - mb_col) * 16 +
+                       (VP8BORDERINPIXELS - 16 - INTERP_EXTEND));
+#if CONFIG_SUPERBLOCKS
+    }
 #endif
 
     xd->up_available = (mb_row != 0);
@@ -796,24 +971,21 @@ static void encode_sb(VP8_COMP *cpi,
 
     // Is segmentation enabled
     if (xd->segmentation_enabled) {
-      // Code to set segment id in xd->mbmi.segment_id
-      if (cpi->segmentation_map[map_index] <= 3)
-        mbmi->segment_id = cpi->segmentation_map[map_index];
-      else
-        mbmi->segment_id = 0;
-
       vp8cx_mb_init_quantizer(cpi, x);
-    } else
-      // Set to Segment 0 by default
-      mbmi->segment_id = 0;
+    }
 
     x->active_ptr = cpi->active_map + map_index;
 
     cpi->update_context = 0;
 
     if (cm->frame_type == KEY_FRAME) {
-      vp8cx_encode_intra_macro_block(cpi, x, tp, 1);
-      // Note the encoder may have changed the segment_id
+#if CONFIG_SUPERBLOCKS
+      if (xd->mode_info_context->mbmi.encoded_as_sb)
+        vp8cx_encode_intra_super_block(cpi, x, tp, mb_col);
+      else
+#endif
+        vp8cx_encode_intra_macro_block(cpi, x, tp, 1);
+        // Note the encoder may have changed the segment_id
 
 #ifdef MODE_STATS
       y_modes[mbmi->mode]++;
@@ -822,9 +994,25 @@ static void encode_sb(VP8_COMP *cpi,
       unsigned char *segment_id;
       int seg_ref_active;
 
-      vp8cx_encode_inter_macroblock(cpi, x, tp,
-                                    recon_yoffset, recon_uvoffset, 1);
-      // Note the encoder may have changed the segment_id
+      if (xd->mode_info_context->mbmi.ref_frame) {
+        unsigned char pred_context;
+
+        pred_context = get_pred_context(cm, xd, PRED_COMP);
+
+        if (xd->mode_info_context->mbmi.second_ref_frame == INTRA_FRAME)
+          cpi->single_pred_count[pred_context]++;
+        else
+          cpi->comp_pred_count[pred_context]++;
+      }
+
+#if CONFIG_SUPERBLOCKS
+      if (xd->mode_info_context->mbmi.encoded_as_sb)
+        vp8cx_encode_inter_superblock(cpi, x, tp, recon_yoffset, recon_uvoffset, mb_col, mb_row);
+      else
+#endif
+        vp8cx_encode_inter_macroblock(cpi, x, tp,
+                                      recon_yoffset, recon_uvoffset, 1);
+        // Note the encoder may have changed the segment_id
 
 #ifdef MODE_STATS
       inter_y_modes[mbmi->mode]++;
@@ -864,10 +1052,20 @@ static void encode_sb(VP8_COMP *cpi,
     // TODO Partitioning is broken!
     cpi->tplist[mb_row].stop = *tp;
 
-    // Copy back updated left context
-    vpx_memcpy(&cpi->left_context[i >> 1],
-               &cm->left_context,
-               sizeof(ENTROPY_CONTEXT_PLANES));
+#if CONFIG_SUPERBLOCKS
+    if (xd->mode_info_context->mbmi.encoded_as_sb) {
+      x->src.y_buffer += 32;
+      x->src.u_buffer += 16;
+      x->src.v_buffer += 16;
+
+      x->gf_active_ptr      += 2;
+      x->partition_info     += 2;
+      xd->mode_info_context += 2;
+      xd->prev_mode_info_context += 2;
+      
+      break;
+    }
+#endif
 
     // Next MB
     mb_row += dy;
@@ -911,14 +1109,13 @@ void encode_sb_row(VP8_COMP *cpi,
   int mb_cols = cm->mb_cols;
 
   // Initialize the left context for the new SB row
-  vpx_memset(cpi->left_context, 0, sizeof(cpi->left_context));
-  vpx_memset(&cm->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
+  vpx_memset(cm->left_context, 0, sizeof(cm->left_context));
 
   // Code each SB in the row
   for (mb_col = 0; mb_col < mb_cols; mb_col += 2) {
-    int mb_rate = 0;
+    int mb_rate = 0, mb_dist = 0;
 #if CONFIG_SUPERBLOCKS
-    int sb_rate = INT_MAX;
+    int sb_rate = INT_MAX, sb_dist;
 #endif
 
 #if CONFIG_DEBUG
@@ -930,8 +1127,14 @@ void encode_sb_row(VP8_COMP *cpi,
     unsigned char *vb = x->src.v_buffer;
 #endif
 
+#if CONFIG_SUPERBLOCKS
     // Pick modes assuming the SB is coded as 4 independent MBs
-    pick_mb_modes(cpi, cm, mb_row, mb_col, x, xd, tp, &mb_rate);
+    xd->mode_info_context->mbmi.encoded_as_sb = 0;
+#endif
+    pick_mb_modes(cpi, cm, mb_row, mb_col, x, xd, tp, &mb_rate, &mb_dist);
+#if CONFIG_SUPERBLOCKS
+    mb_rate += vp8_cost_bit(cm->sb_coded, 0);
+#endif
 
     x->src.y_buffer -= 32;
     x->src.u_buffer -= 16;
@@ -952,21 +1155,40 @@ void encode_sb_row(VP8_COMP *cpi,
 #endif
 
 #if CONFIG_SUPERBLOCKS
-    // Pick a mode assuming that it applies all 4 of the MBs in the SB
-    pick_sb_modes(cpi, cm, mb_row, mb_col, x, xd, &sb_rate);
+    if (!(((    mb_cols & 1) && mb_col ==     mb_cols - 1) ||
+          ((cm->mb_rows & 1) && mb_row == cm->mb_rows - 1))) {
+      /* Pick a mode assuming that it applies to all 4 of the MBs in the SB */
+      xd->mode_info_context->mbmi.encoded_as_sb = 1;
+      pick_sb_modes(cpi, cm, mb_row, mb_col, x, xd, tp, &sb_rate, &sb_dist);
+      sb_rate += vp8_cost_bit(cm->sb_coded, 1);
+    }
 
-    // Decide whether to encode as a SB or 4xMBs
-    if (sb_rate < mb_rate) {
-      x->encode_as_sb = 1;
+    /* Decide whether to encode as a SB or 4xMBs */
+    if (sb_rate < INT_MAX &&
+        RDCOST(x->rdmult, x->rddiv, sb_rate, sb_dist) <
+          RDCOST(x->rdmult, x->rddiv, mb_rate, mb_dist)) {
+      xd->mode_info_context->mbmi.encoded_as_sb = 1;
+      xd->mode_info_context[1].mbmi.encoded_as_sb = 1;
+      xd->mode_info_context[cm->mode_info_stride].mbmi.encoded_as_sb = 1;
+      xd->mode_info_context[1 + cm->mode_info_stride].mbmi.encoded_as_sb = 1;
       *totalrate += sb_rate;
     } else
 #endif
     {
-      x->encode_as_sb = 0;
+#if CONFIG_SUPERBLOCKS
+      xd->mode_info_context->mbmi.encoded_as_sb = 0;
+      if (cm->mb_cols - 1 > mb_col)
+        xd->mode_info_context[1].mbmi.encoded_as_sb = 0;
+      if (cm->mb_rows - 1 > mb_row) {
+        xd->mode_info_context[cm->mode_info_stride].mbmi.encoded_as_sb = 0;
+        if (cm->mb_cols - 1 > mb_col)
+          xd->mode_info_context[1 + cm->mode_info_stride].mbmi.encoded_as_sb = 0;
+      }
+#endif
       *totalrate += mb_rate;
     }
 
-    // Encode SB using best computed mode(s)
+    /* Encode SB using best computed mode(s) */
     encode_sb(cpi, cm, mb_row, mb_col, x, xd, tp);
 
 #if CONFIG_DEBUG
@@ -1038,8 +1260,6 @@ void init_encode_frame_mb_context(VP8_COMP *cpi) {
   xd->mode_info_context->mbmi.mode = DC_PRED;
   xd->mode_info_context->mbmi.uv_mode = DC_PRED;
 
-  xd->left_context = &cm->left_context;
-
   vp8_zero(cpi->count_mb_ref_frame_usage)
   vp8_zero(cpi->bmode_count)
   vp8_zero(cpi->ymode_count)
@@ -1049,6 +1269,10 @@ void init_encode_frame_mb_context(VP8_COMP *cpi) {
   vp8_zero(cpi->mbsplit_count)
   vp8_zero(cpi->common.fc.mv_ref_ct)
   vp8_zero(cpi->common.fc.mv_ref_ct_a)
+#if CONFIG_SUPERBLOCKS
+  vp8_zero(cpi->sb_ymode_count)
+  cpi->sb_count = 0;
+#endif
   // vp8_zero(cpi->uv_mode_count)
 
   x->mvc = cm->fc.mvc;
@@ -1380,7 +1604,12 @@ static void sum_intra_stats(VP8_COMP *cpi, MACROBLOCK *x) {
   }
 #endif
 
-  ++cpi->ymode_count[m];
+#if CONFIG_SUPERBLOCKS
+  if (xd->mode_info_context->mbmi.encoded_as_sb) {
+    ++cpi->sb_ymode_count[m];
+  } else
+#endif
+    ++cpi->ymode_count[m];
   if (m != I8X8_PRED)
     ++cpi->y_uv_mode_count[m][uvm];
   else {
@@ -1418,6 +1647,160 @@ static void adjust_act_zbin(VP8_COMP *cpi, MACROBLOCK *x) {
 #endif
 }
 
+#if CONFIG_SUPERBLOCKS
+static void update_sb_skip_coeff_state(VP8_COMP *cpi,
+                                       MACROBLOCK *x,
+                                       ENTROPY_CONTEXT_PLANES ta[4],
+                                       ENTROPY_CONTEXT_PLANES tl[4],
+                                       TOKENEXTRA *t[4],
+                                       TOKENEXTRA **tp,
+                                       int skip[4])
+{
+  TOKENEXTRA tokens[4][16 * 24];
+  int n_tokens[4], n;
+
+  // if there were no skips, we don't need to do anything
+  if (!skip[0] && !skip[1] && !skip[2] && !skip[3])
+    return;
+
+  // if we don't do coeff skipping for this frame, we don't
+  // need to do anything here
+  if (!cpi->common.mb_no_coeff_skip)
+    return;
+
+  // if all 4 MBs skipped coeff coding, nothing to be done
+  if (skip[0] && skip[1] && skip[2] && skip[3])
+    return;
+
+  // so the situation now is that we want to skip coeffs
+  // for some MBs, but not all, and we didn't code EOB
+  // coefficients for them. However, the skip flag for this
+  // SB will be 0 overall, so we need to insert EOBs in the
+  // middle of the token tree. Do so here.
+  n_tokens[0] = t[1] - t[0];
+  n_tokens[1] = t[2] - t[1];
+  n_tokens[2] = t[3] - t[2];
+  n_tokens[3] = *tp  - t[3];
+  if (n_tokens[0])
+    memcpy(tokens[0], t[0], n_tokens[0] * sizeof(*t[0]));
+  if (n_tokens[1])
+    memcpy(tokens[1], t[1], n_tokens[1] * sizeof(*t[0]));
+  if (n_tokens[2])
+    memcpy(tokens[2], t[2], n_tokens[2] * sizeof(*t[0]));
+  if (n_tokens[3])
+    memcpy(tokens[3], t[3], n_tokens[3] * sizeof(*t[0]));
+
+  // reset pointer, stuff EOBs where necessary
+  *tp = t[0];
+  for (n = 0; n < 4; n++) {
+    TOKENEXTRA *tbak = *tp;
+    if (skip[n]) {
+      x->e_mbd.above_context = &ta[n];
+      x->e_mbd.left_context  = &tl[n];
+      vp8_stuff_mb_8x8(cpi, &x->e_mbd, tp, 0);
+    } else {
+      if (n_tokens[n]) {
+        memcpy(*tp, tokens[n], sizeof(*t[0]) * n_tokens[n]);
+      }
+      (*tp) += n_tokens[n];
+    }
+  }
+}
+
+void vp8cx_encode_intra_super_block(VP8_COMP *cpi,
+                                    MACROBLOCK *x,
+                                    TOKENEXTRA **t,
+                                    int mb_col) {
+  const int output_enabled = 1;
+  int n;
+  MACROBLOCKD *xd = &x->e_mbd;
+  VP8_COMMON *cm = &cpi->common;
+  const uint8_t *src = x->src.y_buffer, *dst = xd->dst.y_buffer;
+  const uint8_t *usrc = x->src.u_buffer, *udst = xd->dst.u_buffer;
+  const uint8_t *vsrc = x->src.v_buffer, *vdst = xd->dst.v_buffer;
+  int src_y_stride = x->src.y_stride, dst_y_stride = xd->dst.y_stride;
+  int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride;
+  const VP8_ENCODER_RTCD *rtcd = IF_RTCD(&cpi->rtcd);
+  TOKENEXTRA *tp[4];
+  int skip[4];
+  MODE_INFO *mi = x->e_mbd.mode_info_context;
+  ENTROPY_CONTEXT_PLANES ta[4], tl[4];
+
+  if ((cpi->oxcf.tuning == VP8_TUNE_SSIM) && output_enabled) {
+    adjust_act_zbin(cpi, x);
+    vp8_update_zbin_extra(cpi, x);
+  }
+
+  /* test code: set transform size based on mode selection */
+  if (cpi->common.txfm_mode == ALLOW_8X8) {
+    x->e_mbd.mode_info_context->mbmi.txfm_size = TX_8X8;
+    x->e_mbd.mode_info_context[1].mbmi.txfm_size = TX_8X8;
+    x->e_mbd.mode_info_context[cm->mode_info_stride].mbmi.txfm_size = TX_8X8;
+    x->e_mbd.mode_info_context[cm->mode_info_stride+1].mbmi.txfm_size = TX_8X8;
+    cpi->t8x8_count++;
+  } else {
+    x->e_mbd.mode_info_context->mbmi.txfm_size = TX_4X4;
+    cpi->t4x4_count++;
+  }
+
+  RECON_INVOKE(&rtcd->common->recon, build_intra_predictors_sby_s)(&x->e_mbd);
+  RECON_INVOKE(&rtcd->common->recon, build_intra_predictors_sbuv_s)(&x->e_mbd);
+
+  assert(x->e_mbd.mode_info_context->mbmi.txfm_size == TX_8X8);
+  for (n = 0; n < 4; n++)
+  {
+    int x_idx = n & 1, y_idx = n >> 1;
+
+    xd->above_context = cm->above_context + mb_col + (n & 1);
+    xd->left_context = cm->left_context + (n >> 1);
+
+    vp8_subtract_mby_s_c(x->src_diff,
+                         src + x_idx * 16 + y_idx * 16 * src_y_stride,
+                         src_y_stride,
+                         dst + x_idx * 16 + y_idx * 16 * dst_y_stride,
+                         dst_y_stride);
+    vp8_subtract_mbuv_s_c(x->src_diff,
+                          usrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
+                          vsrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
+                          src_uv_stride,
+                          udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
+                          vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
+                          dst_uv_stride);
+    vp8_transform_intra_mby_8x8(x);
+    vp8_transform_mbuv_8x8(x);
+    vp8_quantize_mby_8x8(x);
+    vp8_quantize_mbuv_8x8(x);
+    if (x->optimize) {
+      vp8_optimize_mby_8x8(x, rtcd);
+      vp8_optimize_mbuv_8x8(x, rtcd);
+    }
+    vp8_inverse_transform_mby_8x8(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
+    vp8_inverse_transform_mbuv_8x8(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
+    vp8_recon_mby_s_c(IF_RTCD(&rtcd->common->recon), &x->e_mbd,
+                      dst + x_idx * 16 + y_idx * 16 * dst_y_stride);
+    vp8_recon_mbuv_s_c(IF_RTCD(&rtcd->common->recon), &x->e_mbd,
+                       udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
+                       vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride);
+
+    if (output_enabled) {
+      memcpy(&ta[n], xd->above_context, sizeof(ta[n]));
+      memcpy(&tl[n], xd->left_context, sizeof(tl[n]));
+      tp[n] = *t;
+      xd->mode_info_context = mi + x_idx + y_idx * cm->mode_info_stride;
+      vp8_tokenize_mb(cpi, &x->e_mbd, t, 0);
+      skip[n] = xd->mode_info_context->mbmi.mb_skip_coeff;
+    }
+  }
+
+  if (output_enabled) {
+    // Tokenize
+    xd->mode_info_context = mi;
+    sum_intra_stats(cpi, x);
+    update_sb_skip_coeff_state(cpi, x, ta, tl, tp, t, skip);
+  }
+}
+#endif
+
 void vp8cx_encode_intra_macro_block(VP8_COMP *cpi,
                                     MACROBLOCK *x,
                                     TOKENEXTRA **t,
@@ -1484,6 +1867,9 @@ void vp8cx_encode_inter_macroblock (VP8_COMP *cpi, MACROBLOCK *x,
   unsigned char ref_pred_flag;
 
   x->skip = 0;
+#if CONFIG_SUPERBLOCKS
+  assert(!xd->mode_info_context->mbmi.encoded_as_sb);
+#endif
 
 #if CONFIG_SWITCHABLE_INTERP
   vp8_setup_interp_filters(xd, mbmi->interp_filter, cm);
@@ -1648,3 +2034,190 @@ void vp8cx_encode_inter_macroblock (VP8_COMP *cpi, MACROBLOCK *x,
     }
   }
 }
+
+#if CONFIG_SUPERBLOCKS
+void vp8cx_encode_inter_superblock(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t,
+                                   int recon_yoffset, int recon_uvoffset, int mb_col, int mb_row) {
+  const int output_enabled = 1;
+  VP8_COMMON *cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  const uint8_t *src = x->src.y_buffer, *dst = xd->dst.y_buffer;
+  const uint8_t *usrc = x->src.u_buffer, *udst = xd->dst.u_buffer;
+  const uint8_t *vsrc = x->src.v_buffer, *vdst = xd->dst.v_buffer;
+  int src_y_stride = x->src.y_stride, dst_y_stride = xd->dst.y_stride;
+  int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride;
+  const VP8_ENCODER_RTCD *rtcd = IF_RTCD(&cpi->rtcd);
+  int mis = xd->mode_info_stride;
+  unsigned int segment_id = xd->mode_info_context->mbmi.segment_id;
+  int seg_ref_active;
+  unsigned char ref_pred_flag;
+  int n;
+  TOKENEXTRA *tp[4];
+  int skip[4];
+  MODE_INFO *mi = x->e_mbd.mode_info_context;
+  ENTROPY_CONTEXT_PLANES ta[4], tl[4];
+
+  x->skip = 0;
+
+  if (cpi->oxcf.tuning == VP8_TUNE_SSIM) {
+    // Adjust the zbin based on this MB rate.
+    adjust_act_zbin(cpi, x);
+  }
+
+  {
+    // Experimental code. Special case for gf and arf zeromv modes.
+    // Increase zbin size to suppress noise
+    cpi->zbin_mode_boost = 0;
+    if (cpi->zbin_mode_boost_enabled) {
+      if (xd->mode_info_context->mbmi.ref_frame != INTRA_FRAME) {
+        if (xd->mode_info_context->mbmi.mode == ZEROMV) {
+          if (xd->mode_info_context->mbmi.ref_frame != LAST_FRAME)
+            cpi->zbin_mode_boost = GF_ZEROMV_ZBIN_BOOST;
+          else
+            cpi->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST;
+        } else if (xd->mode_info_context->mbmi.mode == SPLITMV)
+          cpi->zbin_mode_boost = 0;
+        else
+          cpi->zbin_mode_boost = MV_ZBIN_BOOST;
+      }
+    }
+
+    vp8_update_zbin_extra(cpi, x);
+  }
+
+  seg_ref_active = segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME);
+
+  // SET VARIOUS PREDICTION FLAGS
+
+  // Did the chosen reference frame match its predicted value.
+  ref_pred_flag = ((xd->mode_info_context->mbmi.ref_frame ==
+                    get_pred_ref(cm, xd)));
+  set_pred_flag(xd, PRED_REF, ref_pred_flag);
+
+  /* test code: set transform size based on mode selection */
+  if (cpi->common.txfm_mode == ALLOW_8X8
+      && x->e_mbd.mode_info_context->mbmi.mode != I8X8_PRED
+      && x->e_mbd.mode_info_context->mbmi.mode != B_PRED
+      && x->e_mbd.mode_info_context->mbmi.mode != SPLITMV) {
+    x->e_mbd.mode_info_context->mbmi.txfm_size = TX_8X8;
+    cpi->t8x8_count++;
+  } else {
+    x->e_mbd.mode_info_context->mbmi.txfm_size = TX_4X4;
+    cpi->t4x4_count++;
+  }
+
+  if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {
+    RECON_INVOKE(&rtcd->common->recon, build_intra_predictors_sby_s)(&x->e_mbd);
+    RECON_INVOKE(&rtcd->common->recon, build_intra_predictors_sbuv_s)(&x->e_mbd);
+  } else {
+    int ref_fb_idx;
+
+    if (xd->mode_info_context->mbmi.ref_frame == LAST_FRAME)
+      ref_fb_idx = cpi->common.lst_fb_idx;
+    else if (xd->mode_info_context->mbmi.ref_frame == GOLDEN_FRAME)
+      ref_fb_idx = cpi->common.gld_fb_idx;
+    else
+      ref_fb_idx = cpi->common.alt_fb_idx;
+
+    xd->pre.y_buffer = cpi->common.yv12_fb[ref_fb_idx].y_buffer + recon_yoffset;
+    xd->pre.u_buffer = cpi->common.yv12_fb[ref_fb_idx].u_buffer + recon_uvoffset;
+    xd->pre.v_buffer = cpi->common.yv12_fb[ref_fb_idx].v_buffer + recon_uvoffset;
+
+    if (xd->mode_info_context->mbmi.second_ref_frame) {
+      int second_ref_fb_idx;
+
+      if (xd->mode_info_context->mbmi.second_ref_frame == LAST_FRAME)
+        second_ref_fb_idx = cpi->common.lst_fb_idx;
+      else if (xd->mode_info_context->mbmi.second_ref_frame == GOLDEN_FRAME)
+        second_ref_fb_idx = cpi->common.gld_fb_idx;
+      else
+        second_ref_fb_idx = cpi->common.alt_fb_idx;
+
+      xd->second_pre.y_buffer = cpi->common.yv12_fb[second_ref_fb_idx].y_buffer +
+                                    recon_yoffset;
+      xd->second_pre.u_buffer = cpi->common.yv12_fb[second_ref_fb_idx].u_buffer +
+                                    recon_uvoffset;
+      xd->second_pre.v_buffer = cpi->common.yv12_fb[second_ref_fb_idx].v_buffer +
+                                    recon_uvoffset;
+    }
+
+    vp8_build_inter32x32_predictors_sb(xd, xd->dst.y_buffer,
+                                       xd->dst.u_buffer, xd->dst.v_buffer,
+                                       xd->dst.y_stride, xd->dst.uv_stride);
+  }
+
+  assert(x->e_mbd.mode_info_context->mbmi.txfm_size == TX_8X8);
+  for (n = 0; n < 4; n++)
+  {
+    int x_idx = n & 1, y_idx = n >> 1;
+
+    vp8_subtract_mby_s_c(x->src_diff,
+                         src + x_idx * 16 + y_idx * 16 * src_y_stride,
+                         src_y_stride,
+                         dst + x_idx * 16 + y_idx * 16 * dst_y_stride,
+                         dst_y_stride);
+    vp8_subtract_mbuv_s_c(x->src_diff,
+                          usrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
+                          vsrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
+                          src_uv_stride,
+                          udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
+                          vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
+                          dst_uv_stride);
+    if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {
+      vp8_transform_intra_mby_8x8(x);
+    } else {
+      vp8_transform_mby_8x8(x);
+    }
+    vp8_transform_mbuv_8x8(x);
+    vp8_quantize_mby_8x8(x);
+    vp8_quantize_mbuv_8x8(x);
+    if (x->optimize) {
+      vp8_optimize_mby_8x8(x, rtcd);
+      vp8_optimize_mbuv_8x8(x, rtcd);
+    }
+    vp8_inverse_transform_mby_8x8(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
+    vp8_inverse_transform_mbuv_8x8(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
+    vp8_recon_mby_s_c(IF_RTCD(&rtcd->common->recon), &x->e_mbd,
+                      dst + x_idx * 16 + y_idx * 16 * dst_y_stride);
+    vp8_recon_mbuv_s_c(IF_RTCD(&rtcd->common->recon), &x->e_mbd,
+                       udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
+                       vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride);
+
+    if (!x->skip) {
+      if (output_enabled) {
+        xd->left_context = cm->left_context + (n >> 1);
+        xd->above_context = cm->above_context + mb_col + (n >> 1);
+        memcpy(&ta[n], xd->above_context, sizeof(ta[n]));
+        memcpy(&tl[n], xd->left_context, sizeof(tl[n]));
+        tp[n] = *t;
+        xd->mode_info_context = mi + x_idx + y_idx * cm->mode_info_stride;
+        vp8_tokenize_mb(cpi, &x->e_mbd, t, 0);
+        skip[n] = xd->mode_info_context->mbmi.mb_skip_coeff;
+      }
+    } else {
+      int mb_skip_context =
+        cpi->common.mb_no_coeff_skip ?
+          (x->e_mbd.mode_info_context - 1)->mbmi.mb_skip_coeff +
+            (x->e_mbd.mode_info_context - cpi->common.mode_info_stride)->mbmi.mb_skip_coeff :
+          0;
+      if (cpi->common.mb_no_coeff_skip) {
+        skip[n] = xd->mode_info_context->mbmi.mb_skip_coeff = 1;
+        xd->left_context = cm->left_context + (n >> 1);
+        xd->above_context = cm->above_context + mb_col + (n >> 1);
+        memcpy(&ta[n], xd->above_context, sizeof(ta[n]));
+        memcpy(&tl[n], xd->left_context, sizeof(tl[n]));
+        tp[n] = *t;
+        cpi->skip_true_count[mb_skip_context]++;
+        vp8_fix_contexts(xd);
+      } else {
+        vp8_stuff_mb(cpi, xd, t, 0);
+        xd->mode_info_context->mbmi.mb_skip_coeff = 0;
+        cpi->skip_false_count[mb_skip_context]++;
+      }
+    }
+  }
+
+  xd->mode_info_context = mi;
+  update_sb_skip_coeff_state(cpi, x, ta, tl, tp, t, skip);
+}
+#endif
diff --git a/vp8/encoder/encodemb.c b/vp8/encoder/encodemb.c
index 454244457..8c48b0d83 100644
--- a/vp8/encoder/encodemb.c
+++ b/vp8/encoder/encodemb.c
@@ -67,11 +67,10 @@ void vp8_subtract_4b_c(BLOCK *be, BLOCKD *bd, int pitch) {
   }
 }
 
-void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride) {
+void vp8_subtract_mbuv_s_c(short *diff, unsigned char *usrc, unsigned char *vsrc, int src_stride,
+                           unsigned char *upred, unsigned char *vpred, int dst_stride) {
   short *udiff = diff + 256;
   short *vdiff = diff + 320;
-  unsigned char *upred = pred + 256;
-  unsigned char *vpred = pred + 320;
 
   int r, c;
 
@@ -81,8 +80,8 @@ void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc,
     }
 
     udiff += 8;
-    upred += 8;
-    usrc  += stride;
+    upred += dst_stride;
+    usrc  += src_stride;
   }
 
   for (r = 0; r < 8; r++) {
@@ -91,12 +90,19 @@ void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc,
     }
 
     vdiff += 8;
-    vpred += 8;
-    vsrc  += stride;
+    vpred += dst_stride;
+    vsrc  += src_stride;
   }
 }
 
-void vp8_subtract_mby_c(short *diff, unsigned char *src, unsigned char *pred, int stride) {
+void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride) {
+  unsigned char *upred = pred + 256;
+  unsigned char *vpred = pred + 320;
+
+  vp8_subtract_mbuv_s_c(diff, usrc, vsrc, stride, upred, vpred, 8);
+}
+
+void vp8_subtract_mby_s_c(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int dst_stride) {
   int r, c;
 
   for (r = 0; r < 16; r++) {
@@ -105,11 +111,16 @@ void vp8_subtract_mby_c(short *diff, unsigned char *src, unsigned char *pred, in
     }
 
     diff += 16;
-    pred += 16;
-    src  += stride;
+    pred += dst_stride;
+    src  += src_stride;
   }
 }
 
+void vp8_subtract_mby_c(short *diff, unsigned char *src, unsigned char *pred, int stride)
+{
+  vp8_subtract_mby_s_c(diff, src, stride, pred, 16);
+}
+
 static void vp8_subtract_mb(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x) {
   BLOCK *b = &x->block[0];
 
diff --git a/vp8/encoder/generic/csystemdependent.c b/vp8/encoder/generic/csystemdependent.c
index 6390f3fe4..6a5bf59d5 100644
--- a/vp8/encoder/generic/csystemdependent.c
+++ b/vp8/encoder/generic/csystemdependent.c
@@ -23,24 +23,36 @@ extern void vp8_yv12_copy_partial_frame(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER
 void vp8_cmachine_specific_config(VP8_COMP *cpi) {
 #if CONFIG_RUNTIME_CPU_DETECT
   cpi->rtcd.common                    = &cpi->common.rtcd;
+#if CONFIG_SUPERBLOCKS
+  cpi->rtcd.variance.sad32x32              = vp8_sad32x32_c;
+#endif
   cpi->rtcd.variance.sad16x16              = vp8_sad16x16_c;
   cpi->rtcd.variance.sad16x8               = vp8_sad16x8_c;
   cpi->rtcd.variance.sad8x16               = vp8_sad8x16_c;
   cpi->rtcd.variance.sad8x8                = vp8_sad8x8_c;
   cpi->rtcd.variance.sad4x4                = vp8_sad4x4_c;
 
+#if CONFIG_SUPERBLOCKS
+  cpi->rtcd.variance.sad32x32x3            = vp8_sad32x32x3_c;
+#endif
   cpi->rtcd.variance.sad16x16x3            = vp8_sad16x16x3_c;
   cpi->rtcd.variance.sad16x8x3             = vp8_sad16x8x3_c;
   cpi->rtcd.variance.sad8x16x3             = vp8_sad8x16x3_c;
   cpi->rtcd.variance.sad8x8x3              = vp8_sad8x8x3_c;
   cpi->rtcd.variance.sad4x4x3              = vp8_sad4x4x3_c;
 
+#if CONFIG_SUPERBLOCKS
+  cpi->rtcd.variance.sad32x32x8            = vp8_sad32x32x8_c;
+#endif
   cpi->rtcd.variance.sad16x16x8            = vp8_sad16x16x8_c;
   cpi->rtcd.variance.sad16x8x8             = vp8_sad16x8x8_c;
   cpi->rtcd.variance.sad8x16x8             = vp8_sad8x16x8_c;
   cpi->rtcd.variance.sad8x8x8              = vp8_sad8x8x8_c;
   cpi->rtcd.variance.sad4x4x8              = vp8_sad4x4x8_c;
 
+#if CONFIG_SUPERBLOCKS
+  cpi->rtcd.variance.sad32x32x4d           = vp8_sad32x32x4d_c;
+#endif
   cpi->rtcd.variance.sad16x16x4d           = vp8_sad16x16x4d_c;
   cpi->rtcd.variance.sad16x8x4d            = vp8_sad16x8x4d_c;
   cpi->rtcd.variance.sad8x16x4d            = vp8_sad8x16x4d_c;
@@ -54,16 +66,34 @@ void vp8_cmachine_specific_config(VP8_COMP *cpi) {
   cpi->rtcd.variance.var8x16               = vp8_variance8x16_c;
   cpi->rtcd.variance.var16x8               = vp8_variance16x8_c;
   cpi->rtcd.variance.var16x16              = vp8_variance16x16_c;
+#if CONFIG_SUPERBLOCKS
+  cpi->rtcd.variance.var32x32              = vp8_variance32x32_c;
+#endif
 
   cpi->rtcd.variance.subpixvar4x4          = vp8_sub_pixel_variance4x4_c;
   cpi->rtcd.variance.subpixvar8x8          = vp8_sub_pixel_variance8x8_c;
   cpi->rtcd.variance.subpixvar8x16         = vp8_sub_pixel_variance8x16_c;
   cpi->rtcd.variance.subpixvar16x8         = vp8_sub_pixel_variance16x8_c;
   cpi->rtcd.variance.subpixvar16x16        = vp8_sub_pixel_variance16x16_c;
+#if CONFIG_SUPERBLOCKS
+  cpi->rtcd.variance.subpixvar32x32        = vp8_sub_pixel_variance32x32_c;
+#endif
   cpi->rtcd.variance.halfpixvar16x16_h     = vp8_variance_halfpixvar16x16_h_c;
+#if CONFIG_SUPERBLOCKS
+  cpi->rtcd.variance.halfpixvar32x32_h     = vp8_variance_halfpixvar32x32_h_c;
+#endif
   cpi->rtcd.variance.halfpixvar16x16_v     = vp8_variance_halfpixvar16x16_v_c;
+#if CONFIG_SUPERBLOCKS
+  cpi->rtcd.variance.halfpixvar32x32_v     = vp8_variance_halfpixvar32x32_v_c;
+#endif
   cpi->rtcd.variance.halfpixvar16x16_hv    = vp8_variance_halfpixvar16x16_hv_c;
+#if CONFIG_SUPERBLOCKS
+  cpi->rtcd.variance.halfpixvar32x32_hv    = vp8_variance_halfpixvar32x32_hv_c;
+#endif
   cpi->rtcd.variance.subpixmse16x16        = vp8_sub_pixel_mse16x16_c;
+#if CONFIG_SUPERBLOCKS
+  cpi->rtcd.variance.subpixmse32x32        = vp8_sub_pixel_mse32x32_c;
+#endif
 
   cpi->rtcd.variance.mse16x16              = vp8_mse16x16_c;
   cpi->rtcd.variance.getmbss               = vp8_get_mb_ss_c;
diff --git a/vp8/encoder/mcomp.c b/vp8/encoder/mcomp.c
index ba4cd897d..a0621b649 100644
--- a/vp8/encoder/mcomp.c
+++ b/vp8/encoder/mcomp.c
@@ -243,7 +243,7 @@ int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
   int y_stride;
   int offset;
 
-#if ARCH_X86 || ARCH_X86_64
+#if !CONFIG_SUPERBLOCKS && (ARCH_X86 || ARCH_X86_64)
   unsigned char *y0 = *(d->base_pre) + d->pre + (bestmv->as_mv.row) * d->pre_stride + bestmv->as_mv.col;
   unsigned char *y;
   int buf_r1, buf_r2, buf_c1, buf_c2;
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index d7a9456d1..256c70386 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -620,6 +620,42 @@ static void print_seg_map(VP8_COMP *cpi) {
   fclose(statsfile);
 }
 
+static void update_reference_segmentation_map(VP8_COMP *cpi) {
+  VP8_COMMON *cm = &cpi->common;
+  int row, col, sb_rows = (cm->mb_rows + 1) >> 1, sb_cols = (cm->mb_cols + 1) >> 1;
+  MODE_INFO *mi = cm->mi;
+  uint8_t *segmap = cpi->segmentation_map;
+  uint8_t *segcache = cm->last_frame_seg_map;
+
+  for (row = 0; row < sb_rows; row++) {
+    for (col = 0; col < sb_cols; col++) {
+      MODE_INFO *miptr = mi + col * 2;
+      uint8_t *seg = segmap + col * 2;
+      uint8_t *cache = segcache + col * 2;
+#if CONFIG_SUPERBLOCKS
+      if (miptr->mbmi.encoded_as_sb) {
+        cache[0] = cache[1] = cache[cm->mb_cols] = cache[cm->mb_cols + 1] =
+          miptr->mbmi.segment_id;
+      } else
+#endif
+      {
+        cache[0] = miptr[0].mbmi.segment_id;
+        if (!(cm->mb_cols & 1) || col < sb_cols - 1)
+          cache[1] = miptr[1].mbmi.segment_id;
+        if (!(cm->mb_rows & 1) || row < sb_rows - 1) {
+          cache[cm->mb_cols] = miptr[cm->mode_info_stride].mbmi.segment_id;
+          if (!(cm->mb_cols & 1) || col < sb_cols - 1)
+            cache[1] = miptr[1].mbmi.segment_id;
+          cache[cm->mb_cols + 1] = miptr[cm->mode_info_stride + 1].mbmi.segment_id;
+        }
+      }
+    }
+    segmap += 2 * cm->mb_cols;
+    segcache += 2 * cm->mb_cols;
+    mi += 2 * cm->mode_info_stride;
+  }
+}
+
 static void set_default_lf_deltas(VP8_COMP *cpi) {
   cpi->mb.e_mbd.mode_ref_lf_delta_enabled = 1;
   cpi->mb.e_mbd.mode_ref_lf_delta_update = 1;
@@ -1734,6 +1770,9 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf) {
   cm->prob_last_coded               = 128;
   cm->prob_gf_coded                 = 128;
   cm->prob_intra_coded              = 63;
+#if CONFIG_SUPERBLOCKS
+  cm->sb_coded                      = 200;
+#endif
   for (i = 0; i < COMP_PRED_CONTEXTS; i++)
     cm->prob_comppred[i]         = 128;
 
@@ -1917,6 +1956,18 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf) {
   init_mv_ref_counts();
 #endif
 
+#if CONFIG_SUPERBLOCKS
+  cpi->fn_ptr[BLOCK_32X32].sdf            = VARIANCE_INVOKE(&cpi->rtcd.variance, sad32x32);
+  cpi->fn_ptr[BLOCK_32X32].vf             = VARIANCE_INVOKE(&cpi->rtcd.variance, var32x32);
+  cpi->fn_ptr[BLOCK_32X32].svf            = VARIANCE_INVOKE(&cpi->rtcd.variance, subpixvar32x32);
+  cpi->fn_ptr[BLOCK_32X32].svf_halfpix_h  = VARIANCE_INVOKE(&cpi->rtcd.variance, halfpixvar32x32_h);
+  cpi->fn_ptr[BLOCK_32X32].svf_halfpix_v  = VARIANCE_INVOKE(&cpi->rtcd.variance, halfpixvar32x32_v);
+  cpi->fn_ptr[BLOCK_32X32].svf_halfpix_hv = VARIANCE_INVOKE(&cpi->rtcd.variance, halfpixvar32x32_hv);
+  cpi->fn_ptr[BLOCK_32X32].sdx3f          = VARIANCE_INVOKE(&cpi->rtcd.variance, sad32x32x3);
+  cpi->fn_ptr[BLOCK_32X32].sdx8f          = VARIANCE_INVOKE(&cpi->rtcd.variance, sad32x32x8);
+  cpi->fn_ptr[BLOCK_32X32].sdx4df         = VARIANCE_INVOKE(&cpi->rtcd.variance, sad32x32x4d);
+#endif
+
   cpi->fn_ptr[BLOCK_16X16].sdf            = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x16);
   cpi->fn_ptr[BLOCK_16X16].vf             = VARIANCE_INVOKE(&cpi->rtcd.variance, var16x16);
   cpi->fn_ptr[BLOCK_16X16].svf            = VARIANCE_INVOKE(&cpi->rtcd.variance, subpixvar16x16);
@@ -3616,6 +3667,10 @@ static void encode_frame_to_data_rate
   cpi->dummy_packing = 0;
   vp8_pack_bitstream(cpi, dest, size);
 
+  if (cpi->mb.e_mbd.update_mb_segmentation_map) {
+    update_reference_segmentation_map(cpi);
+  }
+
 #if CONFIG_PRED_FILTER
   // Select the prediction filtering mode to use for the
   // next frame based on the current frame selections
diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index ff3a21107..7fb7dd2ff 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -359,7 +359,9 @@ enum {
   BLOCK_8X8,
   BLOCK_4X4,
   BLOCK_16X16,
-  BLOCK_MAX_SEGMENTS
+  BLOCK_MAX_SEGMENTS,
+  BLOCK_32X32 = BLOCK_MAX_SEGMENTS,
+  BLOCK_MAX_SB_SEGMENTS,
 };
 
 typedef struct VP8_COMP {
@@ -528,6 +530,10 @@ typedef struct VP8_COMP {
 
   int cq_target_quality;
 
+#if CONFIG_SUPERBLOCKS
+  int sb_count;
+  int sb_ymode_count [VP8_I32X32_MODES];
+#endif
   int ymode_count [VP8_YMODES];        /* intra MB type cts this frame */
   int bmode_count [VP8_BINTRAMODES];
   int i8x8_mode_count [VP8_I8X8_MODES];
@@ -628,7 +634,7 @@ typedef struct VP8_COMP {
   vp8_full_search_fn_t full_search_sad;
   vp8_refining_search_fn_t refining_search_sad;
   vp8_diamond_search_fn_t diamond_search_sad;
-  vp8_variance_fn_ptr_t fn_ptr[BLOCK_MAX_SEGMENTS];
+  vp8_variance_fn_ptr_t fn_ptr[BLOCK_MAX_SB_SEGMENTS];
   uint64_t time_receive_data;
   uint64_t time_compress_data;
   uint64_t time_pick_lpf;
@@ -732,9 +738,6 @@ typedef struct VP8_COMP {
 
   int droppable;
 
-  // Global store for SB left contexts, one for each MB row in the SB
-  ENTROPY_CONTEXT_PLANES left_context[2];
-
   // TODO Do we still need this??
   int update_context;
 
diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c
index ed5b5c96d..97b02f033 100644
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -718,7 +718,7 @@ static void macro_block_yrd(MACROBLOCK *mb,
   *Rate = vp8_rdcost_mby(mb);
 }
 
-static int vp8_rdcost_mby_8x8(MACROBLOCK *mb) {
+static int vp8_rdcost_mby_8x8(MACROBLOCK *mb, int backup) {
   int cost = 0;
   int b;
   MACROBLOCKD *xd = &mb->e_mbd;
@@ -726,11 +726,16 @@ static int vp8_rdcost_mby_8x8(MACROBLOCK *mb) {
   ENTROPY_CONTEXT *ta;
   ENTROPY_CONTEXT *tl;
 
-  vpx_memcpy(&t_above,xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));
-  vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+  if (backup) {
+    vpx_memcpy(&t_above,xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+    vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));
 
-  ta = (ENTROPY_CONTEXT *)&t_above;
-  tl = (ENTROPY_CONTEXT *)&t_left;
+    ta = (ENTROPY_CONTEXT *)&t_above;
+    tl = (ENTROPY_CONTEXT *)&t_left;
+  } else {
+    ta = (ENTROPY_CONTEXT *)mb->e_mbd.above_context;
+    tl = (ENTROPY_CONTEXT *)mb->e_mbd.left_context;
+  }
 
   for (b = 0; b < 16; b += 4)
     cost += cost_coeffs(mb, xd->block + b, PLANE_TYPE_Y_NO_DC,
@@ -775,7 +780,7 @@ static void macro_block_yrd_8x8(MACROBLOCK *mb,
 
   *Distortion = (d >> 2);
   // rate
-  *Rate = vp8_rdcost_mby_8x8(mb);
+  *Rate = vp8_rdcost_mby_8x8(mb, 1);
 }
 
 #if CONFIG_TX16X16
@@ -823,6 +828,66 @@ static void copy_predictor(unsigned char *dst, const unsigned char *predictor) {
   d[12] = p[12];
 }
 
+#if CONFIG_SUPERBLOCKS
+static void super_block_yrd_8x8(MACROBLOCK *x,
+                                int *rate,
+                                int *distortion,
+                                const VP8_ENCODER_RTCD *rtcd, int *skip)
+{
+  MACROBLOCKD *const xd = &x->e_mbd;
+  BLOCK *const by2 = x->block + 24;
+  BLOCKD *const bdy2  = xd->block + 24;
+  int d = 0, r = 0, n;
+  const uint8_t *src = x->src.y_buffer, *dst = xd->dst.y_buffer;
+  int src_y_stride = x->src.y_stride, dst_y_stride = xd->dst.y_stride;
+  ENTROPY_CONTEXT_PLANES *ta = xd->above_context;
+  ENTROPY_CONTEXT_PLANES *tl = xd->left_context;
+  ENTROPY_CONTEXT_PLANES t_above[2];
+  ENTROPY_CONTEXT_PLANES t_left[2];
+  int skippable = 1;
+
+  vpx_memcpy(t_above, xd->above_context, sizeof(t_above));
+  vpx_memcpy(t_left, xd->left_context, sizeof(t_left));
+
+  for (n = 0; n < 4; n++) {
+    int x_idx = n & 1, y_idx = n >> 1;
+
+    vp8_subtract_mby_s_c(x->src_diff,
+                         src + x_idx * 16 + y_idx * 16 * src_y_stride,
+                         src_y_stride,
+                         dst + x_idx * 16 + y_idx * 16 * dst_y_stride,
+                         dst_y_stride);
+    vp8_transform_mby_8x8(x);
+    vp8_quantize_mby_8x8(x);
+
+    /* remove 1st order dc to properly combine 1st/2nd order distortion */
+    x->coeff[  0] = 0;
+    x->coeff[ 64] = 0;
+    x->coeff[128] = 0;
+    x->coeff[192] = 0;
+    xd->dqcoeff[  0] = 0;
+    xd->dqcoeff[ 64] = 0;
+    xd->dqcoeff[128] = 0;
+    xd->dqcoeff[192] = 0;
+
+    d += ENCODEMB_INVOKE(&rtcd->encodemb, mberr)(x, 0);
+    d += ENCODEMB_INVOKE(&rtcd->encodemb, berr)(by2->coeff, bdy2->dqcoeff, 16);
+    xd->above_context = ta + x_idx;
+    xd->left_context = tl + y_idx;
+    r += vp8_rdcost_mby_8x8(x, 0);
+    skippable = skippable && mby_is_skippable_8x8(xd);
+  }
+
+  *distortion = (d >> 2);
+  *rate       = r;
+  if (skip) *skip = skippable;
+  xd->above_context = ta;
+  xd->left_context = tl;
+  vpx_memcpy(xd->above_context, &t_above, sizeof(t_above));
+  vpx_memcpy(xd->left_context, &t_left, sizeof(t_left));
+}
+#endif
+
 static void copy_predictor_8x8(unsigned char *dst, const unsigned char *predictor) {
   const unsigned int *p = (const unsigned int *)predictor;
   unsigned int *d = (unsigned int *)dst;
@@ -1062,6 +1127,45 @@ static int64_t rd_pick_intra4x4mby_modes(VP8_COMP *cpi, MACROBLOCK *mb, int *Rat
   return RDCOST(mb->rdmult, mb->rddiv, cost, distortion);
 }
 
+#if CONFIG_SUPERBLOCKS
+static int64_t rd_pick_intra_sby_mode(VP8_COMP *cpi,
+                                      MACROBLOCK *x,
+                                      int *rate,
+                                      int *rate_tokenonly,
+                                      int *distortion) {
+  MB_PREDICTION_MODE mode;
+  MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
+  int this_rate, this_rate_tokenonly;
+  int this_distortion;
+  int64_t best_rd = INT64_MAX, this_rd;
+
+  /* Y Search for 32x32 intra prediction mode */
+  for (mode = DC_PRED; mode <= TM_PRED; mode++) {
+    x->e_mbd.mode_info_context->mbmi.mode = mode;
+    RECON_INVOKE(&cpi->common.rtcd.recon,
+                 build_intra_predictors_sby_s)(&x->e_mbd);
+
+    super_block_yrd_8x8(x, &this_rate_tokenonly,
+                        &this_distortion, IF_RTCD(&cpi->rtcd), NULL);
+    this_rate = this_rate_tokenonly +
+                x->mbmode_cost[x->e_mbd.frame_type]
+                              [x->e_mbd.mode_info_context->mbmi.mode];
+    this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
+
+    if (this_rd < best_rd) {
+      mode_selected   = mode;
+      best_rd         = this_rd;
+      *rate           = this_rate;
+      *rate_tokenonly = this_rate_tokenonly;
+      *distortion     = this_distortion;
+    }
+  }
+
+  x->e_mbd.mode_info_context->mbmi.mode = mode_selected;
+
+  return best_rd;
+}
+#endif
 
 static int64_t rd_pick_intra16x16mby_mode(VP8_COMP *cpi,
                                       MACROBLOCK *x,
@@ -1372,18 +1476,23 @@ static int64_t rd_inter16x16_uv(VP8_COMP *cpi, MACROBLOCK *x, int *rate,
   return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
 }
 
-static int rd_cost_mbuv_8x8(MACROBLOCK *mb) {
+static int rd_cost_mbuv_8x8(MACROBLOCK *mb, int backup) {
   int b;
   int cost = 0;
   MACROBLOCKD *xd = &mb->e_mbd;
   ENTROPY_CONTEXT_PLANES t_above, t_left;
   ENTROPY_CONTEXT *ta, *tl;
 
-  vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));
-  vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+  if (backup) {
+    vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+    vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));
 
-  ta = (ENTROPY_CONTEXT *)&t_above;
-  tl = (ENTROPY_CONTEXT *)&t_left;
+    ta = (ENTROPY_CONTEXT *)&t_above;
+    tl = (ENTROPY_CONTEXT *)&t_left;
+  } else {
+    ta = (ENTROPY_CONTEXT *)mb->e_mbd.above_context;
+    tl = (ENTROPY_CONTEXT *)mb->e_mbd.left_context;
+  }
 
   for (b = 16; b < 24; b += 4)
     cost += cost_coeffs(mb, xd->block + b, PLANE_TYPE_UV,
@@ -1393,6 +1502,54 @@ static int rd_cost_mbuv_8x8(MACROBLOCK *mb) {
   return cost;
 }
 
+#if CONFIG_SUPERBLOCKS
+static int64_t rd_inter32x32_uv_8x8(VP8_COMP *cpi, MACROBLOCK *x, int *rate,
+                                int *distortion, int fullpixel, int *skip) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  int n, r = 0, d = 0;
+  const uint8_t *usrc = x->src.u_buffer, *udst = xd->dst.u_buffer;
+  const uint8_t *vsrc = x->src.v_buffer, *vdst = xd->dst.v_buffer;
+  int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride;
+  int skippable = 1;
+  ENTROPY_CONTEXT_PLANES t_above[2], t_left[2];
+  ENTROPY_CONTEXT_PLANES *ta = xd->above_context;
+  ENTROPY_CONTEXT_PLANES *tl = xd->left_context;
+
+  memcpy(t_above, xd->above_context, sizeof(t_above));
+  memcpy(t_left, xd->left_context, sizeof(t_left));
+
+  for (n = 0; n < 4; n++) {
+    int x_idx = n & 1, y_idx = n >> 1;
+
+    vp8_subtract_mbuv_s_c(x->src_diff,
+                          usrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
+                          vsrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
+                          src_uv_stride,
+                          udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
+                          vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
+                          dst_uv_stride);
+
+    vp8_transform_mbuv_8x8(x);
+    vp8_quantize_mbuv_8x8(x);
+
+    xd->above_context = ta + x_idx;
+    xd->left_context = tl + y_idx;
+    r += rd_cost_mbuv_8x8(x, 0);
+    d += ENCODEMB_INVOKE(&cpi->rtcd.encodemb, mbuverr)(x) / 4;
+    skippable = skippable && mbuv_is_skippable_8x8(xd);
+  }
+
+  *rate = r;
+  *distortion = d;
+  if (skip) *skip = skippable;
+  xd->left_context = tl;
+  xd->above_context = ta;
+  memcpy(xd->above_context, t_above, sizeof(t_above));
+  memcpy(xd->left_context, t_left, sizeof(t_left));
+
+  return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
+}
+#endif
 
 static int64_t rd_inter16x16_uv_8x8(VP8_COMP *cpi, MACROBLOCK *x, int *rate,
                                     int *distortion, int fullpixel) {
@@ -1403,7 +1560,7 @@ static int64_t rd_inter16x16_uv_8x8(VP8_COMP *cpi, MACROBLOCK *x, int *rate,
 
   vp8_quantize_mbuv_8x8(x);
 
-  *rate       = rd_cost_mbuv_8x8(x);
+  *rate       = rd_cost_mbuv_8x8(x, 1);
   *distortion = ENCODEMB_INVOKE(&cpi->rtcd.encodemb, mbuverr)(x) / 4;
 
   return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
@@ -1527,7 +1684,7 @@ static void rd_pick_intra_mbuv_mode_8x8(VP8_COMP *cpi,
 
     vp8_quantize_mbuv_8x8(x);
 
-    rate_to = rd_cost_mbuv_8x8(x);
+    rate_to = rd_cost_mbuv_8x8(x, 1);
     rate = rate_to + x->intra_uv_mode_cost[x->e_mbd.frame_type][mbmi->uv_mode];
 
     distortion = ENCODEMB_INVOKE(&cpi->rtcd.encodemb, mbuverr)(x) / 4;
@@ -1546,6 +1703,91 @@ static void rd_pick_intra_mbuv_mode_8x8(VP8_COMP *cpi,
   mbmi->uv_mode = mode_selected;
 }
 
+#if CONFIG_SUPERBLOCKS
+static void super_block_uvrd_8x8(MACROBLOCK *x,
+                                 int *rate,
+                                 int *distortion,
+                                 const VP8_ENCODER_RTCD *rtcd) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  int d = 0, r = 0, n;
+  const uint8_t *usrc = x->src.u_buffer, *udst = xd->dst.u_buffer;
+  const uint8_t *vsrc = x->src.v_buffer, *vdst = xd->dst.v_buffer;
+  int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride;
+  ENTROPY_CONTEXT_PLANES t_above[2], t_left[2];
+  ENTROPY_CONTEXT_PLANES *ta = xd->above_context;
+  ENTROPY_CONTEXT_PLANES *tl = xd->left_context;
+
+  memcpy(t_above, xd->above_context, sizeof(t_above));
+  memcpy(t_left,  xd->left_context,  sizeof(t_left));
+
+  for (n = 0; n < 4; n++) {
+    int x_idx = n & 1, y_idx = n >> 1;
+
+    vp8_subtract_mbuv_s_c(x->src_diff,
+                          usrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
+                          vsrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
+                          src_uv_stride,
+                          udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
+                          vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
+                          dst_uv_stride);
+    vp8_transform_mbuv_8x8(x);
+    vp8_quantize_mbuv_8x8(x);
+
+    d += ENCODEMB_INVOKE(&rtcd->encodemb, mbuverr)(x) >> 2;
+    xd->above_context = ta + x_idx;
+    xd->left_context = tl + y_idx;
+    r += rd_cost_mbuv_8x8(x, 0);
+  }
+
+  xd->above_context = ta;
+  xd->left_context = tl;
+  *distortion = (d >> 2);
+  *rate       = r;
+
+  xd->left_context = tl;
+  xd->above_context = ta;
+  memcpy(xd->above_context, t_above, sizeof(t_above));
+  memcpy(xd->left_context,  t_left,  sizeof(t_left));
+}
+
+static int64_t rd_pick_intra_sbuv_mode(VP8_COMP *cpi,
+                                       MACROBLOCK *x,
+                                       int *rate,
+                                       int *rate_tokenonly,
+                                       int *distortion) {
+  MB_PREDICTION_MODE mode;
+  MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
+  int64_t best_rd = INT64_MAX, this_rd;
+  int this_rate_tokenonly, this_rate;
+  int this_distortion;
+
+  for (mode = DC_PRED; mode <= TM_PRED; mode++) {
+    x->e_mbd.mode_info_context->mbmi.uv_mode = mode;
+    RECON_INVOKE(&cpi->rtcd.common->recon,
+                 build_intra_predictors_sbuv_s)(&x->e_mbd);
+
+    super_block_uvrd_8x8(x, &this_rate_tokenonly,
+                         &this_distortion, IF_RTCD(&cpi->rtcd));
+    this_rate = this_rate_tokenonly +
+                x->mbmode_cost[x->e_mbd.frame_type]
+                              [x->e_mbd.mode_info_context->mbmi.mode];
+    this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
+
+    if (this_rd < best_rd) {
+      mode_selected   = mode;
+      best_rd         = this_rd;
+      *rate           = this_rate;
+      *rate_tokenonly = this_rate_tokenonly;
+      *distortion     = this_distortion;
+    }
+  }
+
+  x->e_mbd.mode_info_context->mbmi.uv_mode = mode_selected;
+
+  return best_rd;
+}
+#endif
+
 int vp8_cost_mv_ref(VP8_COMP *cpi,
                     MB_PREDICTION_MODE m,
                     const int near_mv_ref_ct[4]) {
@@ -2568,25 +2810,33 @@ static void vp8_estimate_ref_frame_costs(VP8_COMP *cpi, int segment_id, unsigned
   }
 }
 
-static void store_coding_context(MACROBLOCK *x, int mb_index,
+static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
                                  int mode_index,
                                  PARTITION_INFO *partition,
                                  int_mv *ref_mv,
-                                 int_mv *second_ref_mv) {
+                                 int_mv *second_ref_mv,
+                                 int single_pred_diff,
+                                 int comp_pred_diff,
+                                 int hybrid_pred_diff) {
   MACROBLOCKD *xd = &x->e_mbd;
 
   // Take a snapshot of the coding context so it can be
   // restored if we decide to encode this way
-  x->mb_context[mb_index].best_mode_index = mode_index;
-  vpx_memcpy(&x->mb_context[mb_index].mic, xd->mode_info_context,
+  ctx->best_mode_index = mode_index;
+  vpx_memcpy(&ctx->mic, xd->mode_info_context,
              sizeof(MODE_INFO));
-  vpx_memcpy(&x->mb_context[mb_index].partition_info, partition,
-             sizeof(PARTITION_INFO));
-  x->mb_context[mb_index].best_ref_mv.as_int = ref_mv->as_int;
-  x->mb_context[mb_index].second_best_ref_mv.as_int = second_ref_mv->as_int;
-
-  // x->mb_context[mb_index].rddiv = x->rddiv;
-  // x->mb_context[mb_index].rdmult = x->rdmult;
+  if (partition)
+    vpx_memcpy(&ctx->partition_info, partition,
+               sizeof(PARTITION_INFO));
+  ctx->best_ref_mv.as_int = ref_mv->as_int;
+  ctx->second_best_ref_mv.as_int = second_ref_mv->as_int;
+
+  // ctx[mb_index].rddiv = x->rddiv;
+  // ctx[mb_index].rdmult = x->rdmult;
+
+  ctx->single_pred_diff = single_pred_diff;
+  ctx->comp_pred_diff   = comp_pred_diff;
+  ctx->hybrid_pred_diff = hybrid_pred_diff;
 }
 
 static void inter_mode_cost(VP8_COMP *cpi, MACROBLOCK *x, int this_mode,
@@ -3464,7 +3714,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
     }
 #endif
 
-    if (x->skip)
+    if (x->skip && !mode_excluded)
       break;
   }
 
@@ -3557,16 +3807,36 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
   }
 
 end:
-  // TODO Save these to add in only if MB coding mode is selected?
-  for (i = 0; i < NB_PREDICTION_TYPES; ++i)
-    cpi->rd_comp_pred_diff[i] += best_pred_diff[i];
+  store_coding_context(x, &x->mb_context[xd->mb_index], best_mode_index, &best_partition,
+                       &frame_best_ref_mv[xd->mode_info_context->mbmi.ref_frame],
+                       &frame_best_ref_mv[xd->mode_info_context->mbmi.second_ref_frame],
+                       best_pred_diff[0], best_pred_diff[1], best_pred_diff[2]);
+}
 
-  store_coding_context(x, xd->mb_index, best_mode_index, &best_partition,
-                       &frame_best_ref_mv[mbmi->ref_frame],
-                       &frame_best_ref_mv[mbmi->second_ref_frame]);
+#if CONFIG_SUPERBLOCKS
+void vp8_rd_pick_intra_mode_sb(VP8_COMP *cpi, MACROBLOCK *x,
+                               int *returnrate,
+                               int *returndist) {
+  int rate_y, rate_uv;
+  int rate_y_tokenonly, rate_uv_tokenonly;
+  int error_y, error_uv;
+  int dist_y, dist_uv;
+
+  x->e_mbd.mode_info_context->mbmi.txfm_size = TX_8X8;
+
+  error_uv = rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly,
+                                     &dist_uv);
+  error_y = rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly,
+                                   &dist_y);
+
+  // TODO(rbultje): add rate_uv
+  *returnrate = rate_y;
+  *returndist = dist_y + (dist_uv >> 2);
 }
+#endif
 
-int vp8_rd_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x) {
+void vp8_rd_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x,
+                            int *returnrate, int *returndist) {
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
   int64_t error4x4, error16x16;
@@ -3585,6 +3855,8 @@ int vp8_rd_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x) {
   int rate8x8, dist8x8;
   int mode16x16;
   int mode8x8[2][4];
+  int dist;
+  int rateuv8, rateuv_tokenonly8, distuv8;
 
   mbmi->ref_frame = INTRA_FRAME;
   rd_pick_intra_mbuv_mode(cpi, x, &rateuv, &rateuv_tokenonly, &distuv);
@@ -3646,9 +3918,11 @@ int vp8_rd_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x) {
       rate += rate4x4;
 #endif
       mbmi->mode = B_PRED;
+      dist = dist4x4;
     } else {
       mbmi->mode = mode16x16;
       rate += rate16x16;
+      dist = dist16x16;
     }
   } else {
     if (error4x4 < error8x8) {
@@ -3663,17 +3937,727 @@ int vp8_rd_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x) {
       rate += rate4x4;
 #endif
       mbmi->mode = B_PRED;
+      dist = dist4x4;
     } else {
       mbmi->mode = I8X8_PRED;
       set_i8x8_block_modes(x, mode8x8);
       rate += rate8x8;
+      dist = dist8x8;
     }
   }
-  return rate;
+
+  // TODO(rbultje): should add rateuv here also
+  *returnrate = rate - rateuv;
+  *returndist = dist + (distuv >> 2);
 }
 
-int vp8cx_pick_mode_inter_macroblock(VP8_COMP *cpi, MACROBLOCK *x,
-                                     int recon_yoffset, int recon_uvoffset) {
+#if CONFIG_SUPERBLOCKS
+int64_t vp8_rd_pick_inter_mode_sb(VP8_COMP *cpi, MACROBLOCK *x,
+                                  int recon_yoffset, int recon_uvoffset,
+                                  int *returnrate, int *returndistortion) {
+  VP8_COMMON *cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  BLOCK *b = &x->block[0];
+  BLOCKD *d = &xd->block[0];
+  MB_PREDICTION_MODE this_mode;
+  MV_REFERENCE_FRAME ref_frame;
+  int mis = xd->mode_info_stride;
+  unsigned char segment_id = xd->mode_info_context->mbmi.segment_id;
+  int comp_pred;
+  int_mv best_ref_mv, second_best_ref_mv;
+  int_mv mode_mv[MB_MODE_COUNT];
+  int_mv frame_nearest_mv[4];
+  int_mv frame_near_mv[4];
+  int_mv frame_best_ref_mv[4];
+  int_mv mc_search_result[4];
+  int frame_mdcounts[4][4];
+  unsigned char *y_buffer[4];
+  unsigned char *u_buffer[4];
+  unsigned char *v_buffer[4];
+  static const int flag_list[4] = { 0, VP8_LAST_FLAG, VP8_GOLD_FLAG, VP8_ALT_FLAG };
+  int idx_list[4] = { 0, cpi->common.lst_fb_idx, cpi->common.gld_fb_idx, cpi->common.alt_fb_idx };
+  int mdcounts[4];
+  int near_sadidx[8] = {0, 1, 2, 3, 4, 5, 6, 7};
+  int saddone = 0;
+  int sr = 0;  // search range got from mv_pred(). It uses step_param levels. (0-7)
+  int64_t best_rd = INT64_MAX;
+  int64_t best_comp_rd = INT64_MAX;
+  int64_t best_single_rd = INT64_MAX;
+  int64_t best_hybrid_rd = INT64_MAX;
+  int64_t best_yrd = INT64_MAX;
+  MB_MODE_INFO best_mbmode;
+  int mode_index = 0;
+#if 0
+  PARTITION_INFO best_partition;
+  union b_mode_info best_bmodes[16];
+#endif
+  unsigned int ref_costs[MAX_REF_FRAMES];
+
+  xd->mode_info_context->mbmi.segment_id = segment_id;
+  vp8_estimate_ref_frame_costs(cpi, segment_id, ref_costs);
+
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
+    if (cpi->ref_frame_flags & flag_list[ref_frame]) {
+      YV12_BUFFER_CONFIG *ref_buf = &cpi->common.yv12_fb[idx_list[ref_frame]];
+
+      vp8_find_near_mvs(xd, xd->mode_info_context,
+                        xd->prev_mode_info_context,
+                        &frame_nearest_mv[ref_frame], &frame_near_mv[ref_frame],
+                        &frame_best_ref_mv[ref_frame], frame_mdcounts[ref_frame],
+                        ref_frame, cpi->common.ref_frame_sign_bias);
+
+      y_buffer[ref_frame] = ref_buf->y_buffer + recon_yoffset;
+      u_buffer[ref_frame] = ref_buf->u_buffer + recon_uvoffset;
+      v_buffer[ref_frame] = ref_buf->v_buffer + recon_uvoffset;
+    }
+    mc_search_result[ref_frame].as_int = INVALID_MV;
+  }
+
+  for (mode_index = 0; mode_index < MAX_MODES; mode_index++) {
+    int_mv mvp;
+    int mode_excluded;
+    int64_t this_rd = INT64_MAX;
+    int disable_skip = 0;
+    int other_cost = 0;
+    int compmode_cost = 0;
+    int rate2 = 0;
+    int distortion2 = 0;
+    int rate_y = 0;
+    int rate_uv = 0;
+    int distortion_uv;
+    int distortion;
+    int skippable_y, skippable_uv;
+
+    // Test best rd so far against threshold for trying this mode.
+    if (best_rd <= cpi->rd_threshes[mode_index]) {
+      continue;
+    }
+
+    this_mode = vp8_mode_order[mode_index].mode;
+    ref_frame = vp8_mode_order[mode_index].ref_frame;
+    xd->mode_info_context->mbmi.ref_frame = ref_frame;
+    comp_pred = vp8_mode_order[mode_index].second_ref_frame != INTRA_FRAME;
+    xd->mode_info_context->mbmi.mode = this_mode;
+    xd->mode_info_context->mbmi.uv_mode = DC_PRED;
+#if 0 && CONFIG_PRED_FILTER
+    xd->mode_info_context->mbmi.pred_filter_enabled = 0;
+#endif
+
+#if 0 && CONFIG_COMP_INTRA_PRED
+    xd->mode_info_context->mbmi.second_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
+    xd->mode_info_context->mbmi.second_uv_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
+#endif
+
+    if (!(cpi->ref_frame_flags & flag_list[ref_frame]))
+      continue;
+
+    // not yet supported or not superblocky
+    // TODO(rbultje): support intra coding
+    if (ref_frame == INTRA_FRAME || this_mode == SPLITMV)
+      continue;
+
+    if (comp_pred) {
+      int second_ref;
+
+      if (ref_frame == ALTREF_FRAME) {
+        second_ref = LAST_FRAME;
+      } else {
+        second_ref = ref_frame + 1;
+      }
+      if (!(cpi->ref_frame_flags & flag_list[second_ref]))
+        continue;
+      xd->mode_info_context->mbmi.second_ref_frame = second_ref;
+
+      xd->second_pre.y_buffer = y_buffer[second_ref];
+      xd->second_pre.u_buffer = u_buffer[second_ref];
+      xd->second_pre.v_buffer = v_buffer[second_ref];
+      second_best_ref_mv  = frame_best_ref_mv[second_ref];
+      mode_excluded = cm->comp_pred_mode == SINGLE_PREDICTION_ONLY;
+    } else {
+      xd->mode_info_context->mbmi.second_ref_frame = INTRA_FRAME;
+      mode_excluded = cm->comp_pred_mode == COMP_PREDICTION_ONLY;
+    }
+
+    xd->pre.y_buffer = y_buffer[ref_frame];
+    xd->pre.u_buffer = u_buffer[ref_frame];
+    xd->pre.v_buffer = v_buffer[ref_frame];
+    mode_mv[ZEROMV].as_int = 0;
+    mode_mv[NEARESTMV] = frame_nearest_mv[ref_frame];
+    mode_mv[NEARMV] = frame_near_mv[ref_frame];
+    best_ref_mv = frame_best_ref_mv[ref_frame];
+    vpx_memcpy(mdcounts, frame_mdcounts[ref_frame], sizeof(mdcounts));
+
+    // If the segment reference frame feature is enabled....
+    // then do nothing if the current ref frame is not allowed..
+    if (segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&
+        !check_segref(xd, segment_id, ref_frame)) {
+      continue;
+    }
+    // If the segment mode feature is enabled....
+    // then do nothing if the current mode is not allowed..
+    else if (segfeature_active(xd, segment_id, SEG_LVL_MODE)  &&
+             (this_mode != get_segdata(xd, segment_id, SEG_LVL_MODE))) {
+      continue;
+    }
+    // Disable this drop out case if either the mode or ref frame
+    // segment level feature is enabled for this segment. This is to
+    // prevent the possibility that we end up unable to pick any mode.
+    else if (!segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&
+             !segfeature_active(xd, segment_id, SEG_LVL_MODE)) {
+      // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
+      // unless ARNR filtering is enabled in which case we want
+      // an unfiltered alternative
+      if (cpi->is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) {
+        if (this_mode != ZEROMV || ref_frame != ALTREF_FRAME) {
+          continue;
+        }
+      }
+    }
+
+    if (!comp_pred) {
+      switch (this_mode) {
+        case NEWMV: {
+          int thissme;
+          int bestsme = INT_MAX;
+          int step_param = cpi->sf.first_step;
+          int further_steps;
+          int n;
+          int do_refine = 1;   /* If last step (1-away) of n-step search doesn't pick the center point as the best match,
+                                  we will do a final 1-away diamond refining search  */
+          int num00;
+
+          int sadpb = x->sadperbit16;
+          int_mv mvp_full;
+
+          int col_min = (best_ref_mv.as_mv.col >> 3) - MAX_FULL_PEL_VAL + ((best_ref_mv.as_mv.col & 7) ? 1 : 0);
+          int row_min = (best_ref_mv.as_mv.row >> 3) - MAX_FULL_PEL_VAL + ((best_ref_mv.as_mv.row & 7) ? 1 : 0);
+          int col_max = (best_ref_mv.as_mv.col >> 3) + MAX_FULL_PEL_VAL;
+          int row_max = (best_ref_mv.as_mv.row >> 3) + MAX_FULL_PEL_VAL;
+
+          int tmp_col_min = x->mv_col_min;
+          int tmp_col_max = x->mv_col_max;
+          int tmp_row_min = x->mv_row_min;
+          int tmp_row_max = x->mv_row_max;
+
+          if (!saddone) {
+            vp8_cal_sad(cpi, xd, x, recon_yoffset, &near_sadidx[0]);
+            saddone = 1;
+          }
+
+          vp8_mv_pred(cpi, xs, xd->mode_info_context, &mvp,
+                      xd->mode_info_context->mbmi.ref_frame,
+                      cpi->common.ref_frame_sign_bias, &sr, &near_sadidx[0]);
+
+          mvp_full.as_mv.col = mvp.as_mv.col >> 3;
+          mvp_full.as_mv.row = mvp.as_mv.row >> 3;
+
+          // Get intersection of UMV window and valid MV window to reduce # of checks in diamond search.
+          if (x->mv_col_min < col_min)
+            x->mv_col_min = col_min;
+          if (x->mv_col_max > col_max)
+            x->mv_col_max = col_max;
+          if (x->mv_row_min < row_min)
+            x->mv_row_min = row_min;
+          if (x->mv_row_max > row_max)
+            x->mv_row_max = row_max;
+
+          // adjust search range according to sr from mv prediction
+          if (sr > step_param)
+            step_param = sr;
+
+          // Initial step/diamond search
+          {
+            bestsme = cpi->diamond_search_sad(x, b, d, &mvp_full, &d->bmi.as_mv.first,
+                                              step_param, sadpb, &num00,
+                                              &cpi->fn_ptr[BLOCK_32X32],
+                                              XMVCOST, &best_ref_mv);
+            mode_mv[NEWMV].as_int = d->bmi.as_mv.first.as_int;
+
+            // Further step/diamond searches as necessary
+            n = 0;
+            further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;
+
+            n = num00;
+            num00 = 0;
+
+            /* If there won't be more n-step search, check to see if refining search is needed. */
+            if (n > further_steps)
+              do_refine = 0;
+
+            while (n < further_steps) {
+              n++;
+
+              if (num00)
+                num00--;
+              else {
+                thissme = cpi->diamond_search_sad(x, b, d, &mvp_full,
+                                                  &d->bmi.as_mv.first, step_param + n, sadpb, &num00,
+                                                  &cpi->fn_ptr[BLOCK_32X32],
+                                                  XMVCOST, &best_ref_mv);
+
+                /* check to see if refining search is needed. */
+                if (num00 > (further_steps - n))
+                  do_refine = 0;
+
+                if (thissme < bestsme) {
+                  bestsme = thissme;
+                  mode_mv[NEWMV].as_int = d->bmi.as_mv.first.as_int;
+                } else {
+                  d->bmi.as_mv.first.as_int = mode_mv[NEWMV].as_int;
+                }
+              }
+            }
+          }
+
+          /* final 1-away diamond refining search */
+          if (do_refine == 1) {
+            int search_range;
+
+            // It seems not a good way to set search_range. Need further investigation.
+            // search_range = MAXF(abs((mvp.row>>3) - d->bmi.mv.as_mv.row), abs((mvp.col>>3) - d->bmi.mv.as_mv.col));
+            search_range = 8;
+
+            thissme = cpi->refining_search_sad(x, b, d, &d->bmi.as_mv.first, sadpb,
+                                               search_range, &cpi->fn_ptr[BLOCK_32X32],
+                                               XMVCOST, &best_ref_mv);
+
+            if (thissme < bestsme) {
+              bestsme = thissme;
+              mode_mv[NEWMV].as_int = d->bmi.as_mv.first.as_int;
+            } else {
+              d->bmi.as_mv.first.as_int = mode_mv[NEWMV].as_int;
+            }
+          }
+
+          x->mv_col_min = tmp_col_min;
+          x->mv_col_max = tmp_col_max;
+          x->mv_row_min = tmp_row_min;
+          x->mv_row_max = tmp_row_max;
+
+          if (bestsme < INT_MAX) {
+            int dis; /* TODO: use dis in distortion calculation later. */
+            unsigned int sse;
+            cpi->find_fractional_mv_step(x, b, d, &d->bmi.as_mv.first, &best_ref_mv,
+                                         x->errorperbit,
+                                         &cpi->fn_ptr[BLOCK_32X32],
+                                         XMVCOST, &dis, &sse);
+          }
+          mc_search_result[xd->mode_info_context->mbmi.ref_frame].as_int =
+            d->bmi.as_mv.first.as_int;
+
+          mode_mv[NEWMV].as_int = d->bmi.as_mv.first.as_int;
+
+          // Add the new motion vector cost to our rolling cost variable
+          rate2 += vp8_mv_bit_cost(&mode_mv[NEWMV], &best_ref_mv,
+                                   XMVCOST, 96,
+                                   xd->allow_high_precision_mv);
+        }
+
+        case NEARESTMV:
+        case NEARMV:
+          // Clip "next_nearest" so that it does not extend to far out of image
+          vp8_clamp_mv2(&mode_mv[this_mode], xd);
+
+          // Do not bother proceeding if the vector (from newmv,nearest or near) is 0,0 as this should then be coded using the zeromv mode.
+          if (((this_mode == NEARMV) || (this_mode == NEARESTMV)) && (mode_mv[this_mode].as_int == 0)) {
+            continue;
+          }
+
+        case ZEROMV:
+          // Trap vectors that reach beyond the UMV borders
+          // Note that ALL New MV, Nearest MV Near MV and Zero MV code drops through to this point
+          // because of the lack of break statements in the previous two cases.
+          if (((mode_mv[this_mode].as_mv.row >> 3) < x->mv_row_min) || ((mode_mv[this_mode].as_mv.row >> 3) > x->mv_row_max) ||
+              ((mode_mv[this_mode].as_mv.col >> 3) < x->mv_col_min) || ((mode_mv[this_mode].as_mv.col >> 3) > x->mv_col_max)) {
+            continue;
+          }
+
+          vp8_set_mbmode_and_mvs(x, this_mode, &mode_mv[this_mode]);
+
+#if CONFIG_PRED_FILTER
+          // Filtered prediction:
+          xd->mode_info_context->mbmi.pred_filter_enabled =
+          vp8_mode_order[mode_index].pred_filter_flag;
+          rate2 += vp8_cost_bit(cpi->common.prob_pred_filter_off,
+                                xd->mode_info_context->mbmi.pred_filter_enabled);
+#endif
+
+          vp8_build_inter32x32_predictors_sb(xd,
+                                             xd->dst.y_buffer,
+                                             xd->dst.u_buffer,
+                                             xd->dst.v_buffer,
+                                             xd->dst.y_stride,
+                                             xd->dst.uv_stride);
+
+          compmode_cost =
+            vp8_cost_bit(get_pred_prob(cm, xd, PRED_COMP), 0);
+
+          if (cpi->active_map_enabled && x->active_ptr[0] == 0) {
+            x->skip = 1;
+          } else if (x->encode_breakout) {
+            unsigned int sse;
+            unsigned int var;
+            int threshold = (xd->block[0].dequant[1] *
+                             xd->block[0].dequant[1] >> 4);
+
+            if (threshold < x->encode_breakout)
+              threshold = x->encode_breakout;
+
+            var = VARIANCE_INVOKE(&cpi->rtcd.variance, var32x32)(*(b->base_src),
+              b->src_stride, xd->dst.y_buffer, xd->dst.y_stride, &sse);
+
+            if (sse < threshold) {
+              unsigned int q2dc = xd->block[24].dequant[0];
+              /* If there is no codeable 2nd order dc
+                or a very small uniform pixel change change */
+              if ((sse - var < q2dc *q2dc >> 4) ||
+                  (sse / 2 > var && sse - var < 64)) {
+                // Check u and v to make sure skip is ok
+                int sse2, sse3;
+                int var2 = VARIANCE_INVOKE(&cpi->rtcd.variance, var16x16)
+                                  (x->src.u_buffer, x->src.uv_stride,
+                                   xd->dst.u_buffer, xd->dst.uv_stride, &sse2);
+                int var3 = VARIANCE_INVOKE(&cpi->rtcd.variance, var16x16)
+                                  (x->src.v_buffer, x->src.uv_stride,
+                                   xd->dst.v_buffer, xd->dst.uv_stride, &sse3);
+                sse2 += sse3;
+                if (sse2 * 2 < threshold) {
+                  x->skip = 1;
+                  distortion2 = sse + sse2;
+                  rate2 = 500;
+
+                  /* for best_yrd calculation */
+                  rate_uv = 0;
+                  distortion_uv = sse2;
+
+                  disable_skip = 1;
+                  this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
+                  break;
+                }
+              }
+            }
+          }
+
+          // Add in the Mv/mode cost
+          rate2 += vp8_cost_mv_ref(cpi, this_mode, mdcounts);
+
+          // Y cost and distortion - FIXME support other transform sizes
+          super_block_yrd_8x8(x, &rate_y, &distortion,
+                              IF_RTCD(&cpi->rtcd), &skippable_y);
+          rate2 += rate_y;
+          distortion2 += distortion;
+
+          rd_inter32x32_uv_8x8(cpi, x, &rate_uv, &distortion_uv,
+                               cpi->common.full_pixel, &skippable_uv);
+
+          rate2 += rate_uv;
+          distortion2 += distortion_uv;
+          mode_excluded = cpi->common.comp_pred_mode == COMP_PREDICTION_ONLY;
+          break;
+
+        default:
+          break;
+      }
+    } else { /* xd->mode_info_context->mbmi.second_ref_frame != 0 */
+      int ref1 = xd->mode_info_context->mbmi.ref_frame;
+      int ref2 = xd->mode_info_context->mbmi.second_ref_frame;
+
+      mode_excluded = cpi->common.comp_pred_mode == SINGLE_PREDICTION_ONLY;
+      switch (this_mode) {
+        case NEWMV:
+          if (mc_search_result[ref1].as_int == INVALID_MV ||
+              mc_search_result[ref2].as_int == INVALID_MV)
+            continue;
+          xd->mode_info_context->mbmi.mv[0].as_int = mc_search_result[ref1].as_int;
+          xd->mode_info_context->mbmi.mv[1].as_int = mc_search_result[ref2].as_int;
+          rate2 += vp8_mv_bit_cost(&mc_search_result[ref1],
+                                   &frame_best_ref_mv[ref1],
+                                   XMVCOST, 96,
+                                   xd->allow_high_precision_mv);
+          rate2 += vp8_mv_bit_cost(&mc_search_result[ref2],
+                                   &frame_best_ref_mv[ref2],
+                                   XMVCOST, 96,
+                                   xd->allow_high_precision_mv);
+          break;
+        case ZEROMV:
+          xd->mode_info_context->mbmi.mv[0].as_int = 0;
+          xd->mode_info_context->mbmi.mv[1].as_int = 0;
+          break;
+        case NEARMV:
+          if (frame_near_mv[ref1].as_int == 0 || frame_near_mv[ref2].as_int == 0) {
+            continue;
+          }
+          xd->mode_info_context->mbmi.mv[0].as_int = frame_near_mv[ref1].as_int;
+          xd->mode_info_context->mbmi.mv[1].as_int = frame_near_mv[ref2].as_int;
+          break;
+        case NEARESTMV:
+          if (frame_nearest_mv[ref1].as_int == 0 || frame_nearest_mv[ref2].as_int == 0) {
+            continue;
+          }
+          xd->mode_info_context->mbmi.mv[0].as_int = frame_nearest_mv[ref1].as_int;
+          xd->mode_info_context->mbmi.mv[1].as_int = frame_nearest_mv[ref2].as_int;
+          break;
+        default:
+          break;
+      }
+
+      /* Add in the Mv/mode cost */
+      rate2 += vp8_cost_mv_ref(cpi, this_mode, mdcounts);
+
+      vp8_clamp_mv2(&xd->mode_info_context->mbmi.mv[0], xd);
+      vp8_clamp_mv2(&xd->mode_info_context->mbmi.mv[1], xd);
+      if (((xd->mode_info_context->mbmi.mv[0].as_mv.row >> 3) < x->mv_row_min) ||
+          ((xd->mode_info_context->mbmi.mv[0].as_mv.row >> 3) > x->mv_row_max) ||
+          ((xd->mode_info_context->mbmi.mv[0].as_mv.col >> 3) < x->mv_col_min) ||
+          ((xd->mode_info_context->mbmi.mv[0].as_mv.col >> 3) > x->mv_col_max) ||
+          ((xd->mode_info_context->mbmi.mv[1].as_mv.row >> 3) < x->mv_row_min) ||
+          ((xd->mode_info_context->mbmi.mv[1].as_mv.row >> 3) > x->mv_row_max) ||
+          ((xd->mode_info_context->mbmi.mv[1].as_mv.col >> 3) < x->mv_col_min) ||
+          ((xd->mode_info_context->mbmi.mv[1].as_mv.col >> 3) > x->mv_col_max)) {
+        continue;
+      }
+
+      /* build first and second prediction */
+      vp8_build_inter32x32_predictors_sb(xd, xd->dst.y_buffer,
+                                         xd->dst.u_buffer, xd->dst.v_buffer,
+                                         xd->dst.y_stride, xd->dst.uv_stride);
+
+      /* Y cost and distortion - TODO(rbultje) support other transform sizes */
+      super_block_yrd_8x8(x, &rate_y, &distortion,
+                          IF_RTCD(&cpi->rtcd), &skippable_y);
+
+      rate2 += rate_y;
+      distortion2 += distortion;
+
+      /* UV cost and distortion */
+      rd_inter32x32_uv_8x8(cpi, x, &rate_uv, &distortion_uv,
+                           cpi->common.full_pixel, &skippable_uv);
+
+      rate2 += rate_uv;
+      distortion2 += distortion_uv;
+
+      /* don't bother w/ skip, we would never have come here if skip were
+       * enabled */
+      xd->mode_info_context->mbmi.mode = this_mode;
+
+      /* We don't include the cost of the second reference here, because there
+       * are only three options: Last/Golden, ARF/Last or Golden/ARF, or in
+       * other words if you present them in that order, the second one is
+       * always known if the first is known */
+      compmode_cost = vp8_cost_bit(get_pred_prob(cm, xd, PRED_COMP), 1);
+    }
+
+    if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {
+      rate2 += compmode_cost;
+    }
+
+
+    // Estimate the reference frame signaling cost and add it
+    // to the rolling cost variable.
+    rate2 += ref_costs[xd->mode_info_context->mbmi.ref_frame];
+
+    if (!disable_skip) {
+      // Test for the condition where skip block will be activated
+      // because there are no non zero coefficients and make any
+      // necessary adjustment for rate. Ignore if skip is coded at
+      // segment level as the cost wont have been added in.
+      if (cpi->common.mb_no_coeff_skip) {
+        int mb_skippable = skippable_y && skippable_uv;
+        int mb_skip_allowed;
+
+        // Is Mb level skip allowed for this mb.
+        mb_skip_allowed =
+          !segfeature_active(xd, segment_id, SEG_LVL_EOB) ||
+          get_segdata(xd, segment_id, SEG_LVL_EOB);
+
+        if (mb_skippable) {
+          // Back out the coefficient coding costs
+          rate2 -= (rate_y + rate_uv);
+          // for best_yrd calculation
+          rate_uv = 0;
+
+          if (mb_skip_allowed) {
+            int prob_skip_cost;
+
+            // Cost the skip mb case
+            vp8_prob skip_prob =
+              get_pred_prob(cm, xd, PRED_MBSKIP);
+
+            if (skip_prob) {
+              prob_skip_cost = vp8_cost_bit(skip_prob, 1);
+              rate2 += prob_skip_cost;
+              other_cost += prob_skip_cost;
+            }
+          }
+        }
+        // Add in the cost of the no skip flag.
+        else if (mb_skip_allowed) {
+          int prob_skip_cost = vp8_cost_bit(get_pred_prob(cm, xd,
+                                                          PRED_MBSKIP), 0);
+          rate2 += prob_skip_cost;
+          other_cost += prob_skip_cost;
+        }
+      }
+
+      // Calculate the final RD estimate for this mode.
+      this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
+    }
+
+#if 0
+    // Keep record of best intra distortion
+    if ((xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) &&
+        (this_rd < best_intra_rd)) {
+      best_intra_rd = this_rd;
+      *returnintra = distortion2;
+    }
+#endif
+
+    if (!disable_skip && xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {
+      if (this_rd < best_comp_rd)
+        best_comp_rd = this_rd;
+      if (this_rd < best_single_rd)
+        best_single_rd = this_rd;
+      if (this_rd < best_hybrid_rd)
+        best_hybrid_rd = this_rd;
+    }
+
+    // Did this mode help.. i.e. is it the new best mode
+    if (this_rd < best_rd || x->skip) {
+      if (!mode_excluded) {
+#if 0
+        // Note index of best mode so far
+        best_mode_index = mode_index;
+
+        if (this_mode <= B_PRED) {
+          xd->mode_info_context->mbmi.uv_mode = uv_intra_mode_8x8;
+          /* required for left and above block mv */
+          xd->mode_info_context->mbmi.mv.as_int = 0;
+        }
+#endif
+
+        other_cost += ref_costs[xd->mode_info_context->mbmi.ref_frame];
+
+        /* Calculate the final y RD estimate for this mode */
+        best_yrd = RDCOST(x->rdmult, x->rddiv, (rate2 - rate_uv - other_cost),
+                          (distortion2 - distortion_uv));
+
+        *returnrate = rate2;
+        *returndistortion = distortion2;
+        best_rd = this_rd;
+        vpx_memcpy(&best_mbmode, &xd->mode_info_context->mbmi, sizeof(MB_MODE_INFO));
+      }
+#if 0
+      // Testing this mode gave rise to an improvement in best error score. Lower threshold a bit for next time
+      cpi->rd_thresh_mult[mode_index] = (cpi->rd_thresh_mult[mode_index] >= (MIN_THRESHMULT + 2)) ? cpi->rd_thresh_mult[mode_index] - 2 : MIN_THRESHMULT;
+      cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7) * cpi->rd_thresh_mult[mode_index];
+#endif
+    }
+    // If the mode did not help improve the best error case then raise the threshold for testing that mode next time around.
+    else {
+#if 0
+      cpi->rd_thresh_mult[mode_index] += 4;
+
+      if (cpi->rd_thresh_mult[mode_index] > MAX_THRESHMULT)
+        cpi->rd_thresh_mult[mode_index] = MAX_THRESHMULT;
+
+      cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7) * cpi->rd_thresh_mult[mode_index];
+#endif
+    }
+
+    /* keep record of best compound/single-only prediction */
+    if (!disable_skip && xd->mode_info_context->mbmi.ref_frame != INTRA_FRAME) {
+      int single_rd, hybrid_rd, single_rate, hybrid_rate;
+
+      if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {
+        single_rate = rate2 - compmode_cost;
+        hybrid_rate = rate2;
+      } else {
+        single_rate = rate2;
+        hybrid_rate = rate2 + compmode_cost;
+      }
+
+      single_rd = RDCOST(x->rdmult, x->rddiv, single_rate, distortion2);
+      hybrid_rd = RDCOST(x->rdmult, x->rddiv, hybrid_rate, distortion2);
+
+      if (xd->mode_info_context->mbmi.second_ref_frame == INTRA_FRAME &&
+          single_rd < best_single_rd) {
+        best_single_rd = single_rd;
+      } else if (xd->mode_info_context->mbmi.second_ref_frame != INTRA_FRAME &&
+                 single_rd < best_comp_rd) {
+        best_comp_rd = single_rd;
+      }
+      if (hybrid_rd < best_hybrid_rd) {
+        best_hybrid_rd = hybrid_rd;
+      }
+    }
+
+    if (x->skip && !mode_excluded)
+      break;
+  }
+
+  // TODO(rbultje) integrate with RD thresholding
+#if 0
+  // Reduce the activation RD thresholds for the best choice mode
+  if ((cpi->rd_baseline_thresh[best_mode_index] > 0) &&
+      (cpi->rd_baseline_thresh[best_mode_index] < (INT_MAX >> 2))) {
+    int best_adjustment = (cpi->rd_thresh_mult[best_mode_index] >> 2);
+
+    cpi->rd_thresh_mult[best_mode_index] =
+      (cpi->rd_thresh_mult[best_mode_index] >= (MIN_THRESHMULT + best_adjustment)) ?
+      cpi->rd_thresh_mult[best_mode_index] - best_adjustment : MIN_THRESHMULT;
+    cpi->rd_threshes[best_mode_index] =
+      (cpi->rd_baseline_thresh[best_mode_index] >> 7) * cpi->rd_thresh_mult[best_mode_index];
+  }
+#endif
+
+  // This code forces Altref,0,0 and skip for the frame that overlays a
+  // an alrtef unless Altref is filtered. However, this is unsafe if
+  // segment level coding of ref frame or mode is enabled for this
+  // segment.
+  if (!segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&
+      !segfeature_active(xd, segment_id, SEG_LVL_MODE) &&
+      cpi->is_src_frame_alt_ref &&
+      (cpi->oxcf.arnr_max_frames == 0) &&
+      (best_mbmode.mode != ZEROMV || best_mbmode.ref_frame != ALTREF_FRAME)) {
+    xd->mode_info_context->mbmi.mode = ZEROMV;
+    xd->mode_info_context->mbmi.ref_frame = ALTREF_FRAME;
+    xd->mode_info_context->mbmi.mv[0].as_int = 0;
+    xd->mode_info_context->mbmi.uv_mode = DC_PRED;
+    xd->mode_info_context->mbmi.mb_skip_coeff =
+      (cpi->common.mb_no_coeff_skip) ? 1 : 0;
+    xd->mode_info_context->mbmi.partitioning = 0;
+
+    xd->mode_info_context->mbmi.txfm_size = TX_8X8;
+
+    if (best_rd != INT64_MAX)
+      store_coding_context(x, &x->sb_context[0], mode_index, NULL,
+        &frame_best_ref_mv[xd->mode_info_context->mbmi.ref_frame],
+        &frame_best_ref_mv[xd->mode_info_context->mbmi.second_ref_frame],
+        0, 0, 0);
+    return best_rd;
+  }
+
+  // macroblock modes
+  vpx_memcpy(&xd->mode_info_context->mbmi, &best_mbmode,
+             sizeof(MB_MODE_INFO));
+  xd->mode_info_context->mbmi.txfm_size = TX_8X8;
+
+  if (best_rd != INT64_MAX)
+    store_coding_context(x, &x->sb_context[0], mode_index, NULL,
+      &frame_best_ref_mv[xd->mode_info_context->mbmi.ref_frame],
+      &frame_best_ref_mv[xd->mode_info_context->mbmi.second_ref_frame],
+      (best_single_rd == INT64_MAX) ? INT_MIN : (best_rd - best_single_rd),
+      (best_comp_rd   == INT64_MAX) ? INT_MIN : (best_rd - best_comp_rd),
+      (best_hybrid_rd == INT64_MAX) ? INT_MIN : (best_rd - best_hybrid_rd));
+
+  return best_rd;
+}
+#endif
+
+void vp8cx_pick_mode_inter_macroblock(VP8_COMP *cpi, MACROBLOCK *x,
+                                      int recon_yoffset,
+                                      int recon_uvoffset,
+                                      int *totalrate, int *totaldist) {
   VP8_COMMON *cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
@@ -3694,17 +4678,6 @@ int vp8cx_pick_mode_inter_macroblock(VP8_COMP *cpi, MACROBLOCK *x,
     vp8_rd_pick_inter_mode(cpi, x, recon_yoffset, recon_uvoffset, &rate,
                            &distortion, &intra_error);
 
-    if (mbmi->ref_frame) {
-      unsigned char pred_context;
-
-      pred_context = get_pred_context(cm, xd, PRED_COMP);
-
-      if (mbmi->second_ref_frame == INTRA_FRAME)
-        cpi->single_pred_count[pred_context]++;
-      else
-        cpi->comp_pred_count[pred_context]++;
-    }
-
     /* restore cpi->zbin_mode_boost_enabled */
     cpi->zbin_mode_boost_enabled = zbin_mode_boost_enabled;
   }
@@ -3717,5 +4690,6 @@ int vp8cx_pick_mode_inter_macroblock(VP8_COMP *cpi, MACROBLOCK *x,
   x->mb_context[xd->mb_index].distortion  = distortion;
   x->mb_context[xd->mb_index].intra_error = intra_error;
 
-  return rate;
+  *totalrate = rate;
+  *totaldist = distortion;
 }
diff --git a/vp8/encoder/rdopt.h b/vp8/encoder/rdopt.h
index 2b5928de9..0e36a519d 100644
--- a/vp8/encoder/rdopt.h
+++ b/vp8/encoder/rdopt.h
@@ -18,7 +18,8 @@
 extern void vp8_initialize_rd_consts(VP8_COMP *cpi, int Qvalue);
 extern void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int recon_uvoffset,
                                    int *returnrate, int *returndistortion, int64_t *returnintra);
-extern int vp8_rd_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x);
+extern void vp8_rd_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x, int *r, int *d);
+extern void vp8_rd_pick_intra_mode_sb(VP8_COMP *cpi, MACROBLOCK *x, int *r, int *d);
 
 extern void vp8_mv_pred
 (
diff --git a/vp8/encoder/sad_c.c b/vp8/encoder/sad_c.c
index 78a87f392..4fdfd1186 100644
--- a/vp8/encoder/sad_c.c
+++ b/vp8/encoder/sad_c.c
@@ -13,29 +13,6 @@
 #include "vpx_ports/config.h"
 #include "vpx/vpx_integer.h"
 
-unsigned int vp8_sad16x16_c(
-  const unsigned char *src_ptr,
-  int  src_stride,
-  const unsigned char *ref_ptr,
-  int  ref_stride,
-  int max_sad) {
-
-  int r, c;
-  unsigned int sad = 0;
-
-  for (r = 0; r < 16; r++) {
-    for (c = 0; c < 16; c++) {
-      sad += abs(src_ptr[c] - ref_ptr[c]);
-    }
-
-    src_ptr += src_stride;
-    ref_ptr += ref_stride;
-  }
-
-  return sad;
-}
-
-
 static __inline
 unsigned int sad_mx_n_c(
   const unsigned char *src_ptr,
@@ -60,6 +37,21 @@ unsigned int sad_mx_n_c(
   return sad;
 }
 
+unsigned int vp8_sad32x32_c(const unsigned char *src_ptr,
+                            int  src_stride,
+                            const unsigned char *ref_ptr,
+                            int  ref_stride,
+                            int max_sad) {
+  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 32, 32);
+}
+
+unsigned int vp8_sad16x16_c(const unsigned char *src_ptr,
+                            int  src_stride,
+                            const unsigned char *ref_ptr,
+                            int  ref_stride,
+                            int max_sad) {
+  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 16, 16);
+}
 
 unsigned int vp8_sad8x8_c(
   const unsigned char *src_ptr,
@@ -104,6 +96,7 @@ unsigned int vp8_sad4x4_c(
 
   return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 4, 4);
 }
+
 #if CONFIG_NEWBESTREFMV
 unsigned int vp8_sad2x16_c(
   const unsigned char *src_ptr,
@@ -122,6 +115,34 @@ unsigned int vp8_sad16x2_c(
   return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 16, 2);
 }
 #endif
+
+void vp8_sad32x32x3_c(const unsigned char *src_ptr,
+                      int  src_stride,
+                      const unsigned char *ref_ptr,
+                      int  ref_stride,
+                      unsigned int *sad_array
+                      ) {
+  sad_array[0] = vp8_sad32x32_c(src_ptr, src_stride, ref_ptr, ref_stride, 0x7fffffff);
+  sad_array[1] = vp8_sad32x32_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, 0x7fffffff);
+  sad_array[2] = vp8_sad32x32_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
+}
+
+void vp8_sad32x32x8_c(const unsigned char *src_ptr,
+                      int  src_stride,
+                      const unsigned char *ref_ptr,
+                      int  ref_stride,
+                      unsigned short *sad_array
+                      ) {
+  sad_array[0] = (unsigned short)vp8_sad32x32_c(src_ptr, src_stride, ref_ptr, ref_stride, 0x7fffffff);
+  sad_array[1] = (unsigned short)vp8_sad32x32_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, 0x7fffffff);
+  sad_array[2] = (unsigned short)vp8_sad32x32_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
+  sad_array[3] = (unsigned short)vp8_sad32x32_c(src_ptr, src_stride, ref_ptr + 3, ref_stride, 0x7fffffff);
+  sad_array[4] = (unsigned short)vp8_sad32x32_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, 0x7fffffff);
+  sad_array[5] = (unsigned short)vp8_sad32x32_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, 0x7fffffff);
+  sad_array[6] = (unsigned short)vp8_sad32x32_c(src_ptr, src_stride, ref_ptr + 6, ref_stride, 0x7fffffff);
+  sad_array[7] = (unsigned short)vp8_sad32x32_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, 0x7fffffff);
+}
+
 void vp8_sad16x16x3_c(
   const unsigned char *src_ptr,
   int  src_stride,
@@ -267,6 +288,18 @@ void vp8_sad4x4x8_c(
   sad_array[7] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, 0x7fffffff);
 }
 
+void vp8_sad32x32x4d_c(const unsigned char *src_ptr,
+                       int  src_stride,
+                       unsigned char *ref_ptr[],
+                       int  ref_stride,
+                       unsigned int *sad_array
+                       ) {
+  sad_array[0] = vp8_sad32x32_c(src_ptr, src_stride, ref_ptr[0], ref_stride, 0x7fffffff);
+  sad_array[1] = vp8_sad32x32_c(src_ptr, src_stride, ref_ptr[1], ref_stride, 0x7fffffff);
+  sad_array[2] = vp8_sad32x32_c(src_ptr, src_stride, ref_ptr[2], ref_stride, 0x7fffffff);
+  sad_array[3] = vp8_sad32x32_c(src_ptr, src_stride, ref_ptr[3], ref_stride, 0x7fffffff);
+}
+
 void vp8_sad16x16x4d_c(
   const unsigned char *src_ptr,
   int  src_stride,
diff --git a/vp8/encoder/segmentation.c b/vp8/encoder/segmentation.c
index e9d02cdd4..e88b80d34 100644
--- a/vp8/encoder/segmentation.c
+++ b/vp8/encoder/segmentation.c
@@ -200,42 +200,59 @@ void choose_segmap_coding_method(VP8_COMP *cpi) {
   // in the frame
   xd->mode_info_context = cm->mi;
 
-  for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {
-    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
-      segment_id = xd->mode_info_context->mbmi.segment_id;
-
-      // Count the number of hits on each segment with no prediction
-      no_pred_segcounts[segment_id]++;
-
-      // Temporal prediction not allowed on key frames
-      if (cm->frame_type != KEY_FRAME) {
-        // Test to see if the segment id matches the predicted value.
-        int seg_predicted =
-          (segment_id == get_pred_mb_segid(cm, segmap_index));
-
-        // Get the segment id prediction context
-        pred_context =
-          get_pred_context(cm, xd, PRED_SEG_ID);
-
-        // Store the prediction status for this mb and update counts
-        // as appropriate
-        set_pred_flag(xd, PRED_SEG_ID, seg_predicted);
-        temporal_predictor_count[pred_context][seg_predicted]++;
-
-        if (!seg_predicted)
-          // Update the "unpredicted" segment count
-          t_unpred_seg_counts[segment_id]++;
-      }
+  for (mb_row = 0; mb_row < cm->mb_rows; mb_row += 2) {
+    for (mb_col = 0; mb_col < cm->mb_cols; mb_col += 2) {
+      for (i = 0; i < 4; i++) {
+        static const int dx[4] = { +1, -1, +1, +1 };
+        static const int dy[4] = {  0, +1,  0, -1 };
+        int x_idx = i & 1, y_idx = i >> 1;
+
+        if (mb_col + x_idx >= cm->mb_cols ||
+            mb_row + y_idx >= cm->mb_rows) {
+          goto end;
+        }
+
+        segmap_index = (mb_row + y_idx) * cm->mb_cols + mb_col + x_idx;
+        segment_id = xd->mode_info_context->mbmi.segment_id;
+
+        // Count the number of hits on each segment with no prediction
+        no_pred_segcounts[segment_id]++;
+
+        // Temporal prediction not allowed on key frames
+        if (cm->frame_type != KEY_FRAME) {
+          // Test to see if the segment id matches the predicted value.
+          int seg_predicted =
+            (segment_id == get_pred_mb_segid(cm, segmap_index));
 
-      // Step on to the next mb
-      xd->mode_info_context++;
+          // Get the segment id prediction context
+          pred_context =
+            get_pred_context(cm, xd, PRED_SEG_ID);
 
-      // Step on to the next entry in the segment maps
-      segmap_index++;
+          // Store the prediction status for this mb and update counts
+          // as appropriate
+          set_pred_flag(xd, PRED_SEG_ID, seg_predicted);
+          temporal_predictor_count[pred_context][seg_predicted]++;
+
+          if (!seg_predicted)
+            // Update the "unpredicted" segment count
+            t_unpred_seg_counts[segment_id]++;
+        }
+
+#if CONFIG_SUPERBLOCKS
+        if (xd->mode_info_context->mbmi.encoded_as_sb) {
+          assert(!i);
+          xd->mode_info_context += 2;
+          break;
+        }
+#endif
+      end:
+        xd->mode_info_context += dx[i] + dy[i] * cm->mode_info_stride;
+      }
     }
 
     // this is to account for the border in mode_info_context
-    xd->mode_info_context++;
+    xd->mode_info_context -= mb_col;
+    xd->mode_info_context += cm->mode_info_stride * 2;
   }
 
   // Work out probability tree for coding segments without prediction
diff --git a/vp8/encoder/variance.h b/vp8/encoder/variance.h
index da83d1261..a2fadfc4c 100644
--- a/vp8/encoder/variance.h
+++ b/vp8/encoder/variance.h
@@ -145,8 +145,18 @@ extern prototype_sad(vp8_variance_sad16x8);
 #endif
 extern prototype_sad(vp8_variance_sad16x16);
 
+#ifndef vp8_variance_sad32x32
+#define vp8_variance_sad32x32 vp8_sad32x32_c
+#endif
+extern prototype_sad(vp8_variance_sad32x32);
+
 // -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
 
+#ifndef vp8_variance_sad32x32x3
+#define vp8_variance_sad32x32x3 vp8_sad32x32x3_c
+#endif
+extern prototype_sad_multi_same_address(vp8_variance_sad32x32x3);
+
 #ifndef vp8_variance_sad16x16x3
 #define vp8_variance_sad16x16x3 vp8_sad16x16x3_c
 #endif
@@ -172,6 +182,11 @@ extern prototype_sad_multi_same_address(vp8_variance_sad8x16x3);
 #endif
 extern prototype_sad_multi_same_address(vp8_variance_sad4x4x3);
 
+#ifndef vp8_variance_sad32x32x8
+#define vp8_variance_sad32x32x8 vp8_sad32x32x8_c
+#endif
+extern prototype_sad_multi_same_address_1(vp8_variance_sad32x32x8);
+
 #ifndef vp8_variance_sad16x16x8
 #define vp8_variance_sad16x16x8 vp8_sad16x16x8_c
 #endif
@@ -199,6 +214,11 @@ extern prototype_sad_multi_same_address_1(vp8_variance_sad4x4x8);
 
 // -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
 
+#ifndef vp8_variance_sad32x32x4d
+#define vp8_variance_sad32x32x4d vp8_sad32x32x4d_c
+#endif
+extern prototype_sad_multi_dif_address(vp8_variance_sad32x32x4d);
+
 #ifndef vp8_variance_sad16x16x4d
 #define vp8_variance_sad16x16x4d vp8_sad16x16x4d_c
 #endif
@@ -258,6 +278,11 @@ extern prototype_variance(vp8_variance_var16x8);
 #endif
 extern prototype_variance(vp8_variance_var16x16);
 
+#ifndef vp8_variance_var32x32
+#define vp8_variance_var32x32 vp8_variance32x32_c
+#endif
+extern prototype_variance(vp8_variance_var32x32);
+
 // -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
 
 #ifndef vp8_variance_subpixvar4x4
@@ -285,26 +310,51 @@ extern prototype_subpixvariance(vp8_variance_subpixvar16x8);
 #endif
 extern prototype_subpixvariance(vp8_variance_subpixvar16x16);
 
+#ifndef vp8_variance_subpixvar32x32
+#define vp8_variance_subpixvar32x32 vp8_sub_pixel_variance32x32_c
+#endif
+extern prototype_subpixvariance(vp8_variance_subpixvar32x32);
+
 #ifndef vp8_variance_halfpixvar16x16_h
 #define vp8_variance_halfpixvar16x16_h vp8_variance_halfpixvar16x16_h_c
 #endif
 extern prototype_variance(vp8_variance_halfpixvar16x16_h);
 
+#ifndef vp8_variance_halfpixvar32x32_h
+#define vp8_variance_halfpixvar32x32_h vp8_variance_halfpixvar32x32_h_c
+#endif
+extern prototype_variance(vp8_variance_halfpixvar32x32_h);
+
 #ifndef vp8_variance_halfpixvar16x16_v
 #define vp8_variance_halfpixvar16x16_v vp8_variance_halfpixvar16x16_v_c
 #endif
 extern prototype_variance(vp8_variance_halfpixvar16x16_v);
 
+#ifndef vp8_variance_halfpixvar32x32_v
+#define vp8_variance_halfpixvar32x32_v vp8_variance_halfpixvar32x32_v_c
+#endif
+extern prototype_variance(vp8_variance_halfpixvar32x32_v);
+
 #ifndef vp8_variance_halfpixvar16x16_hv
 #define vp8_variance_halfpixvar16x16_hv vp8_variance_halfpixvar16x16_hv_c
 #endif
 extern prototype_variance(vp8_variance_halfpixvar16x16_hv);
 
+#ifndef vp8_variance_halfpixvar32x32_hv
+#define vp8_variance_halfpixvar32x32_hv vp8_variance_halfpixvar32x32_hv_c
+#endif
+extern prototype_variance(vp8_variance_halfpixvar32x32_hv);
+
 #ifndef vp8_variance_subpixmse16x16
 #define vp8_variance_subpixmse16x16 vp8_sub_pixel_mse16x16_c
 #endif
 extern prototype_subpixvariance(vp8_variance_subpixmse16x16);
 
+#ifndef vp8_variance_subpixmse32x32
+#define vp8_variance_subpixmse32x32 vp8_sub_pixel_mse32x32_c
+#endif
+extern prototype_subpixvariance(vp8_variance_subpixmse32x32);
+
 // -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
 
 #ifndef vp8_variance_getmbss
@@ -349,38 +399,66 @@ typedef struct {
   vp8_sad_fn_t             sad8x16;
   vp8_sad_fn_t             sad16x8;
   vp8_sad_fn_t             sad16x16;
+#if CONFIG_SUPERBLOCKS
+  vp8_sad_fn_t             sad32x32;
+#endif
 
   vp8_variance_fn_t        var4x4;
   vp8_variance_fn_t        var8x8;
   vp8_variance_fn_t        var8x16;
   vp8_variance_fn_t        var16x8;
   vp8_variance_fn_t        var16x16;
+#if CONFIG_SUPERBLOCKS
+  vp8_variance_fn_t        var32x32;
+#endif
 
   vp8_subpixvariance_fn_t  subpixvar4x4;
   vp8_subpixvariance_fn_t  subpixvar8x8;
   vp8_subpixvariance_fn_t  subpixvar8x16;
   vp8_subpixvariance_fn_t  subpixvar16x8;
   vp8_subpixvariance_fn_t  subpixvar16x16;
+#if CONFIG_SUPERBLOCKS
+  vp8_subpixvariance_fn_t  subpixvar32x32;
+#endif
   vp8_variance_fn_t        halfpixvar16x16_h;
+  vp8_variance_fn_t        halfpixvar32x32_h;
   vp8_variance_fn_t        halfpixvar16x16_v;
+#if CONFIG_SUPERBLOCKS
+  vp8_variance_fn_t        halfpixvar32x32_v;
+#endif
   vp8_variance_fn_t        halfpixvar16x16_hv;
+#if CONFIG_SUPERBLOCKS
+  vp8_variance_fn_t        halfpixvar32x32_hv;
+#endif
   vp8_subpixvariance_fn_t  subpixmse16x16;
+#if CONFIG_SUPERBLOCKS
+  vp8_subpixvariance_fn_t  subpixmse32x32;
+#endif
 
   vp8_getmbss_fn_t         getmbss;
   vp8_variance_fn_t        mse16x16;
 
+#if CONFIG_SUPERBLOCKS
+  vp8_sad_multi_fn_t       sad32x32x3;
+#endif
   vp8_sad_multi_fn_t       sad16x16x3;
   vp8_sad_multi_fn_t       sad16x8x3;
   vp8_sad_multi_fn_t       sad8x16x3;
   vp8_sad_multi_fn_t       sad8x8x3;
   vp8_sad_multi_fn_t       sad4x4x3;
 
+#if CONFIG_SUPERBLOCKS
+  vp8_sad_multi1_fn_t      sad32x32x8;
+#endif
   vp8_sad_multi1_fn_t      sad16x16x8;
   vp8_sad_multi1_fn_t      sad16x8x8;
   vp8_sad_multi1_fn_t      sad8x16x8;
   vp8_sad_multi1_fn_t      sad8x8x8;
   vp8_sad_multi1_fn_t      sad4x4x8;
 
+#if CONFIG_SUPERBLOCKS
+  vp8_sad_multi_d_fn_t     sad32x32x4d;
+#endif
   vp8_sad_multi_d_fn_t     sad16x16x4d;
   vp8_sad_multi_d_fn_t     sad16x8x4d;
   vp8_sad_multi_d_fn_t     sad8x16x4d;
diff --git a/vp8/encoder/variance_c.c b/vp8/encoder/variance_c.c
index 0b9d569b0..cbe2a51d6 100644
--- a/vp8/encoder/variance_c.c
+++ b/vp8/encoder/variance_c.c
@@ -55,6 +55,20 @@ static void variance(
   }
 }
 
+#if CONFIG_SUPERBLOCKS
+unsigned int vp8_variance32x32_c(const unsigned char *src_ptr,
+                                 int  source_stride,
+                                 const unsigned char *ref_ptr,
+                                 int  recon_stride,
+                                 unsigned int *sse) {
+  unsigned int var;
+  int avg;
+
+  variance(src_ptr, source_stride, ref_ptr, recon_stride, 32, 32, &var, &avg);
+  *sse = var;
+  return (var - ((avg * avg) >> 10));
+}
+#endif
 
 unsigned int vp8_variance16x16_c(
   const unsigned char *src_ptr,
@@ -334,6 +348,27 @@ unsigned int vp8_sub_pixel_variance16x16_c
   return vp8_variance16x16_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
 }
 
+#if CONFIG_SUPERBLOCKS
+unsigned int vp8_sub_pixel_variance32x32_c(const unsigned char  *src_ptr,
+                                           int  src_pixels_per_line,
+                                           int  xoffset,
+                                           int  yoffset,
+                                           const unsigned char *dst_ptr,
+                                           int dst_pixels_per_line,
+                                           unsigned int *sse) {
+  unsigned short FData3[33 * 32]; // Temp data bufffer used in filtering
+  unsigned char  temp2[36 * 32];
+  const short *HFilter, *VFilter;
+
+  HFilter = vp8_bilinear_filters[xoffset];
+  VFilter = vp8_bilinear_filters[yoffset];
+
+  var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 33, 32, HFilter);
+  var_filter_block2d_bil_second_pass(FData3, temp2, 32, 32, 32, 32, VFilter);
+
+  return vp8_variance32x32_c(temp2, 32, dst_ptr, dst_pixels_per_line, sse);
+}
+#endif
 
 unsigned int vp8_variance_halfpixvar16x16_h_c(
   const unsigned char *src_ptr,
@@ -345,17 +380,38 @@ unsigned int vp8_variance_halfpixvar16x16_h_c(
                                        ref_ptr, recon_stride, sse);
 }
 
+#if CONFIG_SUPERBLOCKS
+unsigned int vp8_variance_halfpixvar32x32_h_c(const unsigned char *src_ptr,
+                                              int  source_stride,
+                                              const unsigned char *ref_ptr,
+                                              int  recon_stride,
+                                              unsigned int *sse) {
+  return vp8_sub_pixel_variance32x32_c(src_ptr, source_stride, 8, 0,
+                                       ref_ptr, recon_stride, sse);
+}
+#endif
+
 
-unsigned int vp8_variance_halfpixvar16x16_v_c(
+unsigned int vp8_variance_halfpixvar16x16_v_c(const unsigned char *src_ptr,
+                                              int  source_stride,
+                                              const unsigned char *ref_ptr,
+                                              int  recon_stride,
+                                              unsigned int *sse) {
+  return vp8_sub_pixel_variance16x16_c(src_ptr, source_stride, 0, 8,
+                                       ref_ptr, recon_stride, sse);
+}
+
+#if CONFIG_SUPERBLOCKS
+unsigned int vp8_variance_halfpixvar32x32_v_c(
   const unsigned char *src_ptr,
   int  source_stride,
   const unsigned char *ref_ptr,
   int  recon_stride,
   unsigned int *sse) {
-  return vp8_sub_pixel_variance16x16_c(src_ptr, source_stride, 0, 8,
+  return vp8_sub_pixel_variance32x32_c(src_ptr, source_stride, 0, 8,
                                        ref_ptr, recon_stride, sse);
 }
-
+#endif
 
 unsigned int vp8_variance_halfpixvar16x16_hv_c(
   const unsigned char *src_ptr,
@@ -367,6 +423,16 @@ unsigned int vp8_variance_halfpixvar16x16_hv_c(
                                        ref_ptr, recon_stride, sse);
 }
 
+#if CONFIG_SUPERBLOCKS
+unsigned int vp8_variance_halfpixvar32x32_hv_c(const unsigned char *src_ptr,
+                                               int  source_stride,
+                                               const unsigned char *ref_ptr,
+                                               int  recon_stride,
+                                               unsigned int *sse) {
+  return vp8_sub_pixel_variance32x32_c(src_ptr, source_stride, 8, 8,
+                                       ref_ptr, recon_stride, sse);
+}
+#endif
 
 unsigned int vp8_sub_pixel_mse16x16_c
 (
@@ -382,6 +448,19 @@ unsigned int vp8_sub_pixel_mse16x16_c
   return *sse;
 }
 
+#if CONFIG_SUPERBLOCKS
+unsigned int vp8_sub_pixel_mse32x32_c(const unsigned char  *src_ptr,
+                                      int  src_pixels_per_line,
+                                      int  xoffset,
+                                      int  yoffset,
+                                      const unsigned char *dst_ptr,
+                                      int dst_pixels_per_line,
+                                      unsigned int *sse) {
+  vp8_sub_pixel_variance32x32_c(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
+  return *sse;
+}
+#endif
+
 unsigned int vp8_sub_pixel_variance16x8_c
 (
   const unsigned char  *src_ptr,