17 files changed, 1303 insertions, 988 deletions
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index 26f22b4be..a95d7eb46 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -12,35 +12,37 @@
 #include <stdio.h>
 #include <limits.h>
 
-#include "vp9/common/vp9_header.h"
-#include "vp9/encoder/vp9_encodemv.h"
+#include "vpx/vpx_encoder.h"
+#include "vpx_mem/vpx_mem.h"
+
 #include "vp9/common/vp9_entropymode.h"
 #include "vp9/common/vp9_entropymv.h"
 #include "vp9/common/vp9_findnearmv.h"
 #include "vp9/common/vp9_tile_common.h"
-#include "vp9/encoder/vp9_mcomp.h"
-#include "vp9/common/vp9_systemdependent.h"
-#include "vp9/common/vp9_pragmas.h"
-#include "vpx/vpx_encoder.h"
-#include "vpx_mem/vpx_mem.h"
-#include "vp9/encoder/vp9_bitstream.h"
-#include "vp9/encoder/vp9_segmentation.h"
-
 #include "vp9/common/vp9_seg_common.h"
 #include "vp9/common/vp9_pred_common.h"
 #include "vp9/common/vp9_entropy.h"
 #include "vp9/common/vp9_entropymv.h"
 #include "vp9/common/vp9_mvref_common.h"
 #include "vp9/common/vp9_treecoder.h"
+#include "vp9/common/vp9_systemdependent.h"
+#include "vp9/common/vp9_pragmas.h"
+
+#include "vp9/encoder/vp9_mcomp.h"
+#include "vp9/encoder/vp9_encodemv.h"
+#include "vp9/encoder/vp9_bitstream.h"
+#include "vp9/encoder/vp9_segmentation.h"
+#include "vp9/encoder/vp9_write_bit_buffer.h"
+
 
 #if defined(SECTIONBITS_OUTPUT)
 unsigned __int64 Sectionbits[500];
 #endif
 
 #ifdef ENTROPY_STATS
-int intra_mode_stats[VP9_KF_BINTRAMODES]
-                    [VP9_KF_BINTRAMODES]
-                    [VP9_KF_BINTRAMODES];
+int intra_mode_stats[VP9_INTRA_MODES]
+                    [VP9_INTRA_MODES]
+                    [VP9_INTRA_MODES];
 vp9_coeff_stats tree_update_hist_4x4[BLOCK_TYPES];
 vp9_coeff_stats tree_update_hist_8x8[BLOCK_TYPES];
 vp9_coeff_stats tree_update_hist_16x16[BLOCK_TYPES];
@@ -243,15 +245,12 @@ static void update_mbintra_mode_probs(VP9_COMP* const cpi,
                                       vp9_writer* const bc) {
   VP9_COMMON *const cm = &cpi->common;
 
-  vp9_prob pnew[VP9_YMODES - 1];
-  unsigned int bct[VP9_YMODES - 1][2];
-
-  update_mode(bc, VP9_YMODES, vp9_ymode_encodings, vp9_ymode_tree, pnew,
-              cm->fc.ymode_prob, bct, (unsigned int *)cpi->ymode_count);
+  vp9_prob pnew[VP9_INTRA_MODES - 1];
+  unsigned int bct[VP9_INTRA_MODES - 1][2];
 
-  update_mode(bc, VP9_I32X32_MODES, vp9_sb_ymode_encodings,
-              vp9_sb_ymode_tree, pnew, cm->fc.sb_ymode_prob, bct,
-              (unsigned int *)cpi->sb_ymode_count);
+  update_mode(bc, VP9_INTRA_MODES, vp9_intra_mode_encodings,
+              vp9_intra_mode_tree, pnew,
+              cm->fc.y_mode_prob, bct, (unsigned int *)cpi->y_mode_count);
 }
 
 void vp9_update_skip_probs(VP9_COMP *cpi) {
@@ -322,15 +321,15 @@ static void update_refpred_stats(VP9_COMP *cpi) {
 // The branch counts table is re-populated during the actual pack stage and in
 // the decoder to facilitate backwards update of the context.
 static void update_inter_mode_probs(VP9_COMMON *cm,
-                                    int mode_context[INTER_MODE_CONTEXTS][4]) {
+    int mode_context[INTER_MODE_CONTEXTS][VP9_MVREFS - 1]) {
   int i, j;
-  unsigned int (*mv_ref_ct)[4][2] = cm->fc.mv_ref_ct;
+  unsigned int (*mv_ref_ct)[VP9_MVREFS - 1][2] = cm->fc.mv_ref_ct;
 
   vpx_memcpy(mode_context, cm->fc.vp9_mode_contexts,
              sizeof(cm->fc.vp9_mode_contexts));
 
   for (i = 0; i < INTER_MODE_CONTEXTS; i++) {
-    for (j = 0; j < 4; j++) {
+    for (j = 0; j < VP9_MVREFS - 1; j++) {
       int new_prob, old_cost, new_cost;
 
       // Work out cost of coding branches with the old and optimal probability
@@ -348,28 +347,8 @@ static void update_inter_mode_probs(VP9_COMMON *cm,
   }
 }
 
-static void write_ymode(vp9_writer *bc, int m, const vp9_prob *p) {
-  write_token(bc, vp9_ymode_tree, p, vp9_ymode_encodings + m);
-}
-
-static void kfwrite_ymode(vp9_writer *bc, int m, const vp9_prob *p) {
-  write_token(bc, vp9_kf_ymode_tree, p, vp9_kf_ymode_encodings + m);
-}
-
-static void write_sb_ymode(vp9_writer *bc, int m, const vp9_prob *p) {
-  write_token(bc, vp9_sb_ymode_tree, p, vp9_sb_ymode_encodings + m);
-}
-
-static void sb_kfwrite_ymode(vp9_writer *bc, int m, const vp9_prob *p) {
-  write_token(bc, vp9_uv_mode_tree, p, vp9_sb_kf_ymode_encodings + m);
-}
-
-static void write_uv_mode(vp9_writer *bc, int m, const vp9_prob *p) {
-  write_token(bc, vp9_uv_mode_tree, p, vp9_uv_mode_encodings + m);
-}
-
-static void write_kf_bmode(vp9_writer *bc, int m, const vp9_prob *p) {
-  write_token(bc, vp9_bmode_tree, p, vp9_kf_bmode_encodings + m);
+static void write_intra_mode(vp9_writer *bc, int m, const vp9_prob *p) {
+  write_token(bc, vp9_intra_mode_tree, p, vp9_intra_mode_encodings + m);
 }
 
 static int prob_update_savings(const unsigned int *ct,
@@ -471,7 +450,6 @@ static void pack_mb_tokens(vp9_writer* const bc,
     const vp9_prob *pp;
     int v = a->value;
     int n = a->len;
-    int ncount = n;
     vp9_prob probs[ENTROPY_NODES];
 
     if (t == EOSB_TOKEN) {
@@ -487,18 +465,25 @@ static void pack_mb_tokens(vp9_writer* const bc,
     assert(pp != 0);
 
     /* skip one or two nodes */
+#if !CONFIG_BALANCED_COEFTREE
     if (p->skip_eob_node) {
       n -= p->skip_eob_node;
       i = 2 * p->skip_eob_node;
-      ncount -= p->skip_eob_node;
     }
+#endif
 
     do {
       const int bb = (v >> --n) & 1;
+#if CONFIG_BALANCED_COEFTREE
+      if (i == 2 && p->skip_eob_node) {
+        i += 2;
+        assert(bb == 1);
+        continue;
+      }
+#endif
       vp9_write(bc, bb, pp[i >> 1]);
       i = vp9_coef_tree[i + bb];
-      ncount--;
-    } while (n && ncount);
+    } while (n);
 
     if (b->base_val) {
       const int e = p->extra, l = b->len;
@@ -524,19 +509,10 @@ static void pack_mb_tokens(vp9_writer* const bc,
   *tp = p;
 }
 
-static void write_mv_ref(vp9_writer *bc, MB_PREDICTION_MODE m,
-                         const vp9_prob *p) {
-#if CONFIG_DEBUG
-  assert(NEARESTMV <= m  &&  m <= SPLITMV);
-#endif
-  write_token(bc, vp9_mv_ref_tree, p,
-              vp9_mv_ref_encoding_array - NEARESTMV + m);
-}
-
 static void write_sb_mv_ref(vp9_writer *bc, MB_PREDICTION_MODE m,
                             const vp9_prob *p) {
 #if CONFIG_DEBUG
-  assert(NEARESTMV <= m  &&  m < SPLITMV);
+  assert(NEARESTMV <= m && m <= NEWMV);
 #endif
   write_token(bc, vp9_sb_mv_ref_tree, p,
               vp9_sb_mv_ref_encoding_array - NEARESTMV + m);
@@ -699,25 +675,37 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m,
   // Encode the reference frame.
   encode_ref_frame(bc, pc, xd, segment_id, rf);
 
+  if (mi->sb_type >= BLOCK_SIZE_SB8X8 && pc->txfm_mode == TX_MODE_SELECT &&
+      !(rf != INTRA_FRAME &&
+        (skip_coeff || vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)))) {
+    TX_SIZE sz = mi->txfm_size;
+    // FIXME(rbultje) code ternary symbol once all experiments are merged
+    vp9_write(bc, sz != TX_4X4, pc->prob_tx[0]);
+    if (mi->sb_type >= BLOCK_SIZE_MB16X16 && sz != TX_4X4) {
+      vp9_write(bc, sz != TX_8X8, pc->prob_tx[1]);
+      if (mi->sb_type >= BLOCK_SIZE_SB32X32 && sz != TX_8X8)
+        vp9_write(bc, sz != TX_16X16, pc->prob_tx[2]);
+    }
+  }
+
   if (rf == INTRA_FRAME) {
 #ifdef ENTROPY_STATS
     active_section = 6;
 #endif
 
-    if (m->mbmi.sb_type >= BLOCK_SIZE_SB8X8)
-      write_sb_ymode(bc, mode, pc->fc.sb_ymode_prob);
-
-    if (m->mbmi.sb_type < BLOCK_SIZE_SB8X8) {
+    if (m->mbmi.sb_type >= BLOCK_SIZE_SB8X8) {
+      write_intra_mode(bc, mode, pc->fc.y_mode_prob);
+    } else {
       int idx, idy;
       int bw = 1 << b_width_log2(mi->sb_type);
       int bh = 1 << b_height_log2(mi->sb_type);
       for (idy = 0; idy < 2; idy += bh)
         for (idx = 0; idx < 2; idx += bw)
-          write_sb_ymode(bc, m->bmi[idy * 2 + idx].as_mode.first,
-                         pc->fc.sb_ymode_prob);
+          write_intra_mode(bc, m->bmi[idy * 2 + idx].as_mode.first,
+                           pc->fc.y_mode_prob);
     }
-    write_uv_mode(bc, mi->uv_mode,
-                  pc->fc.uv_mode_prob[mode]);
+    write_intra_mode(bc, mi->uv_mode,
+                     pc->fc.uv_mode_prob[mode]);
   } else {
     vp9_prob mv_ref_p[VP9_MVREFS - 1];
 
@@ -729,21 +717,20 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m,
 
     // If segment skip is not enabled code the mode.
     if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) {
-      if (mi->sb_type >= BLOCK_SIZE_SB8X8)
+      if (mi->sb_type >= BLOCK_SIZE_SB8X8) {
         write_sb_mv_ref(bc, mode, mv_ref_p);
-      vp9_accum_mv_refs(&cpi->common, mode, mi->mb_mode_context[rf]);
+        vp9_accum_mv_refs(&cpi->common, mode, mi->mb_mode_context[rf]);
+      }
     }
 
-    if (is_inter_mode(mode)) {
-      if (cpi->common.mcomp_filter_type == SWITCHABLE) {
-        write_token(bc, vp9_switchable_interp_tree,
-                    vp9_get_pred_probs(&cpi->common, xd,
-                                       PRED_SWITCHABLE_INTERP),
-                    vp9_switchable_interp_encodings +
-                    vp9_switchable_interp_map[mi->interp_filter]);
-      } else {
-        assert(mi->interp_filter == cpi->common.mcomp_filter_type);
-      }
+    if (cpi->common.mcomp_filter_type == SWITCHABLE) {
+      write_token(bc, vp9_switchable_interp_tree,
+                  vp9_get_pred_probs(&cpi->common, xd,
+                                     PRED_SWITCHABLE_INTERP),
+                  vp9_switchable_interp_encodings +
+                  vp9_switchable_interp_map[mi->interp_filter]);
+    } else {
+      assert(mi->interp_filter == cpi->common.mcomp_filter_type);
     }
 
     // does the feature use compound prediction or not
@@ -753,70 +740,51 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m,
                 vp9_get_pred_prob(pc, xd, PRED_COMP));
     }
 
-    switch (mode) { /* new, split require MVs */
-      case NEWMV:
+    if (xd->mode_info_context->mbmi.sb_type < BLOCK_SIZE_SB8X8) {
+      int j;
+      MB_PREDICTION_MODE blockmode;
+      int_mv blockmv;
+      int bwl = b_width_log2(mi->sb_type), bw = 1 << bwl;
+      int bhl = b_height_log2(mi->sb_type), bh = 1 << bhl;
+      int idx, idy;
+      for (idy = 0; idy < 2; idy += bh) {
+        for (idx = 0; idx < 2; idx += bw) {
+          j = idy * 2 + idx;
+          blockmode = cpi->mb.partition_info->bmi[j].mode;
+          blockmv = cpi->mb.partition_info->bmi[j].mv;
+          write_sb_mv_ref(bc, blockmode, mv_ref_p);
+          vp9_accum_mv_refs(&cpi->common, blockmode, mi->mb_mode_context[rf]);
+          if (blockmode == NEWMV) {
 #ifdef ENTROPY_STATS
-        active_section = 5;
+            active_section = 11;
 #endif
-        vp9_encode_mv(bc,
-                      &mi->mv[0].as_mv, &mi->best_mv.as_mv,
-                      nmvc, xd->allow_high_precision_mv);
+            vp9_encode_mv(bc, &blockmv.as_mv, &mi->best_mv.as_mv,
+                          nmvc, xd->allow_high_precision_mv);
 
-        if (mi->second_ref_frame > 0)
-          vp9_encode_mv(bc,
-                        &mi->mv[1].as_mv, &mi->best_second_mv.as_mv,
-                        nmvc, xd->allow_high_precision_mv);
-        break;
-      case SPLITMV: {
-        int j;
-        MB_PREDICTION_MODE blockmode;
-        int_mv blockmv;
-        int bwl = b_width_log2(mi->sb_type), bw = 1 << bwl;
-        int bhl = b_height_log2(mi->sb_type), bh = 1 << bhl;
-        int idx, idy;
-        for (idy = 0; idy < 2; idy += bh) {
-          for (idx = 0; idx < 2; idx += bw) {
-            j = idy * 2 + idx;
-            blockmode = cpi->mb.partition_info->bmi[j].mode;
-            blockmv = cpi->mb.partition_info->bmi[j].mv;
-            write_sb_mv_ref(bc, blockmode, mv_ref_p);
-            vp9_accum_mv_refs(&cpi->common, blockmode, mi->mb_mode_context[rf]);
-            if (blockmode == NEWMV) {
-#ifdef ENTROPY_STATS
-              active_section = 11;
-#endif
-              vp9_encode_mv(bc, &blockmv.as_mv, &mi->best_mv.as_mv,
+            if (mi->second_ref_frame > 0)
+              vp9_encode_mv(bc,
+                            &cpi->mb.partition_info->bmi[j].second_mv.as_mv,
+                            &mi->best_second_mv.as_mv,
                             nmvc, xd->allow_high_precision_mv);
-
-              if (mi->second_ref_frame > 0)
-                vp9_encode_mv(bc,
-                              &cpi->mb.partition_info->bmi[j].second_mv.as_mv,
-                              &mi->best_second_mv.as_mv,
-                              nmvc, xd->allow_high_precision_mv);
-            }
           }
         }
+      }
 
 #ifdef MODE_STATS
-        ++count_mb_seg[mi->partitioning];
+      ++count_mb_seg[mi->partitioning];
 #endif
-        break;
-      }
-      default:
-        break;
-    }
-  }
+    } else if (mode == NEWMV) {
+#ifdef ENTROPY_STATS
+      active_section = 5;
+#endif
+      vp9_encode_mv(bc,
+                    &mi->mv[0].as_mv, &mi->best_mv.as_mv,
+                    nmvc, xd->allow_high_precision_mv);
 
-  if (mi->sb_type >= BLOCK_SIZE_SB8X8 && pc->txfm_mode == TX_MODE_SELECT &&
-      !(rf != INTRA_FRAME &&
-        (skip_coeff || vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)))) {
-    TX_SIZE sz = mi->txfm_size;
-    // FIXME(rbultje) code ternary symbol once all experiments are merged
-    vp9_write(bc, sz != TX_4X4, pc->prob_tx[0]);
-    if (mi->sb_type >= BLOCK_SIZE_MB16X16 && sz != TX_4X4) {
-      vp9_write(bc, sz != TX_8X8, pc->prob_tx[1]);
-      if (mi->sb_type >= BLOCK_SIZE_SB32X32 && sz != TX_8X8)
-        vp9_write(bc, sz != TX_16X16, pc->prob_tx[2]);
+      if (mi->second_ref_frame > 0)
+        vp9_encode_mv(bc,
+                      &mi->mv[1].as_mv, &mi->best_second_mv.as_mv,
+                      nmvc, xd->allow_high_precision_mv);
     }
   }
 }
@@ -841,14 +809,23 @@ static void write_mb_modes_kf(const VP9_COMP *cpi,
     vp9_write(bc, skip_coeff, vp9_get_pred_prob(c, xd, PRED_MBSKIP));
   }
 
+  if (m->mbmi.sb_type >= BLOCK_SIZE_SB8X8 && c->txfm_mode == TX_MODE_SELECT) {
+    TX_SIZE sz = m->mbmi.txfm_size;
+    // FIXME(rbultje) code ternary symbol once all experiments are merged
+    vp9_write(bc, sz != TX_4X4, c->prob_tx[0]);
+    if (m->mbmi.sb_type >= BLOCK_SIZE_MB16X16 && sz != TX_4X4) {
+      vp9_write(bc, sz != TX_8X8, c->prob_tx[1]);
+      if (m->mbmi.sb_type >= BLOCK_SIZE_SB32X32 && sz != TX_8X8)
+        vp9_write(bc, sz != TX_16X16, c->prob_tx[2]);
+    }
+  }
+
   if (m->mbmi.sb_type >= BLOCK_SIZE_SB8X8) {
     const MB_PREDICTION_MODE A = above_block_mode(m, 0, mis);
     const MB_PREDICTION_MODE L = xd->left_available ?
                                  left_block_mode(m, 0) : DC_PRED;
-    write_kf_bmode(bc, ym, c->kf_bmode_prob[A][L]);
-  }
-
-  if (m->mbmi.sb_type < BLOCK_SIZE_SB8X8) {
+    write_intra_mode(bc, ym, c->kf_y_mode_prob[A][L]);
+  } else {
     int idx, idy;
     int bw = 1 << b_width_log2(m->mbmi.sb_type);
     int bh = 1 << b_height_log2(m->mbmi.sb_type);
@@ -858,24 +835,16 @@ static void write_mb_modes_kf(const VP9_COMP *cpi,
         const MB_PREDICTION_MODE A = above_block_mode(m, i, mis);
         const MB_PREDICTION_MODE L = (xd->left_available || idx) ?
                                      left_block_mode(m, i) : DC_PRED;
-        write_kf_bmode(bc, m->bmi[i].as_mode.first,
-                       c->kf_bmode_prob[A][L]);
+        const int bm = m->bmi[i].as_mode.first;
+#ifdef ENTROPY_STATS
+        ++intra_mode_stats[A][L][bm];
+#endif
+        write_intra_mode(bc, bm, c->kf_y_mode_prob[A][L]);
       }
     }
   }
 
-  write_uv_mode(bc, m->mbmi.uv_mode, c->kf_uv_mode_prob[ym]);
-
-  if (m->mbmi.sb_type >= BLOCK_SIZE_SB8X8 && c->txfm_mode == TX_MODE_SELECT) {
-    TX_SIZE sz = m->mbmi.txfm_size;
-    // FIXME(rbultje) code ternary symbol once all experiments are merged
-    vp9_write(bc, sz != TX_4X4, c->prob_tx[0]);
-    if (m->mbmi.sb_type >= BLOCK_SIZE_MB16X16 && sz != TX_4X4) {
-      vp9_write(bc, sz != TX_8X8, c->prob_tx[1]);
-      if (m->mbmi.sb_type >= BLOCK_SIZE_SB32X32 && sz != TX_8X8)
-        vp9_write(bc, sz != TX_16X16, c->prob_tx[2]);
-    }
-  }
+  write_intra_mode(bc, m->mbmi.uv_mode, c->kf_uv_mode_prob[ym]);
 }
 
 static void write_modes_b(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc,
@@ -1045,7 +1014,7 @@ static void print_prob_tree(vp9_coeff_probs *coef_probs, int block_types) {
   fclose(f);
 }
 
-static void build_tree_distribution(vp9_coeff_probs *coef_probs,
+static void build_tree_distribution(vp9_coeff_probs_model *coef_probs,
                                     vp9_coeff_count *coef_counts,
                                     unsigned int (*eob_branch_ct)[REF_TYPES]
                                                                  [COEF_BANDS]
@@ -1060,6 +1029,7 @@ static void build_tree_distribution(vp9_coeff_probs *coef_probs,
 #ifdef ENTROPY_STATS
   int t = 0;
 #endif
+  vp9_prob full_probs[ENTROPY_NODES];
 
   for (i = 0; i < block_types; ++i) {
     for (j = 0; j < REF_TYPES; ++j) {
@@ -1068,14 +1038,24 @@ static void build_tree_distribution(vp9_coeff_probs *coef_probs,
           if (l >= 3 && k == 0)
             continue;
           vp9_tree_probs_from_distribution(vp9_coef_tree,
-                                           coef_probs[i][j][k][l],
+                                           full_probs,
                                            coef_branch_ct[i][j][k][l],
                                            coef_counts[i][j][k][l], 0);
+          vpx_memcpy(coef_probs[i][j][k][l], full_probs,
+                     sizeof(vp9_prob) * UNCONSTRAINED_NODES);
+#if CONFIG_BALANCED_COEFTREE
+          coef_branch_ct[i][j][k][l][1][1] = eob_branch_ct[i][j][k][l] -
+                                             coef_branch_ct[i][j][k][l][1][0];
+          coef_probs[i][j][k][l][1] =
+              get_binary_prob(coef_branch_ct[i][j][k][l][1][0],
+                              coef_branch_ct[i][j][k][l][1][1]);
+#else
           coef_branch_ct[i][j][k][l][0][1] = eob_branch_ct[i][j][k][l] -
                                              coef_branch_ct[i][j][k][l][0][0];
           coef_probs[i][j][k][l][0] =
               get_binary_prob(coef_branch_ct[i][j][k][l][0][0],
                               coef_branch_ct[i][j][k][l][0][1]);
+#endif
 #ifdef ENTROPY_STATS
           if (!cpi->dummy_packing) {
             for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)
@@ -1127,7 +1107,7 @@ static void update_coef_probs_common(
 #ifdef ENTROPY_STATS
     vp9_coeff_stats *tree_update_hist,
 #endif
-    vp9_coeff_probs *new_frame_coef_probs,
+    vp9_coeff_probs_model *new_frame_coef_probs,
     vp9_coeff_probs_model *old_frame_coef_probs,
     vp9_coeff_stats *frame_branch_ct,
     TX_SIZE tx_size) {
@@ -1136,7 +1116,6 @@ static void update_coef_probs_common(
   int savings;
 
   const int entropy_nodes_update = UNCONSTRAINED_NODES;
-  // vp9_prob bestupd = find_coef_update_prob(cpi);
 
   const int tstart = 0;
   /* dry run to see if there is any udpate at all needed */
@@ -1150,7 +1129,7 @@ static void update_coef_probs_common(
             vp9_prob newp = new_frame_coef_probs[i][j][k][l][t];
             const vp9_prob oldp = old_frame_coef_probs[i][j][k][l][t];
             const vp9_prob upd = vp9_coef_update_prob[t];
-            int s;  // = prev_coef_savings[t];
+            int s;
             int u = 0;
 
             if (l >= 3 && k == 0)
@@ -1192,11 +1171,10 @@ static void update_coef_probs_common(
             vp9_prob newp = new_frame_coef_probs[i][j][k][l][t];
             vp9_prob *oldp = old_frame_coef_probs[i][j][k][l] + t;
             const vp9_prob upd = vp9_coef_update_prob[t];
-            int s;  // = prev_coef_savings[t];
+            int s;
             int u = 0;
             if (l >= 3 && k == 0)
               continue;
-
             if (t == PIVOT_NODE)
               s = prob_diff_update_savings_search_model(
                   frame_branch_ct[i][j][k][l][0],
@@ -1278,37 +1256,6 @@ static void update_coef_probs(VP9_COMP* const cpi, vp9_writer* const bc) {
   }
 }
 
-#ifdef PACKET_TESTING
-FILE *vpxlogc = 0;
-#endif
-
-static void decide_kf_ymode_entropy(VP9_COMP *cpi) {
-  int mode_cost[MB_MODE_COUNT];
-  int bestcost = INT_MAX;
-  int bestindex = 0;
-  int i, j;
-
-  for (i = 0; i < 8; i++) {
-    int cost = 0;
-
-    vp9_cost_tokens(mode_cost, cpi->common.kf_ymode_prob[i], vp9_kf_ymode_tree);
-
-    for (j = 0; j < VP9_YMODES; j++)
-      cost += mode_cost[j] * cpi->ymode_count[j];
-
-    vp9_cost_tokens(mode_cost, cpi->common.sb_kf_ymode_prob[i],
-                    vp9_sb_ymode_tree);
-    for (j = 0; j < VP9_I32X32_MODES; j++)
-      cost += mode_cost[j] * cpi->sb_ymode_count[j];
-
-    if (cost < bestcost) {
-      bestindex = i;
-      bestcost = cost;
-    }
-  }
-  cpi->common.kf_ymode_probs_index = bestindex;
-
-}
 static void segment_reference_frames(VP9_COMP *cpi) {
   VP9_COMMON *oci = &cpi->common;
   MODE_INFO *mi = oci->mi;
@@ -1417,9 +1364,6 @@ static void encode_segmentation(VP9_COMP *cpi, vp9_writer *w) {
 
   // Segmentation map
   vp9_write_bit(w, xd->update_mb_segmentation_map);
-#if CONFIG_IMPLICIT_SEGMENTATION
-  vp9_write_bit(w, xd->allow_implicit_segment_update);
-#endif
   if (xd->update_mb_segmentation_map) {
     // Select the coding strategy (temporal or spatial)
     vp9_choose_segmap_coding_method(cpi);
@@ -1482,60 +1426,67 @@ static void encode_segmentation(VP9_COMP *cpi, vp9_writer *w) {
   }
 }
 
-void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, unsigned long *size) {
-  int i;
-  VP9_HEADER oh;
-  VP9_COMMON *const pc = &cpi->common;
-  vp9_writer header_bc, residual_bc;
-  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
-  int extra_bytes_packed = 0;
-
-  uint8_t *cx_data = dest;
+void write_uncompressed_header(VP9_COMMON *cm,
+                               struct vp9_write_bit_buffer *wb) {
+  const int scaling_active = cm->width != cm->display_width ||
+                             cm->height != cm->display_height;
 
-  oh.show_frame = (int) pc->show_frame;
-  oh.type = (int)pc->frame_type;
-  oh.version = pc->version;
-  oh.first_partition_length_in_bytes = 0;
+  vp9_wb_write_bit(wb, cm->frame_type);
+  vp9_wb_write_literal(wb, cm->version, 3);
+  vp9_wb_write_bit(wb, cm->show_frame);
+  vp9_wb_write_bit(wb, scaling_active);
+  vp9_wb_write_bit(wb, cm->subsampling_x);
+  vp9_wb_write_bit(wb, cm->subsampling_y);
 
-  cx_data += 3;
+  if (cm->frame_type == KEY_FRAME) {
+    vp9_wb_write_literal(wb, SYNC_CODE_0, 8);
+    vp9_wb_write_literal(wb, SYNC_CODE_1, 8);
+    vp9_wb_write_literal(wb, SYNC_CODE_2, 8);
+  }
 
-#if defined(SECTIONBITS_OUTPUT)
-  Sectionbits[active_section = 1] += sizeof(VP9_HEADER) * 8 * 256;
-#endif
+  if (scaling_active) {
+    vp9_wb_write_literal(wb, cm->display_width, 16);
+    vp9_wb_write_literal(wb, cm->display_height, 16);
+  }
 
-  compute_update_table();
+  vp9_wb_write_literal(wb, cm->width, 16);
+  vp9_wb_write_literal(wb, cm->height, 16);
 
-  /* every keyframe send startcode, width, height, scale factor, clamp
-   * and color type.
-   */
-  if (oh.type == KEY_FRAME) {
-    // Start / synch code
-    cx_data[0] = 0x49;
-    cx_data[1] = 0x83;
-    cx_data[2] = 0x42;
-    extra_bytes_packed = 3;
-    cx_data += extra_bytes_packed;
+  if (!cm->show_frame) {
+      vp9_wb_write_bit(wb, cm->intra_only);
   }
 
-  if (pc->width != pc->display_width || pc->height != pc->display_height) {
-    write_le16(cx_data, pc->display_width);
-    write_le16(cx_data + 2, pc->display_height);
-    cx_data += 4;
-    extra_bytes_packed += 4;
+  vp9_wb_write_literal(wb, cm->frame_context_idx, NUM_FRAME_CONTEXTS_LG2);
+  vp9_wb_write_bit(wb, cm->clr_type);
+
+  vp9_wb_write_bit(wb, cm->error_resilient_mode);
+  if (!cm->error_resilient_mode) {
+    vp9_wb_write_bit(wb, cm->reset_frame_context);
+    vp9_wb_write_bit(wb, cm->refresh_frame_context);
+    vp9_wb_write_bit(wb, cm->frame_parallel_decoding_mode);
   }
+}
 
-  write_le16(cx_data, pc->width);
-  write_le16(cx_data + 2, pc->height);
-  extra_bytes_packed += 4;
-  cx_data += 4;
+void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, unsigned long *size) {
+  int i, bytes_packed;
+  VP9_COMMON *const pc = &cpi->common;
+  vp9_writer header_bc, residual_bc;
+  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
 
-  vp9_start_encode(&header_bc, cx_data);
+  uint8_t *cx_data = dest;
+  struct vp9_write_bit_buffer wb = {dest, 0};
+  struct vp9_write_bit_buffer first_partition_size_wb;
+
+  write_uncompressed_header(pc, &wb);
+  first_partition_size_wb = wb;
+  vp9_wb_write_literal(&wb, 0, 16);  // don't know in advance first part. size
 
-  // TODO(jkoleszar): remove these two unused bits?
-  vp9_write_bit(&header_bc, pc->clr_type);
+  bytes_packed = vp9_rb_bytes_written(&wb);
+  cx_data += bytes_packed;
 
-  // error resilient mode
-  vp9_write_bit(&header_bc, pc->error_resilient_mode);
+  compute_update_table();
+
+  vp9_start_encode(&header_bc, cx_data);
 
   encode_loopfilter(pc, xd, &header_bc);
 
@@ -1617,14 +1568,6 @@ void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, unsigned long *size) {
       vp9_write_literal(&header_bc, (pc->mcomp_filter_type), 2);
   }
 
-  if (!pc->error_resilient_mode) {
-    vp9_write_bit(&header_bc, pc->refresh_frame_context);
-    vp9_write_bit(&header_bc, pc->frame_parallel_decoding_mode);
-  }
-
-  vp9_write_literal(&header_bc, pc->frame_context_idx,
-                    NUM_FRAME_CONTEXTS_LG2);
-
 #ifdef ENTROPY_STATS
   if (pc->frame_type == INTER_FRAME)
     active_section = 0;
@@ -1694,7 +1637,7 @@ void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, unsigned long *size) {
   // changes in the bitstream.
   if (pc->frame_type != KEY_FRAME) {
     int i, j;
-    int new_context[INTER_MODE_CONTEXTS][4];
+    int new_context[INTER_MODE_CONTEXTS][VP9_MVREFS - 1];
     if (!cpi->dummy_packing) {
       update_inter_mode_probs(pc, new_context);
     } else {
@@ -1704,7 +1647,7 @@ void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, unsigned long *size) {
     }
 
     for (i = 0; i < INTER_MODE_CONTEXTS; i++) {
-      for (j = 0; j < 4; j++) {
+      for (j = 0; j < VP9_MVREFS - 1; j++) {
         if (new_context[i][j] != pc->fc.vp9_mode_contexts[i][j]) {
           vp9_write(&header_bc, 1, 252);
           vp9_write_prob(&header_bc, new_context[i][j]);
@@ -1731,10 +1674,8 @@ void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, unsigned long *size) {
   vp9_copy(cpi->common.fc.pre_coef_probs_32x32,
            cpi->common.fc.coef_probs_32x32);
 
-  vp9_copy(cpi->common.fc.pre_sb_ymode_prob, cpi->common.fc.sb_ymode_prob);
-  vp9_copy(cpi->common.fc.pre_ymode_prob, cpi->common.fc.ymode_prob);
+  vp9_copy(cpi->common.fc.pre_y_mode_prob, cpi->common.fc.y_mode_prob);
   vp9_copy(cpi->common.fc.pre_uv_mode_prob, cpi->common.fc.uv_mode_prob);
-  vp9_copy(cpi->common.fc.pre_bmode_prob, cpi->common.fc.bmode_prob);
   vp9_copy(cpi->common.fc.pre_partition_prob, cpi->common.fc.partition_prob);
   cpi->common.fc.pre_nmvc = cpi->common.fc.nmvc;
   vp9_zero(cpi->common.fc.mv_ref_ct);
@@ -1750,11 +1691,7 @@ void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, unsigned long *size) {
     vp9_write_prob(&header_bc, pc->mbskip_pred_probs[i]);
   }
 
-  if (pc->frame_type == KEY_FRAME) {
-    if (!pc->kf_ymode_probs_update) {
-      vp9_write_literal(&header_bc, pc->kf_ymode_probs_index, 3);
-    }
-  } else {
+  if (pc->frame_type != KEY_FRAME) {
     // Update the probabilities used to encode reference frame data
     update_ref_probs(cpi);
 
@@ -1820,35 +1757,11 @@ void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, unsigned long *size) {
 
   vp9_stop_encode(&header_bc);
 
-  oh.first_partition_length_in_bytes = header_bc.pos;
-
-  /* update frame tag */
-  {
-    int scaling = (pc->width != pc->display_width ||
-                   pc->height != pc->display_height);
-    int v = (oh.first_partition_length_in_bytes << 8) |
-            (pc->subsampling_y << 7) |
-            (pc->subsampling_x << 6) |
-            (scaling << 5) |
-            (oh.show_frame << 4) |
-            (oh.version << 1) |
-            oh.type;
-
-    assert(oh.first_partition_length_in_bytes <= 0xffff);
-    dest[0] = v;
-    dest[1] = v >> 8;
-    dest[2] = v >> 16;
-  }
-
-  *size = VP9_HEADER_SIZE + extra_bytes_packed + header_bc.pos;
 
-  if (pc->frame_type == KEY_FRAME) {
-    decide_kf_ymode_entropy(cpi);
-  } else {
-    /* This is not required if the counts in cpi are consistent with the
-     * final packing pass */
-    // if (!cpi->dummy_packing) vp9_zero(cpi->NMVcount);
-  }
+  // first partition size
+  assert(header_bc.pos <= 0xffff);
+  vp9_wb_write_literal(&first_partition_size_wb, header_bc.pos, 16);
+  *size = bytes_packed + header_bc.pos;
 
   {
     int tile_row, tile_col, total_size = 0;
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index 211eca4b4..e6d36cdf8 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -114,10 +114,9 @@ struct macroblock {
   int *nmvsadcost_hp[2];
   int **mvsadcost;
 
-  int mbmode_cost[2][MB_MODE_COUNT];
+  int mbmode_cost[MB_MODE_COUNT];
   int intra_uv_mode_cost[2][MB_MODE_COUNT];
-  int bmode_costs[VP9_BINTRAMODES][VP9_BINTRAMODES][VP9_BINTRAMODES];
-  int inter_bmode_costs[INTRA_MODE_COUNT];
+  int y_mode_costs[VP9_INTRA_MODES][VP9_INTRA_MODES][VP9_INTRA_MODES];
   int switchable_interp_costs[VP9_SWITCHABLE_FILTERS + 1]
                              [VP9_SWITCHABLE_FILTERS];
 
@@ -134,7 +133,11 @@ struct macroblock {
 
   unsigned char *active_ptr;
 
+  // note that token_costs is the cost when eob node is skipped
   vp9_coeff_count token_costs[TX_SIZE_MAX_SB][BLOCK_TYPES];
+#if CONFIG_BALANCED_COEFTREE
+  vp9_coeff_count token_costs_noskip[TX_SIZE_MAX_SB][BLOCK_TYPES];
+#endif
 
   int optimize;
 
diff --git a/vp9/encoder/vp9_boolhuff.c b/vp9/encoder/vp9_boolhuff.c
index 0fcb2579f..86143ca57 100644
--- a/vp9/encoder/vp9_boolhuff.c
+++ b/vp9/encoder/vp9_boolhuff.c
@@ -10,6 +10,7 @@
 
 #include <assert.h>
 #include "vp9/encoder/vp9_boolhuff.h"
+#include "vp9/common/vp9_entropy.h"
 
 #if defined(SECTIONBITS_OUTPUT)
 unsigned __int64 Sectionbits[500];
diff --git a/vp9/encoder/vp9_dct.c b/vp9/encoder/vp9_dct.c
index d22644424..8d4eec139 100644
--- a/vp9/encoder/vp9_dct.c
+++ b/vp9/encoder/vp9_dct.c
@@ -606,14 +606,13 @@ void vp9_short_walsh4x4_c(short *input, short *output, int pitch) {
     c1 = ip[2 * pitch_short];
     d1 = ip[3 * pitch_short];
 
-    b1 = a1 - b1;
-    c1 += d1;
-    e1 = (c1 - b1) >> 1;
-    a1 += e1;
-    d1 -= e1;
-    c1 = a1 - c1;
-    b1 -= d1;
-
+    a1 += b1;
+    d1 = d1 - c1;
+    e1 = (a1 - d1) >> 1;
+    b1 = e1 - b1;
+    c1 = e1 - c1;
+    a1 -= c1;
+    d1 += b1;
     op[0] = a1;
     op[4] = c1;
     op[8] = d1;
@@ -631,14 +630,13 @@ void vp9_short_walsh4x4_c(short *input, short *output, int pitch) {
     c1 = ip[2];
     d1 = ip[3];
 
-    b1 = a1 - b1;
-    c1 += d1;
-    e1 = (c1 - b1) >> 1;
-    a1 += e1;
-    d1 -= e1;
-    c1 = a1 - c1;
-    b1 -= d1;
-
+    a1 += b1;
+    d1 -= c1;
+    e1 = (a1 - d1) >> 1;
+    b1 = e1 - b1;
+    c1 = e1 - c1;
+    a1 -= c1;
+    d1 += b1;
     op[0] = a1 << WHT_UPSCALE_FACTOR;
     op[1] = c1 << WHT_UPSCALE_FACTOR;
     op[2] = d1 << WHT_UPSCALE_FACTOR;
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index ebee191ad..a38c1ffd3 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -10,6 +10,7 @@
 
 
 #include "./vpx_config.h"
+#include "./vp9_rtcd.h"
 #include "vp9/encoder/vp9_encodeframe.h"
 #include "vp9/encoder/vp9_encodemb.h"
 #include "vp9/encoder/vp9_encodemv.h"
@@ -97,6 +98,8 @@ static unsigned int alt_activity_measure(VP9_COMP *cpi,
   return vp9_encode_intra(cpi, x, use_dc_pred);
 }
 
+DECLARE_ALIGNED(16, static const uint8_t, vp9_64x64_zeros[64*64]) = { 0 };
+
 
 // Measure the activity of the current macroblock
 // What we measure here is TBD so abstracted to this function
@@ -332,7 +335,9 @@ static void update_state(VP9_COMP *cpi,
   MACROBLOCKD *const xd = &x->e_mbd;
   MODE_INFO *mi = &ctx->mic;
   MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
-  int mb_mode = mi->mbmi.mode;
+#if CONFIG_DEBUG || CONFIG_INTERNAL_STATS
+  MB_PREDICTION_MODE mb_mode = mi->mbmi.mode;
+#endif
   int mb_mode_index = ctx->best_mode_index;
   const int mis = cpi->common.mode_info_stride;
   const int bh = 1 << mi_height_log2(bsize), bw = 1 << mi_width_log2(bsize);
@@ -362,7 +367,8 @@ static void update_state(VP9_COMP *cpi,
     ctx->txfm_rd_diff[ALLOW_32X32] = ctx->txfm_rd_diff[ALLOW_16X16];
   }
 
-  if (mb_mode == SPLITMV) {
+  if (mbmi->ref_frame != INTRA_FRAME &&
+      mbmi->sb_type < BLOCK_SIZE_SB8X8) {
     vpx_memcpy(x->partition_info, &ctx->partition_info,
                sizeof(PARTITION_INFO));
 
@@ -448,7 +454,8 @@ static void update_state(VP9_COMP *cpi,
     */
     // Note how often each mode chosen as best
     cpi->mode_chosen_counts[mb_mode_index]++;
-    if (mbmi->mode == SPLITMV || mbmi->mode == NEWMV) {
+    if (mbmi->ref_frame != INTRA_FRAME &&
+        (mbmi->sb_type < BLOCK_SIZE_SB8X8 || mbmi->mode == NEWMV)) {
       int_mv best_mv, best_second_mv;
       MV_REFERENCE_FRAME rf = mbmi->ref_frame;
       best_mv.as_int = ctx->best_ref_mv.as_int;
@@ -768,6 +775,35 @@ static void restore_context(VP9_COMP *cpi, int mi_row, int mi_col,
   vpx_memcpy(cm->left_seg_context + (mi_row & MI_MASK), sl,
              sizeof(PARTITION_CONTEXT) * mh);
 }
+static void save_context(VP9_COMP *cpi, int mi_row, int mi_col,
+                          ENTROPY_CONTEXT a[16 * MAX_MB_PLANE],
+                          ENTROPY_CONTEXT l[16 * MAX_MB_PLANE],
+                          PARTITION_CONTEXT sa[8],
+                          PARTITION_CONTEXT sl[8],
+                          BLOCK_SIZE_TYPE bsize) {
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  int p;
+  int bwl = b_width_log2(bsize), bw = 1 << bwl;
+  int bhl = b_height_log2(bsize), bh = 1 << bhl;
+  int mwl = mi_width_log2(bsize), mw = 1 << mwl;
+  int mhl = mi_height_log2(bsize), mh = 1 << mhl;
+
+  // buffer the above/left context information of the block in search.
+  for (p = 0; p < MAX_MB_PLANE; ++p) {
+    vpx_memcpy(a + bw * p, cm->above_context[p] +
+               (mi_col * 2 >> xd->plane[p].subsampling_x),
+               sizeof(ENTROPY_CONTEXT) * bw >> xd->plane[p].subsampling_x);
+    vpx_memcpy(l + bh * p, cm->left_context[p] +
+               ((mi_row & MI_MASK) * 2 >> xd->plane[p].subsampling_y),
+               sizeof(ENTROPY_CONTEXT) * bh >> xd->plane[p].subsampling_y);
+  }
+  vpx_memcpy(sa, cm->above_seg_context + mi_col,
+             sizeof(PARTITION_CONTEXT) * mw);
+  vpx_memcpy(sl, cm->left_seg_context + (mi_row & MI_MASK),
+             sizeof(PARTITION_CONTEXT) * mh);
+}
 
 static void encode_b(VP9_COMP *cpi, TOKENEXTRA **tp,
                      int mi_row, int mi_col, int output_enabled,
@@ -860,6 +896,337 @@ static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp,
   }
 }
 
+static void set_partitioning(VP9_COMP *cpi, MODE_INFO *m,
+                             BLOCK_SIZE_TYPE bsize) {
+  VP9_COMMON *const cm = &cpi->common;
+  const int mis = cm->mode_info_stride;
+  int bsl = b_width_log2(bsize);
+  int bs = (1 << bsl) / 2;  //
+  int block_row, block_col;
+  int row, col;
+
+  // this test function sets the entire macroblock to the same bsize
+  for (block_row = 0; block_row < 8; block_row += bs) {
+    for (block_col = 0; block_col < 8; block_col += bs) {
+      for (row = 0; row < bs; row++) {
+        for (col = 0; col < bs; col++) {
+          m[(block_row+row)*mis + block_col+col].mbmi.sb_type = bsize;
+        }
+      }
+    }
+  }
+}
+
+static void set_block_size(VP9_COMMON *const cm,
+                           MODE_INFO *m, BLOCK_SIZE_TYPE bsize, int mis,
+                           int mi_row, int mi_col) {
+  int row, col;
+  int bsl = b_width_log2(bsize);
+  int bs = (1 << bsl) / 2;  //
+  MODE_INFO *m2 = m + mi_row * mis + mi_col;
+  for (row = 0; row < bs; row++) {
+    for (col = 0; col < bs; col++) {
+      if (mi_row + row >= cm->mi_rows || mi_col + col >= cm->mi_cols)
+        return;
+      m2[row*mis+col].mbmi.sb_type = bsize;
+    }
+  }
+}
+typedef struct {
+  int64_t sum_square_error;
+  int64_t sum_error;
+  int count;
+  int variance;
+} var;
+
+#define VT(TYPE, BLOCKSIZE) \
+  typedef struct { \
+    var none; \
+    var horz[2]; \
+    var vert[2]; \
+    BLOCKSIZE split[4]; } TYPE;
+
+VT(v8x8, var)
+VT(v16x16, v8x8)
+VT(v32x32, v16x16)
+VT(v64x64, v32x32)
+
+typedef enum {
+  V16X16,
+  V32X32,
+  V64X64,
+} TREE_LEVEL;
+
+// Set variance values given sum square error, sum error, count.
+static void fill_variance(var *v, int64_t s2, int64_t s, int c) {
+  v->sum_square_error = s2;
+  v->sum_error = s;
+  v->count = c;
+  v->variance = 256
+      * (v->sum_square_error - v->sum_error * v->sum_error / v->count)
+      / v->count;
+}
+
+// Fills a 16x16 variance tree node by calling get var8x8 var..
+static void fill_16x16_variance(const unsigned char *s, int sp,
+                                const unsigned char *d, int dp, v16x16 *vt) {
+  unsigned int sse;
+  int sum;
+  vp9_get_sse_sum_8x8(s, sp, d, dp, &sse, &sum);
+  fill_variance(&vt->split[0].none, sse, sum, 64);
+  vp9_get_sse_sum_8x8(s + 8, sp, d + 8, dp, &sse, &sum);
+  fill_variance(&vt->split[1].none, sse, sum, 64);
+  vp9_get_sse_sum_8x8(s + 8 * sp, sp, d + 8 * dp, dp, &sse, &sum);
+  fill_variance(&vt->split[2].none, sse, sum, 64);
+  vp9_get_sse_sum_8x8(s + 8 * sp + 8, sp, d + 8 + 8 * dp, dp, &sse, &sum);
+  fill_variance(&vt->split[3].none, sse, sum, 64);
+}
+
+// Combine 2 variance structures by summing the sum_error, sum_square_error,
+// and counts and then calculating the new variance.
+void sum_2_variances(var *r, var *a, var*b) {
+  fill_variance(r, a->sum_square_error + b->sum_square_error,
+                a->sum_error + b->sum_error, a->count + b->count);
+}
+// Fill one level of our variance tree,  by summing the split sums into each of
+// the horizontal, vertical and none from split and recalculating variance.
+#define fill_variance_tree(VT) \
+  sum_2_variances(VT.horz[0], VT.split[0].none, VT.split[1].none); \
+  sum_2_variances(VT.horz[1], VT.split[2].none, VT.split[3].none); \
+  sum_2_variances(VT.vert[0], VT.split[0].none, VT.split[2].none); \
+  sum_2_variances(VT.vert[1], VT.split[1].none, VT.split[3].none); \
+  sum_2_variances(VT.none, VT.vert[0], VT.vert[1]);
+
+// Set the blocksize in the macroblock info structure if the variance is less
+// than our threshold to one of none, horz, vert.
+#define set_vt_size(VT, BLOCKSIZE, R, C, ACTION) \
+  if (VT.none.variance < threshold) { \
+    set_block_size(cm, m, BLOCKSIZE, mis, R, C); \
+    ACTION; \
+  } \
+  if (VT.horz[0].variance < threshold && VT.horz[1].variance < threshold ) { \
+    set_block_size(cm, m, get_subsize(BLOCKSIZE, PARTITION_HORZ), mis, R, C); \
+    ACTION; \
+  } \
+  if (VT.vert[0].variance < threshold && VT.vert[1].variance < threshold ) { \
+    set_block_size(cm, m, get_subsize(BLOCKSIZE, PARTITION_VERT), mis, R, C); \
+    ACTION; \
+  }
+
+static void choose_partitioning(VP9_COMP *cpi, MODE_INFO *m, int mi_row,
+                                int mi_col) {
+  VP9_COMMON * const cm = &cpi->common;
+  MACROBLOCK *x = &cpi->mb;
+  MACROBLOCKD *xd = &cpi->mb.e_mbd;
+  const int mis = cm->mode_info_stride;
+  // TODO(JBB): More experimentation or testing of this threshold;
+  int64_t threshold = 4;
+  int i, j, k;
+  v64x64 vt;
+  unsigned char * s;
+  int sp;
+  const unsigned char * d = xd->plane[0].pre->buf;
+  int dp = xd->plane[0].pre->stride;
+
+  set_offsets(cpi, mi_row, mi_col, BLOCK_SIZE_SB64X64);
+  s = x->plane[0].src.buf;
+  sp = x->plane[0].src.stride;
+
+  // TODO(JBB): Clearly the higher the quantizer the fewer partitions we want
+  // but this needs more experimentation.
+  threshold = threshold * cpi->common.base_qindex * cpi->common.base_qindex;
+
+  // if ( cm->frame_type == KEY_FRAME ) {
+  d = vp9_64x64_zeros;
+  dp = 64;
+  // }
+  // Fill in the entire tree of 8x8 variances for splits.
+  for (i = 0; i < 4; i++) {
+    const int x32_idx = ((i & 1) << 5);
+    const int y32_idx = ((i >> 1) << 5);
+    for (j = 0; j < 4; j++) {
+      const int x_idx = x32_idx + ((j & 1) << 4);
+      const int y_idx = y32_idx + ((j >> 1) << 4);
+      fill_16x16_variance(s + y_idx * sp + x_idx, sp, d + y_idx * dp + x_idx,
+                          dp, &vt.split[i].split[j]);
+    }
+  }
+  // Fill the rest of the variance tree by summing the split partition
+  // values.
+  for (i = 0; i < 4; i++) {
+    for (j = 0; j < 4; j++) {
+      fill_variance_tree(&vt.split[i].split[j])
+    }
+    fill_variance_tree(&vt.split[i])
+  }
+  fill_variance_tree(&vt)
+
+  // Now go through the entire structure,  splitting every blocksize until
+  // we get to one that's got a variance lower than our threshold,  or we
+  // hit 8x8.
+  set_vt_size( vt, BLOCK_SIZE_SB64X64, mi_row, mi_col, return);
+  for (i = 0; i < 4; ++i) {
+    const int x32_idx = ((i & 1) << 2);
+    const int y32_idx = ((i >> 1) << 2);
+    set_vt_size(vt, BLOCK_SIZE_SB32X32, mi_row + y32_idx, mi_col + x32_idx,
+                continue);
+
+    for (j = 0; j < 4; ++j) {
+      const int x16_idx = ((j & 1) << 1);
+      const int y16_idx = ((j >> 1) << 1);
+      set_vt_size(vt, BLOCK_SIZE_MB16X16, mi_row + y32_idx + y16_idx,
+                  mi_col+x32_idx+x16_idx, continue);
+
+      for (k = 0; k < 4; ++k) {
+        const int x8_idx = (k & 1);
+        const int y8_idx = (k >> 1);
+        set_block_size(cm, m, BLOCK_SIZE_SB8X8, mis,
+                       mi_row + y32_idx + y16_idx + y8_idx,
+                       mi_col + x32_idx + x16_idx + x8_idx);
+      }
+    }
+  }
+}
+static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp,
+                             int mi_row, int mi_col, BLOCK_SIZE_TYPE bsize,
+                             int *rate, int *dist) {
+  VP9_COMMON * const cm = &cpi->common;
+  MACROBLOCK * const x = &cpi->mb;
+  MACROBLOCKD *xd = &cpi->mb.e_mbd;
+  const int mis = cm->mode_info_stride;
+  int bwl, bhl;
+  int bsl = b_width_log2(bsize);
+  int bs = (1 << bsl);
+  int bss = (1 << bsl)/4;
+  int i, pl;
+  PARTITION_TYPE partition;
+  BLOCK_SIZE_TYPE subsize;
+  ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
+  PARTITION_CONTEXT sl[8], sa[8];
+  int r = 0, d = 0;
+
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
+    return;
+
+
+  bwl = b_width_log2(m->mbmi.sb_type);
+  bhl = b_height_log2(m->mbmi.sb_type);
+
+  // parse the partition type
+  if ((bwl == bsl) && (bhl == bsl))
+    partition = PARTITION_NONE;
+  else if ((bwl == bsl) && (bhl < bsl))
+    partition = PARTITION_HORZ;
+  else if ((bwl < bsl) && (bhl == bsl))
+    partition = PARTITION_VERT;
+  else if ((bwl < bsl) && (bhl < bsl))
+    partition = PARTITION_SPLIT;
+  else
+    assert(0);
+
+  subsize = get_subsize(bsize, partition);
+
+  // TODO(JBB): this restriction is here because pick_sb_modes can return
+  // r's that are INT_MAX meaning we can't select a mode / mv for this block.
+  // when the code is made to work for less than sb8x8 we need to come up with
+  // a solution to this problem.
+  assert(subsize >= BLOCK_SIZE_SB8X8);
+
+  if (bsize >= BLOCK_SIZE_SB8X8) {
+    xd->left_seg_context = cm->left_seg_context + (mi_row & MI_MASK);
+    xd->above_seg_context = cm->above_seg_context + mi_col;
+    *(get_sb_partitioning(x, bsize)) = subsize;
+  }
+
+  pl = partition_plane_context(xd, bsize);
+  save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+  switch (partition) {
+    case PARTITION_NONE:
+      pick_sb_modes(cpi, mi_row, mi_col, tp, &r, &d, bsize,
+                    get_block_context(x, bsize));
+      r += x->partition_cost[pl][PARTITION_NONE];
+      break;
+    case PARTITION_HORZ:
+      *(get_sb_index(xd, subsize)) = 0;
+      pick_sb_modes(cpi, mi_row, mi_col, tp, &r, &d, subsize,
+                    get_block_context(x, subsize));
+      if (mi_row + (bs >> 1) <= cm->mi_rows) {
+        int rt, dt;
+        update_state(cpi, get_block_context(x, subsize), subsize, 0);
+        encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
+        *(get_sb_index(xd, subsize)) = 1;
+        pick_sb_modes(cpi, mi_row + (bs >> 2), mi_col, tp, &rt, &dt, subsize,
+                      get_block_context(x, subsize));
+        r += rt;
+        d += dt;
+      }
+      set_partition_seg_context(cm, xd, mi_row, mi_col);
+      pl = partition_plane_context(xd, bsize);
+      r += x->partition_cost[pl][PARTITION_HORZ];
+      break;
+    case PARTITION_VERT:
+      *(get_sb_index(xd, subsize)) = 0;
+      pick_sb_modes(cpi, mi_row, mi_col, tp, &r, &d, subsize,
+                    get_block_context(x, subsize));
+      if (mi_col + (bs >> 1) <= cm->mi_cols) {
+        int rt, dt;
+        update_state(cpi, get_block_context(x, subsize), subsize, 0);
+        encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
+        *(get_sb_index(xd, subsize)) = 1;
+        pick_sb_modes(cpi, mi_row, mi_col + (bs >> 2), tp, &rt, &dt, subsize,
+                      get_block_context(x, subsize));
+        r += rt;
+        d += dt;
+      }
+      set_partition_seg_context(cm, xd, mi_row, mi_col);
+      pl = partition_plane_context(xd, bsize);
+      r += x->partition_cost[pl][PARTITION_VERT];
+      restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+      break;
+    case PARTITION_SPLIT:
+      for (i = 0; i < 4; i++) {
+        int x_idx = (i & 1) * (bs >> 2);
+        int y_idx = (i >> 1) * (bs >> 2);
+        int jj = i >> 1, ii = i & 0x01;
+        int rt, dt;
+
+        if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
+          continue;
+
+        *(get_sb_index(xd, subsize)) = i;
+
+        rd_use_partition(cpi, m + jj * bss * mis + ii * bss, tp, mi_row + y_idx,
+                         mi_col + x_idx, subsize, &rt, &dt);
+        r += rt;
+        d += dt;
+      }
+      set_partition_seg_context(cm, xd, mi_row, mi_col);
+      pl = partition_plane_context(xd, bsize);
+      r += x->partition_cost[pl][PARTITION_SPLIT];
+      break;
+    default:
+      assert(0);
+  }
+
+  // update partition context
+#if CONFIG_AB4X4
+  if (bsize >= BLOCK_SIZE_SB8X8 &&
+      (bsize == BLOCK_SIZE_SB8X8 || partition != PARTITION_SPLIT)) {
+#else
+  if (bsize > BLOCK_SIZE_SB8X8
+      && (bsize == BLOCK_SIZE_MB16X16 || partition != PARTITION_SPLIT)) {
+#endif
+    set_partition_seg_context(cm, xd, mi_row, mi_col);
+    update_partition_context(xd, subsize, bsize);
+  }
+  restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+
+  if (r < INT_MAX && d < INT_MAX)
+    encode_sb(cpi, tp, mi_row, mi_col, bsize == BLOCK_SIZE_SB64X64, bsize);
+  *rate = r;
+  *dist = d;
+}
+
 
 // TODO(jingning,jimbankoski,rbultje): properly skip partition types that are
 // unlikely to be selected depending on previously rate-distortion optimization
@@ -876,7 +1243,7 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp,
   ENTROPY_CONTEXT   l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
   PARTITION_CONTEXT sl[8], sa[8];
   TOKENEXTRA *tp_orig = *tp;
-  int i, p, pl;
+  int i, pl;
   BLOCK_SIZE_TYPE subsize;
   int srate = INT_MAX, sdist = INT_MAX;
 
@@ -888,19 +1255,7 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp,
     }
   assert(mi_height_log2(bsize) == mi_width_log2(bsize));
 
-  // buffer the above/left context information of the block in search.
-  for (p = 0; p < MAX_MB_PLANE; ++p) {
-    vpx_memcpy(a + bs * p, cm->above_context[p] +
-               (mi_col * 2 >> xd->plane[p].subsampling_x),
-               sizeof(ENTROPY_CONTEXT) * bs >> xd->plane[p].subsampling_x);
-    vpx_memcpy(l + bs * p, cm->left_context[p] +
-               ((mi_row & MI_MASK) * 2 >> xd->plane[p].subsampling_y),
-               sizeof(ENTROPY_CONTEXT) * bs >> xd->plane[p].subsampling_y);
-  }
-  vpx_memcpy(sa, cm->above_seg_context + mi_col,
-             sizeof(PARTITION_CONTEXT) * ms);
-  vpx_memcpy(sl, cm->left_seg_context + (mi_row & MI_MASK),
-             sizeof(PARTITION_CONTEXT) * ms);
+  save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
 
   // PARTITION_SPLIT
   if (bsize >= BLOCK_SIZE_SB8X8) {
@@ -1028,6 +1383,8 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp,
   *rate = srate;
   *dist = sdist;
 
+  restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+
   if (srate < INT_MAX && sdist < INT_MAX)
     encode_sb(cpi, tp, mi_row, mi_col, bsize == BLOCK_SIZE_SB64X64, bsize);
 
@@ -1053,8 +1410,22 @@ static void encode_sb_row(VP9_COMP *cpi, int mi_row,
   for (mi_col = cm->cur_tile_mi_col_start;
        mi_col < cm->cur_tile_mi_col_end; mi_col += 8) {
     int dummy_rate, dummy_dist;
-    rd_pick_partition(cpi, tp, mi_row, mi_col, BLOCK_SIZE_SB64X64,
-                      &dummy_rate, &dummy_dist);
+    // TODO(JBB): remove the border conditions for 64x64 blocks once its fixed
+    // without this border check choose will fail on the border of every
+    // non 64x64.
+    if (cpi->speed < 5 ||
+        mi_col + 8 > cm->cur_tile_mi_col_end ||
+        mi_row + 8 > cm->cur_tile_mi_row_end) {
+      rd_pick_partition(cpi, tp, mi_row, mi_col, BLOCK_SIZE_SB64X64,
+                        &dummy_rate, &dummy_dist);
+    } else {
+      const int idx_str = cm->mode_info_stride * mi_row + mi_col;
+      MODE_INFO *m = cm->mi + idx_str;
+      // set_partitioning(cpi, m, BLOCK_SIZE_SB8X8);
+      choose_partitioning(cpi, cm->mi, mi_row, mi_col);
+      rd_use_partition(cpi, m, tp, mi_row, mi_col, BLOCK_SIZE_SB64X64,
+                       &dummy_rate, &dummy_dist);
+    }
   }
 }
 
@@ -1093,11 +1464,9 @@ static void init_encode_frame_mb_context(VP9_COMP *cpi) {
   xd->mode_info_context->mbmi.uv_mode = DC_PRED;
 
   vp9_zero(cpi->count_mb_ref_frame_usage)
-  vp9_zero(cpi->bmode_count)
-  vp9_zero(cpi->ymode_count)
+  vp9_zero(cpi->y_mode_count)
   vp9_zero(cpi->y_uv_mode_count)
   vp9_zero(cpi->common.fc.mv_ref_ct)
-  vp9_zero(cpi->sb_ymode_count)
   vp9_zero(cpi->partition_count);
 
   // Note: this memset assumes above_context[0], [1] and [2]
@@ -1550,20 +1919,17 @@ static void sum_intra_stats(VP9_COMP *cpi, MACROBLOCK *x) {
   const MB_PREDICTION_MODE m = xd->mode_info_context->mbmi.mode;
   const MB_PREDICTION_MODE uvm = xd->mode_info_context->mbmi.uv_mode;
 
+  ++cpi->y_uv_mode_count[m][uvm];
   if (xd->mode_info_context->mbmi.sb_type >= BLOCK_SIZE_SB8X8) {
-    ++cpi->sb_ymode_count[m];
+    ++cpi->y_mode_count[m];
   } else {
-    ++cpi->ymode_count[m];
-  }
-    ++cpi->y_uv_mode_count[m][uvm];
-  if (m == I4X4_PRED) {
     int idx, idy;
     int bw = 1 << b_width_log2(xd->mode_info_context->mbmi.sb_type);
     int bh = 1 << b_height_log2(xd->mode_info_context->mbmi.sb_type);
     for (idy = 0; idy < 2; idy += bh) {
       for (idx = 0; idx < 2; idx += bw) {
         int m = xd->mode_info_context->bmi[idy * 2 + idx].as_mode.first;
-        ++cpi->sb_ymode_count[m];
+        ++cpi->y_mode_count[m];
       }
     }
   }
@@ -1627,7 +1993,7 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t,
             cpi->zbin_mode_boost = GF_ZEROMV_ZBIN_BOOST;
           else
             cpi->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST;
-        } else if (mbmi->mode == SPLITMV) {
+        } else if (mbmi->sb_type < BLOCK_SIZE_SB8X8) {
           cpi->zbin_mode_boost = SPLIT_MV_ZBIN_BOOST;
         } else {
           cpi->zbin_mode_boost = MV_ZBIN_BOOST;
diff --git a/vp9/encoder/vp9_encodeintra.c b/vp9/encoder/vp9_encodeintra.c
index 5d7c244f1..57041a90b 100644
--- a/vp9/encoder/vp9_encodeintra.c
+++ b/vp9/encoder/vp9_encodeintra.c
@@ -16,72 +16,18 @@
 #include "vp9/common/vp9_invtrans.h"
 #include "vp9/encoder/vp9_encodeintra.h"
 
-static void encode_intra4x4block(MACROBLOCK *x, int ib, BLOCK_SIZE_TYPE bs);
-
 int vp9_encode_intra(VP9_COMP *cpi, MACROBLOCK *x, int use_16x16_pred) {
   MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
   (void) cpi;
-
+  mbmi->mode = DC_PRED;
+  mbmi->ref_frame = INTRA_FRAME;
   if (use_16x16_pred) {
-    mbmi->mode = DC_PRED;
-    mbmi->uv_mode = DC_PRED;
-    mbmi->ref_frame = INTRA_FRAME;
-
-    vp9_encode_intra16x16mby(&cpi->common, x);
+    mbmi->txfm_size = TX_16X16;
+    vp9_encode_intra_block_y(&cpi->common, x, BLOCK_SIZE_MB16X16);
   } else {
-    int i;
-
-    for (i = 0; i < 16; i++) {
-      encode_intra4x4block(x, i, BLOCK_SIZE_MB16X16);
-    }
+    mbmi->txfm_size = TX_4X4;
+    vp9_encode_intra_block_y(&cpi->common, x, BLOCK_SIZE_MB16X16);
   }
 
   return vp9_get_mb_ss(x->plane[0].src_diff);
 }
-
-// This function is used only by the firstpass encoding.
-static void encode_intra4x4block(MACROBLOCK *x, int ib,
-                                 BLOCK_SIZE_TYPE bsize) {
-  MACROBLOCKD * const xd = &x->e_mbd;
-  uint8_t* const src =
-      raster_block_offset_uint8(xd, bsize, 0, ib,
-                                x->plane[0].src.buf, x->plane[0].src.stride);
-  uint8_t* const dst =
-      raster_block_offset_uint8(xd, bsize, 0, ib,
-                                xd->plane[0].dst.buf, xd->plane[0].dst.stride);
-  int16_t* const src_diff =
-      raster_block_offset_int16(xd, bsize, 0, ib,
-                                x->plane[0].src_diff);
-  int16_t* const coeff = BLOCK_OFFSET(x->plane[0].coeff, ib, 16);
-  const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);
-
-  assert(ib < (1 << (bwl + bhl)));
-
-  vp9_intra4x4_predict(&x->e_mbd, ib, bsize, DC_PRED,
-                       dst, xd->plane[0].dst.stride);
-  vp9_subtract_block(4, 4, src_diff, 4 << bwl,
-                     src, x->plane[0].src.stride,
-                     dst, xd->plane[0].dst.stride);
-
-  x->fwd_txm4x4(src_diff, coeff, 8 << bwl);
-  x->quantize_b_4x4(x, ib, DCT_DCT, 16);
-  vp9_inverse_transform_b_4x4_add(&x->e_mbd, xd->plane[0].eobs[ib],
-                              BLOCK_OFFSET(xd->plane[0].dqcoeff, ib, 16),
-                              dst, xd->plane[0].dst.stride);
-}
-
-void vp9_encode_intra16x16mby(VP9_COMMON *const cm, MACROBLOCK *x) {
-  MACROBLOCKD *xd = &x->e_mbd;
-
-  vp9_build_intra_predictors_sby_s(xd, BLOCK_SIZE_MB16X16);
-  vp9_encode_sby(cm, x, BLOCK_SIZE_MB16X16);
-}
-
-void vp9_encode_intra16x16mbuv(VP9_COMMON *const cm, MACROBLOCK *x) {
-  MACROBLOCKD *xd = &x->e_mbd;
-
-  vp9_build_intra_predictors_sbuv_s(xd, BLOCK_SIZE_MB16X16);
-  vp9_encode_sbuv(cm, x, BLOCK_SIZE_MB16X16);
-}
-
-
diff --git a/vp9/encoder/vp9_encodeintra.h b/vp9/encoder/vp9_encodeintra.h
index 7da164c6a..14d144b74 100644
--- a/vp9/encoder/vp9_encodeintra.h
+++ b/vp9/encoder/vp9_encodeintra.h
@@ -14,8 +14,6 @@
 #include "vp9/encoder/vp9_onyx_int.h"
 
 int vp9_encode_intra(VP9_COMP *cpi, MACROBLOCK *x, int use_16x16_pred);
-void vp9_encode_intra16x16mby(VP9_COMMON *const cm, MACROBLOCK *x);
-void vp9_encode_intra16x16mbuv(VP9_COMMON *const cm, MACROBLOCK *x);
 void vp9_encode_intra_block_y(VP9_COMMON *const cm, MACROBLOCK *mb,
                               BLOCK_SIZE_TYPE bs);
 void vp9_encode_intra_block_uv(VP9_COMMON *const cm, MACROBLOCK *mb,
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index b7f60b127..98ea98031 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -20,6 +20,9 @@
 #include "vp9/common/vp9_systemdependent.h"
 #include "vp9_rtcd.h"
 
+DECLARE_ALIGNED(16, extern const uint8_t,
+                vp9_pt_energy_class[MAX_ENTROPY_TOKENS]);
+
 void vp9_subtract_block(int rows, int cols,
                         int16_t *diff_ptr, int diff_stride,
                         const uint8_t *src_ptr, int src_stride,
@@ -105,7 +108,7 @@ static int trellis_get_coeff_context(const int *scan,
                                      uint8_t *token_cache,
                                      int pad, int l) {
   int bak = token_cache[scan[idx]], pt;
-  token_cache[scan[idx]] = token;
+  token_cache[scan[idx]] = vp9_pt_energy_class[token];
   pt = vp9_get_coef_context(scan, nb, pad, token_cache, idx + 1, l);
   token_cache[scan[idx]] = bak;
   return pt;
@@ -189,7 +192,8 @@ static void optimize_b(VP9_COMMON *const cm, MACROBLOCK *mb,
   *(tokens[eob] + 1) = *(tokens[eob] + 0);
   next = eob;
   for (i = 0; i < eob; i++)
-    token_cache[scan[i]] = vp9_dct_value_tokens_ptr[qcoeff_ptr[scan[i]]].token;
+    token_cache[scan[i]] = vp9_pt_energy_class[vp9_dct_value_tokens_ptr[
+        qcoeff_ptr[scan[i]]].token];
   nb = vp9_get_coef_neighbors_handle(scan, &pad);
 
   for (i = eob; i-- > i0;) {
@@ -211,10 +215,21 @@ static void optimize_b(VP9_COMMON *const cm, MACROBLOCK *mb,
         band = get_coef_band(band_translate, i + 1);
         pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache,
                                        pad, default_eob);
+#if CONFIG_BALANCED_COEFTREE
+        rate0 +=
+          mb->token_costs_noskip[tx_size][type][ref][band][pt]
+                                [tokens[next][0].token];
+        rate1 +=
+          mb->token_costs_noskip[tx_size][type][ref][band][pt]
+                                [tokens[next][1].token];
+#else
         rate0 +=
-          mb->token_costs[tx_size][type][ref][band][pt][tokens[next][0].token];
+          mb->token_costs[tx_size][type][ref][band][pt]
+                         [tokens[next][0].token];
         rate1 +=
-          mb->token_costs[tx_size][type][ref][band][pt][tokens[next][1].token];
+          mb->token_costs[tx_size][type][ref][band][pt]
+                         [tokens[next][1].token];
+#endif
       }
       UPDATE_RD_COST();
       /* And pick the best. */
@@ -262,14 +277,32 @@ static void optimize_b(VP9_COMMON *const cm, MACROBLOCK *mb,
         if (t0 != DCT_EOB_TOKEN) {
           pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache,
                                          pad, default_eob);
+#if CONFIG_BALANCED_COEFTREE
+          if (!x)
+            rate0 += mb->token_costs[tx_size][type][ref][band][pt][
+                tokens[next][0].token];
+          else
+            rate0 += mb->token_costs_noskip[tx_size][type][ref][band][pt][
+                tokens[next][0].token];
+#else
           rate0 += mb->token_costs[tx_size][type][ref][band][pt][
               tokens[next][0].token];
+#endif
         }
         if (t1 != DCT_EOB_TOKEN) {
           pt = trellis_get_coeff_context(scan, nb, i, t1, token_cache,
                                          pad, default_eob);
+#if CONFIG_BALANCED_COEFTREE
+          if (!x)
+            rate1 += mb->token_costs[tx_size][type][ref][band][pt][
+                tokens[next][1].token];
+          else
+            rate1 += mb->token_costs_noskip[tx_size][type][ref][band][pt][
+                tokens[next][1].token];
+#else
           rate1 += mb->token_costs[tx_size][type][ref][band][pt][
               tokens[next][1].token];
+#endif
         }
       }
 
@@ -322,8 +355,13 @@ static void optimize_b(VP9_COMMON *const cm, MACROBLOCK *mb,
   error1 = tokens[next][1].error;
   t0 = tokens[next][0].token;
   t1 = tokens[next][1].token;
+#if CONFIG_BALANCED_COEFTREE
+  rate0 += mb->token_costs_noskip[tx_size][type][ref][band][pt][t0];
+  rate1 += mb->token_costs_noskip[tx_size][type][ref][band][pt][t1];
+#else
   rate0 += mb->token_costs[tx_size][type][ref][band][pt][t0];
   rate1 += mb->token_costs[tx_size][type][ref][band][pt][t1];
+#endif
   UPDATE_RD_COST();
   best = rd_cost1 < rd_cost0;
   final_eob = i0 - 1;
@@ -610,6 +648,7 @@ static void encode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize,
   struct encode_b_args* const args = arg;
   MACROBLOCK* const x = args->x;
   MACROBLOCKD* const xd = &x->e_mbd;
+  MB_MODE_INFO* const mbmi = &xd->mode_info_context->mbmi;
   const TX_SIZE tx_size = (TX_SIZE)(ss_txfrm_size / 2);
   const int bw = 4 << (b_width_log2(bsize) - xd->plane[plane].subsampling_x);
   const int raster_block = txfrm_block_to_raster_block(xd, bsize, plane,
@@ -634,9 +673,9 @@ static void encode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize,
   TX_TYPE tx_type;
   int mode, b_mode;
 
-  mode = plane == 0? xd->mode_info_context->mbmi.mode:
-                     xd->mode_info_context->mbmi.uv_mode;
-  if (bsize <= BLOCK_SIZE_SB8X8 && mode == I4X4_PRED && plane == 0)
+  mode = plane == 0? mbmi->mode: mbmi->uv_mode;
+  if (mbmi->sb_type < BLOCK_SIZE_SB8X8 && plane == 0 &&
+      mbmi->ref_frame == INTRA_FRAME)
     b_mode = xd->mode_info_context->bmi[ib].as_mode.first;
   else
     b_mode = mode;
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 5ec696604..f57c8be6c 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -523,6 +523,7 @@ void vp9_first_pass(VP9_COMP *cpi) {
       xd->left_available = (mb_col != 0);
 
       xd->mode_info_context->mbmi.sb_type = BLOCK_SIZE_MB16X16;
+      xd->mode_info_context->mbmi.ref_frame = INTRA_FRAME;
 
       // do intra 16x16 prediction
       this_error = vp9_encode_intra(cpi, x, use_dc_pred);
@@ -619,6 +620,7 @@ void vp9_first_pass(VP9_COMP *cpi) {
           this_error = motion_error;
           vp9_set_mbmode_and_mvs(x, NEWMV, &mv);
           xd->mode_info_context->mbmi.txfm_size = TX_4X4;
+          xd->mode_info_context->mbmi.ref_frame = LAST_FRAME;
           vp9_build_inter_predictors_sby(xd, mb_row << 1,
                                          mb_col << 1,
                                          BLOCK_SIZE_MB16X16);
diff --git a/vp9/encoder/vp9_modecosts.c b/vp9/encoder/vp9_modecosts.c
index 171b44bf9..099a04404 100644
--- a/vp9/encoder/vp9_modecosts.c
+++ b/vp9/encoder/vp9_modecosts.c
@@ -17,29 +17,23 @@
 
 void vp9_init_mode_costs(VP9_COMP *c) {
   VP9_COMMON *x = &c->common;
-  const vp9_tree_p T = vp9_bmode_tree;
-  const vp9_tree_p KT = vp9_bmode_tree;
+  const vp9_tree_p KT = vp9_intra_mode_tree;
   int i, j;
 
-  for (i = 0; i < VP9_BINTRAMODES; i++) {
-    for (j = 0; j < VP9_BINTRAMODES; j++) {
-      vp9_cost_tokens((int *)c->mb.bmode_costs[i][j],
-                      x->kf_bmode_prob[i][j], KT);
+  for (i = 0; i < VP9_INTRA_MODES; i++) {
+    for (j = 0; j < VP9_INTRA_MODES; j++) {
+      vp9_cost_tokens((int *)c->mb.y_mode_costs[i][j],
+                      x->kf_y_mode_prob[i][j], KT);
     }
   }
 
-  vp9_cost_tokens((int *)c->mb.inter_bmode_costs, x->fc.bmode_prob, T);
-
   // TODO(rbultje) separate tables for superblock costing?
-  vp9_cost_tokens(c->mb.mbmode_cost[1], x->fc.sb_ymode_prob,
-                  vp9_sb_ymode_tree);
-  vp9_cost_tokens(c->mb.mbmode_cost[0],
-                  x->sb_kf_ymode_prob[c->common.kf_ymode_probs_index],
-                  vp9_sb_ymode_tree);
+  vp9_cost_tokens(c->mb.mbmode_cost, x->fc.y_mode_prob,
+                  vp9_intra_mode_tree);
   vp9_cost_tokens(c->mb.intra_uv_mode_cost[1],
-                  x->fc.uv_mode_prob[VP9_YMODES - 1], vp9_uv_mode_tree);
+                  x->fc.uv_mode_prob[VP9_INTRA_MODES - 1], vp9_intra_mode_tree);
   vp9_cost_tokens(c->mb.intra_uv_mode_cost[0],
-                  x->kf_uv_mode_prob[VP9_YMODES - 1], vp9_uv_mode_tree);
+                  x->kf_uv_mode_prob[VP9_INTRA_MODES - 1], vp9_intra_mode_tree);
 
   for (i = 0; i <= VP9_SWITCHABLE_FILTERS; ++i)
     vp9_cost_tokens((int *)c->mb.switchable_interp_costs[i],
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index 4bbb4152b..9c0609ed1 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -103,9 +103,9 @@ extern int skip_false_count;
 
 
 #ifdef ENTROPY_STATS
-extern int intra_mode_stats[VP9_KF_BINTRAMODES]
-                           [VP9_KF_BINTRAMODES]
-                           [VP9_KF_BINTRAMODES];
+extern int intra_mode_stats[VP9_INTRA_MODES]
+                           [VP9_INTRA_MODES]
+                           [VP9_INTRA_MODES];
 #endif
 
 #ifdef NMV_STATS
@@ -258,9 +258,6 @@ void vp9_initialize_enc() {
     init_done = 1;
   }
 }
-#ifdef PACKET_TESTING
-extern FILE *vpxlogc;
-#endif
 
 static void setup_features(VP9_COMP *cpi) {
   MACROBLOCKD *xd = &cpi->mb.e_mbd;
@@ -270,9 +267,6 @@ static void setup_features(VP9_COMP *cpi) {
 
   xd->update_mb_segmentation_map = 0;
   xd->update_mb_segmentation_data = 0;
-#if CONFIG_IMPLICIT_SEGMENTATION
-  xd->allow_implicit_segment_update = 0;
-#endif
   vpx_memset(xd->mb_segment_tree_probs, 255, sizeof(xd->mb_segment_tree_probs));
 
   vp9_clearall_segfeatures(xd);
@@ -287,7 +281,6 @@ static void setup_features(VP9_COMP *cpi) {
   set_default_lf_deltas(cpi);
 }
 
-
 static void dealloc_compressor_data(VP9_COMP *cpi) {
   // Delete sementation map
   vpx_free(cpi->segmentation_map);
@@ -358,9 +351,6 @@ static void configure_static_seg_features(VP9_COMP *cpi) {
     vpx_memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
     xd->update_mb_segmentation_map = 0;
     xd->update_mb_segmentation_data = 0;
-#if CONFIG_IMPLICIT_SEGMENTATION
-    xd->allow_implicit_segment_update = 0;
-#endif
     cpi->static_mb_pct = 0;
 
     // Disable segmentation
@@ -374,9 +364,6 @@ static void configure_static_seg_features(VP9_COMP *cpi) {
     vpx_memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
     xd->update_mb_segmentation_map = 0;
     xd->update_mb_segmentation_data = 0;
-#if CONFIG_IMPLICIT_SEGMENTATION
-    xd->allow_implicit_segment_update = 0;
-#endif
     cpi->static_mb_pct = 0;
 
     // Disable segmentation and individual segment features by default
@@ -475,53 +462,6 @@ static void configure_static_seg_features(VP9_COMP *cpi) {
   }
 }
 
-#if CONFIG_IMPLICIT_SEGMENTATION
-static double implict_seg_q_modifiers[MAX_MB_SEGMENTS] =
-  {1.0, 0.95, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
-static void configure_implicit_segmentation(VP9_COMP *cpi, int frame_qindex) {
-  VP9_COMMON *cm = &cpi->common;
-  MACROBLOCKD *xd = &cpi->mb.e_mbd;
-  int i;
-  int qi_delta;
-  double q_baseline = vp9_convert_qindex_to_q(frame_qindex);
-
-  // Set the flags to allow implicit segment update but disallow explicit update
-  xd->segmentation_enabled = 1;
-  xd->allow_implicit_segment_update = 1;
-  xd->update_mb_segmentation_map = 0;
-
-  // For key frames clear down the segment map to a default state.
-  if (cm->frame_type == KEY_FRAME) {
-    // Clear down the global segmentation map
-    vpx_memset(cpi->segmentation_map, 0, (cm->mb_rows * cm->mb_cols));
-
-    // Clear down the segment features.
-    vp9_clearall_segfeatures(xd);
-
-    xd->update_mb_segmentation_data = 0;
-
-  // Update the segment data if it is an arf or non overlay gf.
-  } else if (cpi->refresh_alt_ref_frame ||
-             (cpi->refresh_golden_frame && !cpi->is_src_frame_alt_ref)) {
-    xd->update_mb_segmentation_data = 1;
-
-    // Enable use of q deltas on segments 1 and up
-    // Segment 0 is treated as a neutral segment with no changes
-    for (i = 1; i < MAX_MB_SEGMENTS; ++i) {
-      qi_delta = compute_qdelta(cpi, q_baseline,
-                                implict_seg_q_modifiers[i] * q_baseline);
-      vp9_set_segdata(xd, i, SEG_LVL_ALT_Q, qi_delta);
-      vp9_enable_segfeature(xd, i, SEG_LVL_ALT_Q);
-    }
-
-    // Where relevant assume segment data is delta data
-    xd->mb_segment_abs_delta = SEGMENT_DELTADATA;
-  } else {
-    xd->update_mb_segmentation_data = 0;
-  }
-}
-#endif
-
 #ifdef ENTROPY_STATS
 void vp9_update_mode_context_stats(VP9_COMP *cpi) {
   VP9_COMMON *cm = &cpi->common;
@@ -541,7 +481,7 @@ void vp9_update_mode_context_stats(VP9_COMP *cpi) {
 
   // Add in the values for this frame
   for (i = 0; i < INTER_MODE_CONTEXTS; i++) {
-    for (j = 0; j < 4; j++) {
+    for (j = 0; j < VP9_MVREFS - 1; j++) {
       mv_ref_stats[i][j][0] += (int64_t)mv_ref_ct[i][j][0];
       mv_ref_stats[i][j][1] += (int64_t)mv_ref_ct[i][j][1];
     }
@@ -558,12 +498,13 @@ void print_mode_context(VP9_COMP *cpi) {
   int i, j;
 
   fprintf(f, "#include \"vp9_entropy.h\"\n");
-  fprintf(f, "const int vp9_mode_contexts[INTER_MODE_CONTEXTS][4] =");
+  fprintf(f,
+          "const int vp9_mode_contexts[INTER_MODE_CONTEXTS][VP9_MVREFS - 1] =");
   fprintf(f, "{\n");
   for (j = 0; j < INTER_MODE_CONTEXTS; j++) {
     fprintf(f, "  {/* %d */ ", j);
     fprintf(f, "    ");
-    for (i = 0; i < 4; i++) {
+    for (i = 0; i < VP9_MVREFS - 1; i++) {
       int this_prob;
       int64_t count = cpi->mv_ref_stats[j][i][0] + cpi->mv_ref_stats[j][i][1];
       if (count)
@@ -700,6 +641,25 @@ static void set_rd_speed_thresholds(VP9_COMP *cpi, int mode, int speed) {
   sf->thresh_mult[THR_COMP_SPLITGA  ] += speed_multiplier * 4500;
   sf->thresh_mult[THR_COMP_SPLITLG  ] += speed_multiplier * 4500;
 
+  if (speed > 4) {
+    for (i = 0; i < MAX_MODES; ++i)
+      sf->thresh_mult[i] = INT_MAX;
+
+    sf->thresh_mult[THR_DC       ] = 0;
+    sf->thresh_mult[THR_TM       ] = 0;
+    sf->thresh_mult[THR_NEWMV    ] = 4000;
+    sf->thresh_mult[THR_NEWG     ] = 4000;
+    sf->thresh_mult[THR_NEWA     ] = 4000;
+    sf->thresh_mult[THR_NEARESTMV] = 0;
+    sf->thresh_mult[THR_NEARESTG ] = 0;
+    sf->thresh_mult[THR_NEARESTA ] = 0;
+    sf->thresh_mult[THR_NEARMV   ] = 2000;
+    sf->thresh_mult[THR_NEARG    ] = 2000;
+    sf->thresh_mult[THR_NEARA    ] = 2000;
+    sf->thresh_mult[THR_COMP_NEARESTLA] = 2000;
+    sf->recode_loop = 0;
+  }
+
   /* disable frame modes if flags not set */
   if (!(cpi->ref_frame_flags & VP9_LAST_FLAG)) {
     sf->thresh_mult[THR_NEWMV    ] = INT_MAX;
@@ -778,16 +738,12 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
   sf->optimize_coefficients = !cpi->oxcf.lossless;
   sf->first_step = 0;
   sf->max_step_search_steps = MAX_MVSEARCH_STEPS;
-  sf->comp_inter_joint_serach = 1;
+  sf->comp_inter_joint_search = 1;
 #if CONFIG_MULTIPLE_ARF
   // Switch segmentation off.
   sf->static_segmentation = 0;
 #else
-#if CONFIG_IMPLICIT_SEGMENTATION
   sf->static_segmentation = 0;
-#else
-  sf->static_segmentation = 0;
-#endif
 #endif
   sf->mb16_breakout = 0;
 
@@ -801,55 +757,16 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
       // Switch segmentation off.
       sf->static_segmentation = 0;
 #else
-#if CONFIG_IMPLICIT_SEGMENTATION
-  sf->static_segmentation = 0;
-#else
   sf->static_segmentation = 0;
 #endif
-#endif
       sf->mb16_breakout = 0;
 
       if (speed > 0) {
-        /* Disable coefficient optimization above speed 0 */
         sf->optimize_coefficients = 0;
         sf->no_skip_block4x4_search = 0;
-        sf->comp_inter_joint_serach = 0;
-
+        sf->comp_inter_joint_search = 0;
         sf->first_step = 1;
-
-        cpi->mode_check_freq[THR_SPLITG] = 2;
-        cpi->mode_check_freq[THR_SPLITA] = 2;
-        cpi->mode_check_freq[THR_SPLITMV] = 0;
-
-        cpi->mode_check_freq[THR_COMP_SPLITGA] = 2;
-        cpi->mode_check_freq[THR_COMP_SPLITLG] = 2;
-        cpi->mode_check_freq[THR_COMP_SPLITLA] = 0;
       }
-
-      if (speed > 1) {
-        cpi->mode_check_freq[THR_SPLITG] = 4;
-        cpi->mode_check_freq[THR_SPLITA] = 4;
-        cpi->mode_check_freq[THR_SPLITMV] = 2;
-
-        cpi->mode_check_freq[THR_COMP_SPLITGA] = 4;
-        cpi->mode_check_freq[THR_COMP_SPLITLG] = 4;
-        cpi->mode_check_freq[THR_COMP_SPLITLA] = 2;
-      }
-
-      if (speed > 2) {
-        cpi->mode_check_freq[THR_SPLITG] = 15;
-        cpi->mode_check_freq[THR_SPLITA] = 15;
-        cpi->mode_check_freq[THR_SPLITMV] = 7;
-
-        cpi->mode_check_freq[THR_COMP_SPLITGA] = 15;
-        cpi->mode_check_freq[THR_COMP_SPLITLG] = 15;
-        cpi->mode_check_freq[THR_COMP_SPLITLA] = 7;
-
-        // Only do recode loop on key frames, golden frames and
-        // alt ref frames
-        sf->recode_loop = 2;
-      }
-
       break;
 
   }; /* switch */
@@ -1194,6 +1111,7 @@ void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {
   cpi->refresh_golden_frame = 0;
   cpi->refresh_last_frame = 1;
   cm->refresh_frame_context = 1;
+  cm->reset_frame_context = 0;
 
   setup_features(cpi);
   cpi->mb.e_mbd.allow_high_precision_mv = 0;   // Default mv precision adaptation
@@ -1756,18 +1674,18 @@ void vp9_remove_compressor(VP9_PTR *ptr) {
 
       fprintf(fmode, "\n#include \"vp9_entropymode.h\"\n\n");
       fprintf(fmode, "const unsigned int vp9_kf_default_bmode_counts ");
-      fprintf(fmode, "[VP9_KF_BINTRAMODES][VP9_KF_BINTRAMODES]"
-                     "[VP9_KF_BINTRAMODES] =\n{\n");
+      fprintf(fmode, "[VP9_INTRA_MODES][VP9_INTRA_MODES]"
+                     "[VP9_INTRA_MODES] =\n{\n");
 
-      for (i = 0; i < VP9_KF_BINTRAMODES; i++) {
+      for (i = 0; i < VP9_INTRA_MODES; i++) {
 
         fprintf(fmode, "    { // Above Mode :  %d\n", i);
 
-        for (j = 0; j < VP9_KF_BINTRAMODES; j++) {
+        for (j = 0; j < VP9_INTRA_MODES; j++) {
 
           fprintf(fmode, "        {");
 
-          for (k = 0; k < VP9_KF_BINTRAMODES; k++) {
+          for (k = 0; k < VP9_INTRA_MODES; k++) {
             if (!intra_mode_stats[i][j][k])
               fprintf(fmode, " %5d, ", 1);
             else
@@ -2621,6 +2539,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
       (cpi->oxcf.frame_parallel_decoding_mode != 0);
     if (cm->error_resilient_mode) {
       cm->frame_parallel_decoding_mode = 1;
+      cm->reset_frame_context = 0;
       cm->refresh_frame_context = 0;
     }
   }
@@ -2931,16 +2850,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
       }
     }
 
-#if CONFIG_IMPLICIT_SEGMENTATION
-    if (!cm->error_resilient_mode && !cpi->sf.static_segmentation) {
-      configure_implicit_segmentation(cpi, q);
-    }
-#endif
-
     // transform / motion compensation build reconstruction frame
-    if (cm->frame_type == KEY_FRAME) {
-      vp9_default_coef_probs(cm);
-    }
 
     vp9_encode_frame(cpi);
 
@@ -3183,15 +3093,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
   cpi->dummy_packing = 0;
   vp9_pack_bitstream(cpi, dest, size);
 
-#if CONFIG_IMPLICIT_SEGMENTATION
-  // Should we allow implicit update of the segment map.
-  if (xd->allow_implicit_segment_update && !cm->error_resilient_mode) {
-    vp9_implicit_segment_map_update(cm);
-  // or has there been an explicit update
-  } else if (xd->update_mb_segmentation_map) {
-#else
   if (xd->update_mb_segmentation_map) {
-#endif
     update_reference_segmentation_map(cpi);
   }
 
@@ -3212,10 +3114,8 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
   }
 
   if (cpi->common.frame_type != KEY_FRAME) {
-    vp9_copy(cpi->common.fc.sb_ymode_counts, cpi->sb_ymode_count);
-    vp9_copy(cpi->common.fc.ymode_counts, cpi->ymode_count);
+    vp9_copy(cpi->common.fc.y_mode_counts, cpi->y_mode_count);
     vp9_copy(cpi->common.fc.uv_mode_counts, cpi->y_uv_mode_count);
-    vp9_copy(cpi->common.fc.bmode_counts, cpi->bmode_count);
     vp9_copy(cpi->common.fc.partition_counts, cpi->partition_count);
     cpi->common.fc.NMVcount = cpi->NMVcount;
     if (!cpi->common.error_resilient_mode &&
@@ -3680,6 +3580,7 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
       }
 
       cm->show_frame = 0;
+      cm->intra_only = 0;
       cpi->refresh_alt_ref_frame = 1;
       cpi->refresh_golden_frame = 0;
       cpi->refresh_last_frame = 0;
@@ -3884,6 +3785,7 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
     cpi->droppable = !frame_is_reference(cpi);
 
     // return to normal state
+    cm->reset_frame_context = 0;
     cm->refresh_frame_context = 1;
     cpi->refresh_alt_ref_frame = 0;
     cpi->refresh_golden_frame = 0;
diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h
index 15f9571bb..9e259762d 100644
--- a/vp9/encoder/vp9_onyx_int.h
+++ b/vp9/encoder/vp9_onyx_int.h
@@ -81,17 +81,15 @@ typedef struct {
   vp9_coeff_probs_model coef_probs_16x16[BLOCK_TYPES];
   vp9_coeff_probs_model coef_probs_32x32[BLOCK_TYPES];
 
-  vp9_prob sb_ymode_prob[VP9_I32X32_MODES - 1];
-  vp9_prob ymode_prob[VP9_YMODES - 1]; /* interframe intra mode probs */
-  vp9_prob uv_mode_prob[VP9_YMODES][VP9_UV_MODES - 1];
-  vp9_prob bmode_prob[VP9_BINTRAMODES - 1];
+  vp9_prob y_mode_prob[VP9_INTRA_MODES - 1]; /* interframe intra mode probs */
+  vp9_prob uv_mode_prob[VP9_INTRA_MODES][VP9_INTRA_MODES - 1];
   vp9_prob partition_prob[NUM_PARTITION_CONTEXTS][PARTITION_TYPES - 1];
 
   vp9_prob switchable_interp_prob[VP9_SWITCHABLE_FILTERS + 1]
                                  [VP9_SWITCHABLE_FILTERS - 1];
 
-  int mv_ref_ct[INTER_MODE_CONTEXTS][4][2];
-  int vp9_mode_contexts[INTER_MODE_CONTEXTS][4];
+  int mv_ref_ct[INTER_MODE_CONTEXTS][VP9_MVREFS - 1][2];
+  int vp9_mode_contexts[INTER_MODE_CONTEXTS][VP9_MVREFS - 1];
 
 } CODING_CONTEXT;
 
@@ -115,8 +113,7 @@ typedef struct {
   double new_mv_count;
   double duration;
   double count;
-}
-FIRSTPASS_STATS;
+} FIRSTPASS_STATS;
 
 typedef struct {
   int frames_so_far;
@@ -128,7 +125,6 @@ typedef struct {
   double frame_mvr_abs;
   double frame_mvc;
   double frame_mvc_abs;
-
 } ONEPASS_FRAMESTATS;
 
 typedef struct {
@@ -200,8 +196,7 @@ typedef enum {
   THR_COMP_SPLITLG,
   THR_COMP_SPLITLA,
   THR_COMP_SPLITGA,
-}
-THR_MODES;
+} THR_MODES;
 
 typedef enum {
   DIAMOND = 0,
@@ -225,7 +220,7 @@ typedef struct {
   int search_best_filter;
   int mb16_breakout;
   int static_segmentation;
-  int comp_inter_joint_serach;
+  int comp_inter_joint_search;
 } SPEED_FEATURES;
 
 enum BlockSize {
@@ -336,8 +331,8 @@ typedef struct VP9_COMP {
   int single_pred_count[COMP_PRED_CONTEXTS];
   // FIXME contextualize
   int txfm_count_32x32p[TX_SIZE_MAX_SB];
-  int txfm_count_16x16p[TX_SIZE_MAX_MB];
-  int txfm_count_8x8p[TX_SIZE_MAX_MB - 1];
+  int txfm_count_16x16p[TX_SIZE_MAX_SB - 1];
+  int txfm_count_8x8p[TX_SIZE_MAX_SB - 2];
   int64_t rd_tx_select_diff[NB_TXFM_MODES];
   int rd_tx_select_threshes[4][NB_TXFM_MODES];
 
@@ -413,28 +408,26 @@ typedef struct VP9_COMP {
 
   int cq_target_quality;
 
-  int sb_ymode_count [VP9_I32X32_MODES];
-  int ymode_count[VP9_YMODES];        /* intra MB type cts this frame */
-  int bmode_count[VP9_BINTRAMODES];
-  int y_uv_mode_count[VP9_YMODES][VP9_UV_MODES];
+  int y_mode_count[VP9_INTRA_MODES];
+  int y_uv_mode_count[VP9_INTRA_MODES][VP9_INTRA_MODES];
   unsigned int partition_count[NUM_PARTITION_CONTEXTS][PARTITION_TYPES];
 
   nmv_context_counts NMVcount;
 
   vp9_coeff_count coef_counts_4x4[BLOCK_TYPES];
-  vp9_coeff_probs frame_coef_probs_4x4[BLOCK_TYPES];
+  vp9_coeff_probs_model frame_coef_probs_4x4[BLOCK_TYPES];
   vp9_coeff_stats frame_branch_ct_4x4[BLOCK_TYPES];
 
   vp9_coeff_count coef_counts_8x8[BLOCK_TYPES];
-  vp9_coeff_probs frame_coef_probs_8x8[BLOCK_TYPES];
+  vp9_coeff_probs_model frame_coef_probs_8x8[BLOCK_TYPES];
   vp9_coeff_stats frame_branch_ct_8x8[BLOCK_TYPES];
 
   vp9_coeff_count coef_counts_16x16[BLOCK_TYPES];
-  vp9_coeff_probs frame_coef_probs_16x16[BLOCK_TYPES];
+  vp9_coeff_probs_model frame_coef_probs_16x16[BLOCK_TYPES];
   vp9_coeff_stats frame_branch_ct_16x16[BLOCK_TYPES];
 
   vp9_coeff_count coef_counts_32x32[BLOCK_TYPES];
-  vp9_coeff_probs frame_coef_probs_32x32[BLOCK_TYPES];
+  vp9_coeff_probs_model frame_coef_probs_32x32[BLOCK_TYPES];
   vp9_coeff_stats frame_branch_ct_32x32[BLOCK_TYPES];
 
   int gfu_boost;
@@ -619,7 +612,7 @@ typedef struct VP9_COMP {
 #endif
 
 #ifdef ENTROPY_STATS
-  int64_t mv_ref_stats[INTER_MODE_CONTEXTS][4][2];
+  int64_t mv_ref_stats[INTER_MODE_CONTEXTS][VP9_MVREFS - 1][2];
 #endif
 } VP9_COMP;
 
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index cf1132e3a..f4426adaa 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -124,9 +124,7 @@ void vp9_save_coding_context(VP9_COMP *cpi) {
 
   vp9_copy(cc->vp9_mode_contexts, cm->fc.vp9_mode_contexts);
 
-  vp9_copy(cc->ymode_prob, cm->fc.ymode_prob);
-  vp9_copy(cc->sb_ymode_prob, cm->fc.sb_ymode_prob);
-  vp9_copy(cc->bmode_prob, cm->fc.bmode_prob);
+  vp9_copy(cc->y_mode_prob, cm->fc.y_mode_prob);
   vp9_copy(cc->uv_mode_prob, cm->fc.uv_mode_prob);
   vp9_copy(cc->partition_prob, cm->fc.partition_prob);
 
@@ -163,9 +161,7 @@ void vp9_restore_coding_context(VP9_COMP *cpi) {
 
   vp9_copy(cm->fc.vp9_mode_contexts, cc->vp9_mode_contexts);
 
-  vp9_copy(cm->fc.ymode_prob, cc->ymode_prob);
-  vp9_copy(cm->fc.sb_ymode_prob, cc->sb_ymode_prob);
-  vp9_copy(cm->fc.bmode_prob, cc->bmode_prob);
+  vp9_copy(cm->fc.y_mode_prob, cc->y_mode_prob);
   vp9_copy(cm->fc.uv_mode_prob, cc->uv_mode_prob);
   vp9_copy(cm->fc.partition_prob, cc->partition_prob);
 
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index aa0557735..862e72f24 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -46,6 +46,12 @@
 /* Factor to weigh the rate for switchable interp filters */
 #define SWITCHABLE_INTERP_RATE_FACTOR 1
 
+DECLARE_ALIGNED(16, extern const uint8_t,
+                vp9_pt_energy_class[MAX_ENTROPY_TOKENS]);
+
+#define I4X4_PRED 0x8000
+#define SPLITMV 0x10000
+
 const MODE_DEFINITION vp9_mode_order[MAX_MODES] = {
   {ZEROMV,    LAST_FRAME,   NONE},
   {DC_PRED,   INTRA_FRAME,  NONE},
@@ -81,7 +87,7 @@ const MODE_DEFINITION vp9_mode_order[MAX_MODES] = {
   {SPLITMV,   GOLDEN_FRAME, NONE},
   {SPLITMV,   ALTREF_FRAME, NONE},
 
-  {I4X4_PRED,    INTRA_FRAME,  NONE},
+  {I4X4_PRED, INTRA_FRAME,  NONE},
 
   /* compound prediction modes */
   {ZEROMV,    LAST_FRAME,   GOLDEN_FRAME},
@@ -105,11 +111,31 @@ const MODE_DEFINITION vp9_mode_order[MAX_MODES] = {
   {SPLITMV,   GOLDEN_FRAME, ALTREF_FRAME},
 };
 
+#if CONFIG_BALANCED_COEFTREE
+static void fill_token_costs(vp9_coeff_count *c,
+                             vp9_coeff_count *cnoskip,
+                             vp9_coeff_probs_model *p,
+                             TX_SIZE tx_size) {
+  int i, j, k, l;
+  for (i = 0; i < BLOCK_TYPES; i++)
+    for (j = 0; j < REF_TYPES; j++)
+      for (k = 0; k < COEF_BANDS; k++)
+        for (l = 0; l < PREV_COEF_CONTEXTS; l++) {
+          vp9_prob probs[ENTROPY_NODES];
+          vp9_model_to_full_probs(p[i][j][k][l], probs);
+          vp9_cost_tokens((int *)cnoskip[i][j][k][l], probs,
+                          vp9_coef_tree);
+          // Replace the eob node prob with a very small value so that the
+          // cost approximately equals the cost without the eob node
+          probs[1] = 1;
+          vp9_cost_tokens((int *)c[i][j][k][l], probs, vp9_coef_tree);
+        }
+}
+#else
 static void fill_token_costs(vp9_coeff_count *c,
                              vp9_coeff_probs_model *p,
                              TX_SIZE tx_size) {
   int i, j, k, l;
-
   for (i = 0; i < BLOCK_TYPES; i++)
     for (j = 0; j < REF_TYPES; j++)
       for (k = 0; k < COEF_BANDS; k++)
@@ -120,6 +146,7 @@ static void fill_token_costs(vp9_coeff_count *c,
                                vp9_coef_tree);
         }
 }
+#endif
 
 static int rd_iifactor[32] =  { 4, 4, 3, 2, 1, 0, 0, 0,
                                 0, 0, 0, 0, 0, 0, 0, 0,
@@ -210,6 +237,20 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex) {
     }
   }
 
+#if CONFIG_BALANCED_COEFTREE
+  fill_token_costs(cpi->mb.token_costs[TX_4X4],
+                   cpi->mb.token_costs_noskip[TX_4X4],
+                   cpi->common.fc.coef_probs_4x4, TX_4X4);
+  fill_token_costs(cpi->mb.token_costs[TX_8X8],
+                   cpi->mb.token_costs_noskip[TX_8X8],
+                   cpi->common.fc.coef_probs_8x8, TX_8X8);
+  fill_token_costs(cpi->mb.token_costs[TX_16X16],
+                   cpi->mb.token_costs_noskip[TX_16X16],
+                   cpi->common.fc.coef_probs_16x16, TX_16X16);
+  fill_token_costs(cpi->mb.token_costs[TX_32X32],
+                   cpi->mb.token_costs_noskip[TX_32X32],
+                   cpi->common.fc.coef_probs_32x32, TX_32X32);
+#else
   fill_token_costs(cpi->mb.token_costs[TX_4X4],
                    cpi->common.fc.coef_probs_4x4, TX_4X4);
   fill_token_costs(cpi->mb.token_costs[TX_8X8],
@@ -218,6 +259,7 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex) {
                    cpi->common.fc.coef_probs_16x16, TX_16X16);
   fill_token_costs(cpi->mb.token_costs[TX_32X32],
                    cpi->common.fc.coef_probs_32x32, TX_32X32);
+#endif
 
   for (i = 0; i < NUM_PARTITION_CONTEXTS; i++)
     vp9_cost_tokens(cpi->mb.partition_cost[i],
@@ -225,7 +267,6 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex) {
                     vp9_partition_tree);
 
   /*rough estimate for costing*/
-  cpi->common.kf_ymode_probs_index = cpi->common.base_qindex >> 4;
   vp9_init_mode_costs(cpi);
 
   if (cpi->common.frame_type != KEY_FRAME) {
@@ -271,7 +312,13 @@ static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb,
   TX_TYPE tx_type = DCT_DCT;
 
   const int segment_id = xd->mode_info_context->mbmi.segment_id;
+#if CONFIG_BALANCED_COEFTREE
+  unsigned int (*token_costs_noskip)[PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS] =
+      mb->token_costs_noskip[tx_size][type][ref];
+#else
   vp9_prob coef_probs[COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];
+#endif
+
   int seg_eob, default_eob;
   uint8_t token_cache[1024];
   const uint8_t * band_translate;
@@ -291,8 +338,10 @@ static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb,
           get_tx_type_4x4(xd, block) : DCT_DCT;
       above_ec = A[0] != 0;
       left_ec = L[0] != 0;
+#if !CONFIG_BALANCED_COEFTREE
       vp9_model_to_full_probs_sb(cm->fc.coef_probs_4x4[type][ref],
                                  coef_probs);
+#endif
       seg_eob = 16;
       scan = get_scan_4x4(tx_type);
       band_translate = vp9_coefband_trans_4x4;
@@ -307,8 +356,10 @@ static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb,
       above_ec = (A[0] + A[1]) != 0;
       left_ec = (L[0] + L[1]) != 0;
       scan = get_scan_8x8(tx_type);
+#if !CONFIG_BALANCED_COEFTREE
       vp9_model_to_full_probs_sb(cm->fc.coef_probs_8x8[type][ref],
                                  coef_probs);
+#endif
       seg_eob = 64;
       band_translate = vp9_coefband_trans_8x8plus;
       break;
@@ -320,8 +371,10 @@ static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb,
       TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
           get_tx_type_16x16(xd, y + (x >> 2)) : DCT_DCT;
       scan = get_scan_16x16(tx_type);
+#if !CONFIG_BALANCED_COEFTREE
       vp9_model_to_full_probs_sb(cm->fc.coef_probs_16x16[type][ref],
                                  coef_probs);
+#endif
       seg_eob = 256;
       above_ec = (A[0] + A[1] + A[2] + A[3]) != 0;
       left_ec = (L[0] + L[1] + L[2] + L[3]) != 0;
@@ -330,8 +383,10 @@ static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb,
     }
     case TX_32X32:
       scan = vp9_default_scan_32x32;
+#if !CONFIG_BALANCED_COEFTREE
       vp9_model_to_full_probs_sb(cm->fc.coef_probs_32x32[type][ref],
                                  coef_probs);
+#endif
       seg_eob = 1024;
       above_ec = (A[0] + A[1] + A[2] + A[3] + A[4] + A[5] + A[6] + A[7]) != 0;
       left_ec = (L[0] + L[1] + L[2] + L[3] + L[4] + L[5] + L[6] + L[7]) != 0;
@@ -362,18 +417,30 @@ static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb,
       if (c)
         pt = vp9_get_coef_context(scan, nb, pad, token_cache, c, default_eob);
 
+#if CONFIG_BALANCED_COEFTREE
+      if (!c || token_cache[scan[c - 1]])  // do not skip eob
+        cost += token_costs_noskip[band][pt][t] + vp9_dct_value_cost_ptr[v];
+      else
+        cost += token_costs[band][pt][t] + vp9_dct_value_cost_ptr[v];
+#else
       cost += token_costs[band][pt][t] + vp9_dct_value_cost_ptr[v];
-
       if (!c || token_cache[scan[c - 1]])
         cost += vp9_cost_bit(coef_probs[band][pt][0], 1);
-      token_cache[scan[c]] = t;
+#endif
+      token_cache[scan[c]] = vp9_pt_energy_class[t];
     }
     if (c < seg_eob) {
       if (c)
         pt = vp9_get_coef_context(scan, nb, pad, token_cache, c, default_eob);
+#if CONFIG_BALANCED_COEFTREE
+      cost += mb->token_costs_noskip[tx_size][type][ref]
+          [get_coef_band(band_translate, c)]
+          [pt][DCT_EOB_TOKEN];
+#else
       cost += mb->token_costs[tx_size][type][ref]
           [get_coef_band(band_translate, c)]
           [pt][DCT_EOB_TOKEN];
+#endif
     }
   }
 
@@ -556,9 +623,25 @@ static void super_block_yrd(VP9_COMP *cpi,
                             int64_t txfm_cache[NB_TXFM_MODES]) {
   VP9_COMMON *const cm = &cpi->common;
   int r[TX_SIZE_MAX_SB][2], d[TX_SIZE_MAX_SB], s[TX_SIZE_MAX_SB];
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
 
   vp9_subtract_sby(x, bs);
 
+  if (cpi->speed > 4) {
+    if (bs >= BLOCK_SIZE_SB32X32) {
+      mbmi->txfm_size = TX_32X32;
+    } else if (bs >= BLOCK_SIZE_MB16X16) {
+      mbmi->txfm_size = TX_16X16;
+    } else if (bs >= BLOCK_SIZE_SB8X8) {
+      mbmi->txfm_size = TX_8X8;
+    } else {
+      mbmi->txfm_size = TX_4X4;
+    }
+    super_block_yrd_for_txfm(cm, x, rate, distortion, skip, bs,
+                             mbmi->txfm_size);
+    return;
+  }
   if (bs >= BLOCK_SIZE_SB32X32)
     super_block_yrd_for_txfm(cm, x, &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32],
                              bs, TX_32X32);
@@ -611,11 +694,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
     int64_t this_rd;
     int ratey = 0;
 
-    xd->mode_info_context->bmi[ib].as_mode.first = mode;
-    if (cm->frame_type == KEY_FRAME)
-      rate = bmode_costs[mode];
-    else
-      rate = x->mbmode_cost[cm->frame_type][mode];
+    rate = bmode_costs[mode];
     distortion = 0;
 
     vpx_memcpy(tempa, ta, sizeof(ta));
@@ -653,9 +732,6 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
         distortion += vp9_block_error(coeff, BLOCK_OFFSET(xd->plane[0].dqcoeff,
                                                          block, 16), 16) >> 2;
 
-        vp9_intra4x4_predict(xd, block, BLOCK_SIZE_SB8X8, mode,
-                             dst, xd->plane[0].dst.stride);
-
         if (best_tx_type != DCT_DCT)
           vp9_short_iht4x4_add(BLOCK_OFFSET(xd->plane[0].dqcoeff, block, 16),
                                dst, xd->plane[0].dst.stride, best_tx_type);
@@ -726,16 +802,15 @@ static int64_t rd_pick_intra4x4mby_modes(VP9_COMP *cpi, MACROBLOCK *mb,
   int64_t total_rd = 0;
   ENTROPY_CONTEXT t_above[4], t_left[4];
   int *bmode_costs;
+  MODE_INFO *const mic = xd->mode_info_context;
 
   vpx_memcpy(t_above, xd->plane[0].above_context, sizeof(t_above));
   vpx_memcpy(t_left, xd->plane[0].left_context, sizeof(t_left));
 
-  xd->mode_info_context->mbmi.mode = I4X4_PRED;
-  bmode_costs = mb->inter_bmode_costs;
+  bmode_costs = mb->mbmode_cost;
 
   for (idy = 0; idy < 2; idy += bh) {
     for (idx = 0; idx < 2; idx += bw) {
-      MODE_INFO *const mic = xd->mode_info_context;
       const int mis = xd->mode_info_stride;
       MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode);
       int UNINITIALIZED_IS_SAFE(r), UNINITIALIZED_IS_SAFE(ry);
@@ -747,7 +822,7 @@ static int64_t rd_pick_intra4x4mby_modes(VP9_COMP *cpi, MACROBLOCK *mb,
         const MB_PREDICTION_MODE L = (xd->left_available || idx) ?
                                      left_block_mode(mic, i) : DC_PRED;
 
-        bmode_costs  = mb->bmode_costs[A][L];
+        bmode_costs  = mb->y_mode_costs[A][L];
       }
 
       total_rd += rd_pick_intra4x4block(cpi, mb, i, &best_mode, bmode_costs,
@@ -774,6 +849,7 @@ static int64_t rd_pick_intra4x4mby_modes(VP9_COMP *cpi, MACROBLOCK *mb,
   *Rate = cost;
   *rate_y = tot_rate_y;
   *Distortion = distortion;
+  xd->mode_info_context->mbmi.mode = mic->bmi[3].as_mode.first;
 
   return RDCOST(mb->rdmult, mb->rddiv, cost, distortion);
 }
@@ -785,12 +861,13 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                       int64_t txfm_cache[NB_TXFM_MODES]) {
   MB_PREDICTION_MODE mode;
   MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
-  MACROBLOCKD *xd = &x->e_mbd;
+  MACROBLOCKD *const xd = &x->e_mbd;
   int this_rate, this_rate_tokenonly;
   int this_distortion, s;
   int64_t best_rd = INT64_MAX, this_rd;
   TX_SIZE UNINITIALIZED_IS_SAFE(best_tx);
   int i;
+  int *bmode_costs = x->mbmode_cost;
 
   if (bsize < BLOCK_SIZE_SB8X8) {
     x->e_mbd.mode_info_context->mbmi.txfm_size = TX_4X4;
@@ -805,17 +882,19 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
     int64_t local_txfm_cache[NB_TXFM_MODES];
     MODE_INFO *const mic = xd->mode_info_context;
     const int mis = xd->mode_info_stride;
-    const MB_PREDICTION_MODE A = above_block_mode(mic, 0, mis);
-    const MB_PREDICTION_MODE L = xd->left_available ?
-                                 left_block_mode(mic, 0) : DC_PRED;
-
-    int *bmode_costs  = x->bmode_costs[A][L];
+    if (cpi->common.frame_type == KEY_FRAME) {
+      const MB_PREDICTION_MODE A = above_block_mode(mic, 0, mis);
+      const MB_PREDICTION_MODE L = xd->left_available ?
+                                   left_block_mode(mic, 0) : DC_PRED;
 
+      bmode_costs = x->y_mode_costs[A][L];
+    }
     x->e_mbd.mode_info_context->mbmi.mode = mode;
     vp9_build_intra_predictors_sby_s(&x->e_mbd, bsize);
 
     super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion, &s,
                     bsize, local_txfm_cache);
+
     this_rate = this_rate_tokenonly + bmode_costs[mode];
     this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
 
@@ -925,10 +1004,10 @@ int vp9_cost_mv_ref(VP9_COMP *cpi,
     VP9_COMMON *pc = &cpi->common;
 
     vp9_prob p [VP9_MVREFS - 1];
-    assert(NEARESTMV <= m  &&  m <= SPLITMV);
+    assert(NEARESTMV <= m  &&  m <= NEWMV);
     vp9_mv_ref_probs(pc, p, mode_context);
-    return cost_token(vp9_mv_ref_tree, p,
-                      vp9_mv_ref_encoding_array - NEARESTMV + m);
+    return cost_token(vp9_sb_mv_ref_tree, p,
+                      vp9_sb_mv_ref_encoding_array - NEARESTMV + m);
   } else
     return 0;
 }
@@ -938,19 +1017,18 @@ void vp9_set_mbmode_and_mvs(MACROBLOCK *x, MB_PREDICTION_MODE mb, int_mv *mv) {
   x->e_mbd.mode_info_context->mbmi.mv[0].as_int = mv->as_int;
 }
 
-static int labels2mode(MACROBLOCK *x,
-                       int const *labelings, int which_label,
+static int labels2mode(MACROBLOCK *x, int i,
                        MB_PREDICTION_MODE this_mode,
                        int_mv *this_mv, int_mv *this_second_mv,
                        int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES],
-                       int_mv seg_mvs[MAX_REF_FRAMES - 1],
+                       int_mv seg_mvs[MAX_REF_FRAMES],
                        int_mv *best_ref_mv,
                        int_mv *second_best_ref_mv,
                        int *mvjcost, int *mvcost[2], VP9_COMP *cpi) {
   MACROBLOCKD *const xd = &x->e_mbd;
   MODE_INFO *const mic = xd->mode_info_context;
   MB_MODE_INFO * mbmi = &mic->mbmi;
-  int i, cost = 0, thismvcost = 0;
+  int cost = 0, thismvcost = 0;
   int idx, idy;
   int bw = 1 << b_width_log2(mbmi->sb_type);
   int bh = 1 << b_height_log2(mbmi->sb_type);
@@ -958,72 +1036,61 @@ static int labels2mode(MACROBLOCK *x,
   /* We have to be careful retrieving previously-encoded motion vectors.
    Ones from this macroblock have to be pulled from the BLOCKD array
    as they have not yet made it to the bmi array in our MB_MODE_INFO. */
-  for (i = 0; i < 4; ++i) {
-    MB_PREDICTION_MODE m;
-
-    if (labelings[i] != which_label)
-      continue;
+  MB_PREDICTION_MODE m;
 
-    {
-      // the only time we should do costing for new motion vector or mode
-      // is when we are on a new label  (jbb May 08, 2007)
-      switch (m = this_mode) {
-        case NEWMV:
-          if (mbmi->second_ref_frame > 0) {
-            this_mv->as_int = seg_mvs[mbmi->ref_frame - 1].as_int;
-            this_second_mv->as_int =
-            seg_mvs[mbmi->second_ref_frame - 1].as_int;
-          }
-
-          thismvcost  = vp9_mv_bit_cost(this_mv, best_ref_mv, mvjcost, mvcost,
-                                        102, xd->allow_high_precision_mv);
-          if (mbmi->second_ref_frame > 0) {
-            thismvcost += vp9_mv_bit_cost(this_second_mv, second_best_ref_mv,
-                                          mvjcost, mvcost, 102,
-                                          xd->allow_high_precision_mv);
-          }
-          break;
-        case NEARESTMV:
-          this_mv->as_int = frame_mv[NEARESTMV][mbmi->ref_frame].as_int;
-          if (mbmi->second_ref_frame > 0)
-            this_second_mv->as_int =
-                frame_mv[NEARESTMV][mbmi->second_ref_frame].as_int;
-          break;
-        case NEARMV:
-          this_mv->as_int = frame_mv[NEARMV][mbmi->ref_frame].as_int;
-          if (mbmi->second_ref_frame > 0)
-            this_second_mv->as_int =
-                frame_mv[NEARMV][mbmi->second_ref_frame].as_int;
-          break;
-        case ZEROMV:
-          this_mv->as_int = 0;
-          if (mbmi->second_ref_frame > 0)
-            this_second_mv->as_int = 0;
-          break;
-        default:
-          break;
+  // the only time we should do costing for new motion vector or mode
+  // is when we are on a new label  (jbb May 08, 2007)
+  switch (m = this_mode) {
+    case NEWMV:
+      this_mv->as_int = seg_mvs[mbmi->ref_frame].as_int;
+      thismvcost  = vp9_mv_bit_cost(this_mv, best_ref_mv, mvjcost, mvcost,
+                                    102, xd->allow_high_precision_mv);
+      if (mbmi->second_ref_frame > 0) {
+        this_second_mv->as_int = seg_mvs[mbmi->second_ref_frame].as_int;
+        thismvcost += vp9_mv_bit_cost(this_second_mv, second_best_ref_mv,
+                                      mvjcost, mvcost, 102,
+                                      xd->allow_high_precision_mv);
       }
+      break;
+    case NEARESTMV:
+      this_mv->as_int = frame_mv[NEARESTMV][mbmi->ref_frame].as_int;
+      if (mbmi->second_ref_frame > 0)
+        this_second_mv->as_int =
+            frame_mv[NEARESTMV][mbmi->second_ref_frame].as_int;
+      break;
+    case NEARMV:
+      this_mv->as_int = frame_mv[NEARMV][mbmi->ref_frame].as_int;
+      if (mbmi->second_ref_frame > 0)
+        this_second_mv->as_int =
+            frame_mv[NEARMV][mbmi->second_ref_frame].as_int;
+      break;
+    case ZEROMV:
+      this_mv->as_int = 0;
+      if (mbmi->second_ref_frame > 0)
+        this_second_mv->as_int = 0;
+      break;
+    default:
+      break;
+  }
 
-      cost = vp9_cost_mv_ref(cpi, this_mode,
-                             mbmi->mb_mode_context[mbmi->ref_frame]);
-    }
+  cost = vp9_cost_mv_ref(cpi, this_mode,
+                         mbmi->mb_mode_context[mbmi->ref_frame]);
 
-    mic->bmi[i].as_mv[0].as_int = this_mv->as_int;
-    if (mbmi->second_ref_frame > 0)
-      mic->bmi[i].as_mv[1].as_int = this_second_mv->as_int;
+  mic->bmi[i].as_mv[0].as_int = this_mv->as_int;
+  if (mbmi->second_ref_frame > 0)
+    mic->bmi[i].as_mv[1].as_int = this_second_mv->as_int;
 
-    x->partition_info->bmi[i].mode = m;
-    x->partition_info->bmi[i].mv.as_int = this_mv->as_int;
-    if (mbmi->second_ref_frame > 0)
-      x->partition_info->bmi[i].second_mv.as_int = this_second_mv->as_int;
-    for (idy = 0; idy < bh; ++idy) {
-      for (idx = 0; idx < bw; ++idx) {
-        vpx_memcpy(&mic->bmi[i + idy * 2 + idx],
-                   &mic->bmi[i], sizeof(mic->bmi[i]));
-        vpx_memcpy(&x->partition_info->bmi[i + idy * 2 + idx],
-                   &x->partition_info->bmi[i],
-                   sizeof(x->partition_info->bmi[i]));
-      }
+  x->partition_info->bmi[i].mode = m;
+  x->partition_info->bmi[i].mv.as_int = this_mv->as_int;
+  if (mbmi->second_ref_frame > 0)
+    x->partition_info->bmi[i].second_mv.as_int = this_second_mv->as_int;
+  for (idy = 0; idy < bh; ++idy) {
+    for (idx = 0; idx < bw; ++idx) {
+      vpx_memcpy(&mic->bmi[i + idy * 2 + idx],
+                 &mic->bmi[i], sizeof(mic->bmi[i]));
+      vpx_memcpy(&x->partition_info->bmi[i + idy * 2 + idx],
+                 &x->partition_info->bmi[i],
+                 sizeof(x->partition_info->bmi[i]));
     }
   }
 
@@ -1033,90 +1100,86 @@ static int labels2mode(MACROBLOCK *x,
 
 static int64_t encode_inter_mb_segment(VP9_COMMON *const cm,
                                        MACROBLOCK *x,
-                                       int const *labels,
-                                       int which_label,
+                                       int i,
                                        int *labelyrate,
                                        int *distortion,
                                        ENTROPY_CONTEXT *ta,
                                        ENTROPY_CONTEXT *tl) {
-  int i, k;
+  int k;
   MACROBLOCKD *xd = &x->e_mbd;
   BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type;
   int bwl = b_width_log2(bsize), bw = 1 << bwl;
   int bhl = b_height_log2(bsize), bh = 1 << bhl;
   int idx, idy;
+  const int src_stride = x->plane[0].src.stride;
+  uint8_t* const src =
+  raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
+                            x->plane[0].src.buf, src_stride);
+  int16_t* src_diff =
+  raster_block_offset_int16(xd, BLOCK_SIZE_SB8X8, 0, i,
+                            x->plane[0].src_diff);
+  int16_t* coeff = BLOCK_OFFSET(x->plane[0].coeff, 16, i);
+  uint8_t* const pre =
+  raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
+                            xd->plane[0].pre[0].buf,
+                            xd->plane[0].pre[0].stride);
+  uint8_t* const dst =
+  raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
+                            xd->plane[0].dst.buf,
+                            xd->plane[0].dst.stride);
+  int thisdistortion = 0;
+  int thisrate = 0;
 
   *labelyrate = 0;
   *distortion = 0;
-  for (i = 0; i < 4; i++) {
-    if (labels[i] == which_label) {
-      const int src_stride = x->plane[0].src.stride;
-      uint8_t* const src =
-      raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
-                                x->plane[0].src.buf, src_stride);
-      int16_t* src_diff =
-      raster_block_offset_int16(xd, BLOCK_SIZE_SB8X8, 0, i,
-                                x->plane[0].src_diff);
-      int16_t* coeff = BLOCK_OFFSET(x->plane[0].coeff, 16, i);
-      uint8_t* const pre =
-      raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
-                                xd->plane[0].pre[0].buf,
-                                xd->plane[0].pre[0].stride);
-      uint8_t* const dst =
-      raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
-                                xd->plane[0].dst.buf,
-                                xd->plane[0].dst.stride);
-      int thisdistortion = 0;
-      int thisrate = 0;
-
-      vp9_build_inter_predictor(pre,
-                                xd->plane[0].pre[0].stride,
-                                dst,
-                                xd->plane[0].dst.stride,
-                                &xd->mode_info_context->bmi[i].as_mv[0],
-                                &xd->scale_factor[0],
-                                4 * bw, 4 * bh, 0 /* no avg */, &xd->subpix);
-
-      // TODO(debargha): Make this work properly with the
-      // implicit-compoundinter-weight experiment when implicit
-      // weighting for splitmv modes is turned on.
-      if (xd->mode_info_context->mbmi.second_ref_frame > 0) {
-        uint8_t* const second_pre =
-        raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
-                                  xd->plane[0].pre[1].buf,
-                                  xd->plane[0].pre[1].stride);
-        vp9_build_inter_predictor(second_pre, xd->plane[0].pre[1].stride,
-                                  dst, xd->plane[0].dst.stride,
-                                  &xd->mode_info_context->bmi[i].as_mv[1],
-                                  &xd->scale_factor[1], 4 * bw, 4 * bh, 1,
-                                  &xd->subpix);
-      }
 
-      vp9_subtract_block(4 * bh, 4 * bw, src_diff, 8,
-                         src, src_stride,
-                         dst, xd->plane[0].dst.stride);
+  vp9_build_inter_predictor(pre,
+                            xd->plane[0].pre[0].stride,
+                            dst,
+                            xd->plane[0].dst.stride,
+                            &xd->mode_info_context->bmi[i].as_mv[0],
+                            &xd->scale_factor[0],
+                            4 * bw, 4 * bh, 0 /* no avg */, &xd->subpix);
+
+  // TODO(debargha): Make this work properly with the
+  // implicit-compoundinter-weight experiment when implicit
+  // weighting for splitmv modes is turned on.
+  if (xd->mode_info_context->mbmi.second_ref_frame > 0) {
+    uint8_t* const second_pre =
+    raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
+                              xd->plane[0].pre[1].buf,
+                              xd->plane[0].pre[1].stride);
+    vp9_build_inter_predictor(second_pre, xd->plane[0].pre[1].stride,
+                              dst, xd->plane[0].dst.stride,
+                              &xd->mode_info_context->bmi[i].as_mv[1],
+                              &xd->scale_factor[1], 4 * bw, 4 * bh, 1,
+                              &xd->subpix);
+  }
 
-      k = i;
-      for (idy = 0; idy < bh; ++idy) {
-        for (idx = 0; idx < bw; ++idx) {
-          k += (idy * 2 + idx);
-          src_diff = raster_block_offset_int16(xd, BLOCK_SIZE_SB8X8, 0, k,
-                                               x->plane[0].src_diff);
-          coeff = BLOCK_OFFSET(x->plane[0].coeff, 16, k);
-          x->fwd_txm4x4(src_diff, coeff, 16);
-          x->quantize_b_4x4(x, k, DCT_DCT, 16);
-          thisdistortion += vp9_block_error(coeff,
-                                            BLOCK_OFFSET(xd->plane[0].dqcoeff,
-                                                         k, 16), 16);
-          thisrate += cost_coeffs(cm, x, 0, k, PLANE_TYPE_Y_WITH_DC,
-                                  ta + (k & 1),
-                                  tl + (k >> 1), TX_4X4, 16);
-        }
-      }
-      *distortion += thisdistortion;
-      *labelyrate += thisrate;
+  vp9_subtract_block(4 * bh, 4 * bw, src_diff, 8,
+                     src, src_stride,
+                     dst, xd->plane[0].dst.stride);
+
+  k = i;
+  for (idy = 0; idy < bh; ++idy) {
+    for (idx = 0; idx < bw; ++idx) {
+      k += (idy * 2 + idx);
+      src_diff = raster_block_offset_int16(xd, BLOCK_SIZE_SB8X8, 0, k,
+                                           x->plane[0].src_diff);
+      coeff = BLOCK_OFFSET(x->plane[0].coeff, 16, k);
+      x->fwd_txm4x4(src_diff, coeff, 16);
+      x->quantize_b_4x4(x, k, DCT_DCT, 16);
+      thisdistortion += vp9_block_error(coeff,
+                                        BLOCK_OFFSET(xd->plane[0].dqcoeff,
+                                                     k, 16), 16);
+      thisrate += cost_coeffs(cm, x, 0, k, PLANE_TYPE_Y_WITH_DC,
+                              ta + (k & 1),
+                              tl + (k >> 1), TX_4X4, 16);
     }
   }
+  *distortion += thisdistortion;
+  *labelyrate += thisrate;
+
   *distortion >>= 2;
   return RDCOST(x->rdmult, x->rddiv, *labelyrate, *distortion);
 }
@@ -1188,11 +1251,45 @@ static enum BlockSize get_block_size(int bw, int bh) {
   return -1;
 }
 
+static INLINE void mi_buf_shift(MACROBLOCK *x, int i) {
+  MB_MODE_INFO *mbmi = &x->e_mbd.mode_info_context->mbmi;
+  x->plane[0].src.buf =
+      raster_block_offset_uint8(&x->e_mbd, BLOCK_SIZE_SB8X8, 0, i,
+                                x->plane[0].src.buf,
+                                x->plane[0].src.stride);
+  assert(((intptr_t)x->e_mbd.plane[0].pre[0].buf & 0x7) == 0);
+  x->e_mbd.plane[0].pre[0].buf =
+      raster_block_offset_uint8(&x->e_mbd, BLOCK_SIZE_SB8X8, 0, i,
+                                x->e_mbd.plane[0].pre[0].buf,
+                                x->e_mbd.plane[0].pre[0].stride);
+  if (mbmi->second_ref_frame)
+    x->e_mbd.plane[0].pre[1].buf =
+        raster_block_offset_uint8(&x->e_mbd, BLOCK_SIZE_SB8X8, 0, i,
+                                  x->e_mbd.plane[0].pre[1].buf,
+                                  x->e_mbd.plane[0].pre[1].stride);
+}
+
+static INLINE void mi_buf_restore(MACROBLOCK *x, struct buf_2d orig_src,
+                                  struct buf_2d orig_pre[2]) {
+  MB_MODE_INFO *mbmi = &x->e_mbd.mode_info_context->mbmi;
+  x->plane[0].src = orig_src;
+  x->e_mbd.plane[0].pre[0] = orig_pre[0];
+  if (mbmi->second_ref_frame)
+    x->e_mbd.plane[0].pre[1] = orig_pre[1];
+}
+
+static void iterative_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
+                                    BLOCK_SIZE_TYPE bsize,
+                                    int_mv *frame_mv,
+                                    YV12_BUFFER_CONFIG **scaled_ref_frame,
+                                    int mi_row, int mi_col,
+                                    int_mv single_newmv[MAX_REF_FRAMES]);
+
 static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
                                     BEST_SEG_INFO *bsi,
-                                    int_mv seg_mvs[4][MAX_REF_FRAMES - 1]) {
+                                    int_mv seg_mvs[4][MAX_REF_FRAMES],
+                                    int mi_row, int mi_col) {
   int i, j;
-  static const int labels[4] = { 0, 1, 2, 3 };
   int br = 0, bd = 0;
   MB_PREDICTION_MODE this_mode;
   MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
@@ -1208,7 +1305,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
   int bhl = b_height_log2(bsize), bh = 1 << bhl;
   int idx, idy;
   vp9_variance_fn_ptr_t *v_fn_ptr;
-
+  YV12_BUFFER_CONFIG *scaled_ref_frame[2] = {NULL, NULL};
   ENTROPY_CONTEXT t_above[4], t_left[4];
   ENTROPY_CONTEXT t_above_b[4], t_left_b[4];
 
@@ -1255,18 +1352,21 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
         int distortion;
         int labelyrate;
         ENTROPY_CONTEXT t_above_s[4], t_left_s[4];
+        const struct buf_2d orig_src = x->plane[0].src;
+        struct buf_2d orig_pre[2];
+
+        vpx_memcpy(orig_pre, x->e_mbd.plane[0].pre, sizeof(orig_pre));
 
         vpx_memcpy(t_above_s, t_above, sizeof(t_above_s));
         vpx_memcpy(t_left_s, t_left, sizeof(t_left_s));
 
         // motion search for newmv (single predictor case only)
         if (mbmi->second_ref_frame <= 0 && this_mode == NEWMV) {
-          int sseshift, n;
           int step_param = 0;
           int further_steps;
           int thissme, bestsme = INT_MAX;
-          const struct buf_2d orig_src = x->plane[0].src;
-          const struct buf_2d orig_pre = x->e_mbd.plane[0].pre[0];
+          int sadpb = x->sadperbit4;
+          int_mv mvp_full;
 
           /* Is the best so far sufficiently good that we cant justify doing
            * and new motion search. */
@@ -1287,55 +1387,35 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
 
           further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;
 
-          {
-            int sadpb = x->sadperbit4;
-            int_mv mvp_full;
-
-            mvp_full.as_mv.row = bsi->mvp.as_mv.row >> 3;
-            mvp_full.as_mv.col = bsi->mvp.as_mv.col >> 3;
-
-            // find first label
-            n = i;
-
-            // adjust src pointer for this segment
-            x->plane[0].src.buf =
-            raster_block_offset_uint8(&x->e_mbd, BLOCK_SIZE_SB8X8, 0, n,
-                                      x->plane[0].src.buf,
-                                      x->plane[0].src.stride);
-            assert(((intptr_t)x->e_mbd.plane[0].pre[0].buf & 0x7) == 0);
-            x->e_mbd.plane[0].pre[0].buf =
-            raster_block_offset_uint8(&x->e_mbd, BLOCK_SIZE_SB8X8, 0, n,
-                                      x->e_mbd.plane[0].pre[0].buf,
-                                      x->e_mbd.plane[0].pre[0].stride);
-
-            bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param,
-                                             sadpb, further_steps, 0, v_fn_ptr,
-                                             bsi->ref_mv, &mode_mv[NEWMV]);
-
-            sseshift = 0;
-
-            // Should we do a full search (best quality only)
-            if ((cpi->compressor_speed == 0) && (bestsme >> sseshift) > 4000) {
-              /* Check if mvp_full is within the range. */
-              clamp_mv(&mvp_full, x->mv_col_min, x->mv_col_max,
-                       x->mv_row_min, x->mv_row_max);
-
-              thissme = cpi->full_search_sad(x, &mvp_full,
-                                             sadpb, 16, v_fn_ptr,
-                                             x->nmvjointcost, x->mvcost,
-                                             bsi->ref_mv,
-                                             n);
-
-              if (thissme < bestsme) {
-                bestsme = thissme;
-                mode_mv[NEWMV].as_int =
-                x->e_mbd.mode_info_context->bmi[n].as_mv[0].as_int;
-              } else {
-                /* The full search result is actually worse so re-instate the
-                 * previous best vector */
-                x->e_mbd.mode_info_context->bmi[n].as_mv[0].as_int =
-                mode_mv[NEWMV].as_int;
-              }
+          mvp_full.as_mv.row = bsi->mvp.as_mv.row >> 3;
+          mvp_full.as_mv.col = bsi->mvp.as_mv.col >> 3;
+
+          // adjust src pointer for this block
+          mi_buf_shift(x, i);
+          bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param,
+                                           sadpb, further_steps, 0, v_fn_ptr,
+                                           bsi->ref_mv, &mode_mv[NEWMV]);
+
+          // Should we do a full search (best quality only)
+          if (cpi->compressor_speed == 0) {
+            /* Check if mvp_full is within the range. */
+            clamp_mv(&mvp_full, x->mv_col_min, x->mv_col_max,
+                     x->mv_row_min, x->mv_row_max);
+
+            thissme = cpi->full_search_sad(x, &mvp_full,
+                                           sadpb, 16, v_fn_ptr,
+                                           x->nmvjointcost, x->mvcost,
+                                           bsi->ref_mv, i);
+
+            if (thissme < bestsme) {
+              bestsme = thissme;
+              mode_mv[NEWMV].as_int =
+                  x->e_mbd.mode_info_context->bmi[i].as_mv[0].as_int;
+            } else {
+              /* The full search result is actually worse so re-instate the
+               * previous best vector */
+              x->e_mbd.mode_info_context->bmi[i].as_mv[0].as_int =
+                  mode_mv[NEWMV].as_int;
             }
           }
 
@@ -1348,23 +1428,32 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
                                          &distortion, &sse);
 
             // safe motion search result for use in compound prediction
-            seg_mvs[i][mbmi->ref_frame - 1].as_int = mode_mv[NEWMV].as_int;
+            seg_mvs[i][mbmi->ref_frame].as_int = mode_mv[NEWMV].as_int;
           }
 
           // restore src pointers
-          x->plane[0].src = orig_src;
-          x->e_mbd.plane[0].pre[0] = orig_pre;
+          mi_buf_restore(x, orig_src, orig_pre);
         } else if (mbmi->second_ref_frame > 0 && this_mode == NEWMV) {
-          /* NEW4X4 */
-          /* motion search not completed? Then skip newmv for this block with
-           * comppred */
-          if (seg_mvs[i][mbmi->second_ref_frame - 1].as_int == INVALID_MV ||
-              seg_mvs[i][mbmi->ref_frame        - 1].as_int == INVALID_MV) {
+          if (seg_mvs[i][mbmi->second_ref_frame].as_int == INVALID_MV ||
+              seg_mvs[i][mbmi->ref_frame       ].as_int == INVALID_MV)
             continue;
+
+          // adjust src pointers
+          mi_buf_shift(x, i);
+          if (cpi->sf.comp_inter_joint_search) {
+            iterative_motion_search(cpi, x, bsize, frame_mv[this_mode],
+                                    scaled_ref_frame,
+                                    mi_row, mi_col, seg_mvs[i]);
+            seg_mvs[i][mbmi->ref_frame].as_int =
+                frame_mv[this_mode][mbmi->ref_frame].as_int;
+            seg_mvs[i][mbmi->second_ref_frame].as_int =
+                frame_mv[this_mode][mbmi->second_ref_frame].as_int;
           }
+          // restore src pointers
+          mi_buf_restore(x, orig_src, orig_pre);
         }
 
-        rate = labels2mode(x, labels, i, this_mode, &mode_mv[this_mode],
+        rate = labels2mode(x, i, this_mode, &mode_mv[this_mode],
                            &second_mode_mv[this_mode], frame_mv, seg_mvs[i],
                            bsi->ref_mv, bsi->second_ref_mv, x->nmvjointcost,
                            x->mvcost, cpi);
@@ -1381,7 +1470,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
           continue;
 
         this_rd = encode_inter_mb_segment(&cpi->common,
-                                          x, labels, i, &labelyrate,
+                                          x, i, &labelyrate,
                                           &distortion, t_above_s, t_left_s);
         this_rd += RDCOST(x->rdmult, x->rddiv, rate, 0);
         rate += labelyrate;
@@ -1392,10 +1481,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
           bestlabelyrate = labelyrate;
           mode_selected = this_mode;
           best_label_rd = this_rd;
-          for (j = 0; j < 4; j++)
-            if (labels[j] == i)
-              best_eobs[j] = x->e_mbd.plane[0].eobs[j];
-
+          best_eobs[i] = x->e_mbd.plane[0].eobs[i];
           vpx_memcpy(t_above_b, t_above_s, sizeof(t_above_s));
           vpx_memcpy(t_left_b, t_left_s, sizeof(t_left_s));
         }
@@ -1404,7 +1490,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
       vpx_memcpy(t_above, t_above_b, sizeof(t_above));
       vpx_memcpy(t_left, t_left_b, sizeof(t_left));
 
-      labels2mode(x, labels, i, mode_selected, &mode_mv[mode_selected],
+      labels2mode(x, i, mode_selected, &mode_mv[mode_selected],
                   &second_mode_mv[mode_selected], frame_mv, seg_mvs[i],
                   bsi->ref_mv, bsi->second_ref_mv, x->nmvjointcost,
                   x->mvcost, cpi);
@@ -1443,12 +1529,6 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
   }
 }
 
-static void rd_check_segment(VP9_COMP *cpi, MACROBLOCK *x,
-                             BEST_SEG_INFO *bsi,
-                             int_mv seg_mvs[4][MAX_REF_FRAMES - 1]) {
-  rd_check_segment_txsize(cpi, x, bsi, seg_mvs);
-}
-
 static int rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x,
                                        int_mv *best_ref_mv,
                                        int_mv *second_best_ref_mv,
@@ -1457,7 +1537,8 @@ static int rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x,
                                        int *returnyrate,
                                        int *returndistortion,
                                        int *skippable, int mvthresh,
-                                       int_mv seg_mvs[4][MAX_REF_FRAMES - 1]) {
+                                       int_mv seg_mvs[4][MAX_REF_FRAMES],
+                                       int mi_row, int mi_col) {
   int i;
   BEST_SEG_INFO bsi;
   MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
@@ -1473,7 +1554,7 @@ static int rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x,
   for (i = 0; i < 4; i++)
     bsi.modes[i] = ZEROMV;
 
-  rd_check_segment(cpi, x, &bsi, seg_mvs);
+  rd_check_segment_txsize(cpi, x, &bsi, seg_mvs, mi_row, mi_col);
 
   /* set it to the best */
   for (i = 0; i < 4; i++) {
@@ -1504,6 +1585,7 @@ static int rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x,
   *returndistortion = bsi.d;
   *returnyrate = bsi.segment_yrate;
   *skippable = vp9_sby_is_skippable(&x->e_mbd, BLOCK_SIZE_SB8X8);
+  mbmi->mode = bsi.modes[3];
 
   return (int)(bsi.segment_rd);
 }
@@ -1878,6 +1960,154 @@ static INLINE int get_switchable_rate(VP9_COMMON *cm, MACROBLOCK *x) {
   return SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs[c][m];
 }
 
+static void iterative_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
+                                    BLOCK_SIZE_TYPE bsize,
+                                    int_mv *frame_mv,
+                                    YV12_BUFFER_CONFIG **scaled_ref_frame,
+                                    int mi_row, int mi_col,
+                                    int_mv single_newmv[MAX_REF_FRAMES]) {
+  int pw = 4 << b_width_log2(bsize), ph = 4 << b_height_log2(bsize);
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
+  int refs[2] = { mbmi->ref_frame,
+                  (mbmi->second_ref_frame < 0 ? 0 : mbmi->second_ref_frame) };
+  int_mv ref_mv[2];
+  const enum BlockSize block_size = get_plane_block_size(bsize, &xd->plane[0]);
+  int ite;
+  // Prediction buffer from second frame.
+  uint8_t *second_pred = vpx_memalign(16, pw * ph * sizeof(uint8_t));
+
+  // Do joint motion search in compound mode to get more accurate mv.
+  struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0}};
+  struct buf_2d backup_second_yv12[MAX_MB_PLANE] = {{0}};
+  struct buf_2d scaled_first_yv12;
+  int last_besterr[2] = {INT_MAX, INT_MAX};
+
+  ref_mv[0] = mbmi->ref_mvs[refs[0]][0];
+  ref_mv[1] = mbmi->ref_mvs[refs[1]][0];
+
+  if (scaled_ref_frame[0]) {
+    int i;
+    // Swap out the reference frame for a version that's been scaled to
+    // match the resolution of the current frame, allowing the existing
+    // motion search code to be used without additional modifications.
+    for (i = 0; i < MAX_MB_PLANE; i++)
+      backup_yv12[i] = xd->plane[i].pre[0];
+    setup_pre_planes(xd, scaled_ref_frame[0], NULL, mi_row, mi_col,
+                     NULL, NULL);
+  }
+
+  if (scaled_ref_frame[1]) {
+    int i;
+    for (i = 0; i < MAX_MB_PLANE; i++)
+      backup_second_yv12[i] = xd->plane[i].pre[1];
+
+    setup_pre_planes(xd, scaled_ref_frame[1], NULL, mi_row, mi_col,
+                     NULL, NULL);
+  }
+
+  xd->scale_factor[0].set_scaled_offsets(&xd->scale_factor[0],
+                                          mi_row, mi_col);
+  xd->scale_factor[1].set_scaled_offsets(&xd->scale_factor[1],
+                                          mi_row, mi_col);
+  scaled_first_yv12 = xd->plane[0].pre[0];
+
+  // Initialize mv using single prediction mode result.
+  frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int;
+  frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int;
+
+  // Allow joint search multiple times iteratively for each ref frame
+  // and break out the search loop if it couldn't find better mv.
+  for (ite = 0; ite < 4; ite++) {
+    struct buf_2d ref_yv12[2];
+    int bestsme = INT_MAX;
+    int sadpb = x->sadperbit16;
+    int_mv tmp_mv;
+    int search_range = 3;
+
+    int tmp_col_min = x->mv_col_min;
+    int tmp_col_max = x->mv_col_max;
+    int tmp_row_min = x->mv_row_min;
+    int tmp_row_max = x->mv_row_max;
+    int id = ite % 2;
+
+    // Initialized here because of compiler problem in Visual Studio.
+    ref_yv12[0] = xd->plane[0].pre[0];
+    ref_yv12[1] = xd->plane[0].pre[1];
+
+    // Get pred block from second frame.
+    vp9_build_inter_predictor(ref_yv12[!id].buf,
+                              ref_yv12[!id].stride,
+                              second_pred, pw,
+                              &frame_mv[refs[!id]],
+                              &xd->scale_factor[!id],
+                              pw, ph, 0,
+                              &xd->subpix);
+
+    // Compound motion search on first ref frame.
+    if (id)
+      xd->plane[0].pre[0] = ref_yv12[id];
+    vp9_clamp_mv_min_max(x, &ref_mv[id]);
+
+    // Use mv result from single mode as mvp.
+    tmp_mv.as_int = frame_mv[refs[id]].as_int;
+
+    tmp_mv.as_mv.col >>= 3;
+    tmp_mv.as_mv.row >>= 3;
+
+    // Small-range full-pixel motion search
+    bestsme = vp9_refining_search_8p_c(x, &tmp_mv, sadpb,
+                                       search_range,
+                                       &cpi->fn_ptr[block_size],
+                                       x->nmvjointcost, x->mvcost,
+                                       &ref_mv[id], second_pred,
+                                       pw, ph);
+
+    x->mv_col_min = tmp_col_min;
+    x->mv_col_max = tmp_col_max;
+    x->mv_row_min = tmp_row_min;
+    x->mv_row_max = tmp_row_max;
+
+    if (bestsme < INT_MAX) {
+      int dis; /* TODO: use dis in distortion calculation later. */
+      unsigned int sse;
+
+      bestsme = vp9_find_best_sub_pixel_comp(x, &tmp_mv,
+                                             &ref_mv[id],
+                                             x->errorperbit,
+                                             &cpi->fn_ptr[block_size],
+                                             x->nmvjointcost, x->mvcost,
+                                             &dis, &sse, second_pred,
+                                             pw, ph);
+    }
+
+    if (id)
+      xd->plane[0].pre[0] = scaled_first_yv12;
+
+    if (bestsme < last_besterr[id]) {
+      frame_mv[refs[id]].as_int = tmp_mv.as_int;
+      last_besterr[id] = bestsme;
+    } else {
+      break;
+    }
+  }
+
+  // restore the predictor
+  if (scaled_ref_frame[0]) {
+    int i;
+    for (i = 0; i < MAX_MB_PLANE; i++)
+      xd->plane[i].pre[0] = backup_yv12[i];
+  }
+
+  if (scaled_ref_frame[1]) {
+    int i;
+    for (i = 0; i < MAX_MB_PLANE; i++)
+      xd->plane[i].pre[1] = backup_second_yv12[i];
+  }
+
+  vpx_free(second_pred);
+}
+
 static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                  BLOCK_SIZE_TYPE bsize,
                                  int64_t txfm_cache[],
@@ -1920,145 +2150,13 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
       ref_mv[1] = mbmi->ref_mvs[refs[1]][0];
 
       if (is_comp_pred) {
-        if (cpi->sf.comp_inter_joint_serach) {
-          int pw = 4 << b_width_log2(bsize), ph = 4 << b_height_log2(bsize);
-          int ite;
-          // Prediction buffer from second frame.
-          uint8_t *second_pred = vpx_memalign(16, pw * ph * sizeof(uint8_t));
-
-          // Do joint motion search in compound mode to get more accurate mv.
-          struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0}};
-          struct buf_2d backup_second_yv12[MAX_MB_PLANE] = {{0}};
-          struct buf_2d scaled_first_yv12;
-          int last_besterr[2] = {INT_MAX, INT_MAX};
-
-          if (scaled_ref_frame[0]) {
-            int i;
-
-            // Swap out the reference frame for a version that's been scaled to
-            // match the resolution of the current frame, allowing the existing
-            // motion search code to be used without additional modifications.
-            for (i = 0; i < MAX_MB_PLANE; i++)
-              backup_yv12[i] = xd->plane[i].pre[0];
-
-            setup_pre_planes(xd, scaled_ref_frame[0], NULL, mi_row, mi_col,
-                             NULL, NULL);
-          }
-
-          if (scaled_ref_frame[1]) {
-            int i;
-
-            for (i = 0; i < MAX_MB_PLANE; i++)
-              backup_second_yv12[i] = xd->plane[i].pre[1];
-
-            setup_pre_planes(xd, scaled_ref_frame[1], NULL, mi_row, mi_col,
-                             NULL, NULL);
-          }
-          xd->scale_factor[0].set_scaled_offsets(&xd->scale_factor[0],
-                                                  mi_row, mi_col);
-          xd->scale_factor[1].set_scaled_offsets(&xd->scale_factor[1],
-                                                  mi_row, mi_col);
-
-          scaled_first_yv12 = xd->plane[0].pre[0];
-
-          // Initialize mv using single prediction mode result.
-          frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int;
-          frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int;
-
-          // Allow joint search multiple times iteratively for each ref frame
-          // and break out the search loop if it couldn't find better mv.
-          for (ite = 0; ite < 4; ite++) {
-            struct buf_2d ref_yv12[2];
-            int bestsme = INT_MAX;
-            int sadpb = x->sadperbit16;
-            int_mv tmp_mv;
-            int search_range = 3;
-
-            int tmp_col_min = x->mv_col_min;
-            int tmp_col_max = x->mv_col_max;
-            int tmp_row_min = x->mv_row_min;
-            int tmp_row_max = x->mv_row_max;
-            int id = ite % 2;
-
-            // Initialized here because of compiler problem in Visual Studio.
-            ref_yv12[0] = xd->plane[0].pre[0];
-            ref_yv12[1] = xd->plane[0].pre[1];
-
-            // Get pred block from second frame.
-            vp9_build_inter_predictor(ref_yv12[!id].buf,
-                                      ref_yv12[!id].stride,
-                                      second_pred, pw,
-                                      &frame_mv[refs[!id]],
-                                      &xd->scale_factor[!id],
-                                      pw, ph, 0,
-                                      &xd->subpix);
-
-            // Compound motion search on first ref frame.
-            if (id)
-              xd->plane[0].pre[0] = ref_yv12[id];
-            vp9_clamp_mv_min_max(x, &ref_mv[id]);
-
-            // Use mv result from single mode as mvp.
-            tmp_mv.as_int = frame_mv[refs[id]].as_int;
-
-            tmp_mv.as_mv.col >>= 3;
-            tmp_mv.as_mv.row >>= 3;
-
-            // Small-range full-pixel motion search
-            bestsme = vp9_refining_search_8p_c(x, &tmp_mv, sadpb,
-                                               search_range,
-                                               &cpi->fn_ptr[block_size],
-                                               x->nmvjointcost, x->mvcost,
-                                               &ref_mv[id], second_pred,
-                                               pw, ph);
-
-            x->mv_col_min = tmp_col_min;
-            x->mv_col_max = tmp_col_max;
-            x->mv_row_min = tmp_row_min;
-            x->mv_row_max = tmp_row_max;
-
-            if (bestsme < INT_MAX) {
-              int dis; /* TODO: use dis in distortion calculation later. */
-              unsigned int sse;
-
-              bestsme = vp9_find_best_sub_pixel_comp(x, &tmp_mv,
-                                                     &ref_mv[id],
-                                                     x->errorperbit,
-                                                     &cpi->fn_ptr[block_size],
-                                                     x->nmvjointcost, x->mvcost,
-                                                     &dis, &sse, second_pred,
-                                                     pw, ph);
-            }
-
-            if (id)
-              xd->plane[0].pre[0] = scaled_first_yv12;
-
-            if (bestsme < last_besterr[id]) {
-              frame_mv[refs[id]].as_int =
-                  xd->mode_info_context->bmi[0].as_mv[1].as_int = tmp_mv.as_int;
-              last_besterr[id] = bestsme;
-            } else {
-              break;
-            }
-          }
-
-          // restore the predictor
-          if (scaled_ref_frame[0]) {
-            int i;
-
-            for (i = 0; i < MAX_MB_PLANE; i++)
-              xd->plane[i].pre[0] = backup_yv12[i];
-          }
+        // Initialize mv using single prediction mode result.
+        frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int;
+        frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int;
 
-          if (scaled_ref_frame[1]) {
-            int i;
-
-            for (i = 0; i < MAX_MB_PLANE; i++)
-              xd->plane[i].pre[1] = backup_second_yv12[i];
-          }
-
-          vpx_free(second_pred);
-        }
+        if (cpi->sf.comp_inter_joint_search)
+          iterative_motion_search(cpi, x, bsize, frame_mv, scaled_ref_frame,
+                                  mi_row, mi_col, single_newmv);
 
         if (frame_mv[refs[0]].as_int == INVALID_MV ||
             frame_mv[refs[1]].as_int == INVALID_MV)
@@ -2134,8 +2232,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                        x->nmvjointcost, x->mvcost,
                                        &dis, &sse);
         }
-        frame_mv[refs[0]].as_int =
-          xd->mode_info_context->bmi[0].as_mv[0].as_int = tmp_mv.as_int;
+        frame_mv[refs[0]].as_int = tmp_mv.as_int;
         single_newmv[refs[0]].as_int = tmp_mv.as_int;
 
         // Add the new motion vector cost to our rolling cost variable
@@ -2191,7 +2288,9 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                  (mbmi->mv[1].as_mv.col & 15) == 0;
   // Search for best switchable filter by checking the variance of
   // pred error irrespective of whether the filter will be used
-  if (1) {
+  if (cpi->speed > 4) {
+    *best_filter = EIGHTTAP;
+  } else {
     int i, newbest;
     int tmp_rate_sum = 0, tmp_dist_sum = 0;
     for (i = 0; i < VP9_SWITCHABLE_FILTERS; ++i) {
@@ -2328,6 +2427,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     // Y cost and distortion
     super_block_yrd(cpi, x, rate_y, distortion_y, &skippable_y,
                     bsize, txfm_cache);
+
     *rate2 += *rate_y;
     *distortion += *distortion_y;
 
@@ -2393,16 +2493,14 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     *returnrate = rate4x4_y + rate_uv +
         vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 0);
     *returndist = dist4x4_y + (dist_uv >> 2);
-    for (i = 0; i < NB_TXFM_MODES; i++) {
-      ctx->txfm_rd_diff[i] = MIN(err4x4, err - txfm_cache[i]);
-    }
+    vpx_memset(ctx->txfm_rd_diff, 0, sizeof(ctx->txfm_rd_diff));
     xd->mode_info_context->mbmi.txfm_size = TX_4X4;
   } else {
     *returnrate = rate_y + rate_uv +
         vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 0);
     *returndist = dist_y + (dist_uv >> 2);
     for (i = 0; i < NB_TXFM_MODES; i++) {
-      ctx->txfm_rd_diff[i] = MIN(err4x4, err - txfm_cache[i]);
+      ctx->txfm_rd_diff[i] = txfm_cache[i] - txfm_cache[cm->txfm_mode];
     }
     xd->mode_info_context->mbmi.txfm_size = txfm_size;
     xd->mode_info_context->mbmi.mode = mode;
@@ -2457,14 +2555,14 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   int64_t frame_distortions[MAX_REF_FRAMES] = {-1};
   int intra_cost_penalty = 20 * vp9_dc_quant(cpi->common.base_qindex,
                                              cpi->common.y_dc_delta_q);
-  int_mv seg_mvs[4][MAX_REF_FRAMES - 1];
+  int_mv seg_mvs[4][MAX_REF_FRAMES];
   union b_mode_info best_bmodes[4];
   PARTITION_INFO best_partition;
 
   for (i = 0; i < 4; i++) {
     int j;
 
-    for (j = 0; j < MAX_REF_FRAMES - 1; j++)
+    for (j = 0; j < MAX_REF_FRAMES; j++)
       seg_mvs[i][j].as_int = INVALID_MV;
   }
   // Everywhere the flag is set the error is much higher than its neighbors.
@@ -2563,11 +2661,6 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     this_mode = vp9_mode_order[mode_index].mode;
     ref_frame = vp9_mode_order[mode_index].ref_frame;
 
-    if (!(ref_frame == INTRA_FRAME
-        || (cpi->ref_frame_flags & flag_list[ref_frame]))) {
-      continue;
-    }
-
     if (cpi->speed > 0 && bsize >= BLOCK_SIZE_SB8X8) {
       if (!(ref_frame_mask & (1 << ref_frame))) {
         continue;
@@ -2585,6 +2678,15 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     mbmi->ref_frame = ref_frame;
     mbmi->second_ref_frame = vp9_mode_order[mode_index].second_ref_frame;
 
+    if (!(ref_frame == INTRA_FRAME
+        || (cpi->ref_frame_flags & flag_list[ref_frame]))) {
+      continue;
+    }
+    if (!(mbmi->second_ref_frame == NONE
+        || (cpi->ref_frame_flags & flag_list[mbmi->second_ref_frame]))) {
+      continue;
+    }
+
     // TODO(jingning, jkoleszar): scaling reference frame not supported for
     // SPLITMV.
     if (mbmi->ref_frame > 0 &&
@@ -2680,8 +2782,6 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     if (this_mode == I4X4_PRED) {
       int rate;
 
-      // Note the rate value returned here includes the cost of coding
-      // the I4X4_PRED mode : x->mbmode_cost[xd->frame_type][I4X4_PRED];
       mbmi->txfm_size = TX_4X4;
       rd_pick_intra4x4mby_modes(cpi, x, &rate, &rate_y,
                                 &distortion_y, INT64_MAX);
@@ -2716,7 +2816,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       skippable = skippable && skip_uv[uv_tx];
       mbmi->uv_mode = mode_uv[uv_tx];
 
-      rate2 = rate_y + x->mbmode_cost[cm->frame_type][mbmi->mode] + rate_uv;
+      rate2 = rate_y + x->mbmode_cost[mbmi->mode] + rate_uv;
       if (mbmi->mode != DC_PRED && mbmi->mode != TM_PRED)
         rate2 += intra_cost_penalty;
       distortion2 = distortion_y + distortion_uv;
@@ -2755,7 +2855,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
                                              second_ref, INT64_MAX,
                                              &rate, &rate_y, &distortion,
                                              &skippable,
-                                             (int)this_rd_thresh, seg_mvs);
+                                             (int)this_rd_thresh, seg_mvs,
+                                             mi_row, mi_col);
         if (cpi->common.mcomp_filter_type == SWITCHABLE) {
           const int rs = get_switchable_rate(cm, x);
           tmp_rd += RDCOST(x->rdmult, x->rddiv, rs, 0);
@@ -2794,7 +2895,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
                                              second_ref, INT64_MAX,
                                              &rate, &rate_y, &distortion,
                                              &skippable,
-                                             (int)this_rd_thresh, seg_mvs);
+                                             (int)this_rd_thresh, seg_mvs,
+                                             mi_row, mi_col);
       } else {
         if (cpi->common.mcomp_filter_type == SWITCHABLE) {
           int rs = get_switchable_rate(cm, x);
@@ -2843,7 +2945,6 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
 
       compmode_cost =
           vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_COMP), is_comp_pred);
-      mbmi->mode = this_mode;
     } else {
       YV12_BUFFER_CONFIG *scaled_ref_frame[2] = {NULL, NULL};
       int fb = get_ref_frame_idx(cpi, mbmi->ref_frame);
@@ -2938,14 +3039,16 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       best_mode = this_mode;
     }
 
-    // Store the respective mode distortions for later use.
-    if (mode_distortions[this_mode] == -1
-        || distortion2 < mode_distortions[this_mode]) {
-      mode_distortions[this_mode] = distortion2;
-    }
-    if (frame_distortions[mbmi->ref_frame] == -1
-        || distortion2 < frame_distortions[mbmi->ref_frame]) {
-      frame_distortions[mbmi->ref_frame] = distortion2;
+    if (this_mode != I4X4_PRED && this_mode != SPLITMV) {
+      // Store the respective mode distortions for later use.
+      if (mode_distortions[this_mode] == -1
+          || distortion2 < mode_distortions[this_mode]) {
+        mode_distortions[this_mode] = distortion2;
+      }
+      if (frame_distortions[mbmi->ref_frame] == -1
+          || distortion2 < frame_distortions[mbmi->ref_frame]) {
+        frame_distortions[mbmi->ref_frame] = distortion2;
+      }
     }
 
     // Did this mode help.. i.e. is it the new best mode
@@ -2954,7 +3057,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
         // Note index of best mode so far
         best_mode_index = mode_index;
 
-        if (this_mode <= I4X4_PRED) {
+        if (ref_frame == INTRA_FRAME) {
           /* required for left and above block mv */
           mbmi->mv[0].as_int = 0;
         }
@@ -3052,8 +3155,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   // Flag all modes that have a distortion thats > 2x the best we found at
   // this level.
   for (mode_index = 0; mode_index < MB_MODE_COUNT; ++mode_index) {
-    if (mode_index == NEARESTMV || mode_index == NEARMV || mode_index == NEWMV
-        || mode_index == SPLITMV)
+    if (mode_index == NEARESTMV || mode_index == NEARMV || mode_index == NEWMV)
       continue;
 
     if (mode_distortions[mode_index] > 2 * *returndistortion) {
@@ -3077,7 +3179,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
 
   assert((cm->mcomp_filter_type == SWITCHABLE) ||
          (cm->mcomp_filter_type == best_mbmode.interp_filter) ||
-         (best_mbmode.mode <= I4X4_PRED));
+         (best_mbmode.ref_frame == INTRA_FRAME));
 
   // Accumulate filter usage stats
   // TODO(agrange): Use RD criteria to select interpolation filter mode.
@@ -3129,13 +3231,15 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
 
   // macroblock modes
   vpx_memcpy(mbmi, &best_mbmode, sizeof(MB_MODE_INFO));
-  if (best_mbmode.mode == I4X4_PRED) {
+  if (best_mbmode.ref_frame == INTRA_FRAME &&
+      best_mbmode.sb_type < BLOCK_SIZE_SB8X8) {
     for (i = 0; i < 4; i++) {
       xd->mode_info_context->bmi[i].as_mode = best_bmodes[i].as_mode;
     }
   }
 
-  if (best_mbmode.mode == SPLITMV) {
+  if (best_mbmode.ref_frame != INTRA_FRAME &&
+      best_mbmode.sb_type < BLOCK_SIZE_SB8X8) {
     for (i = 0; i < 4; i++)
       xd->mode_info_context->bmi[i].as_mv[0].as_int =
           best_bmodes[i].as_mv[0].as_int;
diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c
index 08efc84d4..79f72bb4b 100644
--- a/vp9/encoder/vp9_tokenize.c
+++ b/vp9/encoder/vp9_tokenize.c
@@ -36,6 +36,9 @@ extern vp9_coeff_stats tree_update_hist_16x16[BLOCK_TYPES];
 extern vp9_coeff_stats tree_update_hist_32x32[BLOCK_TYPES];
 #endif  /* ENTROPY_STATS */
 
+DECLARE_ALIGNED(16, extern const uint8_t,
+                vp9_pt_energy_class[MAX_ENTROPY_TOKENS]);
+
 static TOKENVALUE dct_value_tokens[DCT_MAX_VALUE * 2];
 const TOKENVALUE *vp9_dct_value_tokens_ptr;
 static int dct_value_cost[DCT_MAX_VALUE * 2];
@@ -221,14 +224,24 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE_TYPE bsize,
     t->token = token;
     t->context_tree = coef_probs[type][ref][band][pt];
     t->skip_eob_node = (c > 0) && (token_cache[scan[c - 1]] == 0);
+
+#if CONFIG_BALANCED_COEFTREE
+    assert(token <= ZERO_TOKEN ||
+           vp9_coef_encodings[t->token].len - t->skip_eob_node > 0);
+#else
     assert(vp9_coef_encodings[t->token].len - t->skip_eob_node > 0);
+#endif
 
     if (!dry_run) {
       ++counts[type][ref][band][pt][token];
+#if CONFIG_BALANCED_COEFTREE
+      if (!t->skip_eob_node && token > ZERO_TOKEN)
+#else
       if (!t->skip_eob_node)
+#endif
         ++cpi->common.fc.eob_branch_counts[tx_size][type][ref][band][pt];
     }
-    token_cache[scan[c]] = token;
+    token_cache[scan[c]] = vp9_pt_energy_class[token];
     ++t;
   } while (c < eob && ++c < seg_eob);
 
diff --git a/vp9/encoder/vp9_variance_c.c b/vp9/encoder/vp9_variance_c.c
index 7231dcf22..e9b36f356 100644
--- a/vp9/encoder/vp9_variance_c.c
+++ b/vp9/encoder/vp9_variance_c.c
@@ -318,6 +318,11 @@ unsigned int vp9_variance16x8_c(const uint8_t *src_ptr,
   return (var - (((unsigned int)avg * avg) >> 7));
 }
 
+void vp9_get_sse_sum_8x8_c(const uint8_t *src_ptr, int source_stride,
+                       const uint8_t *ref_ptr, int ref_stride,
+                       unsigned int *sse, int *sum) {
+  variance(src_ptr, source_stride, ref_ptr, ref_stride, 8, 8, sse, sum);
+}
 
 unsigned int vp9_variance8x8_c(const uint8_t *src_ptr,
                                int  source_stride,
diff --git a/vp9/encoder/vp9_write_bit_buffer.h b/vp9/encoder/vp9_write_bit_buffer.h
new file mode 100644
index 000000000..18cf40366
--- /dev/null
+++ b/vp9/encoder/vp9_write_bit_buffer.h
@@ -0,0 +1,42 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_BIT_WRITE_BUFFER_H_
+#define VP9_BIT_WRITE_BUFFER_H_
+
+#include "vpx/vpx_integer.h"
+
+struct vp9_write_bit_buffer {
+  uint8_t *bit_buffer;
+  size_t bit_offset;
+};
+
+static size_t vp9_rb_bytes_written(struct vp9_write_bit_buffer *wb) {
+  return wb->bit_offset / CHAR_BIT + (wb->bit_offset % CHAR_BIT > 0);
+}
+
+static void vp9_wb_write_bit(struct vp9_write_bit_buffer *wb, int bit) {
+  const int off = wb->bit_offset;
+  const int p = off / CHAR_BIT;
+  const int q = CHAR_BIT - 1 - off % CHAR_BIT;
+  wb->bit_buffer[p] &= ~(1 << q);
+  wb->bit_buffer[p] |= bit << q;
+  wb->bit_offset = off + 1;
+}
+
+static void vp9_wb_write_literal(struct vp9_write_bit_buffer *wb,
+                              int data, int bits) {
+  int bit;
+  for (bit = bits - 1; bit >= 0; bit--)
+    vp9_wb_write_bit(wb, (data >> bit) & 1);
+}
+
+
+#endif  // VP9_BIT_WRITE_BUFFER_H_