22 files changed, 825 insertions, 705 deletions
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index d8839cd14..5916bae2b 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -369,11 +369,6 @@ static void write_uv_mode(vp9_writer *bc, int m, const vp9_prob *p) {
   write_token(bc, vp9_uv_mode_tree, p, vp9_uv_mode_encodings + m);
 }
 
-
-static void write_bmode(vp9_writer *bc, int m, const vp9_prob *p) {
-  write_token(bc, vp9_bmode_tree, p, vp9_bmode_encodings + m);
-}
-
 static void write_kf_bmode(vp9_writer *bc, int m, const vp9_prob *p) {
   write_token(bc, vp9_kf_bmode_tree, p, vp9_kf_bmode_encodings + m);
 }
@@ -411,7 +406,6 @@ static int prob_diff_update_savings_search(const unsigned int *ct,
   return bestsavings;
 }
 
-#if CONFIG_MODELCOEFPROB
 static int prob_diff_update_savings_search_model(const unsigned int *ct,
                                                  const vp9_prob *oldp,
                                                  vp9_prob *bestp,
@@ -420,7 +414,8 @@ static int prob_diff_update_savings_search_model(const unsigned int *ct,
   int i, old_b, new_b, update_b, savings, bestsavings, step;
   int newp;
   vp9_prob bestnewp, newplist[ENTROPY_NODES], oldplist[ENTROPY_NODES];
-  vp9_model_to_full_probs(oldp, b, r, oldplist);
+  vp9_model_to_full_probs(oldp, oldplist);
+  vpx_memcpy(newplist, oldp, sizeof(vp9_prob) * UNCONSTRAINED_NODES);
   for (i = UNCONSTRAINED_NODES, old_b = 0; i < ENTROPY_NODES; ++i)
     old_b += cost_branch256(ct + 2 * i, oldplist[i]);
   old_b += cost_branch256(ct + 2 * PIVOT_NODE, oldplist[PIVOT_NODE]);
@@ -433,7 +428,7 @@ static int prob_diff_update_savings_search_model(const unsigned int *ct,
   for (; newp != oldp[PIVOT_NODE]; newp += step) {
     if (newp < 1 || newp > 255) continue;
     newplist[PIVOT_NODE] = newp;
-    vp9_get_model_distribution(newp, newplist, b, r);
+    vp9_model_to_full_probs(newplist, newplist);
     for (i = UNCONSTRAINED_NODES, new_b = 0; i < ENTROPY_NODES; ++i)
       new_b += cost_branch256(ct + 2 * i, newplist[i]);
     new_b += cost_branch256(ct + 2 * PIVOT_NODE, newplist[PIVOT_NODE]);
@@ -448,7 +443,6 @@ static int prob_diff_update_savings_search_model(const unsigned int *ct,
   *bestp = bestnewp;
   return bestsavings;
 }
-#endif
 
 static void vp9_cond_prob_update(vp9_writer *bc, vp9_prob *oldp, vp9_prob upd,
                                  unsigned int *ct) {
@@ -479,25 +473,18 @@ static void pack_mb_tokens(vp9_writer* const bc,
     int v = a->value;
     int n = a->len;
     int ncount = n;
-#if CONFIG_MODELCOEFPROB
     vp9_prob probs[ENTROPY_NODES];
-#endif
 
     if (t == EOSB_TOKEN) {
       ++p;
       break;
     }
-#if CONFIG_MODELCOEFPROB
     if (t >= TWO_TOKEN) {
-      vp9_model_to_full_probs(p->context_tree,
-                              p->block_type, p->ref_type, probs);
+      vp9_model_to_full_probs(p->context_tree, probs);
       pp = probs;
     } else {
       pp = p->context_tree;
     }
-#else
-    pp = p->context_tree;
-#endif
     assert(pp != 0);
 
     /* skip one or two nodes */
@@ -729,26 +716,17 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m,
     active_section = 6;
 #endif
 
-#if CONFIG_AB4X4
     if (m->mbmi.sb_type >= BLOCK_SIZE_SB8X8)
       write_sb_ymode(bc, mode, pc->fc.sb_ymode_prob);
-#else
-    if (m->mbmi.sb_type > BLOCK_SIZE_SB8X8)
-      write_sb_ymode(bc, mode, pc->fc.sb_ymode_prob);
-    else
-      write_ymode(bc, mode, pc->fc.ymode_prob);
-#endif
 
-#if CONFIG_AB4X4
     if (m->mbmi.sb_type < BLOCK_SIZE_SB8X8) {
-#else
-    if (mode == I4X4_PRED) {
-#endif
-      int j = 0;
-      do {
-        write_bmode(bc, m->bmi[j].as_mode.first,
-                    pc->fc.bmode_prob);
-      } while (++j < 4);
+      int idx, idy;
+      int bw = 1 << b_width_log2(mi->sb_type);
+      int bh = 1 << b_height_log2(mi->sb_type);
+      for (idy = 0; idy < 2; idy += bh)
+        for (idx = 0; idx < 2; idx += bw)
+          write_sb_ymode(bc, m->bmi[idy * 2 + idx].as_mode.first,
+                         pc->fc.sb_ymode_prob);
     }
     write_uv_mode(bc, mi->uv_mode,
                   pc->fc.uv_mode_prob[mode]);
@@ -763,16 +741,8 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m,
 
     // If segment skip is not enabled code the mode.
     if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) {
-#if CONFIG_AB4X4
       if (mi->sb_type >= BLOCK_SIZE_SB8X8)
         write_sb_mv_ref(bc, mode, mv_ref_p);
-#else
-      if (mi->sb_type > BLOCK_SIZE_SB8X8) {
-        write_sb_mv_ref(bc, mode, mv_ref_p);
-      } else {
-        write_mv_ref(bc, mode, mv_ref_p);
-      }
-#endif
       vp9_accum_mv_refs(&cpi->common, mode, mi->mb_mode_context[rf]);
     }
 
@@ -819,9 +789,6 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m,
         int bwl = b_width_log2(mi->sb_type), bw = 1 << bwl;
         int bhl = b_height_log2(mi->sb_type), bh = 1 << bhl;
         int idx, idy;
-#if !CONFIG_AB4X4
-        bw = 1, bh = 1;
-#endif
         for (idy = 0; idy < 2; idy += bh) {
           for (idx = 0; idx < 2; idx += bw) {
             j = idy * 2 + idx;
@@ -861,20 +828,9 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m,
     }
   }
 
-#if CONFIG_AB4X4
-  if (((rf == INTRA_FRAME && mi->sb_type >= BLOCK_SIZE_SB8X8) ||
-       (rf != INTRA_FRAME && mi->sb_type >= BLOCK_SIZE_SB8X8)) &&
-      pc->txfm_mode == TX_MODE_SELECT &&
-      !(skip_coeff || vp9_segfeature_active(xd, segment_id,
-                                            SEG_LVL_SKIP)))
-#else
-  if (((rf == INTRA_FRAME && mode != I4X4_PRED) ||
-       (rf != INTRA_FRAME && mode != SPLITMV)) &&
-      pc->txfm_mode == TX_MODE_SELECT &&
-      !(skip_coeff || vp9_segfeature_active(xd, segment_id,
-                                            SEG_LVL_SKIP)))
-#endif
-  {
+  if (mi->sb_type >= BLOCK_SIZE_SB8X8 && pc->txfm_mode == TX_MODE_SELECT &&
+      !(rf != INTRA_FRAME &&
+        (skip_coeff || vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)))) {
     TX_SIZE sz = mi->txfm_size;
     // FIXME(rbultje) code ternary symbol once all experiments are merged
     vp9_write(bc, sz != TX_4X4, pc->prob_tx[0]);
@@ -891,8 +847,8 @@ static void write_mb_modes_kf(const VP9_COMP *cpi,
                               vp9_writer *bc, int mi_row, int mi_col) {
   const VP9_COMMON *const c = &cpi->common;
   const MACROBLOCKD *const xd = &cpi->mb.e_mbd;
-  const int mis = c->mode_info_stride;
   const int ym = m->mbmi.mode;
+  const int mis = c->mode_info_stride;
   const int segment_id = m->mbmi.segment_id;
   int skip_coeff;
 
@@ -906,45 +862,32 @@ static void write_mb_modes_kf(const VP9_COMP *cpi,
     vp9_write(bc, skip_coeff, vp9_get_pred_prob(c, xd, PRED_MBSKIP));
   }
 
-#if CONFIG_AB4X4
-  if (m->mbmi.sb_type >= BLOCK_SIZE_SB8X8)
-    sb_kfwrite_ymode(bc, ym, c->sb_kf_ymode_prob[c->kf_ymode_probs_index]);
-#else
-  if (m->mbmi.sb_type > BLOCK_SIZE_SB8X8)
-    sb_kfwrite_ymode(bc, ym, c->sb_kf_ymode_prob[c->kf_ymode_probs_index]);
-  else
-    kfwrite_ymode(bc, ym, c->kf_ymode_prob[c->kf_ymode_probs_index]);
-#endif
+  if (m->mbmi.sb_type >= BLOCK_SIZE_SB8X8) {
+    const B_PREDICTION_MODE A = above_block_mode(m, 0, mis);
+    const B_PREDICTION_MODE L = xd->left_available ?
+                                 left_block_mode(m, 0) : DC_PRED;
+    write_kf_bmode(bc, ym, c->kf_bmode_prob[A][L]);
+  }
 
-#if CONFIG_AB4X4
   if (m->mbmi.sb_type < BLOCK_SIZE_SB8X8) {
-#else
-  if (ym == I4X4_PRED) {
-#endif
-    int i = 0;
-    do {
-      const B_PREDICTION_MODE a = above_block_mode(m, i, mis);
-      const B_PREDICTION_MODE l = (xd->left_available ||
-                                  (i & 1)) ?
-                                  left_block_mode(m, i) : B_DC_PRED;
-      const int bm = m->bmi[i].as_mode.first;
-
-/*#ifdef ENTROPY_STATS
-      ++intra_mode_stats [A] [L] [bm];
-#endif*/
-      write_kf_bmode(bc, bm, c->kf_bmode_prob[a][l]);
-    } while (++i < 4);
+    int idx, idy;
+    int bw = 1 << b_width_log2(m->mbmi.sb_type);
+    int bh = 1 << b_height_log2(m->mbmi.sb_type);
+    for (idy = 0; idy < 2; idy += bh) {
+      for (idx = 0; idx < 2; idx += bw) {
+        int i = idy * 2 + idx;
+        const B_PREDICTION_MODE A = above_block_mode(m, i, mis);
+        const B_PREDICTION_MODE L = (xd->left_available || idx) ?
+                                     left_block_mode(m, i) : DC_PRED;
+        write_kf_bmode(bc, m->bmi[i].as_mode.first,
+                       c->kf_bmode_prob[A][L]);
+      }
+    }
   }
 
   write_uv_mode(bc, m->mbmi.uv_mode, c->kf_uv_mode_prob[ym]);
 
-#if CONFIG_AB4X4
-  if (m->mbmi.sb_type >= BLOCK_SIZE_SB8X8 && c->txfm_mode == TX_MODE_SELECT &&
-      !(skip_coeff || vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP))) {
-#else
-  if (ym != I4X4_PRED && c->txfm_mode == TX_MODE_SELECT &&
-      !(skip_coeff || vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP))) {
-#endif
+  if (m->mbmi.sb_type >= BLOCK_SIZE_SB8X8 && c->txfm_mode == TX_MODE_SELECT) {
     TX_SIZE sz = m->mbmi.txfm_size;
     // FIXME(rbultje) code ternary symbol once all experiments are merged
     vp9_write(bc, sz != TX_4X4, c->prob_tx[0]);
@@ -962,11 +905,9 @@ static void write_modes_b(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc,
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &cpi->mb.e_mbd;
 
-#if CONFIG_AB4X4
   if (m->mbmi.sb_type < BLOCK_SIZE_SB8X8)
     if (xd->ab_index > 0)
       return;
-#endif
   xd->mode_info_context = m;
   set_mi_row_col(&cpi->common, xd, mi_row,
                  1 << mi_height_log2(m->mbmi.sb_type),
@@ -1019,17 +960,11 @@ static void write_modes_sb(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc,
   else
     assert(0);
 
-#if CONFIG_AB4X4
   if (bsize < BLOCK_SIZE_SB8X8)
     if (xd->ab_index > 0)
       return;
-#endif
 
-#if CONFIG_AB4X4
   if (bsize >= BLOCK_SIZE_SB8X8) {
-#else
-  if (bsize > BLOCK_SIZE_SB8X8) {
-#endif
     int pl;
     xd->left_seg_context = cm->left_seg_context + (mi_row & MI_MASK);
     xd->above_seg_context = cm->above_seg_context + mi_col;
@@ -1071,13 +1006,8 @@ static void write_modes_sb(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc,
   }
 
   // update partition context
-#if CONFIG_AB4X4
   if (bsize >= BLOCK_SIZE_SB8X8 &&
       (bsize == BLOCK_SIZE_SB8X8 || partition != PARTITION_SPLIT)) {
-#else
-  if (bsize > BLOCK_SIZE_SB8X8 &&
-      (bsize == BLOCK_SIZE_MB16X16 || partition != PARTITION_SPLIT)) {
-#endif
     set_partition_seg_context(cm, xd, mi_row, mi_col);
     update_partition_context(xd, subsize, bsize);
   }
@@ -1101,7 +1031,7 @@ static void write_modes(VP9_COMP *cpi, vp9_writer* const bc,
     vpx_memset(c->left_seg_context, 0, sizeof(c->left_seg_context));
     for (mi_col = c->cur_tile_mi_col_start;
          mi_col < c->cur_tile_mi_col_end;
-         mi_col += 8, m += 8)
+         mi_col += 64 / MI_SIZE, m += 64 / MI_SIZE)
       write_modes_sb(cpi, m, bc, tok, tok_end, mi_row, mi_col,
                      BLOCK_SIZE_SB64X64);
   }
@@ -1219,22 +1149,14 @@ static void update_coef_probs_common(
     vp9_coeff_stats *tree_update_hist,
 #endif
     vp9_coeff_probs *new_frame_coef_probs,
-#if CONFIG_MODELCOEFPROB
     vp9_coeff_probs_model *old_frame_coef_probs,
-#else
-    vp9_coeff_probs *old_frame_coef_probs,
-#endif
     vp9_coeff_stats *frame_branch_ct,
     TX_SIZE tx_size) {
   int i, j, k, l, t;
   int update[2] = {0, 0};
   int savings;
 
-#if CONFIG_MODELCOEFPROB
   const int entropy_nodes_update = UNCONSTRAINED_NODES;
-#else
-  const int entropy_nodes_update = ENTROPY_NODES;
-#endif
   // vp9_prob bestupd = find_coef_update_prob(cpi);
 
   const int tstart = 0;
@@ -1254,13 +1176,11 @@ static void update_coef_probs_common(
 
             if (l >= 3 && k == 0)
               continue;
-#if CONFIG_MODELCOEFPROB
             if (t == PIVOT_NODE)
               s = prob_diff_update_savings_search_model(
                   frame_branch_ct[i][j][k][l][0],
                   old_frame_coef_probs[i][j][k][l], &newp, upd, i, j);
             else
-#endif
               s = prob_diff_update_savings_search(
                   frame_branch_ct[i][j][k][l][t], oldp, &newp, upd);
             if (s > 0 && newp != oldp)
@@ -1298,13 +1218,11 @@ static void update_coef_probs_common(
             if (l >= 3 && k == 0)
               continue;
 
-#if CONFIG_MODELCOEFPROB
             if (t == PIVOT_NODE)
               s = prob_diff_update_savings_search_model(
                   frame_branch_ct[i][j][k][l][0],
                   old_frame_coef_probs[i][j][k][l], &newp, upd, i, j);
             else
-#endif
               s = prob_diff_update_savings_search(
                   frame_branch_ct[i][j][k][l][t],
                   *oldp, &newp, upd);
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index d3851b428..84e1a1fdb 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -140,11 +140,9 @@ struct macroblock {
 
   // TODO(jingning): Need to refactor the structure arrays that buffers the
   // coding mode decisions of each partition type.
-#if CONFIG_AB4X4
   PICK_MODE_CONTEXT ab4x4_context[4][4][4];
   PICK_MODE_CONTEXT sb8x4_context[4][4][4];
   PICK_MODE_CONTEXT sb4x8_context[4][4][4];
-#endif
   PICK_MODE_CONTEXT sb8x8_context[4][4][4];
   PICK_MODE_CONTEXT sb8x16_context[4][4][2];
   PICK_MODE_CONTEXT sb16x8_context[4][4][2];
@@ -158,9 +156,7 @@ struct macroblock {
   PICK_MODE_CONTEXT sb64_context;
   int partition_cost[NUM_PARTITION_CONTEXTS][PARTITION_TYPES];
 
-#if CONFIG_AB4X4
   BLOCK_SIZE_TYPE b_partitioning[4][4][4];
-#endif
   BLOCK_SIZE_TYPE mb_partitioning[4][4];
   BLOCK_SIZE_TYPE sb_partitioning[4];
   BLOCK_SIZE_TYPE sb64_partitioning;
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index f3a03f3c8..6c129ebbf 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -503,21 +503,19 @@ static unsigned find_seg_id(uint8_t *buf, BLOCK_SIZE_TYPE bsize,
 void vp9_setup_src_planes(MACROBLOCK *x,
                           const YV12_BUFFER_CONFIG *src,
                           int mb_row, int mb_col) {
-  setup_pred_plane(&x->plane[0].src,
-                   src->y_buffer, src->y_stride,
-                   mb_row, mb_col, NULL,
-                   x->e_mbd.plane[0].subsampling_x,
-                   x->e_mbd.plane[0].subsampling_y);
-  setup_pred_plane(&x->plane[1].src,
-                   src->u_buffer, src->uv_stride,
-                   mb_row, mb_col, NULL,
-                   x->e_mbd.plane[1].subsampling_x,
-                   x->e_mbd.plane[1].subsampling_y);
-  setup_pred_plane(&x->plane[2].src,
-                   src->v_buffer, src->uv_stride,
-                   mb_row, mb_col, NULL,
-                   x->e_mbd.plane[2].subsampling_x,
-                   x->e_mbd.plane[2].subsampling_y);
+  uint8_t *buffers[4] = {src->y_buffer, src->u_buffer, src->v_buffer,
+                         src->alpha_buffer};
+  int strides[4] = {src->y_stride, src->uv_stride, src->uv_stride,
+                    src->alpha_stride};
+  int i;
+
+  for (i = 0; i < MAX_MB_PLANE; i++) {
+    setup_pred_plane(&x->plane[i].src,
+                     buffers[i], strides[i],
+                     mb_row, mb_col, NULL,
+                     x->e_mbd.plane[i].subsampling_x,
+                     x->e_mbd.plane[i].subsampling_y);
+  }
 }
 
 static void set_offsets(VP9_COMP *cpi,
@@ -621,11 +619,9 @@ static void pick_sb_modes(VP9_COMP *cpi, int mi_row, int mi_col,
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
 
-#if CONFIG_AB4X4
   if (bsize < BLOCK_SIZE_SB8X8)
     if (xd->ab_index != 0)
       return;
-#endif
 
   set_offsets(cpi, mi_row, mi_col, bsize);
   xd->mode_info_context->mbmi.sb_type = bsize;
@@ -710,14 +706,12 @@ static PICK_MODE_CONTEXT *get_block_context(MACROBLOCK *x,
       return &x->sb8x16_context[xd->sb_index][xd->mb_index][xd->b_index];
     case BLOCK_SIZE_SB8X8:
       return &x->sb8x8_context[xd->sb_index][xd->mb_index][xd->b_index];
-#if CONFIG_AB4X4
     case BLOCK_SIZE_SB8X4:
       return &x->sb8x4_context[xd->sb_index][xd->mb_index][xd->b_index];
     case BLOCK_SIZE_SB4X8:
       return &x->sb4x8_context[xd->sb_index][xd->mb_index][xd->b_index];
     case BLOCK_SIZE_AB4X4:
       return &x->ab4x4_context[xd->sb_index][xd->mb_index][xd->b_index];
-#endif
     default:
       assert(0);
       return NULL;
@@ -734,10 +728,8 @@ static BLOCK_SIZE_TYPE *get_sb_partitioning(MACROBLOCK *x,
       return &x->sb_partitioning[xd->sb_index];
     case BLOCK_SIZE_MB16X16:
       return &x->mb_partitioning[xd->sb_index][xd->mb_index];
-#if CONFIG_AB4X4
     case BLOCK_SIZE_SB8X8:
       return &x->b_partitioning[xd->sb_index][xd->mb_index][xd->b_index];
-#endif
     default:
       assert(0);
       return NULL;
@@ -787,11 +779,9 @@ static void encode_b(VP9_COMP *cpi, TOKENEXTRA **tp,
   if (sub_index != -1)
     *(get_sb_index(xd, bsize)) = sub_index;
 
-#if CONFIG_AB4X4
   if (bsize < BLOCK_SIZE_SB8X8)
     if (xd->ab_index > 0)
       return;
-#endif
   set_offsets(cpi, mi_row, mi_col, bsize);
   update_state(cpi, get_block_context(x, bsize), bsize, output_enabled);
   encode_superblock(cpi, tp, output_enabled, mi_row, mi_col, bsize);
@@ -818,13 +808,8 @@ static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp,
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
 
-#if CONFIG_AB4X4
   c1 = BLOCK_SIZE_AB4X4;
-  if (bsize >= BLOCK_SIZE_SB8X8)
-#else
-  if (bsize > BLOCK_SIZE_SB8X8)
-#endif
-  {
+  if (bsize >= BLOCK_SIZE_SB8X8) {
     set_partition_seg_context(cm, xd, mi_row, mi_col);
     pl = partition_plane_context(xd, bsize);
     c1 = *(get_sb_partitioning(x, bsize));
@@ -833,13 +818,8 @@ static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp,
   bwl = b_width_log2(c1), bhl = b_height_log2(c1);
 
   if (bsl == bwl && bsl == bhl) {
-#if CONFIG_AB4X4
     if (output_enabled && bsize >= BLOCK_SIZE_SB8X8)
         cpi->partition_count[pl][PARTITION_NONE]++;
-#else
-    if (output_enabled && bsize > BLOCK_SIZE_SB8X8)
-      cpi->partition_count[pl][PARTITION_NONE]++;
-#endif
     encode_b(cpi, tp, mi_row, mi_col, output_enabled, c1, -1);
   } else if (bsl == bhl && bsl > bwl) {
     if (output_enabled)
@@ -870,13 +850,8 @@ static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp,
     }
   }
 
-#if CONFIG_AB4X4
   if (bsize >= BLOCK_SIZE_SB8X8 &&
       (bsize == BLOCK_SIZE_SB8X8 || bsl == bwl || bsl == bhl)) {
-#else
-  if (bsize > BLOCK_SIZE_SB8X8 &&
-      (bsize == BLOCK_SIZE_MB16X16 || bsl == bwl || bsl == bhl)) {
-#endif
     set_partition_seg_context(cm, xd, mi_row, mi_col);
     update_partition_context(xd, c1, bsize);
   }
@@ -902,14 +877,12 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp,
   BLOCK_SIZE_TYPE subsize;
   int srate = INT_MAX, sdist = INT_MAX;
 
-#if CONFIG_AB4X4
   if (bsize < BLOCK_SIZE_SB8X8)
     if (xd->ab_index != 0) {
       *rate = 0;
       *dist = 0;
       return;
     }
-#endif
   assert(mi_height_log2(bsize) == mi_width_log2(bsize));
 
   // buffer the above/left context information of the block in search.
@@ -927,11 +900,7 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp,
              sizeof(PARTITION_CONTEXT) * ms);
 
   // PARTITION_SPLIT
-#if CONFIG_AB4X4
   if (bsize >= BLOCK_SIZE_SB8X8) {
-#else
-  if (bsize >= BLOCK_SIZE_MB16X16) {
-#endif
     int r4 = 0, d4 = 0;
     subsize = get_subsize(bsize, PARTITION_SPLIT);
     *(get_sb_partitioning(x, bsize)) = subsize;
@@ -953,12 +922,8 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp,
     }
     set_partition_seg_context(cm, xd, mi_row, mi_col);
     pl = partition_plane_context(xd, bsize);
-#if CONFIG_AB4X4
     if (r4 < INT_MAX)
       r4 += x->partition_cost[pl][PARTITION_SPLIT];
-#else
-    r4 += x->partition_cost[pl][PARTITION_SPLIT];
-#endif
     assert(r4 >= 0);
     assert(d4 >= 0);
     srate = r4;
@@ -968,11 +933,7 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp,
 
   // PARTITION_HORZ
   if ((mi_col + ms <= cm->mi_cols) && (mi_row + (ms >> 1) <= cm->mi_rows) &&
-#if CONFIG_AB4X4
       (bsize >= BLOCK_SIZE_SB8X8)) {
-#else
-      (bsize >= BLOCK_SIZE_MB16X16)) {
-#endif
     int r2, d2;
     int mb_skip = 0;
     subsize = get_subsize(bsize, PARTITION_HORZ);
@@ -995,12 +956,8 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp,
     }
     set_partition_seg_context(cm, xd, mi_row, mi_col);
     pl = partition_plane_context(xd, bsize);
-#if CONFIG_AB4X4
     if (r2 < INT_MAX)
       r2 += x->partition_cost[pl][PARTITION_HORZ];
-#else
-    r2 += x->partition_cost[pl][PARTITION_HORZ];
-#endif
     if ((RDCOST(x->rdmult, x->rddiv, r2, d2) <
          RDCOST(x->rdmult, x->rddiv, srate, sdist)) && !mb_skip) {
       srate = r2;
@@ -1012,11 +969,7 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp,
 
   // PARTITION_VERT
   if ((mi_row + ms <= cm->mi_rows) && (mi_col + (ms >> 1) <= cm->mi_cols) &&
-#if CONFIG_AB4X4
       (bsize >= BLOCK_SIZE_SB8X8)) {
-#else
-      (bsize >= BLOCK_SIZE_MB16X16)) {
-#endif
     int r2, d2;
     int mb_skip = 0;
     subsize = get_subsize(bsize, PARTITION_VERT);
@@ -1038,12 +991,8 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp,
     }
     set_partition_seg_context(cm, xd, mi_row, mi_col);
     pl = partition_plane_context(xd, bsize);
-#if CONFIG_AB4X4
     if (r2 < INT_MAX)
       r2 += x->partition_cost[pl][PARTITION_VERT];
-#else
-    r2 += x->partition_cost[pl][PARTITION_VERT];
-#endif
     if ((RDCOST(x->rdmult, x->rddiv, r2, d2) <
          RDCOST(x->rdmult, x->rddiv, srate, sdist)) && !mb_skip) {
       srate = r2;
@@ -1058,11 +1007,7 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp,
     int r, d;
     pick_sb_modes(cpi, mi_row, mi_col, tp, &r, &d, bsize,
                   get_block_context(x, bsize));
-#if CONFIG_AB4X4
     if (bsize >= BLOCK_SIZE_SB8X8) {
-#else
-    if (bsize >= BLOCK_SIZE_MB16X16) {
-#endif
       set_partition_seg_context(cm, xd, mi_row, mi_col);
       pl = partition_plane_context(xd, bsize);
       r += x->partition_cost[pl][PARTITION_NONE];
@@ -1072,11 +1017,7 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp,
         RDCOST(x->rdmult, x->rddiv, srate, sdist)) {
       srate = r;
       sdist = d;
-#if CONFIG_AB4X4
       if (bsize >= BLOCK_SIZE_SB8X8)
-#else
-      if (bsize >= BLOCK_SIZE_MB16X16)
-#endif
         *(get_sb_partitioning(x, bsize)) = bsize;
     }
   }
@@ -1260,6 +1201,8 @@ static void encode_frame_internal(VP9_COMP *cpi) {
   vpx_memset(cpi->txfm_count_16x16p, 0, sizeof(cpi->txfm_count_16x16p));
   vpx_memset(cpi->txfm_count_8x8p, 0, sizeof(cpi->txfm_count_8x8p));
   vpx_memset(cpi->rd_tx_select_diff, 0, sizeof(cpi->rd_tx_select_diff));
+  vpx_memset(cpi->rd_tx_select_threshes, 0, sizeof(cpi->rd_tx_select_threshes));
+
   {
     struct vpx_usec_timer  emr_timer;
     vpx_usec_timer_start(&emr_timer);
@@ -1514,7 +1457,7 @@ void vp9_encode_frame(VP9_COMP *cpi) {
     } else
       txfm_type = ALLOW_8X8;
 #else
-    txfm_type = cpi->rd_tx_select_threshes[frame_type][ALLOW_32X32] >=
+    txfm_type = cpi->rd_tx_select_threshes[frame_type][ALLOW_32X32] >
                   cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] ?
                     ALLOW_32X32 : TX_MODE_SELECT;
 #endif
@@ -1603,22 +1546,22 @@ static void sum_intra_stats(VP9_COMP *cpi, MACROBLOCK *x) {
   const MB_PREDICTION_MODE m = xd->mode_info_context->mbmi.mode;
   const MB_PREDICTION_MODE uvm = xd->mode_info_context->mbmi.uv_mode;
 
-#if CONFIG_AB4X4
   if (xd->mode_info_context->mbmi.sb_type >= BLOCK_SIZE_SB8X8) {
-#else
-  if (xd->mode_info_context->mbmi.sb_type > BLOCK_SIZE_SB8X8) {
-#endif
     ++cpi->sb_ymode_count[m];
   } else {
     ++cpi->ymode_count[m];
   }
     ++cpi->y_uv_mode_count[m][uvm];
   if (m == I4X4_PRED) {
-    int b = 0;
-    do {
-      int m = xd->mode_info_context->bmi[b].as_mode.first;
-      ++cpi->bmode_count[m];
-    } while (++b < 4);
+    int idx, idy;
+    int bw = 1 << b_width_log2(xd->mode_info_context->mbmi.sb_type);
+    int bh = 1 << b_height_log2(xd->mode_info_context->mbmi.sb_type);
+    for (idy = 0; idy < 2; idy += bh) {
+      for (idx = 0; idx < 2; idx += bw) {
+        int m = xd->mode_info_context->bmi[idy * 2 + idx].as_mode.first;
+        ++cpi->sb_ymode_count[m];
+      }
+    }
   }
 }
 
@@ -1693,22 +1636,11 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t,
     vp9_update_zbin_extra(cpi, x);
   }
 
-#if CONFIG_AB4X4
-  if (mbmi->ref_frame == INTRA_FRAME &&
-      bsize < BLOCK_SIZE_SB8X8) {
-#else
-  if (mbmi->mode == I4X4_PRED) {
-    assert(bsize == BLOCK_SIZE_SB8X8 && mbmi->txfm_size == TX_4X4);
-#endif
-    vp9_encode_intra4x4mby(x, BLOCK_SIZE_SB8X8);
-    vp9_build_intra_predictors_sbuv_s(xd, BLOCK_SIZE_SB8X8);
-    vp9_encode_sbuv(cm, x, BLOCK_SIZE_SB8X8);
-
-    if (output_enabled)
-      sum_intra_stats(cpi, x);
-  } else if (mbmi->ref_frame == INTRA_FRAME) {
-    vp9_build_intra_predictors_sby_s(xd, bsize);
-    vp9_build_intra_predictors_sbuv_s(xd, bsize);
+  if (mbmi->ref_frame == INTRA_FRAME) {
+    vp9_encode_intra_block_y(cm, x, (bsize < BLOCK_SIZE_SB8X8) ?
+                                    BLOCK_SIZE_SB8X8 : bsize);
+    vp9_encode_intra_block_uv(cm, x, (bsize < BLOCK_SIZE_SB8X8) ?
+                                     BLOCK_SIZE_SB8X8 : bsize);
     if (output_enabled)
       sum_intra_stats(cpi, x);
   } else {
@@ -1730,14 +1662,9 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t,
                                                            : bsize);
   }
 
-#if CONFIG_AB4X4
-  if (mbmi->ref_frame == INTRA_FRAME &&
-      bsize < BLOCK_SIZE_SB8X8) {
-#else
-  if (mbmi->mode == I4X4_PRED) {
-    assert(bsize == BLOCK_SIZE_SB8X8);
-#endif
-    vp9_tokenize_sb(cpi, xd, t, !output_enabled, BLOCK_SIZE_SB8X8);
+  if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {
+    vp9_tokenize_sb(cpi, xd, t, !output_enabled,
+                    (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 : bsize);
   } else if (!x->skip) {
     vp9_encode_sb(cm, x, (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 : bsize);
     vp9_tokenize_sb(cpi, xd, t, !output_enabled,
@@ -1764,8 +1691,9 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t,
 
   if (output_enabled) {
     if (cm->txfm_mode == TX_MODE_SELECT &&
-        !(mbmi->mb_skip_coeff ||
-          vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP))) {
+        mbmi->sb_type >= BLOCK_SIZE_SB8X8 &&
+        !(mbmi->ref_frame != INTRA_FRAME && (mbmi->mb_skip_coeff ||
+          vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)))) {
       if (bsize >= BLOCK_SIZE_SB32X32) {
         cpi->txfm_count_32x32p[mbmi->txfm_size]++;
       } else if (bsize >= BLOCK_SIZE_MB16X16) {
@@ -1776,18 +1704,19 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t,
     } else {
       int x, y;
       TX_SIZE sz = (cm->txfm_mode == TX_MODE_SELECT) ? TX_32X32 : cm->txfm_mode;
-
-      if (sz == TX_32X32 && bsize < BLOCK_SIZE_SB32X32)
-        sz = TX_16X16;
-      if (sz == TX_16X16 && bsize < BLOCK_SIZE_MB16X16)
-        sz = TX_8X8;
-#if CONFIG_AB4X4
-      if (sz == TX_8X8 && bsize < BLOCK_SIZE_SB8X8)
-#else
-      if (sz == TX_8X8 && (mbmi->mode == SPLITMV ||
-                           mbmi->mode == I4X4_PRED))
-#endif
+       // The new intra coding scheme requires no change of transform size
+      if (mi->mbmi.ref_frame != INTRA_FRAME) {
+        if (sz == TX_32X32 && bsize < BLOCK_SIZE_SB32X32)
+          sz = TX_16X16;
+        if (sz == TX_16X16 && bsize < BLOCK_SIZE_MB16X16)
+          sz = TX_8X8;
+        if (sz == TX_8X8 && bsize < BLOCK_SIZE_SB8X8)
+          sz = TX_4X4;
+      } else if (bsize >= BLOCK_SIZE_SB8X8) {
+        sz = mbmi->txfm_size;
+      } else {
         sz = TX_4X4;
+      }
 
       for (y = 0; y < bh; y++) {
         for (x = 0; x < bw; x++) {
diff --git a/vp9/encoder/vp9_encodeintra.c b/vp9/encoder/vp9_encodeintra.c
index f8cf50f84..91866b28f 100644
--- a/vp9/encoder/vp9_encodeintra.c
+++ b/vp9/encoder/vp9_encodeintra.c
@@ -80,15 +80,6 @@ static void encode_intra4x4block(MACROBLOCK *x, int ib,
   }
 }
 
-void vp9_encode_intra4x4mby(MACROBLOCK *mb, BLOCK_SIZE_TYPE bsize) {
-  int i;
-  int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);
-  int bc = 1 << (bwl + bhl);
-
-  for (i = 0; i < bc; i++)
-    encode_intra4x4block(mb, i, bsize);
-}
-
 void vp9_encode_intra16x16mby(VP9_COMMON *const cm, MACROBLOCK *x) {
   MACROBLOCKD *xd = &x->e_mbd;
 
@@ -102,3 +93,5 @@ void vp9_encode_intra16x16mbuv(VP9_COMMON *const cm, MACROBLOCK *x) {
   vp9_build_intra_predictors_sbuv_s(xd, BLOCK_SIZE_MB16X16);
   vp9_encode_sbuv(cm, x, BLOCK_SIZE_MB16X16);
 }
+
+
diff --git a/vp9/encoder/vp9_encodeintra.h b/vp9/encoder/vp9_encodeintra.h
index c26200494..7da164c6a 100644
--- a/vp9/encoder/vp9_encodeintra.h
+++ b/vp9/encoder/vp9_encodeintra.h
@@ -16,5 +16,9 @@
 int vp9_encode_intra(VP9_COMP *cpi, MACROBLOCK *x, int use_16x16_pred);
 void vp9_encode_intra16x16mby(VP9_COMMON *const cm, MACROBLOCK *x);
 void vp9_encode_intra16x16mbuv(VP9_COMMON *const cm, MACROBLOCK *x);
-void vp9_encode_intra4x4mby(MACROBLOCK *mb, BLOCK_SIZE_TYPE bs);
+void vp9_encode_intra_block_y(VP9_COMMON *const cm, MACROBLOCK *mb,
+                              BLOCK_SIZE_TYPE bs);
+void vp9_encode_intra_block_uv(VP9_COMMON *const cm, MACROBLOCK *mb,
+                               BLOCK_SIZE_TYPE bs);
+
 #endif  // VP9_ENCODER_VP9_ENCODEINTRA_H_
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 84b350792..3f2061c64 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -44,7 +44,6 @@ static void subtract_plane(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize, int plane) {
   const uint8_t *src = x->plane[plane].src.buf;
   const int src_stride = x->plane[plane].src.stride;
 
-  assert(plane < 3);
   vp9_subtract_block(bh, bw,
                      x->plane[plane].src_diff, bw, src, src_stride,
                      xd->plane[plane].dst.buf, xd->plane[plane].dst.stride);
@@ -168,7 +167,7 @@ static void optimize_b(VP9_COMMON *const cm, MACROBLOCK *mb,
       break;
     }
     case TX_32X32:
-      scan = vp9_default_zig_zag1d_32x32;
+      scan = vp9_default_scan_32x32;
       default_eob = 1024;
       band_translate = vp9_coefband_trans_8x8plus;
       break;
@@ -605,3 +604,118 @@ void vp9_encode_sb(VP9_COMMON *const cm, MACROBLOCK *x,
 
   foreach_transformed_block(xd, bsize, encode_block, &arg);
 }
+
+static void encode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize,
+                               int ss_txfrm_size, void *arg) {
+  struct encode_b_args* const args = arg;
+  MACROBLOCK* const x = args->x;
+  MACROBLOCKD* const xd = &x->e_mbd;
+  const TX_SIZE tx_size = (TX_SIZE)(ss_txfrm_size / 2);
+  const int bw = 4 << (b_width_log2(bsize) - xd->plane[plane].subsampling_x);
+  const int raster_block = txfrm_block_to_raster_block(xd, bsize, plane,
+                                                       block, ss_txfrm_size);
+  uint8_t* const src =
+      raster_block_offset_uint8(xd, bsize, plane, raster_block,
+                                x->plane[plane].src.buf,
+                                x->plane[plane].src.stride);
+  uint8_t* const dst =
+      raster_block_offset_uint8(xd, bsize, plane, raster_block,
+                                xd->plane[plane].dst.buf,
+                                xd->plane[plane].dst.stride);
+  int16_t* const src_diff =
+      raster_block_offset_int16(xd, bsize, plane,
+                                raster_block, x->plane[plane].src_diff);
+
+  const int txfm_b_size = 4 << tx_size;
+  int ib = raster_block;
+  int tx_ib = ib >> tx_size;
+  int plane_b_size;
+
+  TX_TYPE tx_type;
+  int mode, b_mode;
+
+  mode = plane == 0? xd->mode_info_context->mbmi.mode:
+                     xd->mode_info_context->mbmi.uv_mode;
+  if (bsize <= BLOCK_SIZE_SB8X8 && mode == I4X4_PRED && plane == 0)
+    b_mode = xd->mode_info_context->bmi[ib].as_mode.first;
+  else
+    b_mode = mode;
+
+  assert(b_mode >= B_DC_PRED && b_mode <= B_TM_PRED);
+
+  plane_b_size = b_width_log2(bsize) - xd->plane[plane].subsampling_x;
+  vp9_predict_intra_block(xd, tx_ib, plane_b_size, tx_size, b_mode,
+                          dst, xd->plane[plane].dst.stride);
+  vp9_subtract_block(txfm_b_size, txfm_b_size,
+                     src_diff, bw,
+                     src, x->plane[plane].src.stride,
+                     dst, xd->plane[plane].dst.stride);
+
+  xform_quant(plane, block, bsize, ss_txfrm_size, arg);
+
+  /*
+  if (x->optimize)
+    vp9_optimize_b(plane, block, bsize, ss_txfrm_size, args->cm, x, args->ctx);
+    */
+  switch (ss_txfrm_size / 2) {
+    case TX_32X32:
+        vp9_short_idct32x32_add(BLOCK_OFFSET(xd->plane[plane].dqcoeff,
+                                block, 16), dst, xd->plane[plane].dst.stride);
+      break;
+    case TX_16X16:
+      tx_type = plane == 0 ? get_tx_type_16x16(xd, raster_block) : DCT_DCT;
+      if (tx_type == DCT_DCT) {
+        vp9_short_idct16x16_add(BLOCK_OFFSET(xd->plane[plane].dqcoeff,
+                                block, 16), dst, xd->plane[plane].dst.stride);
+      } else {
+        vp9_short_iht16x16_add(BLOCK_OFFSET(xd->plane[plane].dqcoeff,
+                               block, 16), dst, xd->plane[plane].dst.stride,
+                               tx_type);
+      }
+      break;
+    case TX_8X8:
+      tx_type = plane == 0 ? get_tx_type_8x8(xd, raster_block) : DCT_DCT;
+      if (tx_type == DCT_DCT) {
+        vp9_short_idct8x8_add(BLOCK_OFFSET(xd->plane[plane].dqcoeff,
+                              block, 16), dst, xd->plane[plane].dst.stride);
+      } else {
+        vp9_short_iht8x8_add(BLOCK_OFFSET(xd->plane[plane].dqcoeff,
+                             block, 16), dst, xd->plane[plane].dst.stride,
+                             tx_type);
+      }
+      break;
+    case TX_4X4:
+      tx_type = plane == 0 ? get_tx_type_4x4(xd, raster_block) : DCT_DCT;
+      if (tx_type == DCT_DCT) {
+        // this is like vp9_short_idct4x4 but has a special case around eob<=1
+        // which is significant (not just an optimization) for the lossless
+        // case.
+        vp9_inverse_transform_b_4x4_add(xd, xd->plane[plane].eobs[block],
+            BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16), dst,
+            xd->plane[plane].dst.stride);
+      } else {
+        vp9_short_iht4x4_add(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
+                             dst, xd->plane[plane].dst.stride, tx_type);
+      }
+      break;
+  }
+}
+
+void vp9_encode_intra_block_y(VP9_COMMON *const cm, MACROBLOCK *x,
+                              BLOCK_SIZE_TYPE bsize) {
+  MACROBLOCKD* const xd = &x->e_mbd;
+  struct optimize_ctx ctx;
+  struct encode_b_args arg = {cm, x, &ctx};
+
+  foreach_transformed_block_in_plane(xd, bsize, 0,
+                                     encode_block_intra, &arg);
+}
+void vp9_encode_intra_block_uv(VP9_COMMON *const cm, MACROBLOCK *x,
+                              BLOCK_SIZE_TYPE bsize) {
+  MACROBLOCKD* const xd = &x->e_mbd;
+  struct optimize_ctx ctx;
+  struct encode_b_args arg = {cm, x, &ctx};
+
+  foreach_transformed_block_uv(xd, bsize, encode_block_intra, &arg);
+}
+
diff --git a/vp9/encoder/vp9_encodemv.c b/vp9/encoder/vp9_encodemv.c
index e2cd8838c..1bb7fa88d 100644
--- a/vp9/encoder/vp9_encodemv.c
+++ b/vp9/encoder/vp9_encodemv.c
@@ -573,16 +573,9 @@ void vp9_update_nmv_count(VP9_COMP *cpi, MACROBLOCK *x,
   int bhl = b_height_log2(mbmi->sb_type), bh = 1 << bhl;
   int idx, idy;
 
-#if CONFIG_AB4X4
   if (mbmi->sb_type < BLOCK_SIZE_SB8X8) {
-#else
-  if (mbmi->mode == SPLITMV) {
-#endif
     int i;
     PARTITION_INFO *pi = x->partition_info;
-#if !CONFIG_AB4X4
-    bw = 1, bh = 1;
-#endif
     for (idy = 0; idy < 2; idy += bh) {
       for (idx = 0; idx < 2; idx += bw) {
         i = idy * 2 + idx;
diff --git a/vp9/encoder/vp9_lookahead.c b/vp9/encoder/vp9_lookahead.c
index 708fe4549..b07d92a44 100644
--- a/vp9/encoder/vp9_lookahead.c
+++ b/vp9/encoder/vp9_lookahead.c
@@ -84,20 +84,27 @@ bail:
   return NULL;
 }
 
+#define USE_PARTIAL_COPY 0
 
 int vp9_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG   *src,
                        int64_t ts_start, int64_t ts_end, unsigned int flags,
                        unsigned char *active_map) {
   struct lookahead_entry *buf;
+#if USE_PARTIAL_COPY
   int row, col, active_end;
   int mb_rows = (src->y_height + 15) >> 4;
   int mb_cols = (src->y_width + 15) >> 4;
+#endif
 
   if (ctx->sz + 1 > ctx->max_sz)
     return 1;
   ctx->sz++;
   buf = pop(ctx, &ctx->write_idx);
 
+#if USE_PARTIAL_COPY
+  // TODO(jkoleszar): This is disabled for now, as
+  // vp9_copy_and_extend_frame_with_rect is not subsampling/alpha aware.
+
   // Only do this partial copy if the following conditions are all met:
   // 1. Lookahead queue has has size of 1.
   // 2. Active map is provided.
@@ -140,6 +147,11 @@ int vp9_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG   *src,
   } else {
     vp9_copy_and_extend_frame(src, &buf->img);
   }
+#else
+  // Partial copy not implemented yet
+  vp9_copy_and_extend_frame(src, &buf->img);
+#endif
+
   buf->ts_start = ts_start;
   buf->ts_end = ts_end;
   buf->flags = flags;
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 75e6e6757..2e99736ce 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -414,7 +414,6 @@ int vp9_find_best_sub_pixel_step_iteratively(MACROBLOCK *x,
   return besterr;
 }
 
-#if CONFIG_COMP_INTER_JOINT_SEARCH
 #undef DIST
 /* returns subpixel variance error function */
 #define DIST(r, c) \
@@ -606,7 +605,7 @@ int vp9_find_best_sub_pixel_comp(MACROBLOCK *x,
 
   return besterr;
 }
-#endif  // CONFIG_COMP_INTER_JOINT_SEARCH
+
 
 #undef MVC
 #undef PRE
@@ -2327,7 +2326,6 @@ int vp9_refining_search_sadx4(MACROBLOCK *x,
     return INT_MAX;
 }
 
-#if CONFIG_COMP_INTER_JOINT_SEARCH
 /* This function is called when we do joint motion search in comp_inter_inter
  * mode.
  */
@@ -2429,4 +2427,3 @@ int vp9_refining_search_8p_c(MACROBLOCK *x,
     return INT_MAX;
   }
 }
-#endif  // CONFIG_COMP_INTER_JOINT_SEARCH
diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h
index 33e688b97..28b2efd28 100644
--- a/vp9/encoder/vp9_mcomp.h
+++ b/vp9/encoder/vp9_mcomp.h
@@ -75,7 +75,6 @@ typedef int (*vp9_diamond_search_fn_t)(MACROBLOCK *x,
                                        int *mvjcost, int *mvcost[2],
                                        int_mv *center_mv);
 
-#if CONFIG_COMP_INTER_JOINT_SEARCH
 int vp9_find_best_sub_pixel_comp(MACROBLOCK *x,
                                  int_mv *bestmv, int_mv *ref_mv,
                                  int error_per_bit,
@@ -91,5 +90,4 @@ int vp9_refining_search_8p_c(MACROBLOCK *x,
                              int *mvjcost, int *mvcost[2],
                              int_mv *center_mv, const uint8_t *second_pred,
                              int w, int h);
-#endif  // CONFIG_COMP_INTER_JOINT_SEARCH
 #endif  // VP9_ENCODER_VP9_MCOMP_H_
diff --git a/vp9/encoder/vp9_modecosts.c b/vp9/encoder/vp9_modecosts.c
index e26daf0c9..e58ff40d9 100644
--- a/vp9/encoder/vp9_modecosts.c
+++ b/vp9/encoder/vp9_modecosts.c
@@ -33,10 +33,11 @@ void vp9_init_mode_costs(VP9_COMP *c) {
                   x->fc.sub_mv_ref_prob[0], vp9_sub_mv_ref_tree);
 
   // TODO(rbultje) separate tables for superblock costing?
-  vp9_cost_tokens(c->mb.mbmode_cost[1], x->fc.ymode_prob, vp9_ymode_tree);
+  vp9_cost_tokens(c->mb.mbmode_cost[1], x->fc.sb_ymode_prob,
+                  vp9_sb_ymode_tree);
   vp9_cost_tokens(c->mb.mbmode_cost[0],
-                  x->kf_ymode_prob[c->common.kf_ymode_probs_index],
-                  vp9_kf_ymode_tree);
+                  x->sb_kf_ymode_prob[c->common.kf_ymode_probs_index],
+                  vp9_sb_ymode_tree);
   vp9_cost_tokens(c->mb.intra_uv_mode_cost[1],
                   x->fc.uv_mode_prob[VP9_YMODES - 1], vp9_uv_mode_tree);
   vp9_cost_tokens(c->mb.intra_uv_mode_cost[0],
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index e5c4761cc..27eeb5c82 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -775,6 +775,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
   sf->optimize_coefficients = !cpi->oxcf.lossless;
   sf->first_step = 0;
   sf->max_step_search_steps = MAX_MVSEARCH_STEPS;
+  sf->comp_inter_joint_serach = 1;
 #if CONFIG_MULTIPLE_ARF
   // Switch segmentation off.
   sf->static_segmentation = 0;
@@ -785,7 +786,6 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
   sf->static_segmentation = 0;
 #endif
 #endif
-  sf->splitmode_breakout = 0;
   sf->mb16_breakout = 0;
 
   switch (mode) {
@@ -804,13 +804,13 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
   sf->static_segmentation = 0;
 #endif
 #endif
-      sf->splitmode_breakout = 1;
       sf->mb16_breakout = 0;
 
       if (speed > 0) {
         /* Disable coefficient optimization above speed 0 */
         sf->optimize_coefficients = 0;
         sf->no_skip_block4x4_search = 0;
+        sf->comp_inter_joint_serach = 0;
 
         sf->first_step = 1;
 
@@ -1636,12 +1636,12 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
 
   BFP(BLOCK_8X4, vp9_sad8x4, vp9_variance8x4, vp9_sub_pixel_variance8x4,
       vp9_sub_pixel_avg_variance8x4, NULL, NULL,
-      NULL, NULL, NULL,
+      NULL, NULL, vp9_sad8x4x8,
       vp9_sad8x4x4d)
 
   BFP(BLOCK_4X8, vp9_sad4x8, vp9_variance4x8, vp9_sub_pixel_variance4x8,
       vp9_sub_pixel_avg_variance4x8, NULL, NULL,
-      NULL, NULL, NULL,
+      NULL, NULL, vp9_sad4x8x8,
       vp9_sad4x8x4d)
 
   BFP(BLOCK_4X4, vp9_sad4x4, vp9_variance4x4, vp9_sub_pixel_variance4x4,
@@ -2083,6 +2083,18 @@ void vp9_write_yuv_rec_frame(VP9_COMMON *cm) {
     fwrite(src, s->uv_width, 1, yuv_rec_file);
     src += s->uv_stride;
   } while (--h);
+
+#if CONFIG_ALPHA
+  if (s->alpha_buffer) {
+    src = s->alpha_buffer;
+    h = s->alpha_height;
+    do {
+      fwrite(src, s->alpha_width, 1,  yuv_rec_file);
+      src += s->alpha_stride;
+    } while (--h);
+  }
+#endif
+
   fflush(yuv_rec_file);
 }
 #endif
@@ -2095,11 +2107,15 @@ static void scale_and_extend_frame(YV12_BUFFER_CONFIG *src_fb,
   const int out_h = dst_fb->y_crop_height;
   int x, y, i;
 
-  uint8_t *srcs[3] = {src_fb->y_buffer, src_fb->u_buffer, src_fb->v_buffer};
-  int src_strides[3] = {src_fb->y_stride, src_fb->uv_stride, src_fb->uv_stride};
+  uint8_t *srcs[4] = {src_fb->y_buffer, src_fb->u_buffer, src_fb->v_buffer,
+                      src_fb->alpha_buffer};
+  int src_strides[4] = {src_fb->y_stride, src_fb->uv_stride, src_fb->uv_stride,
+                        src_fb->alpha_stride};
 
-  uint8_t *dsts[3] = {dst_fb->y_buffer, dst_fb->u_buffer, dst_fb->v_buffer};
-  int dst_strides[3] = {dst_fb->y_stride, dst_fb->uv_stride, dst_fb->uv_stride};
+  uint8_t *dsts[4] = {dst_fb->y_buffer, dst_fb->u_buffer, dst_fb->v_buffer,
+                      dst_fb->alpha_buffer};
+  int dst_strides[4] = {dst_fb->y_stride, dst_fb->uv_stride, dst_fb->uv_stride,
+                        dst_fb->alpha_stride};
 
   for (y = 0; y < out_h; y += 16) {
     for (x = 0; x < out_w; x += 16) {
@@ -2791,6 +2807,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
   }
 #endif
   loop_count = 0;
+  vpx_memset(cpi->rd_tx_select_threshes, 0, sizeof(cpi->rd_tx_select_threshes));
 
   if (cm->frame_type != KEY_FRAME) {
     /* TODO: Decide this more intelligently */
@@ -2919,11 +2936,9 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
 #endif
 
     // transform / motion compensation build reconstruction frame
-#if CONFIG_MODELCOEFPROB
     if (cm->frame_type == KEY_FRAME) {
       vp9_default_coef_probs(cm);
     }
-#endif
 
     vp9_encode_frame(cpi);
 
@@ -3153,6 +3168,15 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
   // Pick the loop filter level for the frame.
   loopfilter_frame(cpi, cm);
 
+#if WRITE_RECON_BUFFER
+  if (cm->show_frame)
+    write_cx_frame_to_file(cm->frame_to_show,
+                           cm->current_video_frame + 2000);
+  else
+    write_cx_frame_to_file(cm->frame_to_show,
+                           cm->current_video_frame + 3000);
+#endif
+
   // build the bitstream
   cpi->dummy_packing = 0;
   vp9_pack_bitstream(cpi, dest, size);
@@ -3172,7 +3196,6 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
   release_scaled_references(cpi);
   update_reference_frames(cpi);
 
-#if CONFIG_MODELCOEFPROB
   vp9_full_to_model_counts(cpi->common.fc.coef_counts_4x4,
                            cpi->coef_counts_4x4);
   vp9_full_to_model_counts(cpi->common.fc.coef_counts_8x8,
@@ -3181,12 +3204,6 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
                            cpi->coef_counts_16x16);
   vp9_full_to_model_counts(cpi->common.fc.coef_counts_32x32,
                            cpi->coef_counts_32x32);
-#else
-  vp9_copy(cpi->common.fc.coef_counts_4x4, cpi->coef_counts_4x4);
-  vp9_copy(cpi->common.fc.coef_counts_8x8, cpi->coef_counts_8x8);
-  vp9_copy(cpi->common.fc.coef_counts_16x16, cpi->coef_counts_16x16);
-  vp9_copy(cpi->common.fc.coef_counts_32x32, cpi->coef_counts_32x32);
-#endif
   if (!cpi->common.error_resilient_mode &&
       !cpi->common.frame_parallel_decoding_mode) {
     vp9_adapt_coef_probs(&cpi->common);
diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h
index 57d19ca63..e3e95eda9 100644
--- a/vp9/encoder/vp9_onyx_int.h
+++ b/vp9/encoder/vp9_onyx_int.h
@@ -76,17 +76,10 @@ typedef struct {
   // 0 = I4X4_PRED, ZERO_MV, MV, SPLIT
   signed char last_mode_lf_deltas[MAX_MODE_LF_DELTAS];
 
-#if CONFIG_MODELCOEFPROB
   vp9_coeff_probs_model coef_probs_4x4[BLOCK_TYPES];
   vp9_coeff_probs_model coef_probs_8x8[BLOCK_TYPES];
   vp9_coeff_probs_model coef_probs_16x16[BLOCK_TYPES];
   vp9_coeff_probs_model coef_probs_32x32[BLOCK_TYPES];
-#else
-  vp9_coeff_probs coef_probs_4x4[BLOCK_TYPES];
-  vp9_coeff_probs coef_probs_8x8[BLOCK_TYPES];
-  vp9_coeff_probs coef_probs_16x16[BLOCK_TYPES];
-  vp9_coeff_probs coef_probs_32x32[BLOCK_TYPES];
-#endif
 
   vp9_prob sb_ymode_prob[VP9_I32X32_MODES - 1];
   vp9_prob ymode_prob[VP9_YMODES - 1]; /* interframe intra mode probs */
@@ -231,9 +224,9 @@ typedef struct {
   int optimize_coefficients;
   int no_skip_block4x4_search;
   int search_best_filter;
-  int splitmode_breakout;
   int mb16_breakout;
   int static_segmentation;
+  int comp_inter_joint_serach;
 } SPEED_FEATURES;
 
 enum BlockSize {
@@ -265,6 +258,14 @@ typedef struct VP9_COMP {
   DECLARE_ALIGNED(16, short, uv_zbin[QINDEX_RANGE][16]);
   DECLARE_ALIGNED(16, short, uv_round[QINDEX_RANGE][16]);
 
+#if CONFIG_ALPHA
+  DECLARE_ALIGNED(16, short, a_quant[QINDEX_RANGE][16]);
+  DECLARE_ALIGNED(16, unsigned char, a_quant_shift[QINDEX_RANGE][16]);
+  DECLARE_ALIGNED(16, short, a_zbin[QINDEX_RANGE][16]);
+  DECLARE_ALIGNED(16, short, a_round[QINDEX_RANGE][16]);
+
+  DECLARE_ALIGNED(16, short, zrun_zbin_boost_a[QINDEX_RANGE][16]);
+#endif
   DECLARE_ALIGNED(16, short, zrun_zbin_boost_y[QINDEX_RANGE][16]);
   DECLARE_ALIGNED(16, short, zrun_zbin_boost_uv[QINDEX_RANGE][16]);
 
diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index aea350bc4..53d8be775 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@@ -93,7 +93,7 @@ void vp9_quantize(MACROBLOCK *mb, int plane, int block, int n_coeffs,
       scan = get_scan_16x16(tx_type);
       break;
     default:
-      scan = vp9_default_zig_zag1d_32x32;
+      scan = vp9_default_scan_32x32;
       break;
   }
 
@@ -148,6 +148,9 @@ void vp9_init_quantizer(VP9_COMP *cpi) {
   int i;
   int quant_val;
   int quant_uv_val;
+#if CONFIG_ALPHA
+  int quant_alpha_val;
+#endif
   int q;
 
   static const int zbin_boost[16] = { 0,  0,  0,  8,  8,  8, 10, 12,
@@ -168,7 +171,6 @@ void vp9_init_quantizer(VP9_COMP *cpi) {
     cpi->common.y_dequant[q][0] = quant_val;
     cpi->zrun_zbin_boost_y[q][0] = (quant_val * zbin_boost[0]) >> 7;
 
-
     quant_val = vp9_dc_quant(q, cpi->common.uv_dc_delta_q);
     invert_quant(cpi->uv_quant[q] + 0, cpi->uv_quant_shift[q] + 0, quant_val);
     cpi->uv_zbin[q][0] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7);
@@ -176,13 +178,26 @@ void vp9_init_quantizer(VP9_COMP *cpi) {
     cpi->common.uv_dequant[q][0] = quant_val;
     cpi->zrun_zbin_boost_uv[q][0] = (quant_val * zbin_boost[0]) >> 7;
 
+#if CONFIG_ALPHA
+    quant_val = vp9_dc_quant(q, cpi->common.a_dc_delta_q);
+    invert_quant(cpi->a_quant[q] + 0, cpi->a_quant_shift[q] + 0, quant_val);
+    cpi->a_zbin[q][0] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7);
+    cpi->a_round[q][0] = (qrounding_factor * quant_val) >> 7;
+    cpi->common.a_dequant[q][0] = quant_val;
+    cpi->zrun_zbin_boost_a[q][0] = (quant_val * zbin_boost[0]) >> 7;
+#endif
+
     quant_val = vp9_ac_quant(q, 0);
     cpi->common.y_dequant[q][1] = quant_val;
     quant_uv_val = vp9_ac_quant(q, cpi->common.uv_ac_delta_q);
     cpi->common.uv_dequant[q][1] = quant_uv_val;
+#if CONFIG_ALPHA
+    quant_alpha_val = vp9_ac_quant(q, cpi->common.a_ac_delta_q);
+    cpi->common.a_dequant[q][1] = quant_alpha_val;
+#endif
     // all the 4x4 ac values =;
     for (i = 1; i < 16; i++) {
-      int rc = vp9_default_zig_zag1d_4x4[i];
+      int rc = vp9_default_scan_4x4[i];
 
       invert_quant(cpi->y_quant[q] + rc, cpi->y_quant_shift[q] + rc, quant_val);
       cpi->y_zbin[q][rc] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7);
@@ -196,6 +211,16 @@ void vp9_init_quantizer(VP9_COMP *cpi) {
       cpi->uv_round[q][rc] = (qrounding_factor * quant_uv_val) >> 7;
       cpi->zrun_zbin_boost_uv[q][i] =
           ROUND_POWER_OF_TWO(quant_uv_val * zbin_boost[i], 7);
+
+#if CONFIG_ALPHA
+      invert_quant(cpi->a_quant[q] + rc, cpi->a_quant_shift[q] + rc,
+          quant_alpha_val);
+      cpi->a_zbin[q][rc] =
+          ROUND_POWER_OF_TWO(qzbin_factor * quant_alpha_val, 7);
+      cpi->a_round[q][rc] = (qrounding_factor * quant_alpha_val) >> 7;
+      cpi->zrun_zbin_boost_a[q][i] =
+          ROUND_POWER_OF_TWO(quant_alpha_val * zbin_boost[i], 7);
+#endif
     }
   }
 }
@@ -233,6 +258,16 @@ void vp9_mb_init_quantizer(VP9_COMP *cpi, MACROBLOCK *x) {
     x->e_mbd.plane[i].dequant = cpi->common.uv_dequant[qindex];
   }
 
+#if CONFIG_ALPHA
+  x->plane[3].quant = cpi->a_quant[qindex];
+  x->plane[3].quant_shift = cpi->a_quant_shift[qindex];
+  x->plane[3].zbin = cpi->a_zbin[qindex];
+  x->plane[3].round = cpi->a_round[qindex];
+  x->plane[3].zrun_zbin_boost = cpi->zrun_zbin_boost_a[qindex];
+  x->plane[3].zbin_extra = (int16_t)zbin_extra;
+  x->e_mbd.plane[3].dequant = cpi->common.a_dequant[qindex];
+#endif
+
   x->skip_block = vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP);
 
   /* save this macroblock QIndex for vp9_update_zbin_extra() */
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 8c1ef4915..15ed8318c 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -106,11 +106,7 @@ const MODE_DEFINITION vp9_mode_order[MAX_MODES] = {
 };
 
 static void fill_token_costs(vp9_coeff_count *c,
-#if CONFIG_MODELCOEFPROB
                              vp9_coeff_probs_model *p,
-#else
-                             vp9_coeff_probs *p,
-#endif
                              TX_SIZE tx_size) {
   int i, j, k, l;
 
@@ -118,15 +114,10 @@ static void fill_token_costs(vp9_coeff_count *c,
     for (j = 0; j < REF_TYPES; j++)
       for (k = 0; k < COEF_BANDS; k++)
         for (l = 0; l < PREV_COEF_CONTEXTS; l++) {
-#if CONFIG_MODELCOEFPROB
           vp9_prob probs[ENTROPY_NODES];
-          vp9_model_to_full_probs(p[i][j][k][l], i, j, probs);
+          vp9_model_to_full_probs(p[i][j][k][l], probs);
           vp9_cost_tokens_skip((int *)c[i][j][k][l], probs,
                                vp9_coef_tree);
-#else
-          vp9_cost_tokens_skip((int *)c[i][j][k][l], p[i][j][k][l],
-                               vp9_coef_tree);
-#endif
         }
 }
 
@@ -280,11 +271,7 @@ static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb,
   TX_TYPE tx_type = DCT_DCT;
 
   const int segment_id = xd->mode_info_context->mbmi.segment_id;
-#if CONFIG_MODELCOEFPROB
   vp9_prob coef_probs[COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];
-#else
-  vp9_prob (*coef_probs)[PREV_COEF_CONTEXTS][ENTROPY_NODES];
-#endif
   int seg_eob, default_eob;
   uint8_t token_cache[1024];
   const uint8_t * band_translate;
@@ -304,12 +291,8 @@ static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb,
           get_tx_type_4x4(xd, block) : DCT_DCT;
       above_ec = A[0] != 0;
       left_ec = L[0] != 0;
-#if CONFIG_MODELCOEFPROB
       vp9_model_to_full_probs_sb(cm->fc.coef_probs_4x4[type][ref],
-                                 type, ref, coef_probs);
-#else
-      coef_probs = cm->fc.coef_probs_4x4[type][ref];
-#endif
+                                 coef_probs);
       seg_eob = 16;
       scan = get_scan_4x4(tx_type);
       band_translate = vp9_coefband_trans_4x4;
@@ -324,12 +307,8 @@ static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb,
       above_ec = (A[0] + A[1]) != 0;
       left_ec = (L[0] + L[1]) != 0;
       scan = get_scan_8x8(tx_type);
-#if CONFIG_MODELCOEFPROB
       vp9_model_to_full_probs_sb(cm->fc.coef_probs_8x8[type][ref],
-                                 type, ref, coef_probs);
-#else
-      coef_probs = cm->fc.coef_probs_8x8[type][ref];
-#endif
+                                 coef_probs);
       seg_eob = 64;
       band_translate = vp9_coefband_trans_8x8plus;
       break;
@@ -341,12 +320,8 @@ static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb,
       TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
           get_tx_type_16x16(xd, y + (x >> 2)) : DCT_DCT;
       scan = get_scan_16x16(tx_type);
-#if CONFIG_MODELCOEFPROB
       vp9_model_to_full_probs_sb(cm->fc.coef_probs_16x16[type][ref],
-                                 type, ref, coef_probs);
-#else
-      coef_probs = cm->fc.coef_probs_16x16[type][ref];
-#endif
+                                 coef_probs);
       seg_eob = 256;
       above_ec = (A[0] + A[1] + A[2] + A[3]) != 0;
       left_ec = (L[0] + L[1] + L[2] + L[3]) != 0;
@@ -354,13 +329,9 @@ static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb,
       break;
     }
     case TX_32X32:
-      scan = vp9_default_zig_zag1d_32x32;
-#if CONFIG_MODELCOEFPROB
+      scan = vp9_default_scan_32x32;
       vp9_model_to_full_probs_sb(cm->fc.coef_probs_32x32[type][ref],
-                                 type, ref, coef_probs);
-#else
-      coef_probs = cm->fc.coef_probs_32x32[type][ref];
-#endif
+                                 coef_probs);
       seg_eob = 1024;
       above_ec = (A[0] + A[1] + A[2] + A[3] + A[4] + A[5] + A[6] + A[7]) != 0;
       left_ec = (L[0] + L[1] + L[2] + L[3] + L[4] + L[5] + L[6] + L[7]) != 0;
@@ -609,78 +580,92 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
                                      int *bmode_costs,
                                      ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
                                      int *bestrate, int *bestratey,
-                                     int *bestdistortion) {
-  B_PREDICTION_MODE mode;
+                                     int *bestdistortion,
+                                     BLOCK_SIZE_TYPE bsize) {
+  MB_PREDICTION_MODE mode;
   MACROBLOCKD *xd = &x->e_mbd;
   int64_t best_rd = INT64_MAX;
   int rate = 0;
   int distortion;
   VP9_COMMON *const cm = &cpi->common;
   const int src_stride = x->plane[0].src.stride;
-  uint8_t* const src =
-      raster_block_offset_uint8(xd,
-                                BLOCK_SIZE_SB8X8,
-                                0, ib,
-                                x->plane[0].src.buf, src_stride);
-  int16_t* const src_diff =
-      raster_block_offset_int16(xd,
-                                BLOCK_SIZE_SB8X8,
-                                0, ib,
-                                x->plane[0].src_diff);
-  int16_t* const coeff = BLOCK_OFFSET(x->plane[0].coeff, ib, 16);
-  uint8_t* const dst =
-      raster_block_offset_uint8(xd,
-                                BLOCK_SIZE_SB8X8,
-                                0, ib,
-                                xd->plane[0].dst.buf, xd->plane[0].dst.stride);
-  ENTROPY_CONTEXT ta = *a, tempa = *a;
-  ENTROPY_CONTEXT tl = *l, templ = *l;
+  uint8_t *src, *dst;
+  int16_t *src_diff, *coeff;
+
+  ENTROPY_CONTEXT ta[2], tempa[2];
+  ENTROPY_CONTEXT tl[2], templ[2];
   TX_TYPE tx_type = DCT_DCT;
   TX_TYPE best_tx_type = DCT_DCT;
-  /*
-   * The predictor buffer is a 2d buffer with a stride of 16.  Create
-   * a temp buffer that meets the stride requirements, but we are only
-   * interested in the left 4x4 block
-   * */
-  DECLARE_ALIGNED_ARRAY(16, int16_t, best_dqcoeff, 16);
+  int bw = 1 << b_width_log2(bsize);
+  int bh = 1 << b_height_log2(bsize);
+  int idx, idy, block;
+  DECLARE_ALIGNED(16, int16_t, best_dqcoeff[4][16]);
 
   assert(ib < 4);
 
+  vpx_memcpy(ta, a, sizeof(ta));
+  vpx_memcpy(tl, l, sizeof(tl));
   xd->mode_info_context->mbmi.txfm_size = TX_4X4;
-  for (mode = B_DC_PRED; mode < LEFT4X4; mode++) {
+
+  for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
     int64_t this_rd;
-    int ratey;
+    int ratey = 0;
 
     xd->mode_info_context->bmi[ib].as_mode.first = mode;
-    rate = bmode_costs[mode];
+    if (cm->frame_type == KEY_FRAME)
+      rate = bmode_costs[mode];
+    else
+      rate = x->mbmode_cost[cm->frame_type][mode];
+    distortion = 0;
 
-    vp9_intra4x4_predict(xd, ib,
-                         BLOCK_SIZE_SB8X8,
-                         mode, dst, xd->plane[0].dst.stride);
-    vp9_subtract_block(4, 4, src_diff, 8,
-                       src, src_stride,
-                       dst, xd->plane[0].dst.stride);
+    vpx_memcpy(tempa, ta, sizeof(ta));
+    vpx_memcpy(templ, tl, sizeof(tl));
 
-    xd->mode_info_context->bmi[ib].as_mode.first = mode;
-    tx_type = get_tx_type_4x4(xd, ib);
-    if (tx_type != DCT_DCT) {
-      vp9_short_fht4x4(src_diff, coeff, 8, tx_type);
-      x->quantize_b_4x4(x, ib, tx_type, 16);
-    } else {
-      x->fwd_txm4x4(src_diff, coeff, 16);
-      x->quantize_b_4x4(x, ib, tx_type, 16);
-    }
+    for (idy = 0; idy < bh; ++idy) {
+      for (idx = 0; idx < bw; ++idx) {
+        block = ib + idy * 2 + idx;
+        xd->mode_info_context->bmi[block].as_mode.first = mode;
+        src = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, block,
+                                        x->plane[0].src.buf, src_stride);
+        src_diff = raster_block_offset_int16(xd, BLOCK_SIZE_SB8X8, 0, block,
+                                             x->plane[0].src_diff);
+        coeff = BLOCK_OFFSET(x->plane[0].coeff, block, 16);
+        dst = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, block,
+                                        xd->plane[0].dst.buf,
+                                        xd->plane[0].dst.stride);
+        vp9_intra4x4_predict(xd, block, BLOCK_SIZE_SB8X8, mode,
+                             dst, xd->plane[0].dst.stride);
+        vp9_subtract_block(4, 4, src_diff, 8,
+                           src, src_stride,
+                           dst, xd->plane[0].dst.stride);
+
+        tx_type = get_tx_type_4x4(xd, block);
+        if (tx_type != DCT_DCT) {
+          vp9_short_fht4x4(src_diff, coeff, 8, tx_type);
+          x->quantize_b_4x4(x, block, tx_type, 16);
+        } else {
+          x->fwd_txm4x4(src_diff, coeff, 16);
+          x->quantize_b_4x4(x, block, tx_type, 16);
+        }
 
-    tempa = ta;
-    templ = tl;
+        ratey += cost_coeffs(cm, x, 0, block, PLANE_TYPE_Y_WITH_DC,
+                             tempa + idx, templ + idy, TX_4X4, 16);
+        distortion += vp9_block_error(coeff, BLOCK_OFFSET(xd->plane[0].dqcoeff,
+                                                         block, 16), 16) >> 2;
 
-    ratey = cost_coeffs(cm, x, 0, ib,
-                        PLANE_TYPE_Y_WITH_DC, &tempa, &templ, TX_4X4, 16);
-    rate += ratey;
-    distortion = vp9_block_error(coeff,
-                                 BLOCK_OFFSET(xd->plane[0].dqcoeff, ib, 16),
-                                 16) >> 2;
+        vp9_intra4x4_predict(xd, block, BLOCK_SIZE_SB8X8, mode,
+                             dst, xd->plane[0].dst.stride);
+
+        if (best_tx_type != DCT_DCT)
+          vp9_short_iht4x4_add(BLOCK_OFFSET(xd->plane[0].dqcoeff, block, 16),
+                               dst, xd->plane[0].dst.stride, best_tx_type);
+        else
+          xd->inv_txm4x4_add(BLOCK_OFFSET(xd->plane[0].dqcoeff, block, 16),
+                             dst, xd->plane[0].dst.stride);
+      }
+    }
 
+    rate += ratey;
     this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
 
     if (this_rd < best_rd) {
@@ -690,25 +675,37 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
       best_rd = this_rd;
       *best_mode = mode;
       best_tx_type = tx_type;
-      *a = tempa;
-      *l = templ;
-      vpx_memcpy(best_dqcoeff, BLOCK_OFFSET(xd->plane[0].dqcoeff, ib, 16), 32);
+      vpx_memcpy(a, tempa, sizeof(tempa));
+      vpx_memcpy(l, templ, sizeof(templ));
+      for (idy = 0; idy < bh; ++idy) {
+        for (idx = 0; idx < bw; ++idx) {
+          block = ib + idy * 2 + idx;
+          vpx_memcpy(best_dqcoeff[idy * 2 + idx],
+                     BLOCK_OFFSET(xd->plane[0].dqcoeff, block, 16),
+                     sizeof(best_dqcoeff[0]));
+        }
+      }
     }
   }
-  xd->mode_info_context->bmi[ib].as_mode.first =
-    (B_PREDICTION_MODE)(*best_mode);
-
-  vp9_intra4x4_predict(xd, ib,
-                       BLOCK_SIZE_SB8X8,
-                       *best_mode,
-                       dst, xd->plane[0].dst.stride);
-
-  // inverse transform
-  if (best_tx_type != DCT_DCT) {
-    vp9_short_iht4x4_add(best_dqcoeff, dst, xd->plane[0].dst.stride,
-                           best_tx_type);
-  } else {
-    xd->inv_txm4x4_add(best_dqcoeff, dst, xd->plane[0].dst.stride);
+
+  for (idy = 0; idy < bh; ++idy) {
+    for (idx = 0; idx < bw; ++idx) {
+      block = ib + idy * 2 + idx;
+      xd->mode_info_context->bmi[block].as_mode.first = *best_mode;
+      dst = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, block,
+                                      xd->plane[0].dst.buf,
+                                      xd->plane[0].dst.stride);
+
+      vp9_intra4x4_predict(xd, block, BLOCK_SIZE_SB8X8, *best_mode,
+                           dst, xd->plane[0].dst.stride);
+      // inverse transform
+      if (best_tx_type != DCT_DCT)
+        vp9_short_iht4x4_add(best_dqcoeff[idy * 2 + idx], dst,
+                             xd->plane[0].dst.stride, best_tx_type);
+      else
+        xd->inv_txm4x4_add(best_dqcoeff[idy * 2 + idx], dst,
+                           xd->plane[0].dst.stride);
+    }
   }
 
   return best_rd;
@@ -717,17 +714,17 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
 static int64_t rd_pick_intra4x4mby_modes(VP9_COMP *cpi, MACROBLOCK *mb,
                                          int *Rate, int *rate_y,
                                          int *Distortion, int64_t best_rd) {
-  int i;
+  int i, j;
   MACROBLOCKD *const xd = &mb->e_mbd;
-#if CONFIG_AB4X4
+  BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type;
+  int bw = 1 << b_width_log2(bsize);
+  int bh = 1 << b_height_log2(bsize);
+  int idx, idy;
   int cost = 0;
-#else
-  int cost = mb->mbmode_cost[xd->frame_type][I4X4_PRED];
-#endif
   int distortion = 0;
   int tot_rate_y = 0;
   int64_t total_rd = 0;
-  ENTROPY_CONTEXT t_above[2], t_left[2];
+  ENTROPY_CONTEXT t_above[4], t_left[4];
   int *bmode_costs;
 
   vpx_memcpy(t_above, xd->plane[0].above_context, sizeof(t_above));
@@ -736,31 +733,39 @@ static int64_t rd_pick_intra4x4mby_modes(VP9_COMP *cpi, MACROBLOCK *mb,
   xd->mode_info_context->mbmi.mode = I4X4_PRED;
   bmode_costs = mb->inter_bmode_costs;
 
-  for (i = 0; i < 4; i++) {
-    const int x_idx = i & 1, y_idx = i >> 1;
-    MODE_INFO *const mic = xd->mode_info_context;
-    const int mis = xd->mode_info_stride;
-    B_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode);
-    int UNINITIALIZED_IS_SAFE(r), UNINITIALIZED_IS_SAFE(ry), UNINITIALIZED_IS_SAFE(d);
+  for (idy = 0; idy < 2; idy += bh) {
+    for (idx = 0; idx < 2; idx += bw) {
+      MODE_INFO *const mic = xd->mode_info_context;
+      const int mis = xd->mode_info_stride;
+      B_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode);
+      int UNINITIALIZED_IS_SAFE(r), UNINITIALIZED_IS_SAFE(ry);
+      int UNINITIALIZED_IS_SAFE(d);
+      i = idy * 2 + idx;
 
-    if (xd->frame_type == KEY_FRAME) {
-      const B_PREDICTION_MODE A = above_block_mode(mic, i, mis);
-      const B_PREDICTION_MODE L = left_block_mode(mic, i);
+      if (xd->frame_type == KEY_FRAME) {
+        const MB_PREDICTION_MODE A = above_block_mode(mic, i, mis);
+        const MB_PREDICTION_MODE L = (xd->left_available || idx) ?
+                                     left_block_mode(mic, i) : DC_PRED;
 
-      bmode_costs  = mb->bmode_costs[A][L];
-    }
+        bmode_costs  = mb->bmode_costs[A][L];
+      }
 
-    total_rd += rd_pick_intra4x4block(cpi, mb, i, &best_mode, bmode_costs,
-                                      t_above + x_idx, t_left + y_idx,
-                                      &r, &ry, &d);
-    cost += r;
-    distortion += d;
-    tot_rate_y += ry;
+      total_rd += rd_pick_intra4x4block(cpi, mb, i, &best_mode, bmode_costs,
+                                        t_above + idx, t_left + idy,
+                                        &r, &ry, &d, bsize);
+      cost += r;
+      distortion += d;
+      tot_rate_y += ry;
 
-    mic->bmi[i].as_mode.first = best_mode;
+      mic->bmi[i].as_mode.first = best_mode;
+      for (j = 1; j < bh; ++j)
+        mic->bmi[i + j * 2].as_mode.first = best_mode;
+      for (j = 1; j < bw; ++j)
+        mic->bmi[i + j].as_mode.first = best_mode;
 
-    if (total_rd >= best_rd)
-      break;
+      if (total_rd >= best_rd)
+        break;
+    }
   }
 
   if (total_rd >= best_rd)
@@ -780,18 +785,17 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                       int64_t txfm_cache[NB_TXFM_MODES]) {
   MB_PREDICTION_MODE mode;
   MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
+  MACROBLOCKD *xd = &x->e_mbd;
   int this_rate, this_rate_tokenonly;
   int this_distortion, s;
   int64_t best_rd = INT64_MAX, this_rd;
   TX_SIZE UNINITIALIZED_IS_SAFE(best_tx);
   int i;
 
-#if CONFIG_AB4X4
   if (bsize < BLOCK_SIZE_SB8X8) {
     x->e_mbd.mode_info_context->mbmi.txfm_size = TX_4X4;
     return best_rd;
   }
-#endif
 
   for (i = 0; i < NB_TXFM_MODES; i++)
     txfm_cache[i] = INT64_MAX;
@@ -799,13 +803,20 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
   /* Y Search for 32x32 intra prediction mode */
   for (mode = DC_PRED; mode <= TM_PRED; mode++) {
     int64_t local_txfm_cache[NB_TXFM_MODES];
+    MODE_INFO *const mic = xd->mode_info_context;
+    const int mis = xd->mode_info_stride;
+    const MB_PREDICTION_MODE A = above_block_mode(mic, 0, mis);
+    const MB_PREDICTION_MODE L = xd->left_available ?
+                                 left_block_mode(mic, 0) : DC_PRED;
+
+    int *bmode_costs  = x->bmode_costs[A][L];
 
     x->e_mbd.mode_info_context->mbmi.mode = mode;
     vp9_build_intra_predictors_sby_s(&x->e_mbd, bsize);
 
     super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion, &s,
                     bsize, local_txfm_cache);
-    this_rate = this_rate_tokenonly + x->mbmode_cost[x->e_mbd.frame_type][mode];
+    this_rate = this_rate_tokenonly + bmode_costs[mode];
     this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
 
     if (this_rd < best_rd) {
@@ -940,11 +951,9 @@ static int labels2mode(MACROBLOCK *x,
   MB_MODE_INFO * mbmi = &mic->mbmi;
   const int mis = xd->mode_info_stride;
   int i, cost = 0, thismvcost = 0;
-#if CONFIG_AB4X4
   int idx, idy;
   int bw = 1 << b_width_log2(mbmi->sb_type);
   int bh = 1 << b_height_log2(mbmi->sb_type);
-#endif
 
   /* We have to be careful retrieving previously-encoded motion vectors.
    Ones from this macroblock have to be pulled from the BLOCKD array
@@ -1028,7 +1037,6 @@ static int labels2mode(MACROBLOCK *x,
     x->partition_info->bmi[i].mv.as_int = this_mv->as_int;
     if (mbmi->second_ref_frame > 0)
       x->partition_info->bmi[i].second_mv.as_int = this_second_mv->as_int;
-#if CONFIG_AB4X4
     for (idy = 0; idy < bh; ++idy) {
       for (idx = 0; idx < bw; ++idx) {
         vpx_memcpy(&mic->bmi[i + idy * 2 + idx],
@@ -1038,7 +1046,6 @@ static int labels2mode(MACROBLOCK *x,
                    sizeof(x->partition_info->bmi[i]));
       }
     }
-#endif
   }
 
   cost += thismvcost;
@@ -1059,9 +1066,6 @@ static int64_t encode_inter_mb_segment(VP9_COMMON *const cm,
   int bwl = b_width_log2(bsize), bw = 1 << bwl;
   int bhl = b_height_log2(bsize), bh = 1 << bhl;
   int idx, idy;
-#if !CONFIG_AB4X4
-  bw = 1, bh = 1;
-#endif
 
   *labelyrate = 0;
   *distortion = 0;
@@ -1229,18 +1233,10 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
   ENTROPY_CONTEXT t_above[4], t_left[4];
   ENTROPY_CONTEXT t_above_b[4], t_left_b[4];
 
-#if !CONFIG_AB4X4
-  bh = 1, bw = 1;
-#endif
-
   vpx_memcpy(t_above, x->e_mbd.plane[0].above_context, sizeof(t_above));
   vpx_memcpy(t_left, x->e_mbd.plane[0].left_context, sizeof(t_left));
 
-#if CONFIG_AB4X4
   v_fn_ptr = &cpi->fn_ptr[get_block_size(4 << bwl, 4 << bhl)];
-#else
-  v_fn_ptr = &cpi->fn_ptr[BLOCK_4X4];
-#endif
 
   // 64 makes this threshold really big effectively
   // making it so that we very rarely check mvs on
@@ -1249,19 +1245,13 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
   label_mv_thresh = 1 * bsi->mvthresh / label_count;
 
   // Segmentation method overheads
-#if !CONFIG_AB4X4
-  rate += vp9_cost_mv_ref(cpi, SPLITMV,
-                          mbmi->mb_mode_context[mbmi->ref_frame]);
-  this_segment_rd += RDCOST(x->rdmult, x->rddiv, rate, 0);
-  br += rate;
-#endif
   other_segment_rd = this_segment_rd;
 
   for (idy = 0; idy < 2; idy += bh) {
     for (idx = 0; idx < 2; idx += bw) {
       // TODO(jingning,rbultje): rewrite the rate-distortion optimization
-      // loop for 4x4/4x8/8x4 block coding
-#if CONFIG_AB4X4
+      // loop for 4x4/4x8/8x4 block coding. to be replaced with new rd loop
+#if CONFIG_AB4X4 || 1
       int_mv mode_mv[B_MODE_COUNT], second_mode_mv[B_MODE_COUNT];
       int64_t best_label_rd = INT64_MAX, best_other_rd = INT64_MAX;
       B_PREDICTION_MODE mode_selected = ZERO4X4;
@@ -1912,6 +1902,10 @@ static void setup_pred_block(const MACROBLOCKD *xd,
   dst[1].buf = src->u_buffer;
   dst[2].buf = src->v_buffer;
   dst[1].stride = dst[2].stride = src->uv_stride;
+#if CONFIG_ALPHA
+  dst[3].buf = src->alpha_buffer;
+  dst[3].stride = src->alpha_stride;
+#endif
 
   // TODO(jkoleszar): Make scale factors per-plane data
   for (i = 0; i < MAX_MB_PLANE; i++) {
@@ -2108,157 +2102,145 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
       ref_mv[1] = mbmi->ref_mvs[refs[1]][0];
 
       if (is_comp_pred) {
-#if CONFIG_COMP_INTER_JOINT_SEARCH
-        const int b_sz[BLOCK_SIZE_TYPES][2] = {
-            {4, 4},
-            {8, 8},
-            {8, 16},
-            {16, 8},
-            {16, 16},
-            {16, 32},
-            {32, 16},
-            {32, 32},
-            {32, 64},
-            {64, 32},
-            {64, 64}
-        };
-
-        int ite;
-        // Prediction buffer from second frame.
-        uint8_t *second_pred = vpx_memalign(16, b_sz[bsize][0] *
-                                            b_sz[bsize][1] * sizeof(uint8_t));
-
-        // Do joint motion search in compound mode to get more accurate mv.
-        struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0}};
-        struct buf_2d backup_second_yv12[MAX_MB_PLANE] = {{0}};
-        struct buf_2d scaled_first_yv12;
-        int last_besterr[2] = {INT_MAX, INT_MAX};
-
-        if (scaled_ref_frame[0]) {
-          int i;
-
-          // Swap out the reference frame for a version that's been scaled to
-          // match the resolution of the current frame, allowing the existing
-          // motion search code to be used without additional modifications.
-          for (i = 0; i < MAX_MB_PLANE; i++)
-            backup_yv12[i] = xd->plane[i].pre[0];
-
-          setup_pre_planes(xd, scaled_ref_frame[0], NULL, mi_row, mi_col,
-                           NULL, NULL);
-        }
-
-        if (scaled_ref_frame[1]) {
-          int i;
-
-          for (i = 0; i < MAX_MB_PLANE; i++)
-            backup_second_yv12[i] = xd->plane[i].pre[1];
-
-          setup_pre_planes(xd, scaled_ref_frame[1], NULL, mi_row, mi_col,
-                           NULL, NULL);
-        }
-        xd->scale_factor[0].set_scaled_offsets(&xd->scale_factor[0],
-                                                mi_row, mi_col);
-        xd->scale_factor[1].set_scaled_offsets(&xd->scale_factor[1],
-                                                mi_row, mi_col);
-
-        scaled_first_yv12 = xd->plane[0].pre[0];
-
-        // Initialize mv using single prediction mode result.
-        frame_mv[NEWMV][refs[0]].as_int = single_newmv[refs[0]].as_int;
-        frame_mv[NEWMV][refs[1]].as_int = single_newmv[refs[1]].as_int;
-
-        // Allow joint search multiple times iteratively for each ref frame, and
-        // break out the search loop if it couldn't find better mv.
-        for (ite = 0; ite < 4; ite++) {
-          struct buf_2d ref_yv12[2] = {xd->plane[0].pre[0],
-                                       xd->plane[0].pre[1]};
-          int bestsme = INT_MAX;
-          int sadpb = x->sadperbit16;
-          int_mv tmp_mv;
-          int search_range = 3;
-
-          int tmp_col_min = x->mv_col_min;
-          int tmp_col_max = x->mv_col_max;
-          int tmp_row_min = x->mv_row_min;
-          int tmp_row_max = x->mv_row_max;
-          int id = ite % 2;
-
-          // Get pred block from second frame.
-          vp9_build_inter_predictor(ref_yv12[!id].buf,
-                                    ref_yv12[!id].stride,
-                                    second_pred, b_sz[bsize][0],
-                                    &frame_mv[NEWMV][refs[!id]],
-                                    &xd->scale_factor[!id],
-                                    b_sz[bsize][0], b_sz[bsize][1], 0,
-                                    &xd->subpix);
-
-          // Compound motion search on first ref frame.
-          if (id)
-            xd->plane[0].pre[0] = ref_yv12[id];
-          vp9_clamp_mv_min_max(x, &ref_mv[id]);
-
-          // Use mv result from single mode as mvp.
-          tmp_mv.as_int = frame_mv[NEWMV][refs[id]].as_int;
-
-          tmp_mv.as_mv.col >>= 3;
-          tmp_mv.as_mv.row >>= 3;
-
-          // Small-range full-pixel motion search
-          bestsme = vp9_refining_search_8p_c(x, &tmp_mv, sadpb,
-                                             search_range,
-                                             &cpi->fn_ptr[block_size],
-                                             x->nmvjointcost, x->mvcost,
-                                             &ref_mv[id], second_pred,
-                                             b_sz[bsize][0], b_sz[bsize][1]);
+        if (cpi->sf.comp_inter_joint_serach) {
+          int pw = 4 << b_width_log2(bsize), ph = 4 << b_height_log2(bsize);
+          int ite;
+          // Prediction buffer from second frame.
+          uint8_t *second_pred = vpx_memalign(16, pw * ph * sizeof(uint8_t));
+
+          // Do joint motion search in compound mode to get more accurate mv.
+          struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0}};
+          struct buf_2d backup_second_yv12[MAX_MB_PLANE] = {{0}};
+          struct buf_2d scaled_first_yv12;
+          int last_besterr[2] = {INT_MAX, INT_MAX};
+
+          if (scaled_ref_frame[0]) {
+            int i;
+
+            // Swap out the reference frame for a version that's been scaled to
+            // match the resolution of the current frame, allowing the existing
+            // motion search code to be used without additional modifications.
+            for (i = 0; i < MAX_MB_PLANE; i++)
+              backup_yv12[i] = xd->plane[i].pre[0];
+
+            setup_pre_planes(xd, scaled_ref_frame[0], NULL, mi_row, mi_col,
+                             NULL, NULL);
+          }
 
-          x->mv_col_min = tmp_col_min;
-          x->mv_col_max = tmp_col_max;
-          x->mv_row_min = tmp_row_min;
-          x->mv_row_max = tmp_row_max;
+          if (scaled_ref_frame[1]) {
+            int i;
 
-          if (bestsme < INT_MAX) {
-            int dis; /* TODO: use dis in distortion calculation later. */
-            unsigned int sse;
+            for (i = 0; i < MAX_MB_PLANE; i++)
+              backup_second_yv12[i] = xd->plane[i].pre[1];
 
-            bestsme = vp9_find_best_sub_pixel_comp(x, &tmp_mv,
-                                                   &ref_mv[id],
-                                                   x->errorperbit,
-                                                   &cpi->fn_ptr[block_size],
-                                                   x->nmvjointcost, x->mvcost,
-                                                   &dis, &sse, second_pred,
-                                                   b_sz[bsize][0],
-                                                   b_sz[bsize][1]);
+            setup_pre_planes(xd, scaled_ref_frame[1], NULL, mi_row, mi_col,
+                             NULL, NULL);
           }
+          xd->scale_factor[0].set_scaled_offsets(&xd->scale_factor[0],
+                                                  mi_row, mi_col);
+          xd->scale_factor[1].set_scaled_offsets(&xd->scale_factor[1],
+                                                  mi_row, mi_col);
+
+          scaled_first_yv12 = xd->plane[0].pre[0];
+
+          // Initialize mv using single prediction mode result.
+          frame_mv[NEWMV][refs[0]].as_int = single_newmv[refs[0]].as_int;
+          frame_mv[NEWMV][refs[1]].as_int = single_newmv[refs[1]].as_int;
+
+          // Allow joint search multiple times iteratively for each ref frame
+          // and break out the search loop if it couldn't find better mv.
+          for (ite = 0; ite < 4; ite++) {
+            struct buf_2d ref_yv12[2];
+            int bestsme = INT_MAX;
+            int sadpb = x->sadperbit16;
+            int_mv tmp_mv;
+            int search_range = 3;
+
+            int tmp_col_min = x->mv_col_min;
+            int tmp_col_max = x->mv_col_max;
+            int tmp_row_min = x->mv_row_min;
+            int tmp_row_max = x->mv_row_max;
+            int id = ite % 2;
+
+            // Initialized here because of compiler problem in Visual Studio.
+            ref_yv12[0] = xd->plane[0].pre[0];
+            ref_yv12[1] = xd->plane[0].pre[1];
+
+            // Get pred block from second frame.
+            vp9_build_inter_predictor(ref_yv12[!id].buf,
+                                      ref_yv12[!id].stride,
+                                      second_pred, pw,
+                                      &frame_mv[NEWMV][refs[!id]],
+                                      &xd->scale_factor[!id],
+                                      pw, ph, 0,
+                                      &xd->subpix);
+
+            // Compound motion search on first ref frame.
+            if (id)
+              xd->plane[0].pre[0] = ref_yv12[id];
+            vp9_clamp_mv_min_max(x, &ref_mv[id]);
+
+            // Use mv result from single mode as mvp.
+            tmp_mv.as_int = frame_mv[NEWMV][refs[id]].as_int;
+
+            tmp_mv.as_mv.col >>= 3;
+            tmp_mv.as_mv.row >>= 3;
+
+            // Small-range full-pixel motion search
+            bestsme = vp9_refining_search_8p_c(x, &tmp_mv, sadpb,
+                                               search_range,
+                                               &cpi->fn_ptr[block_size],
+                                               x->nmvjointcost, x->mvcost,
+                                               &ref_mv[id], second_pred,
+                                               pw, ph);
+
+            x->mv_col_min = tmp_col_min;
+            x->mv_col_max = tmp_col_max;
+            x->mv_row_min = tmp_row_min;
+            x->mv_row_max = tmp_row_max;
+
+            if (bestsme < INT_MAX) {
+              int dis; /* TODO: use dis in distortion calculation later. */
+              unsigned int sse;
+
+              bestsme = vp9_find_best_sub_pixel_comp(x, &tmp_mv,
+                                                     &ref_mv[id],
+                                                     x->errorperbit,
+                                                     &cpi->fn_ptr[block_size],
+                                                     x->nmvjointcost, x->mvcost,
+                                                     &dis, &sse, second_pred,
+                                                     pw, ph);
+            }
 
-          if (id)
-            xd->plane[0].pre[0] = scaled_first_yv12;
+            if (id)
+              xd->plane[0].pre[0] = scaled_first_yv12;
 
-          if (bestsme < last_besterr[id]) {
+            if (bestsme < last_besterr[id]) {
             frame_mv[NEWMV][refs[id]].as_int =
-                xd->mode_info_context->bmi[0].as_mv[1].as_int = tmp_mv.as_int;
-            last_besterr[id] = bestsme;
-          } else {
-            break;
+                  xd->mode_info_context->bmi[0].as_mv[1].as_int = tmp_mv.as_int;
+              last_besterr[id] = bestsme;
+            } else {
+              break;
+            }
           }
-        }
 
-        // restore the predictor
-        if (scaled_ref_frame[0]) {
-          int i;
+          // restore the predictor
+          if (scaled_ref_frame[0]) {
+            int i;
 
-          for (i = 0; i < MAX_MB_PLANE; i++)
-            xd->plane[i].pre[0] = backup_yv12[i];
-        }
+            for (i = 0; i < MAX_MB_PLANE; i++)
+              xd->plane[i].pre[0] = backup_yv12[i];
+          }
 
-        if (scaled_ref_frame[1]) {
-          int i;
+          if (scaled_ref_frame[1]) {
+            int i;
 
-          for (i = 0; i < MAX_MB_PLANE; i++)
-            xd->plane[i].pre[1] = backup_second_yv12[i];
-        }
+            for (i = 0; i < MAX_MB_PLANE; i++)
+              xd->plane[i].pre[1] = backup_second_yv12[i];
+          }
 
-        vpx_free(second_pred);
-#endif  // CONFIG_COMP_INTER_JOINT_SEARCH
+          vpx_free(second_pred);
+        }
 
         if (frame_mv[NEWMV][refs[0]].as_int == INVALID_MV ||
             frame_mv[NEWMV][refs[1]].as_int == INVALID_MV)
@@ -2577,11 +2559,7 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
                           &dist_uv, &uv_skip,
                           (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 :
                                                        bsize);
-#if CONFIG_AB4X4
   if (bsize < BLOCK_SIZE_SB8X8)
-#else
-  if (bsize == BLOCK_SIZE_SB8X8)
-#endif
     err4x4 = rd_pick_intra4x4mby_modes(cpi, x, &rate4x4_y,
                                        &rate4x4_y_tokenonly,
                                        &dist4x4_y, err);
@@ -2593,11 +2571,7 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     memset(ctx->txfm_rd_diff, 0, sizeof(ctx->txfm_rd_diff));
     xd->mode_info_context->mbmi.mode = mode;
     xd->mode_info_context->mbmi.txfm_size = txfm_size;
-#if CONFIG_AB4X4
   } else if (bsize < BLOCK_SIZE_SB8X8 && err4x4 < err) {
-#else
-  } else if (bsize == BLOCK_SIZE_SB8X8 && err4x4 < err) {
-#endif
     *returnrate = rate4x4_y + rate_uv +
         vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 0);
     *returndist = dist4x4_y + (dist_uv >> 2);
@@ -2762,17 +2736,10 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       txfm_cache[i] = INT64_MAX;
 
     // Test best rd so far against threshold for trying this mode.
-#if CONFIG_AB4X4
     if (bsize >= BLOCK_SIZE_SB8X8 &&
         (best_rd < cpi->rd_threshes[mode_index] ||
          cpi->rd_threshes[mode_index] == INT_MAX))
       continue;
-#else
-    if (best_rd <= cpi->rd_threshes[mode_index] ||
-        cpi->rd_threshes[mode_index] == INT_MAX) {
-      continue;
-    }
-#endif
 
     x->skip = 0;
     this_mode = vp9_mode_order[mode_index].mode;
@@ -2783,11 +2750,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       continue;
     }
 
-#if CONFIG_AB4X4
     if (cpi->speed > 0 && bsize >= BLOCK_SIZE_SB8X8) {
-#else
-    if (cpi->speed > 0) {
-#endif
       if (!(ref_frame_mask & (1 << ref_frame))) {
         continue;
       }
@@ -2833,18 +2796,12 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     mbmi->interp_filter = cm->mcomp_filter_type;
     vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
 
-#if CONFIG_AB4X4
     if (bsize >= BLOCK_SIZE_SB8X8 &&
         (this_mode == I4X4_PRED || this_mode == SPLITMV))
       continue;
     if (bsize < BLOCK_SIZE_SB8X8 &&
         !(this_mode == I4X4_PRED || this_mode == SPLITMV))
       continue;
-#else
-    if (bsize != BLOCK_SIZE_SB8X8 &&
-        (this_mode == I4X4_PRED || this_mode == SPLITMV))
-      continue;
-#endif
 
     if (comp_pred) {
       if (ref_frame == ALTREF_FRAME) {
@@ -2919,11 +2876,9 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       distortion2 += dist_uv[TX_4X4];
       distortion_uv = dist_uv[TX_4X4];
       mbmi->uv_mode = mode_uv[TX_4X4];
-#if CONFIG_AB4X4
       txfm_cache[ONLY_4X4] = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
       for (i = 0; i < NB_TXFM_MODES; ++i)
         txfm_cache[i] = txfm_cache[ONLY_4X4];
-#endif
     } else if (ref_frame == INTRA_FRAME) {
       TX_SIZE uv_tx;
       vp9_build_intra_predictors_sby_s(xd, bsize);
@@ -3057,11 +3012,9 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       distortion2 += distortion_uv;
       skippable = skippable && uv_skippable;
 
-#if CONFIG_AB4X4
       txfm_cache[ONLY_4X4] = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
       for (i = 0; i < NB_TXFM_MODES; ++i)
         txfm_cache[i] = txfm_cache[ONLY_4X4];
-#endif
 
       if (!mode_excluded) {
         if (is_comp_pred)
@@ -3117,11 +3070,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       // Is Mb level skip allowed (i.e. not coded at segment level).
       mb_skip_allowed = !vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP);
 
-#if CONFIG_AB4X4
       if (skippable && bsize >= BLOCK_SIZE_SB8X8) {
-#else
-      if (skippable) {
-#endif
         // Back out the coefficient coding costs
         rate2 -= (rate_y + rate_uv);
         // for best_yrd calculation
@@ -3302,13 +3251,11 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     }
   }
 
-#if CONFIG_AB4X4
   if (best_rd == INT64_MAX && bsize < BLOCK_SIZE_SB8X8) {
     *returnrate = INT_MAX;
     *returndistortion = INT_MAX;
     return best_rd;
   }
-#endif
 
   assert((cm->mcomp_filter_type == SWITCHABLE) ||
          (cm->mcomp_filter_type == best_mbmode.interp_filter) ||
@@ -3341,10 +3288,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       cpi->is_src_frame_alt_ref &&
       (cpi->oxcf.arnr_max_frames == 0) &&
       (best_mbmode.mode != ZEROMV || best_mbmode.ref_frame != ALTREF_FRAME)
-#if CONFIG_AB4X4
-      && bsize >= BLOCK_SIZE_SB8X8
-#endif
-     ) {
+      && bsize >= BLOCK_SIZE_SB8X8) {
     mbmi->mode = ZEROMV;
     mbmi->ref_frame = ALTREF_FRAME;
     mbmi->second_ref_frame = NONE;
diff --git a/vp9/encoder/vp9_sad_c.c b/vp9/encoder/vp9_sad_c.c
index 994828f20..6b1ba4964 100644
--- a/vp9/encoder/vp9_sad_c.c
+++ b/vp9/encoder/vp9_sad_c.c
@@ -593,6 +593,37 @@ void vp9_sad8x4x4d_c(const uint8_t *src_ptr,
                             ref_ptr[3], ref_stride, 0x7fffffff);
 }
 
+void vp9_sad8x4x8_c(const uint8_t *src_ptr,
+                     int  src_stride,
+                     const uint8_t *ref_ptr,
+                     int  ref_stride,
+                     uint32_t *sad_array) {
+  sad_array[0] = vp9_sad8x4(src_ptr, src_stride,
+                             ref_ptr, ref_stride,
+                             0x7fffffff);
+  sad_array[1] = vp9_sad8x4(src_ptr, src_stride,
+                             ref_ptr + 1, ref_stride,
+                             0x7fffffff);
+  sad_array[2] = vp9_sad8x4(src_ptr, src_stride,
+                             ref_ptr + 2, ref_stride,
+                             0x7fffffff);
+  sad_array[3] = vp9_sad8x4(src_ptr, src_stride,
+                             ref_ptr + 3, ref_stride,
+                             0x7fffffff);
+  sad_array[4] = vp9_sad8x4(src_ptr, src_stride,
+                             ref_ptr + 4, ref_stride,
+                             0x7fffffff);
+  sad_array[5] = vp9_sad8x4(src_ptr, src_stride,
+                             ref_ptr + 5, ref_stride,
+                             0x7fffffff);
+  sad_array[6] = vp9_sad8x4(src_ptr, src_stride,
+                             ref_ptr + 6, ref_stride,
+                             0x7fffffff);
+  sad_array[7] = vp9_sad8x4(src_ptr, src_stride,
+                             ref_ptr + 7, ref_stride,
+                             0x7fffffff);
+}
+
 void vp9_sad4x8x4d_c(const uint8_t *src_ptr,
                      int  src_stride,
                      const uint8_t* const ref_ptr[],
@@ -608,6 +639,37 @@ void vp9_sad4x8x4d_c(const uint8_t *src_ptr,
                             ref_ptr[3], ref_stride, 0x7fffffff);
 }
 
+void vp9_sad4x8x8_c(const uint8_t *src_ptr,
+                     int  src_stride,
+                     const uint8_t *ref_ptr,
+                     int  ref_stride,
+                     uint32_t *sad_array) {
+  sad_array[0] = vp9_sad4x8(src_ptr, src_stride,
+                             ref_ptr, ref_stride,
+                             0x7fffffff);
+  sad_array[1] = vp9_sad4x8(src_ptr, src_stride,
+                             ref_ptr + 1, ref_stride,
+                             0x7fffffff);
+  sad_array[2] = vp9_sad4x8(src_ptr, src_stride,
+                             ref_ptr + 2, ref_stride,
+                             0x7fffffff);
+  sad_array[3] = vp9_sad4x8(src_ptr, src_stride,
+                             ref_ptr + 3, ref_stride,
+                             0x7fffffff);
+  sad_array[4] = vp9_sad4x8(src_ptr, src_stride,
+                             ref_ptr + 4, ref_stride,
+                             0x7fffffff);
+  sad_array[5] = vp9_sad4x8(src_ptr, src_stride,
+                             ref_ptr + 5, ref_stride,
+                             0x7fffffff);
+  sad_array[6] = vp9_sad4x8(src_ptr, src_stride,
+                             ref_ptr + 6, ref_stride,
+                             0x7fffffff);
+  sad_array[7] = vp9_sad4x8(src_ptr, src_stride,
+                             ref_ptr + 7, ref_stride,
+                             0x7fffffff);
+}
+
 void vp9_sad4x4x4d_c(const uint8_t *src_ptr,
                      int  src_stride,
                      const uint8_t* const ref_ptr[],
diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c
index 30143d77d..47792fcc2 100644
--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c
@@ -205,9 +205,11 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi,
   DECLARE_ALIGNED_ARRAY(16, uint8_t,  predictor, 16 * 16 + 8 * 8 + 8 * 8);
 
   // Save input state
-  uint8_t *y_buffer = mbd->plane[0].pre[0].buf;
-  uint8_t *u_buffer = mbd->plane[1].pre[0].buf;
-  uint8_t *v_buffer = mbd->plane[2].pre[0].buf;
+  uint8_t* input_buffer[MAX_MB_PLANE];
+  int i;
+
+  for (i = 0; i < MAX_MB_PLANE; i++)
+    input_buffer[i] = mbd->plane[i].pre[0].buf;
 
   for (mb_row = 0; mb_row < mb_rows; mb_row++) {
 #if ALT_REF_MC_ENABLED
@@ -352,9 +354,8 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi,
   }
 
   // Restore input state
-  mbd->plane[0].pre[0].buf = y_buffer;
-  mbd->plane[1].pre[0].buf = u_buffer;
-  mbd->plane[2].pre[0].buf = v_buffer;
+  for (i = 0; i < MAX_MB_PLANE; i++)
+    mbd->plane[i].pre[0].buf = input_buffer[i];
 }
 
 void vp9_temporal_filter_prepare(VP9_COMP *cpi, int distance) {
diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c
index 9ed16ffc3..08efc84d4 100644
--- a/vp9/encoder/vp9_tokenize.c
+++ b/vp9/encoder/vp9_tokenize.c
@@ -119,12 +119,8 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE_TYPE bsize,
   TOKENEXTRA *t = *tp;        /* store tokens starting here */
   const int eob = xd->plane[plane].eobs[block];
   const int16_t *qcoeff_ptr = BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16);
-#if CONFIG_AB4X4
   const BLOCK_SIZE_TYPE sb_type = (mbmi->sb_type < BLOCK_SIZE_SB8X8) ?
                                    BLOCK_SIZE_SB8X8 : mbmi->sb_type;
-#else
-  const BLOCK_SIZE_TYPE sb_type = mbmi->sb_type;
-#endif
   const int bwl = b_width_log2(sb_type);
   const int off = block >> (2 * tx_size);
   const int mod = bwl - tx_size - xd->plane[plane].subsampling_x;
@@ -136,11 +132,7 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE_TYPE bsize,
   const int segment_id = mbmi->segment_id;
   const int *scan, *nb;
   vp9_coeff_count *counts;
-#if CONFIG_MODELCOEFPROB
   vp9_coeff_probs_model *coef_probs;
-#else
-  vp9_coeff_probs *coef_probs;
-#endif
   const int ref = mbmi->ref_frame != INTRA_FRAME;
   ENTROPY_CONTEXT above_ec, left_ec;
   uint8_t token_cache[1024];
@@ -194,7 +186,7 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE_TYPE bsize,
       above_ec = (A[0] + A[1] + A[2] + A[3] + A[4] + A[5] + A[6] + A[7]) != 0;
       left_ec = (L[0] + L[1] + L[2] + L[3] + L[4] + L[5] + L[6] + L[7]) != 0;
       seg_eob = 1024;
-      scan = vp9_default_zig_zag1d_32x32;
+      scan = vp9_default_scan_32x32;
       counts = cpi->coef_counts_32x32;
       coef_probs = cpi->common.fc.coef_probs_32x32;
       band_translate = vp9_coefband_trans_8x8plus;
@@ -228,10 +220,6 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE_TYPE bsize,
 
     t->token = token;
     t->context_tree = coef_probs[type][ref][band][pt];
-#if CONFIG_MODELCOEFPROB
-    t->block_type = type;
-    t->ref_type = ref;
-#endif
     t->skip_eob_node = (c > 0) && (token_cache[scan[c - 1]] == 0);
     assert(vp9_coef_encodings[t->token].len - t->skip_eob_node > 0);
 
diff --git a/vp9/encoder/vp9_tokenize.h b/vp9/encoder/vp9_tokenize.h
index 2a56da8d1..08236c429 100644
--- a/vp9/encoder/vp9_tokenize.h
+++ b/vp9/encoder/vp9_tokenize.h
@@ -26,10 +26,6 @@ typedef struct {
   int16_t         extra;
   uint8_t         token;
   uint8_t         skip_eob_node;
-#if CONFIG_MODELCOEFPROB
-  uint8_t         block_type;
-  uint8_t         ref_type;
-#endif
 } TOKENEXTRA;
 
 typedef int64_t vp9_coeff_accum[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS]
diff --git a/vp9/encoder/vp9_variance.h b/vp9/encoder/vp9_variance.h
index 306476b01..aaa43ef82 100644
--- a/vp9/encoder/vp9_variance.h
+++ b/vp9/encoder/vp9_variance.h
@@ -86,7 +86,6 @@ typedef struct vp9_variance_vtable {
     vp9_sad_multi_d_fn_t       sdx4df;
 } vp9_variance_fn_ptr_t;
 
-// #if CONFIG_COMP_INTER_JOINT_SEARCH
 static void comp_avg_pred(uint8_t *comp_pred, const uint8_t *pred, int weight,
                           int height, uint8_t *ref, int ref_stride) {
   int i, j;
@@ -102,5 +101,4 @@ static void comp_avg_pred(uint8_t *comp_pred, const uint8_t *pred, int weight,
     ref += ref_stride;
   }
 }
-// #endif  // CONFIG_COMP_INTER_JOINT_SEARCH
 #endif  // VP9_ENCODER_VP9_VARIANCE_H_
diff --git a/vp9/encoder/x86/vp9_variance_sse2.c b/vp9/encoder/x86/vp9_variance_sse2.c
index fc363b6b0..67ca9257c 100644
--- a/vp9/encoder/x86/vp9_variance_sse2.c
+++ b/vp9/encoder/x86/vp9_variance_sse2.c
@@ -139,7 +139,37 @@ void vp9_half_vert_variance16x_h_sse2
 
 DECLARE_ALIGNED(16, extern const short, vp9_bilinear_filters_mmx[16][8]);
 
-unsigned int vp9_variance4x4_wmt(
+typedef unsigned int (*get_var_sse2) (
+  const unsigned char *src_ptr,
+  int source_stride,
+  const unsigned char *ref_ptr,
+  int recon_stride,
+  unsigned int *SSE,
+  int *Sum
+);
+
+static void variance_sse2(const unsigned char *src_ptr, int  source_stride,
+                        const unsigned char *ref_ptr, int  recon_stride,
+                        int  w, int  h, unsigned int *sse, int *sum,
+                        get_var_sse2 var_fn, int block_size) {
+  unsigned int sse0;
+  int sum0;
+  int i, j;
+
+  *sse = 0;
+  *sum = 0;
+
+  for (i = 0; i < h; i += block_size) {
+    for (j = 0; j < w; j += block_size) {
+      var_fn(src_ptr + source_stride * i + j, source_stride,
+             ref_ptr + recon_stride * i + j, recon_stride, &sse0, &sum0);
+      *sse += sse0;
+      *sum += sum0;
+    }
+  }
+}
+
+unsigned int vp9_variance4x4_sse2(
   const unsigned char *src_ptr,
   int  source_stride,
   const unsigned char *ref_ptr,
@@ -148,13 +178,41 @@ unsigned int vp9_variance4x4_wmt(
   unsigned int var;
   int avg;
 
-  vp9_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg);
+  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 4, 4,
+                  &var, &avg, vp9_get4x4var_mmx, 4);
   *sse = var;
   return (var - (((unsigned int)avg * avg) >> 4));
+}
+
+unsigned int vp9_variance8x4_sse2(const uint8_t *src_ptr,
+                                  int  source_stride,
+                                  const uint8_t *ref_ptr,
+                                  int  recon_stride,
+                                  unsigned int *sse) {
+  unsigned int var;
+  int avg;
+
+  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 8, 4,
+                  &var, &avg, vp9_get4x4var_mmx, 4);
+  *sse = var;
+  return (var - (((unsigned int)avg * avg) >> 5));
+}
+
+unsigned int vp9_variance4x8_sse2(const uint8_t *src_ptr,
+                                  int  source_stride,
+                                  const uint8_t *ref_ptr,
+                                  int  recon_stride,
+                                  unsigned int *sse) {
+  unsigned int var;
+  int avg;
 
+  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 4, 8,
+                  &var, &avg, vp9_get4x4var_mmx, 4);
+  *sse = var;
+  return (var - (((unsigned int)avg * avg) >> 5));
 }
 
-unsigned int vp9_variance8x8_wmt
+unsigned int vp9_variance8x8_sse2
 (
   const unsigned char *src_ptr,
   int  source_stride,
@@ -164,83 +222,157 @@ unsigned int vp9_variance8x8_wmt
   unsigned int var;
   int avg;
 
-  vp9_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg);
+  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 8, 8,
+                  &var, &avg, vp9_get8x8var_sse2, 8);
   *sse = var;
   return (var - (((unsigned int)avg * avg) >> 6));
-
 }
 
-
-unsigned int vp9_variance16x16_wmt
+unsigned int vp9_variance16x8_sse2
 (
   const unsigned char *src_ptr,
   int  source_stride,
   const unsigned char *ref_ptr,
   int  recon_stride,
   unsigned int *sse) {
-  unsigned int sse0;
-  int sum0;
-
+  unsigned int var;
+  int avg;
 
-  vp9_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0);
-  *sse = sse0;
-  return (sse0 - (((unsigned int)sum0 * sum0) >> 8));
+  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 16, 8,
+                  &var, &avg, vp9_get8x8var_sse2, 8);
+  *sse = var;
+  return (var - (((unsigned int)avg * avg) >> 7));
 }
 
-unsigned int vp9_mse16x16_wmt(
+unsigned int vp9_variance8x16_sse2
+(
   const unsigned char *src_ptr,
   int  source_stride,
   const unsigned char *ref_ptr,
   int  recon_stride,
   unsigned int *sse) {
+  unsigned int var;
+  int avg;
 
-  unsigned int sse0;
-  int sum0;
-  vp9_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0);
-  *sse = sse0;
-  return sse0;
-
+  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 8, 16,
+                &var, &avg, vp9_get8x8var_sse2, 8);
+  *sse = var;
+  return (var - (((unsigned int)avg * avg) >> 7));
 }
 
-
-unsigned int vp9_variance16x8_wmt
+unsigned int vp9_variance16x16_sse2
 (
   const unsigned char *src_ptr,
   int  source_stride,
   const unsigned char *ref_ptr,
   int  recon_stride,
   unsigned int *sse) {
-  unsigned int sse0, sse1, var;
-  int sum0, sum1, avg;
-
-  vp9_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0);
-  vp9_get8x8var_sse2(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
+  unsigned int var;
+  int avg;
 
-  var = sse0 + sse1;
-  avg = sum0 + sum1;
+  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16,
+                &var, &avg, vp9_get16x16var_sse2, 16);
   *sse = var;
-  return (var - (((unsigned int)avg * avg) >> 7));
-
+  return (var - (((unsigned int)avg * avg) >> 8));
 }
 
-unsigned int vp9_variance8x16_wmt
-(
+unsigned int vp9_mse16x16_wmt(
   const unsigned char *src_ptr,
   int  source_stride,
   const unsigned char *ref_ptr,
   int  recon_stride,
   unsigned int *sse) {
-  unsigned int sse0, sse1, var;
-  int sum0, sum1, avg;
 
-  vp9_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0);
-  vp9_get8x8var_sse2(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1);
+  unsigned int sse0;
+  int sum0;
+  vp9_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0,
+                       &sum0);
+  *sse = sse0;
+  return sse0;
+}
 
-  var = sse0 + sse1;
-  avg = sum0 + sum1;
+unsigned int vp9_variance32x32_sse2(const uint8_t *src_ptr,
+                                    int  source_stride,
+                                    const uint8_t *ref_ptr,
+                                    int  recon_stride,
+                                    unsigned int *sse) {
+  unsigned int var;
+  int avg;
+
+  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 32,
+                &var, &avg, vp9_get16x16var_sse2, 16);
   *sse = var;
-  return (var - (((unsigned int)avg * avg) >> 7));
+  return (var - (((int64_t)avg * avg) >> 10));
+}
+
+unsigned int vp9_variance32x16_sse2(const uint8_t *src_ptr,
+                                    int  source_stride,
+                                    const uint8_t *ref_ptr,
+                                    int  recon_stride,
+                                    unsigned int *sse) {
+  unsigned int var;
+  int avg;
+
+  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 16,
+                &var, &avg, vp9_get16x16var_sse2, 16);
+  *sse = var;
+  return (var - (((int64_t)avg * avg) >> 9));
+}
 
+unsigned int vp9_variance16x32_sse2(const uint8_t *src_ptr,
+                                    int  source_stride,
+                                    const uint8_t *ref_ptr,
+                                    int  recon_stride,
+                                    unsigned int *sse) {
+  unsigned int var;
+  int avg;
+
+  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 16, 32,
+                &var, &avg, vp9_get16x16var_sse2, 16);
+  *sse = var;
+  return (var - (((int64_t)avg * avg) >> 9));
+}
+
+unsigned int vp9_variance64x64_sse2(const uint8_t *src_ptr,
+                                    int  source_stride,
+                                    const uint8_t *ref_ptr,
+                                    int  recon_stride,
+                                    unsigned int *sse) {
+  unsigned int var;
+  int avg;
+
+  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 64, 64,
+                &var, &avg, vp9_get16x16var_sse2, 16);
+  *sse = var;
+  return (var - (((int64_t)avg * avg) >> 12));
+}
+
+unsigned int vp9_variance64x32_sse2(const uint8_t *src_ptr,
+                                    int  source_stride,
+                                    const uint8_t *ref_ptr,
+                                    int  recon_stride,
+                                    unsigned int *sse) {
+  unsigned int var;
+  int avg;
+
+  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 64, 32,
+                &var, &avg, vp9_get16x16var_sse2, 16);
+  *sse = var;
+  return (var - (((int64_t)avg * avg) >> 11));
+}
+
+unsigned int vp9_variance32x64_sse2(const uint8_t *src_ptr,
+                                    int  source_stride,
+                                    const uint8_t *ref_ptr,
+                                    int  recon_stride,
+                                    unsigned int *sse) {
+  unsigned int var;
+  int avg;
+
+  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 64,
+                &var, &avg, vp9_get16x16var_sse2, 16);
+  *sse = var;
+  return (var - (((int64_t)avg * avg) >> 11));
 }
 
 unsigned int vp9_sub_pixel_variance4x4_wmt
diff --git a/vp9/encoder/x86/vp9_variance_ssse3.c b/vp9/encoder/x86/vp9_variance_ssse3.c
index f95a5423c..882acad78 100644
--- a/vp9/encoder/x86/vp9_variance_ssse3.c
+++ b/vp9/encoder/x86/vp9_variance_ssse3.c
@@ -15,15 +15,6 @@
 
 #define HALFNDX 8
 
-extern unsigned int vp9_get16x16var_sse2
-(
-  const unsigned char *src_ptr,
-  int source_stride,
-  const unsigned char *ref_ptr,
-  int recon_stride,
-  unsigned int *SSE,
-  int *Sum
-);
 extern void vp9_half_horiz_vert_variance16x_h_sse2
 (
   const unsigned char *ref_ptr,