1 files changed, 513 insertions, 409 deletions
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index aa0557735..862e72f24 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -46,6 +46,12 @@
 /* Factor to weigh the rate for switchable interp filters */
 #define SWITCHABLE_INTERP_RATE_FACTOR 1
 
+DECLARE_ALIGNED(16, extern const uint8_t,
+                vp9_pt_energy_class[MAX_ENTROPY_TOKENS]);
+
+#define I4X4_PRED 0x8000
+#define SPLITMV 0x10000
+
 const MODE_DEFINITION vp9_mode_order[MAX_MODES] = {
   {ZEROMV,    LAST_FRAME,   NONE},
   {DC_PRED,   INTRA_FRAME,  NONE},
@@ -81,7 +87,7 @@ const MODE_DEFINITION vp9_mode_order[MAX_MODES] = {
   {SPLITMV,   GOLDEN_FRAME, NONE},
   {SPLITMV,   ALTREF_FRAME, NONE},
 
-  {I4X4_PRED,    INTRA_FRAME,  NONE},
+  {I4X4_PRED, INTRA_FRAME,  NONE},
 
   /* compound prediction modes */
   {ZEROMV,    LAST_FRAME,   GOLDEN_FRAME},
@@ -105,11 +111,31 @@ const MODE_DEFINITION vp9_mode_order[MAX_MODES] = {
   {SPLITMV,   GOLDEN_FRAME, ALTREF_FRAME},
 };
 
+#if CONFIG_BALANCED_COEFTREE
+static void fill_token_costs(vp9_coeff_count *c,
+                             vp9_coeff_count *cnoskip,
+                             vp9_coeff_probs_model *p,
+                             TX_SIZE tx_size) {
+  int i, j, k, l;
+  for (i = 0; i < BLOCK_TYPES; i++)
+    for (j = 0; j < REF_TYPES; j++)
+      for (k = 0; k < COEF_BANDS; k++)
+        for (l = 0; l < PREV_COEF_CONTEXTS; l++) {
+          vp9_prob probs[ENTROPY_NODES];
+          vp9_model_to_full_probs(p[i][j][k][l], probs);
+          vp9_cost_tokens((int *)cnoskip[i][j][k][l], probs,
+                          vp9_coef_tree);
+          // Replace the eob node prob with a very small value so that the
+          // cost approximately equals the cost without the eob node
+          probs[1] = 1;
+          vp9_cost_tokens((int *)c[i][j][k][l], probs, vp9_coef_tree);
+        }
+}
+#else
 static void fill_token_costs(vp9_coeff_count *c,
                              vp9_coeff_probs_model *p,
                              TX_SIZE tx_size) {
   int i, j, k, l;
-
   for (i = 0; i < BLOCK_TYPES; i++)
     for (j = 0; j < REF_TYPES; j++)
       for (k = 0; k < COEF_BANDS; k++)
@@ -120,6 +146,7 @@ static void fill_token_costs(vp9_coeff_count *c,
                                vp9_coef_tree);
         }
 }
+#endif
 
 static int rd_iifactor[32] =  { 4, 4, 3, 2, 1, 0, 0, 0,
                                 0, 0, 0, 0, 0, 0, 0, 0,
@@ -210,6 +237,20 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex) {
     }
   }
 
+#if CONFIG_BALANCED_COEFTREE
+  fill_token_costs(cpi->mb.token_costs[TX_4X4],
+                   cpi->mb.token_costs_noskip[TX_4X4],
+                   cpi->common.fc.coef_probs_4x4, TX_4X4);
+  fill_token_costs(cpi->mb.token_costs[TX_8X8],
+                   cpi->mb.token_costs_noskip[TX_8X8],
+                   cpi->common.fc.coef_probs_8x8, TX_8X8);
+  fill_token_costs(cpi->mb.token_costs[TX_16X16],
+                   cpi->mb.token_costs_noskip[TX_16X16],
+                   cpi->common.fc.coef_probs_16x16, TX_16X16);
+  fill_token_costs(cpi->mb.token_costs[TX_32X32],
+                   cpi->mb.token_costs_noskip[TX_32X32],
+                   cpi->common.fc.coef_probs_32x32, TX_32X32);
+#else
   fill_token_costs(cpi->mb.token_costs[TX_4X4],
                    cpi->common.fc.coef_probs_4x4, TX_4X4);
   fill_token_costs(cpi->mb.token_costs[TX_8X8],
@@ -218,6 +259,7 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex) {
                    cpi->common.fc.coef_probs_16x16, TX_16X16);
   fill_token_costs(cpi->mb.token_costs[TX_32X32],
                    cpi->common.fc.coef_probs_32x32, TX_32X32);
+#endif
 
   for (i = 0; i < NUM_PARTITION_CONTEXTS; i++)
     vp9_cost_tokens(cpi->mb.partition_cost[i],
@@ -225,7 +267,6 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex) {
                     vp9_partition_tree);
 
   /*rough estimate for costing*/
-  cpi->common.kf_ymode_probs_index = cpi->common.base_qindex >> 4;
   vp9_init_mode_costs(cpi);
 
   if (cpi->common.frame_type != KEY_FRAME) {
@@ -271,7 +312,13 @@ static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb,
   TX_TYPE tx_type = DCT_DCT;
 
   const int segment_id = xd->mode_info_context->mbmi.segment_id;
+#if CONFIG_BALANCED_COEFTREE
+  unsigned int (*token_costs_noskip)[PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS] =
+      mb->token_costs_noskip[tx_size][type][ref];
+#else
   vp9_prob coef_probs[COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];
+#endif
+
   int seg_eob, default_eob;
   uint8_t token_cache[1024];
   const uint8_t * band_translate;
@@ -291,8 +338,10 @@ static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb,
           get_tx_type_4x4(xd, block) : DCT_DCT;
       above_ec = A[0] != 0;
       left_ec = L[0] != 0;
+#if !CONFIG_BALANCED_COEFTREE
       vp9_model_to_full_probs_sb(cm->fc.coef_probs_4x4[type][ref],
                                  coef_probs);
+#endif
       seg_eob = 16;
       scan = get_scan_4x4(tx_type);
       band_translate = vp9_coefband_trans_4x4;
@@ -307,8 +356,10 @@ static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb,
       above_ec = (A[0] + A[1]) != 0;
       left_ec = (L[0] + L[1]) != 0;
       scan = get_scan_8x8(tx_type);
+#if !CONFIG_BALANCED_COEFTREE
       vp9_model_to_full_probs_sb(cm->fc.coef_probs_8x8[type][ref],
                                  coef_probs);
+#endif
       seg_eob = 64;
       band_translate = vp9_coefband_trans_8x8plus;
       break;
@@ -320,8 +371,10 @@ static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb,
       TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
           get_tx_type_16x16(xd, y + (x >> 2)) : DCT_DCT;
       scan = get_scan_16x16(tx_type);
+#if !CONFIG_BALANCED_COEFTREE
       vp9_model_to_full_probs_sb(cm->fc.coef_probs_16x16[type][ref],
                                  coef_probs);
+#endif
       seg_eob = 256;
       above_ec = (A[0] + A[1] + A[2] + A[3]) != 0;
       left_ec = (L[0] + L[1] + L[2] + L[3]) != 0;
@@ -330,8 +383,10 @@ static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb,
     }
     case TX_32X32:
       scan = vp9_default_scan_32x32;
+#if !CONFIG_BALANCED_COEFTREE
       vp9_model_to_full_probs_sb(cm->fc.coef_probs_32x32[type][ref],
                                  coef_probs);
+#endif
       seg_eob = 1024;
       above_ec = (A[0] + A[1] + A[2] + A[3] + A[4] + A[5] + A[6] + A[7]) != 0;
       left_ec = (L[0] + L[1] + L[2] + L[3] + L[4] + L[5] + L[6] + L[7]) != 0;
@@ -362,18 +417,30 @@ static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb,
       if (c)
         pt = vp9_get_coef_context(scan, nb, pad, token_cache, c, default_eob);
 
+#if CONFIG_BALANCED_COEFTREE
+      if (!c || token_cache[scan[c - 1]])  // do not skip eob
+        cost += token_costs_noskip[band][pt][t] + vp9_dct_value_cost_ptr[v];
+      else
+        cost += token_costs[band][pt][t] + vp9_dct_value_cost_ptr[v];
+#else
       cost += token_costs[band][pt][t] + vp9_dct_value_cost_ptr[v];
-
       if (!c || token_cache[scan[c - 1]])
         cost += vp9_cost_bit(coef_probs[band][pt][0], 1);
-      token_cache[scan[c]] = t;
+#endif
+      token_cache[scan[c]] = vp9_pt_energy_class[t];
     }
     if (c < seg_eob) {
       if (c)
         pt = vp9_get_coef_context(scan, nb, pad, token_cache, c, default_eob);
+#if CONFIG_BALANCED_COEFTREE
+      cost += mb->token_costs_noskip[tx_size][type][ref]
+          [get_coef_band(band_translate, c)]
+          [pt][DCT_EOB_TOKEN];
+#else
       cost += mb->token_costs[tx_size][type][ref]
           [get_coef_band(band_translate, c)]
           [pt][DCT_EOB_TOKEN];
+#endif
     }
   }
 
@@ -556,9 +623,25 @@ static void super_block_yrd(VP9_COMP *cpi,
                             int64_t txfm_cache[NB_TXFM_MODES]) {
   VP9_COMMON *const cm = &cpi->common;
   int r[TX_SIZE_MAX_SB][2], d[TX_SIZE_MAX_SB], s[TX_SIZE_MAX_SB];
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
 
   vp9_subtract_sby(x, bs);
 
+  if (cpi->speed > 4) {
+    if (bs >= BLOCK_SIZE_SB32X32) {
+      mbmi->txfm_size = TX_32X32;
+    } else if (bs >= BLOCK_SIZE_MB16X16) {
+      mbmi->txfm_size = TX_16X16;
+    } else if (bs >= BLOCK_SIZE_SB8X8) {
+      mbmi->txfm_size = TX_8X8;
+    } else {
+      mbmi->txfm_size = TX_4X4;
+    }
+    super_block_yrd_for_txfm(cm, x, rate, distortion, skip, bs,
+                             mbmi->txfm_size);
+    return;
+  }
   if (bs >= BLOCK_SIZE_SB32X32)
     super_block_yrd_for_txfm(cm, x, &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32],
                              bs, TX_32X32);
@@ -611,11 +694,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
     int64_t this_rd;
     int ratey = 0;
 
-    xd->mode_info_context->bmi[ib].as_mode.first = mode;
-    if (cm->frame_type == KEY_FRAME)
-      rate = bmode_costs[mode];
-    else
-      rate = x->mbmode_cost[cm->frame_type][mode];
+    rate = bmode_costs[mode];
     distortion = 0;
 
     vpx_memcpy(tempa, ta, sizeof(ta));
@@ -653,9 +732,6 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
         distortion += vp9_block_error(coeff, BLOCK_OFFSET(xd->plane[0].dqcoeff,
                                                          block, 16), 16) >> 2;
 
-        vp9_intra4x4_predict(xd, block, BLOCK_SIZE_SB8X8, mode,
-                             dst, xd->plane[0].dst.stride);
-
         if (best_tx_type != DCT_DCT)
           vp9_short_iht4x4_add(BLOCK_OFFSET(xd->plane[0].dqcoeff, block, 16),
                                dst, xd->plane[0].dst.stride, best_tx_type);
@@ -726,16 +802,15 @@ static int64_t rd_pick_intra4x4mby_modes(VP9_COMP *cpi, MACROBLOCK *mb,
   int64_t total_rd = 0;
   ENTROPY_CONTEXT t_above[4], t_left[4];
   int *bmode_costs;
+  MODE_INFO *const mic = xd->mode_info_context;
 
   vpx_memcpy(t_above, xd->plane[0].above_context, sizeof(t_above));
   vpx_memcpy(t_left, xd->plane[0].left_context, sizeof(t_left));
 
-  xd->mode_info_context->mbmi.mode = I4X4_PRED;
-  bmode_costs = mb->inter_bmode_costs;
+  bmode_costs = mb->mbmode_cost;
 
   for (idy = 0; idy < 2; idy += bh) {
     for (idx = 0; idx < 2; idx += bw) {
-      MODE_INFO *const mic = xd->mode_info_context;
       const int mis = xd->mode_info_stride;
       MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode);
       int UNINITIALIZED_IS_SAFE(r), UNINITIALIZED_IS_SAFE(ry);
@@ -747,7 +822,7 @@ static int64_t rd_pick_intra4x4mby_modes(VP9_COMP *cpi, MACROBLOCK *mb,
         const MB_PREDICTION_MODE L = (xd->left_available || idx) ?
                                      left_block_mode(mic, i) : DC_PRED;
 
-        bmode_costs  = mb->bmode_costs[A][L];
+        bmode_costs  = mb->y_mode_costs[A][L];
       }
 
       total_rd += rd_pick_intra4x4block(cpi, mb, i, &best_mode, bmode_costs,
@@ -774,6 +849,7 @@ static int64_t rd_pick_intra4x4mby_modes(VP9_COMP *cpi, MACROBLOCK *mb,
   *Rate = cost;
   *rate_y = tot_rate_y;
   *Distortion = distortion;
+  xd->mode_info_context->mbmi.mode = mic->bmi[3].as_mode.first;
 
   return RDCOST(mb->rdmult, mb->rddiv, cost, distortion);
 }
@@ -785,12 +861,13 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                       int64_t txfm_cache[NB_TXFM_MODES]) {
   MB_PREDICTION_MODE mode;
   MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
-  MACROBLOCKD *xd = &x->e_mbd;
+  MACROBLOCKD *const xd = &x->e_mbd;
   int this_rate, this_rate_tokenonly;
   int this_distortion, s;
   int64_t best_rd = INT64_MAX, this_rd;
   TX_SIZE UNINITIALIZED_IS_SAFE(best_tx);
   int i;
+  int *bmode_costs = x->mbmode_cost;
 
   if (bsize < BLOCK_SIZE_SB8X8) {
     x->e_mbd.mode_info_context->mbmi.txfm_size = TX_4X4;
@@ -805,17 +882,19 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
     int64_t local_txfm_cache[NB_TXFM_MODES];
     MODE_INFO *const mic = xd->mode_info_context;
     const int mis = xd->mode_info_stride;
-    const MB_PREDICTION_MODE A = above_block_mode(mic, 0, mis);
-    const MB_PREDICTION_MODE L = xd->left_available ?
-                                 left_block_mode(mic, 0) : DC_PRED;
-
-    int *bmode_costs  = x->bmode_costs[A][L];
+    if (cpi->common.frame_type == KEY_FRAME) {
+      const MB_PREDICTION_MODE A = above_block_mode(mic, 0, mis);
+      const MB_PREDICTION_MODE L = xd->left_available ?
+                                   left_block_mode(mic, 0) : DC_PRED;
 
+      bmode_costs = x->y_mode_costs[A][L];
+    }
     x->e_mbd.mode_info_context->mbmi.mode = mode;
     vp9_build_intra_predictors_sby_s(&x->e_mbd, bsize);
 
     super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion, &s,
                     bsize, local_txfm_cache);
+
     this_rate = this_rate_tokenonly + bmode_costs[mode];
     this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
 
@@ -925,10 +1004,10 @@ int vp9_cost_mv_ref(VP9_COMP *cpi,
     VP9_COMMON *pc = &cpi->common;
 
     vp9_prob p [VP9_MVREFS - 1];
-    assert(NEARESTMV <= m  &&  m <= SPLITMV);
+    assert(NEARESTMV <= m  &&  m <= NEWMV);
     vp9_mv_ref_probs(pc, p, mode_context);
-    return cost_token(vp9_mv_ref_tree, p,
-                      vp9_mv_ref_encoding_array - NEARESTMV + m);
+    return cost_token(vp9_sb_mv_ref_tree, p,
+                      vp9_sb_mv_ref_encoding_array - NEARESTMV + m);
   } else
     return 0;
 }
@@ -938,19 +1017,18 @@ void vp9_set_mbmode_and_mvs(MACROBLOCK *x, MB_PREDICTION_MODE mb, int_mv *mv) {
   x->e_mbd.mode_info_context->mbmi.mv[0].as_int = mv->as_int;
 }
 
-static int labels2mode(MACROBLOCK *x,
-                       int const *labelings, int which_label,
+static int labels2mode(MACROBLOCK *x, int i,
                        MB_PREDICTION_MODE this_mode,
                        int_mv *this_mv, int_mv *this_second_mv,
                        int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES],
-                       int_mv seg_mvs[MAX_REF_FRAMES - 1],
+                       int_mv seg_mvs[MAX_REF_FRAMES],
                        int_mv *best_ref_mv,
                        int_mv *second_best_ref_mv,
                        int *mvjcost, int *mvcost[2], VP9_COMP *cpi) {
   MACROBLOCKD *const xd = &x->e_mbd;
   MODE_INFO *const mic = xd->mode_info_context;
   MB_MODE_INFO * mbmi = &mic->mbmi;
-  int i, cost = 0, thismvcost = 0;
+  int cost = 0, thismvcost = 0;
   int idx, idy;
   int bw = 1 << b_width_log2(mbmi->sb_type);
   int bh = 1 << b_height_log2(mbmi->sb_type);
@@ -958,72 +1036,61 @@ static int labels2mode(MACROBLOCK *x,
   /* We have to be careful retrieving previously-encoded motion vectors.
    Ones from this macroblock have to be pulled from the BLOCKD array
    as they have not yet made it to the bmi array in our MB_MODE_INFO. */
-  for (i = 0; i < 4; ++i) {
-    MB_PREDICTION_MODE m;
-
-    if (labelings[i] != which_label)
-      continue;
+  MB_PREDICTION_MODE m;
 
-    {
-      // the only time we should do costing for new motion vector or mode
-      // is when we are on a new label  (jbb May 08, 2007)
-      switch (m = this_mode) {
-        case NEWMV:
-          if (mbmi->second_ref_frame > 0) {
-            this_mv->as_int = seg_mvs[mbmi->ref_frame - 1].as_int;
-            this_second_mv->as_int =
-            seg_mvs[mbmi->second_ref_frame - 1].as_int;
-          }
-
-          thismvcost  = vp9_mv_bit_cost(this_mv, best_ref_mv, mvjcost, mvcost,
-                                        102, xd->allow_high_precision_mv);
-          if (mbmi->second_ref_frame > 0) {
-            thismvcost += vp9_mv_bit_cost(this_second_mv, second_best_ref_mv,
-                                          mvjcost, mvcost, 102,
-                                          xd->allow_high_precision_mv);
-          }
-          break;
-        case NEARESTMV:
-          this_mv->as_int = frame_mv[NEARESTMV][mbmi->ref_frame].as_int;
-          if (mbmi->second_ref_frame > 0)
-            this_second_mv->as_int =
-                frame_mv[NEARESTMV][mbmi->second_ref_frame].as_int;
-          break;
-        case NEARMV:
-          this_mv->as_int = frame_mv[NEARMV][mbmi->ref_frame].as_int;
-          if (mbmi->second_ref_frame > 0)
-            this_second_mv->as_int =
-                frame_mv[NEARMV][mbmi->second_ref_frame].as_int;
-          break;
-        case ZEROMV:
-          this_mv->as_int = 0;
-          if (mbmi->second_ref_frame > 0)
-            this_second_mv->as_int = 0;
-          break;
-        default:
-          break;
+  // the only time we should do costing for new motion vector or mode
+  // is when we are on a new label  (jbb May 08, 2007)
+  switch (m = this_mode) {
+    case NEWMV:
+      this_mv->as_int = seg_mvs[mbmi->ref_frame].as_int;
+      thismvcost  = vp9_mv_bit_cost(this_mv, best_ref_mv, mvjcost, mvcost,
+                                    102, xd->allow_high_precision_mv);
+      if (mbmi->second_ref_frame > 0) {
+        this_second_mv->as_int = seg_mvs[mbmi->second_ref_frame].as_int;
+        thismvcost += vp9_mv_bit_cost(this_second_mv, second_best_ref_mv,
+                                      mvjcost, mvcost, 102,
+                                      xd->allow_high_precision_mv);
       }
+      break;
+    case NEARESTMV:
+      this_mv->as_int = frame_mv[NEARESTMV][mbmi->ref_frame].as_int;
+      if (mbmi->second_ref_frame > 0)
+        this_second_mv->as_int =
+            frame_mv[NEARESTMV][mbmi->second_ref_frame].as_int;
+      break;
+    case NEARMV:
+      this_mv->as_int = frame_mv[NEARMV][mbmi->ref_frame].as_int;
+      if (mbmi->second_ref_frame > 0)
+        this_second_mv->as_int =
+            frame_mv[NEARMV][mbmi->second_ref_frame].as_int;
+      break;
+    case ZEROMV:
+      this_mv->as_int = 0;
+      if (mbmi->second_ref_frame > 0)
+        this_second_mv->as_int = 0;
+      break;
+    default:
+      break;
+  }
 
-      cost = vp9_cost_mv_ref(cpi, this_mode,
-                             mbmi->mb_mode_context[mbmi->ref_frame]);
-    }
+  cost = vp9_cost_mv_ref(cpi, this_mode,
+                         mbmi->mb_mode_context[mbmi->ref_frame]);
 
-    mic->bmi[i].as_mv[0].as_int = this_mv->as_int;
-    if (mbmi->second_ref_frame > 0)
-      mic->bmi[i].as_mv[1].as_int = this_second_mv->as_int;
+  mic->bmi[i].as_mv[0].as_int = this_mv->as_int;
+  if (mbmi->second_ref_frame > 0)
+    mic->bmi[i].as_mv[1].as_int = this_second_mv->as_int;
 
-    x->partition_info->bmi[i].mode = m;
-    x->partition_info->bmi[i].mv.as_int = this_mv->as_int;
-    if (mbmi->second_ref_frame > 0)
-      x->partition_info->bmi[i].second_mv.as_int = this_second_mv->as_int;
-    for (idy = 0; idy < bh; ++idy) {
-      for (idx = 0; idx < bw; ++idx) {
-        vpx_memcpy(&mic->bmi[i + idy * 2 + idx],
-                   &mic->bmi[i], sizeof(mic->bmi[i]));
-        vpx_memcpy(&x->partition_info->bmi[i + idy * 2 + idx],
-                   &x->partition_info->bmi[i],
-                   sizeof(x->partition_info->bmi[i]));
-      }
+  x->partition_info->bmi[i].mode = m;
+  x->partition_info->bmi[i].mv.as_int = this_mv->as_int;
+  if (mbmi->second_ref_frame > 0)
+    x->partition_info->bmi[i].second_mv.as_int = this_second_mv->as_int;
+  for (idy = 0; idy < bh; ++idy) {
+    for (idx = 0; idx < bw; ++idx) {
+      vpx_memcpy(&mic->bmi[i + idy * 2 + idx],
+                 &mic->bmi[i], sizeof(mic->bmi[i]));
+      vpx_memcpy(&x->partition_info->bmi[i + idy * 2 + idx],
+                 &x->partition_info->bmi[i],
+                 sizeof(x->partition_info->bmi[i]));
     }
   }
 
@@ -1033,90 +1100,86 @@ static int labels2mode(MACROBLOCK *x,
 
 static int64_t encode_inter_mb_segment(VP9_COMMON *const cm,
                                        MACROBLOCK *x,
-                                       int const *labels,
-                                       int which_label,
+                                       int i,
                                        int *labelyrate,
                                        int *distortion,
                                        ENTROPY_CONTEXT *ta,
                                        ENTROPY_CONTEXT *tl) {
-  int i, k;
+  int k;
   MACROBLOCKD *xd = &x->e_mbd;
   BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type;
   int bwl = b_width_log2(bsize), bw = 1 << bwl;
   int bhl = b_height_log2(bsize), bh = 1 << bhl;
   int idx, idy;
+  const int src_stride = x->plane[0].src.stride;
+  uint8_t* const src =
+  raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
+                            x->plane[0].src.buf, src_stride);
+  int16_t* src_diff =
+  raster_block_offset_int16(xd, BLOCK_SIZE_SB8X8, 0, i,
+                            x->plane[0].src_diff);
+  int16_t* coeff = BLOCK_OFFSET(x->plane[0].coeff, 16, i);
+  uint8_t* const pre =
+  raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
+                            xd->plane[0].pre[0].buf,
+                            xd->plane[0].pre[0].stride);
+  uint8_t* const dst =
+  raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
+                            xd->plane[0].dst.buf,
+                            xd->plane[0].dst.stride);
+  int thisdistortion = 0;
+  int thisrate = 0;
 
   *labelyrate = 0;
   *distortion = 0;
-  for (i = 0; i < 4; i++) {
-    if (labels[i] == which_label) {
-      const int src_stride = x->plane[0].src.stride;
-      uint8_t* const src =
-      raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
-                                x->plane[0].src.buf, src_stride);
-      int16_t* src_diff =
-      raster_block_offset_int16(xd, BLOCK_SIZE_SB8X8, 0, i,
-                                x->plane[0].src_diff);
-      int16_t* coeff = BLOCK_OFFSET(x->plane[0].coeff, 16, i);
-      uint8_t* const pre =
-      raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
-                                xd->plane[0].pre[0].buf,
-                                xd->plane[0].pre[0].stride);
-      uint8_t* const dst =
-      raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
-                                xd->plane[0].dst.buf,
-                                xd->plane[0].dst.stride);
-      int thisdistortion = 0;
-      int thisrate = 0;
-
-      vp9_build_inter_predictor(pre,
-                                xd->plane[0].pre[0].stride,
-                                dst,
-                                xd->plane[0].dst.stride,
-                                &xd->mode_info_context->bmi[i].as_mv[0],
-                                &xd->scale_factor[0],
-                                4 * bw, 4 * bh, 0 /* no avg */, &xd->subpix);
-
-      // TODO(debargha): Make this work properly with the
-      // implicit-compoundinter-weight experiment when implicit
-      // weighting for splitmv modes is turned on.
-      if (xd->mode_info_context->mbmi.second_ref_frame > 0) {
-        uint8_t* const second_pre =
-        raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
-                                  xd->plane[0].pre[1].buf,
-                                  xd->plane[0].pre[1].stride);
-        vp9_build_inter_predictor(second_pre, xd->plane[0].pre[1].stride,
-                                  dst, xd->plane[0].dst.stride,
-                                  &xd->mode_info_context->bmi[i].as_mv[1],
-                                  &xd->scale_factor[1], 4 * bw, 4 * bh, 1,
-                                  &xd->subpix);
-      }
 
-      vp9_subtract_block(4 * bh, 4 * bw, src_diff, 8,
-                         src, src_stride,
-                         dst, xd->plane[0].dst.stride);
+  vp9_build_inter_predictor(pre,
+                            xd->plane[0].pre[0].stride,
+                            dst,
+                            xd->plane[0].dst.stride,
+                            &xd->mode_info_context->bmi[i].as_mv[0],
+                            &xd->scale_factor[0],
+                            4 * bw, 4 * bh, 0 /* no avg */, &xd->subpix);
+
+  // TODO(debargha): Make this work properly with the
+  // implicit-compoundinter-weight experiment when implicit
+  // weighting for splitmv modes is turned on.
+  if (xd->mode_info_context->mbmi.second_ref_frame > 0) {
+    uint8_t* const second_pre =
+    raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
+                              xd->plane[0].pre[1].buf,
+                              xd->plane[0].pre[1].stride);
+    vp9_build_inter_predictor(second_pre, xd->plane[0].pre[1].stride,
+                              dst, xd->plane[0].dst.stride,
+                              &xd->mode_info_context->bmi[i].as_mv[1],
+                              &xd->scale_factor[1], 4 * bw, 4 * bh, 1,
+                              &xd->subpix);
+  }
 
-      k = i;
-      for (idy = 0; idy < bh; ++idy) {
-        for (idx = 0; idx < bw; ++idx) {
-          k += (idy * 2 + idx);
-          src_diff = raster_block_offset_int16(xd, BLOCK_SIZE_SB8X8, 0, k,
-                                               x->plane[0].src_diff);
-          coeff = BLOCK_OFFSET(x->plane[0].coeff, 16, k);
-          x->fwd_txm4x4(src_diff, coeff, 16);
-          x->quantize_b_4x4(x, k, DCT_DCT, 16);
-          thisdistortion += vp9_block_error(coeff,
-                                            BLOCK_OFFSET(xd->plane[0].dqcoeff,
-                                                         k, 16), 16);
-          thisrate += cost_coeffs(cm, x, 0, k, PLANE_TYPE_Y_WITH_DC,
-                                  ta + (k & 1),
-                                  tl + (k >> 1), TX_4X4, 16);
-        }
-      }
-      *distortion += thisdistortion;
-      *labelyrate += thisrate;
+  vp9_subtract_block(4 * bh, 4 * bw, src_diff, 8,
+                     src, src_stride,
+                     dst, xd->plane[0].dst.stride);
+
+  k = i;
+  for (idy = 0; idy < bh; ++idy) {
+    for (idx = 0; idx < bw; ++idx) {
+      k += (idy * 2 + idx);
+      src_diff = raster_block_offset_int16(xd, BLOCK_SIZE_SB8X8, 0, k,
+                                           x->plane[0].src_diff);
+      coeff = BLOCK_OFFSET(x->plane[0].coeff, 16, k);
+      x->fwd_txm4x4(src_diff, coeff, 16);
+      x->quantize_b_4x4(x, k, DCT_DCT, 16);
+      thisdistortion += vp9_block_error(coeff,
+                                        BLOCK_OFFSET(xd->plane[0].dqcoeff,
+                                                     k, 16), 16);
+      thisrate += cost_coeffs(cm, x, 0, k, PLANE_TYPE_Y_WITH_DC,
+                              ta + (k & 1),
+                              tl + (k >> 1), TX_4X4, 16);
     }
   }
+  *distortion += thisdistortion;
+  *labelyrate += thisrate;
+
   *distortion >>= 2;
   return RDCOST(x->rdmult, x->rddiv, *labelyrate, *distortion);
 }
@@ -1188,11 +1251,45 @@ static enum BlockSize get_block_size(int bw, int bh) {
   return -1;
 }
 
+static INLINE void mi_buf_shift(MACROBLOCK *x, int i) {
+  MB_MODE_INFO *mbmi = &x->e_mbd.mode_info_context->mbmi;
+  x->plane[0].src.buf =
+      raster_block_offset_uint8(&x->e_mbd, BLOCK_SIZE_SB8X8, 0, i,
+                                x->plane[0].src.buf,
+                                x->plane[0].src.stride);
+  assert(((intptr_t)x->e_mbd.plane[0].pre[0].buf & 0x7) == 0);
+  x->e_mbd.plane[0].pre[0].buf =
+      raster_block_offset_uint8(&x->e_mbd, BLOCK_SIZE_SB8X8, 0, i,
+                                x->e_mbd.plane[0].pre[0].buf,
+                                x->e_mbd.plane[0].pre[0].stride);
+  if (mbmi->second_ref_frame)
+    x->e_mbd.plane[0].pre[1].buf =
+        raster_block_offset_uint8(&x->e_mbd, BLOCK_SIZE_SB8X8, 0, i,
+                                  x->e_mbd.plane[0].pre[1].buf,
+                                  x->e_mbd.plane[0].pre[1].stride);
+}
+
+static INLINE void mi_buf_restore(MACROBLOCK *x, struct buf_2d orig_src,
+                                  struct buf_2d orig_pre[2]) {
+  MB_MODE_INFO *mbmi = &x->e_mbd.mode_info_context->mbmi;
+  x->plane[0].src = orig_src;
+  x->e_mbd.plane[0].pre[0] = orig_pre[0];
+  if (mbmi->second_ref_frame)
+    x->e_mbd.plane[0].pre[1] = orig_pre[1];
+}
+
+static void iterative_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
+                                    BLOCK_SIZE_TYPE bsize,
+                                    int_mv *frame_mv,
+                                    YV12_BUFFER_CONFIG **scaled_ref_frame,
+                                    int mi_row, int mi_col,
+                                    int_mv single_newmv[MAX_REF_FRAMES]);
+
 static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
                                     BEST_SEG_INFO *bsi,
-                                    int_mv seg_mvs[4][MAX_REF_FRAMES - 1]) {
+                                    int_mv seg_mvs[4][MAX_REF_FRAMES],
+                                    int mi_row, int mi_col) {
   int i, j;
-  static const int labels[4] = { 0, 1, 2, 3 };
   int br = 0, bd = 0;
   MB_PREDICTION_MODE this_mode;
   MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
@@ -1208,7 +1305,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
   int bhl = b_height_log2(bsize), bh = 1 << bhl;
   int idx, idy;
   vp9_variance_fn_ptr_t *v_fn_ptr;
-
+  YV12_BUFFER_CONFIG *scaled_ref_frame[2] = {NULL, NULL};
   ENTROPY_CONTEXT t_above[4], t_left[4];
   ENTROPY_CONTEXT t_above_b[4], t_left_b[4];
 
@@ -1255,18 +1352,21 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
         int distortion;
         int labelyrate;
         ENTROPY_CONTEXT t_above_s[4], t_left_s[4];
+        const struct buf_2d orig_src = x->plane[0].src;
+        struct buf_2d orig_pre[2];
+
+        vpx_memcpy(orig_pre, x->e_mbd.plane[0].pre, sizeof(orig_pre));
 
         vpx_memcpy(t_above_s, t_above, sizeof(t_above_s));
         vpx_memcpy(t_left_s, t_left, sizeof(t_left_s));
 
         // motion search for newmv (single predictor case only)
         if (mbmi->second_ref_frame <= 0 && this_mode == NEWMV) {
-          int sseshift, n;
           int step_param = 0;
           int further_steps;
           int thissme, bestsme = INT_MAX;
-          const struct buf_2d orig_src = x->plane[0].src;
-          const struct buf_2d orig_pre = x->e_mbd.plane[0].pre[0];
+          int sadpb = x->sadperbit4;
+          int_mv mvp_full;
 
           /* Is the best so far sufficiently good that we cant justify doing
            * and new motion search. */
@@ -1287,55 +1387,35 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
 
           further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;
 
-          {
-            int sadpb = x->sadperbit4;
-            int_mv mvp_full;
-
-            mvp_full.as_mv.row = bsi->mvp.as_mv.row >> 3;
-            mvp_full.as_mv.col = bsi->mvp.as_mv.col >> 3;
-
-            // find first label
-            n = i;
-
-            // adjust src pointer for this segment
-            x->plane[0].src.buf =
-            raster_block_offset_uint8(&x->e_mbd, BLOCK_SIZE_SB8X8, 0, n,
-                                      x->plane[0].src.buf,
-                                      x->plane[0].src.stride);
-            assert(((intptr_t)x->e_mbd.plane[0].pre[0].buf & 0x7) == 0);
-            x->e_mbd.plane[0].pre[0].buf =
-            raster_block_offset_uint8(&x->e_mbd, BLOCK_SIZE_SB8X8, 0, n,
-                                      x->e_mbd.plane[0].pre[0].buf,
-                                      x->e_mbd.plane[0].pre[0].stride);
-
-            bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param,
-                                             sadpb, further_steps, 0, v_fn_ptr,
-                                             bsi->ref_mv, &mode_mv[NEWMV]);
-
-            sseshift = 0;
-
-            // Should we do a full search (best quality only)
-            if ((cpi->compressor_speed == 0) && (bestsme >> sseshift) > 4000) {
-              /* Check if mvp_full is within the range. */
-              clamp_mv(&mvp_full, x->mv_col_min, x->mv_col_max,
-                       x->mv_row_min, x->mv_row_max);
-
-              thissme = cpi->full_search_sad(x, &mvp_full,
-                                             sadpb, 16, v_fn_ptr,
-                                             x->nmvjointcost, x->mvcost,
-                                             bsi->ref_mv,
-                                             n);
-
-              if (thissme < bestsme) {
-                bestsme = thissme;
-                mode_mv[NEWMV].as_int =
-                x->e_mbd.mode_info_context->bmi[n].as_mv[0].as_int;
-              } else {
-                /* The full search result is actually worse so re-instate the
-                 * previous best vector */
-                x->e_mbd.mode_info_context->bmi[n].as_mv[0].as_int =
-                mode_mv[NEWMV].as_int;
-              }
+          mvp_full.as_mv.row = bsi->mvp.as_mv.row >> 3;
+          mvp_full.as_mv.col = bsi->mvp.as_mv.col >> 3;
+
+          // adjust src pointer for this block
+          mi_buf_shift(x, i);
+          bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param,
+                                           sadpb, further_steps, 0, v_fn_ptr,
+                                           bsi->ref_mv, &mode_mv[NEWMV]);
+
+          // Should we do a full search (best quality only)
+          if (cpi->compressor_speed == 0) {
+            /* Check if mvp_full is within the range. */
+            clamp_mv(&mvp_full, x->mv_col_min, x->mv_col_max,
+                     x->mv_row_min, x->mv_row_max);
+
+            thissme = cpi->full_search_sad(x, &mvp_full,
+                                           sadpb, 16, v_fn_ptr,
+                                           x->nmvjointcost, x->mvcost,
+                                           bsi->ref_mv, i);
+
+            if (thissme < bestsme) {
+              bestsme = thissme;
+              mode_mv[NEWMV].as_int =
+                  x->e_mbd.mode_info_context->bmi[i].as_mv[0].as_int;
+            } else {
+              /* The full search result is actually worse so re-instate the
+               * previous best vector */
+              x->e_mbd.mode_info_context->bmi[i].as_mv[0].as_int =
+                  mode_mv[NEWMV].as_int;
             }
           }
 
@@ -1348,23 +1428,32 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
                                          &distortion, &sse);
 
             // safe motion search result for use in compound prediction
-            seg_mvs[i][mbmi->ref_frame - 1].as_int = mode_mv[NEWMV].as_int;
+            seg_mvs[i][mbmi->ref_frame].as_int = mode_mv[NEWMV].as_int;
           }
 
           // restore src pointers
-          x->plane[0].src = orig_src;
-          x->e_mbd.plane[0].pre[0] = orig_pre;
+          mi_buf_restore(x, orig_src, orig_pre);
         } else if (mbmi->second_ref_frame > 0 && this_mode == NEWMV) {
-          /* NEW4X4 */
-          /* motion search not completed? Then skip newmv for this block with
-           * comppred */
-          if (seg_mvs[i][mbmi->second_ref_frame - 1].as_int == INVALID_MV ||
-              seg_mvs[i][mbmi->ref_frame        - 1].as_int == INVALID_MV) {
+          if (seg_mvs[i][mbmi->second_ref_frame].as_int == INVALID_MV ||
+              seg_mvs[i][mbmi->ref_frame       ].as_int == INVALID_MV)
             continue;
+
+          // adjust src pointers
+          mi_buf_shift(x, i);
+          if (cpi->sf.comp_inter_joint_search) {
+            iterative_motion_search(cpi, x, bsize, frame_mv[this_mode],
+                                    scaled_ref_frame,
+                                    mi_row, mi_col, seg_mvs[i]);
+            seg_mvs[i][mbmi->ref_frame].as_int =
+                frame_mv[this_mode][mbmi->ref_frame].as_int;
+            seg_mvs[i][mbmi->second_ref_frame].as_int =
+                frame_mv[this_mode][mbmi->second_ref_frame].as_int;
           }
+          // restore src pointers
+          mi_buf_restore(x, orig_src, orig_pre);
         }
 
-        rate = labels2mode(x, labels, i, this_mode, &mode_mv[this_mode],
+        rate = labels2mode(x, i, this_mode, &mode_mv[this_mode],
                            &second_mode_mv[this_mode], frame_mv, seg_mvs[i],
                            bsi->ref_mv, bsi->second_ref_mv, x->nmvjointcost,
                            x->mvcost, cpi);
@@ -1381,7 +1470,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
           continue;
 
         this_rd = encode_inter_mb_segment(&cpi->common,
-                                          x, labels, i, &labelyrate,
+                                          x, i, &labelyrate,
                                           &distortion, t_above_s, t_left_s);
         this_rd += RDCOST(x->rdmult, x->rddiv, rate, 0);
         rate += labelyrate;
@@ -1392,10 +1481,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
           bestlabelyrate = labelyrate;
           mode_selected = this_mode;
           best_label_rd = this_rd;
-          for (j = 0; j < 4; j++)
-            if (labels[j] == i)
-              best_eobs[j] = x->e_mbd.plane[0].eobs[j];
-
+          best_eobs[i] = x->e_mbd.plane[0].eobs[i];
           vpx_memcpy(t_above_b, t_above_s, sizeof(t_above_s));
           vpx_memcpy(t_left_b, t_left_s, sizeof(t_left_s));
         }
@@ -1404,7 +1490,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
       vpx_memcpy(t_above, t_above_b, sizeof(t_above));
       vpx_memcpy(t_left, t_left_b, sizeof(t_left));
 
-      labels2mode(x, labels, i, mode_selected, &mode_mv[mode_selected],
+      labels2mode(x, i, mode_selected, &mode_mv[mode_selected],
                   &second_mode_mv[mode_selected], frame_mv, seg_mvs[i],
                   bsi->ref_mv, bsi->second_ref_mv, x->nmvjointcost,
                   x->mvcost, cpi);
@@ -1443,12 +1529,6 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
   }
 }
 
-static void rd_check_segment(VP9_COMP *cpi, MACROBLOCK *x,
-                             BEST_SEG_INFO *bsi,
-                             int_mv seg_mvs[4][MAX_REF_FRAMES - 1]) {
-  rd_check_segment_txsize(cpi, x, bsi, seg_mvs);
-}
-
 static int rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x,
                                        int_mv *best_ref_mv,
                                        int_mv *second_best_ref_mv,
@@ -1457,7 +1537,8 @@ static int rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x,
                                        int *returnyrate,
                                        int *returndistortion,
                                        int *skippable, int mvthresh,
-                                       int_mv seg_mvs[4][MAX_REF_FRAMES - 1]) {
+                                       int_mv seg_mvs[4][MAX_REF_FRAMES],
+                                       int mi_row, int mi_col) {
   int i;
   BEST_SEG_INFO bsi;
   MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
@@ -1473,7 +1554,7 @@ static int rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x,
   for (i = 0; i < 4; i++)
     bsi.modes[i] = ZEROMV;
 
-  rd_check_segment(cpi, x, &bsi, seg_mvs);
+  rd_check_segment_txsize(cpi, x, &bsi, seg_mvs, mi_row, mi_col);
 
   /* set it to the best */
   for (i = 0; i < 4; i++) {
@@ -1504,6 +1585,7 @@ static int rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x,
   *returndistortion = bsi.d;
   *returnyrate = bsi.segment_yrate;
   *skippable = vp9_sby_is_skippable(&x->e_mbd, BLOCK_SIZE_SB8X8);
+  mbmi->mode = bsi.modes[3];
 
   return (int)(bsi.segment_rd);
 }
@@ -1878,6 +1960,154 @@ static INLINE int get_switchable_rate(VP9_COMMON *cm, MACROBLOCK *x) {
   return SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs[c][m];
 }
 
+static void iterative_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
+                                    BLOCK_SIZE_TYPE bsize,
+                                    int_mv *frame_mv,
+                                    YV12_BUFFER_CONFIG **scaled_ref_frame,
+                                    int mi_row, int mi_col,
+                                    int_mv single_newmv[MAX_REF_FRAMES]) {
+  int pw = 4 << b_width_log2(bsize), ph = 4 << b_height_log2(bsize);
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
+  int refs[2] = { mbmi->ref_frame,
+                  (mbmi->second_ref_frame < 0 ? 0 : mbmi->second_ref_frame) };
+  int_mv ref_mv[2];
+  const enum BlockSize block_size = get_plane_block_size(bsize, &xd->plane[0]);
+  int ite;
+  // Prediction buffer from second frame.
+  uint8_t *second_pred = vpx_memalign(16, pw * ph * sizeof(uint8_t));
+
+  // Do joint motion search in compound mode to get more accurate mv.
+  struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0}};
+  struct buf_2d backup_second_yv12[MAX_MB_PLANE] = {{0}};
+  struct buf_2d scaled_first_yv12;
+  int last_besterr[2] = {INT_MAX, INT_MAX};
+
+  ref_mv[0] = mbmi->ref_mvs[refs[0]][0];
+  ref_mv[1] = mbmi->ref_mvs[refs[1]][0];
+
+  if (scaled_ref_frame[0]) {
+    int i;
+    // Swap out the reference frame for a version that's been scaled to
+    // match the resolution of the current frame, allowing the existing
+    // motion search code to be used without additional modifications.
+    for (i = 0; i < MAX_MB_PLANE; i++)
+      backup_yv12[i] = xd->plane[i].pre[0];
+    setup_pre_planes(xd, scaled_ref_frame[0], NULL, mi_row, mi_col,
+                     NULL, NULL);
+  }
+
+  if (scaled_ref_frame[1]) {
+    int i;
+    for (i = 0; i < MAX_MB_PLANE; i++)
+      backup_second_yv12[i] = xd->plane[i].pre[1];
+
+    setup_pre_planes(xd, scaled_ref_frame[1], NULL, mi_row, mi_col,
+                     NULL, NULL);
+  }
+
+  xd->scale_factor[0].set_scaled_offsets(&xd->scale_factor[0],
+                                          mi_row, mi_col);
+  xd->scale_factor[1].set_scaled_offsets(&xd->scale_factor[1],
+                                          mi_row, mi_col);
+  scaled_first_yv12 = xd->plane[0].pre[0];
+
+  // Initialize mv using single prediction mode result.
+  frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int;
+  frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int;
+
+  // Allow joint search multiple times iteratively for each ref frame
+  // and break out the search loop if it couldn't find better mv.
+  for (ite = 0; ite < 4; ite++) {
+    struct buf_2d ref_yv12[2];
+    int bestsme = INT_MAX;
+    int sadpb = x->sadperbit16;
+    int_mv tmp_mv;
+    int search_range = 3;
+
+    int tmp_col_min = x->mv_col_min;
+    int tmp_col_max = x->mv_col_max;
+    int tmp_row_min = x->mv_row_min;
+    int tmp_row_max = x->mv_row_max;
+    int id = ite % 2;
+
+    // Initialized here because of compiler problem in Visual Studio.
+    ref_yv12[0] = xd->plane[0].pre[0];
+    ref_yv12[1] = xd->plane[0].pre[1];
+
+    // Get pred block from second frame.
+    vp9_build_inter_predictor(ref_yv12[!id].buf,
+                              ref_yv12[!id].stride,
+                              second_pred, pw,
+                              &frame_mv[refs[!id]],
+                              &xd->scale_factor[!id],
+                              pw, ph, 0,
+                              &xd->subpix);
+
+    // Compound motion search on first ref frame.
+    if (id)
+      xd->plane[0].pre[0] = ref_yv12[id];
+    vp9_clamp_mv_min_max(x, &ref_mv[id]);
+
+    // Use mv result from single mode as mvp.
+    tmp_mv.as_int = frame_mv[refs[id]].as_int;
+
+    tmp_mv.as_mv.col >>= 3;
+    tmp_mv.as_mv.row >>= 3;
+
+    // Small-range full-pixel motion search
+    bestsme = vp9_refining_search_8p_c(x, &tmp_mv, sadpb,
+                                       search_range,
+                                       &cpi->fn_ptr[block_size],
+                                       x->nmvjointcost, x->mvcost,
+                                       &ref_mv[id], second_pred,
+                                       pw, ph);
+
+    x->mv_col_min = tmp_col_min;
+    x->mv_col_max = tmp_col_max;
+    x->mv_row_min = tmp_row_min;
+    x->mv_row_max = tmp_row_max;
+
+    if (bestsme < INT_MAX) {
+      int dis; /* TODO: use dis in distortion calculation later. */
+      unsigned int sse;
+
+      bestsme = vp9_find_best_sub_pixel_comp(x, &tmp_mv,
+                                             &ref_mv[id],
+                                             x->errorperbit,
+                                             &cpi->fn_ptr[block_size],
+                                             x->nmvjointcost, x->mvcost,
+                                             &dis, &sse, second_pred,
+                                             pw, ph);
+    }
+
+    if (id)
+      xd->plane[0].pre[0] = scaled_first_yv12;
+
+    if (bestsme < last_besterr[id]) {
+      frame_mv[refs[id]].as_int = tmp_mv.as_int;
+      last_besterr[id] = bestsme;
+    } else {
+      break;
+    }
+  }
+
+  // restore the predictor
+  if (scaled_ref_frame[0]) {
+    int i;
+    for (i = 0; i < MAX_MB_PLANE; i++)
+      xd->plane[i].pre[0] = backup_yv12[i];
+  }
+
+  if (scaled_ref_frame[1]) {
+    int i;
+    for (i = 0; i < MAX_MB_PLANE; i++)
+      xd->plane[i].pre[1] = backup_second_yv12[i];
+  }
+
+  vpx_free(second_pred);
+}
+
 static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                  BLOCK_SIZE_TYPE bsize,
                                  int64_t txfm_cache[],
@@ -1920,145 +2150,13 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
       ref_mv[1] = mbmi->ref_mvs[refs[1]][0];
 
       if (is_comp_pred) {
-        if (cpi->sf.comp_inter_joint_serach) {
-          int pw = 4 << b_width_log2(bsize), ph = 4 << b_height_log2(bsize);
-          int ite;
-          // Prediction buffer from second frame.
-          uint8_t *second_pred = vpx_memalign(16, pw * ph * sizeof(uint8_t));
-
-          // Do joint motion search in compound mode to get more accurate mv.
-          struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0}};
-          struct buf_2d backup_second_yv12[MAX_MB_PLANE] = {{0}};
-          struct buf_2d scaled_first_yv12;
-          int last_besterr[2] = {INT_MAX, INT_MAX};
-
-          if (scaled_ref_frame[0]) {
-            int i;
-
-            // Swap out the reference frame for a version that's been scaled to
-            // match the resolution of the current frame, allowing the existing
-            // motion search code to be used without additional modifications.
-            for (i = 0; i < MAX_MB_PLANE; i++)
-              backup_yv12[i] = xd->plane[i].pre[0];
-
-            setup_pre_planes(xd, scaled_ref_frame[0], NULL, mi_row, mi_col,
-                             NULL, NULL);
-          }
-
-          if (scaled_ref_frame[1]) {
-            int i;
-
-            for (i = 0; i < MAX_MB_PLANE; i++)
-              backup_second_yv12[i] = xd->plane[i].pre[1];
-
-            setup_pre_planes(xd, scaled_ref_frame[1], NULL, mi_row, mi_col,
-                             NULL, NULL);
-          }
-          xd->scale_factor[0].set_scaled_offsets(&xd->scale_factor[0],
-                                                  mi_row, mi_col);
-          xd->scale_factor[1].set_scaled_offsets(&xd->scale_factor[1],
-                                                  mi_row, mi_col);
-
-          scaled_first_yv12 = xd->plane[0].pre[0];
-
-          // Initialize mv using single prediction mode result.
-          frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int;
-          frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int;
-
-          // Allow joint search multiple times iteratively for each ref frame
-          // and break out the search loop if it couldn't find better mv.
-          for (ite = 0; ite < 4; ite++) {
-            struct buf_2d ref_yv12[2];
-            int bestsme = INT_MAX;
-            int sadpb = x->sadperbit16;
-            int_mv tmp_mv;
-            int search_range = 3;
-
-            int tmp_col_min = x->mv_col_min;
-            int tmp_col_max = x->mv_col_max;
-            int tmp_row_min = x->mv_row_min;
-            int tmp_row_max = x->mv_row_max;
-            int id = ite % 2;
-
-            // Initialized here because of compiler problem in Visual Studio.
-            ref_yv12[0] = xd->plane[0].pre[0];
-            ref_yv12[1] = xd->plane[0].pre[1];
-
-            // Get pred block from second frame.
-            vp9_build_inter_predictor(ref_yv12[!id].buf,
-                                      ref_yv12[!id].stride,
-                                      second_pred, pw,
-                                      &frame_mv[refs[!id]],
-                                      &xd->scale_factor[!id],
-                                      pw, ph, 0,
-                                      &xd->subpix);
-
-            // Compound motion search on first ref frame.
-            if (id)
-              xd->plane[0].pre[0] = ref_yv12[id];
-            vp9_clamp_mv_min_max(x, &ref_mv[id]);
-
-            // Use mv result from single mode as mvp.
-            tmp_mv.as_int = frame_mv[refs[id]].as_int;
-
-            tmp_mv.as_mv.col >>= 3;
-            tmp_mv.as_mv.row >>= 3;
-
-            // Small-range full-pixel motion search
-            bestsme = vp9_refining_search_8p_c(x, &tmp_mv, sadpb,
-                                               search_range,
-                                               &cpi->fn_ptr[block_size],
-                                               x->nmvjointcost, x->mvcost,
-                                               &ref_mv[id], second_pred,
-                                               pw, ph);
-
-            x->mv_col_min = tmp_col_min;
-            x->mv_col_max = tmp_col_max;
-            x->mv_row_min = tmp_row_min;
-            x->mv_row_max = tmp_row_max;
-
-            if (bestsme < INT_MAX) {
-              int dis; /* TODO: use dis in distortion calculation later. */
-              unsigned int sse;
-
-              bestsme = vp9_find_best_sub_pixel_comp(x, &tmp_mv,
-                                                     &ref_mv[id],
-                                                     x->errorperbit,
-                                                     &cpi->fn_ptr[block_size],
-                                                     x->nmvjointcost, x->mvcost,
-                                                     &dis, &sse, second_pred,
-                                                     pw, ph);
-            }
-
-            if (id)
-              xd->plane[0].pre[0] = scaled_first_yv12;
-
-            if (bestsme < last_besterr[id]) {
-              frame_mv[refs[id]].as_int =
-                  xd->mode_info_context->bmi[0].as_mv[1].as_int = tmp_mv.as_int;
-              last_besterr[id] = bestsme;
-            } else {
-              break;
-            }
-          }
-
-          // restore the predictor
-          if (scaled_ref_frame[0]) {
-            int i;
-
-            for (i = 0; i < MAX_MB_PLANE; i++)
-              xd->plane[i].pre[0] = backup_yv12[i];
-          }
+        // Initialize mv using single prediction mode result.
+        frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int;
+        frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int;
 
-          if (scaled_ref_frame[1]) {
-            int i;
-
-            for (i = 0; i < MAX_MB_PLANE; i++)
-              xd->plane[i].pre[1] = backup_second_yv12[i];
-          }
-
-          vpx_free(second_pred);
-        }
+        if (cpi->sf.comp_inter_joint_search)
+          iterative_motion_search(cpi, x, bsize, frame_mv, scaled_ref_frame,
+                                  mi_row, mi_col, single_newmv);
 
         if (frame_mv[refs[0]].as_int == INVALID_MV ||
             frame_mv[refs[1]].as_int == INVALID_MV)
@@ -2134,8 +2232,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                        x->nmvjointcost, x->mvcost,
                                        &dis, &sse);
         }
-        frame_mv[refs[0]].as_int =
-          xd->mode_info_context->bmi[0].as_mv[0].as_int = tmp_mv.as_int;
+        frame_mv[refs[0]].as_int = tmp_mv.as_int;
         single_newmv[refs[0]].as_int = tmp_mv.as_int;
 
         // Add the new motion vector cost to our rolling cost variable
@@ -2191,7 +2288,9 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                  (mbmi->mv[1].as_mv.col & 15) == 0;
   // Search for best switchable filter by checking the variance of
   // pred error irrespective of whether the filter will be used
-  if (1) {
+  if (cpi->speed > 4) {
+    *best_filter = EIGHTTAP;
+  } else {
     int i, newbest;
     int tmp_rate_sum = 0, tmp_dist_sum = 0;
     for (i = 0; i < VP9_SWITCHABLE_FILTERS; ++i) {
@@ -2328,6 +2427,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     // Y cost and distortion
     super_block_yrd(cpi, x, rate_y, distortion_y, &skippable_y,
                     bsize, txfm_cache);
+
     *rate2 += *rate_y;
     *distortion += *distortion_y;
 
@@ -2393,16 +2493,14 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     *returnrate = rate4x4_y + rate_uv +
         vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 0);
     *returndist = dist4x4_y + (dist_uv >> 2);
-    for (i = 0; i < NB_TXFM_MODES; i++) {
-      ctx->txfm_rd_diff[i] = MIN(err4x4, err - txfm_cache[i]);
-    }
+    vpx_memset(ctx->txfm_rd_diff, 0, sizeof(ctx->txfm_rd_diff));
     xd->mode_info_context->mbmi.txfm_size = TX_4X4;
   } else {
     *returnrate = rate_y + rate_uv +
         vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 0);
     *returndist = dist_y + (dist_uv >> 2);
     for (i = 0; i < NB_TXFM_MODES; i++) {
-      ctx->txfm_rd_diff[i] = MIN(err4x4, err - txfm_cache[i]);
+      ctx->txfm_rd_diff[i] = txfm_cache[i] - txfm_cache[cm->txfm_mode];
     }
     xd->mode_info_context->mbmi.txfm_size = txfm_size;
     xd->mode_info_context->mbmi.mode = mode;
@@ -2457,14 +2555,14 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   int64_t frame_distortions[MAX_REF_FRAMES] = {-1};
   int intra_cost_penalty = 20 * vp9_dc_quant(cpi->common.base_qindex,
                                              cpi->common.y_dc_delta_q);
-  int_mv seg_mvs[4][MAX_REF_FRAMES - 1];
+  int_mv seg_mvs[4][MAX_REF_FRAMES];
   union b_mode_info best_bmodes[4];
   PARTITION_INFO best_partition;
 
   for (i = 0; i < 4; i++) {
     int j;
 
-    for (j = 0; j < MAX_REF_FRAMES - 1; j++)
+    for (j = 0; j < MAX_REF_FRAMES; j++)
       seg_mvs[i][j].as_int = INVALID_MV;
   }
   // Everywhere the flag is set the error is much higher than its neighbors.
@@ -2563,11 +2661,6 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     this_mode = vp9_mode_order[mode_index].mode;
     ref_frame = vp9_mode_order[mode_index].ref_frame;
 
-    if (!(ref_frame == INTRA_FRAME
-        || (cpi->ref_frame_flags & flag_list[ref_frame]))) {
-      continue;
-    }
-
     if (cpi->speed > 0 && bsize >= BLOCK_SIZE_SB8X8) {
       if (!(ref_frame_mask & (1 << ref_frame))) {
         continue;
@@ -2585,6 +2678,15 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     mbmi->ref_frame = ref_frame;
     mbmi->second_ref_frame = vp9_mode_order[mode_index].second_ref_frame;
 
+    if (!(ref_frame == INTRA_FRAME
+        || (cpi->ref_frame_flags & flag_list[ref_frame]))) {
+      continue;
+    }
+    if (!(mbmi->second_ref_frame == NONE
+        || (cpi->ref_frame_flags & flag_list[mbmi->second_ref_frame]))) {
+      continue;
+    }
+
     // TODO(jingning, jkoleszar): scaling reference frame not supported for
     // SPLITMV.
     if (mbmi->ref_frame > 0 &&
@@ -2680,8 +2782,6 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     if (this_mode == I4X4_PRED) {
       int rate;
 
-      // Note the rate value returned here includes the cost of coding
-      // the I4X4_PRED mode : x->mbmode_cost[xd->frame_type][I4X4_PRED];
       mbmi->txfm_size = TX_4X4;
       rd_pick_intra4x4mby_modes(cpi, x, &rate, &rate_y,
                                 &distortion_y, INT64_MAX);
@@ -2716,7 +2816,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       skippable = skippable && skip_uv[uv_tx];
       mbmi->uv_mode = mode_uv[uv_tx];
 
-      rate2 = rate_y + x->mbmode_cost[cm->frame_type][mbmi->mode] + rate_uv;
+      rate2 = rate_y + x->mbmode_cost[mbmi->mode] + rate_uv;
       if (mbmi->mode != DC_PRED && mbmi->mode != TM_PRED)
         rate2 += intra_cost_penalty;
       distortion2 = distortion_y + distortion_uv;
@@ -2755,7 +2855,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
                                              second_ref, INT64_MAX,
                                              &rate, &rate_y, &distortion,
                                              &skippable,
-                                             (int)this_rd_thresh, seg_mvs);
+                                             (int)this_rd_thresh, seg_mvs,
+                                             mi_row, mi_col);
         if (cpi->common.mcomp_filter_type == SWITCHABLE) {
           const int rs = get_switchable_rate(cm, x);
           tmp_rd += RDCOST(x->rdmult, x->rddiv, rs, 0);
@@ -2794,7 +2895,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
                                              second_ref, INT64_MAX,
                                              &rate, &rate_y, &distortion,
                                              &skippable,
-                                             (int)this_rd_thresh, seg_mvs);
+                                             (int)this_rd_thresh, seg_mvs,
+                                             mi_row, mi_col);
       } else {
         if (cpi->common.mcomp_filter_type == SWITCHABLE) {
           int rs = get_switchable_rate(cm, x);
@@ -2843,7 +2945,6 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
 
       compmode_cost =
           vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_COMP), is_comp_pred);
-      mbmi->mode = this_mode;
     } else {
       YV12_BUFFER_CONFIG *scaled_ref_frame[2] = {NULL, NULL};
       int fb = get_ref_frame_idx(cpi, mbmi->ref_frame);
@@ -2938,14 +3039,16 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       best_mode = this_mode;
     }
 
-    // Store the respective mode distortions for later use.
-    if (mode_distortions[this_mode] == -1
-        || distortion2 < mode_distortions[this_mode]) {
-      mode_distortions[this_mode] = distortion2;
-    }
-    if (frame_distortions[mbmi->ref_frame] == -1
-        || distortion2 < frame_distortions[mbmi->ref_frame]) {
-      frame_distortions[mbmi->ref_frame] = distortion2;
+    if (this_mode != I4X4_PRED && this_mode != SPLITMV) {
+      // Store the respective mode distortions for later use.
+      if (mode_distortions[this_mode] == -1
+          || distortion2 < mode_distortions[this_mode]) {
+        mode_distortions[this_mode] = distortion2;
+      }
+      if (frame_distortions[mbmi->ref_frame] == -1
+          || distortion2 < frame_distortions[mbmi->ref_frame]) {
+        frame_distortions[mbmi->ref_frame] = distortion2;
+      }
     }
 
     // Did this mode help.. i.e. is it the new best mode
@@ -2954,7 +3057,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
         // Note index of best mode so far
         best_mode_index = mode_index;
 
-        if (this_mode <= I4X4_PRED) {
+        if (ref_frame == INTRA_FRAME) {
           /* required for left and above block mv */
           mbmi->mv[0].as_int = 0;
         }
@@ -3052,8 +3155,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   // Flag all modes that have a distortion thats > 2x the best we found at
   // this level.
   for (mode_index = 0; mode_index < MB_MODE_COUNT; ++mode_index) {
-    if (mode_index == NEARESTMV || mode_index == NEARMV || mode_index == NEWMV
-        || mode_index == SPLITMV)
+    if (mode_index == NEARESTMV || mode_index == NEARMV || mode_index == NEWMV)
       continue;
 
     if (mode_distortions[mode_index] > 2 * *returndistortion) {
@@ -3077,7 +3179,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
 
   assert((cm->mcomp_filter_type == SWITCHABLE) ||
          (cm->mcomp_filter_type == best_mbmode.interp_filter) ||
-         (best_mbmode.mode <= I4X4_PRED));
+         (best_mbmode.ref_frame == INTRA_FRAME));
 
   // Accumulate filter usage stats
   // TODO(agrange): Use RD criteria to select interpolation filter mode.
@@ -3129,13 +3231,15 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
 
   // macroblock modes
   vpx_memcpy(mbmi, &best_mbmode, sizeof(MB_MODE_INFO));
-  if (best_mbmode.mode == I4X4_PRED) {
+  if (best_mbmode.ref_frame == INTRA_FRAME &&
+      best_mbmode.sb_type < BLOCK_SIZE_SB8X8) {
     for (i = 0; i < 4; i++) {
       xd->mode_info_context->bmi[i].as_mode = best_bmodes[i].as_mode;
     }
   }
 
-  if (best_mbmode.mode == SPLITMV) {
+  if (best_mbmode.ref_frame != INTRA_FRAME &&
+      best_mbmode.sb_type < BLOCK_SIZE_SB8X8) {
     for (i = 0; i < 4; i++)
       xd->mode_info_context->bmi[i].as_mv[0].as_int =
           best_bmodes[i].as_mv[0].as_int;