1 files changed, 463 insertions, 439 deletions
diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c
index c82a87d69..97a38dd8b 100644
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -60,10 +60,8 @@ extern void vp8_update_zbin_extra(VP8_COMP *cpi, MACROBLOCK *x);
 
 #define INVALID_MV 0x80008000
 
-#if CONFIG_SWITCHABLE_INTERP
 /* Factor to weigh the rate for switchable interp filters */
 #define SWITCHABLE_INTERP_RATE_FACTOR 1
-#endif
 
 static const int auto_speed_thresh[17] = {
   1000,
@@ -355,37 +353,31 @@ void vp8_initialize_rd_consts(VP8_COMP *cpi, int QIndex) {
     cpi->mb.token_costs[TX_4X4],
     (const vp8_prob( *)[8][PREV_COEF_CONTEXTS][11]) cpi->common.fc.coef_probs,
     BLOCK_TYPES);
-#if CONFIG_HYBRIDTRANSFORM
   fill_token_costs(
     cpi->mb.hybrid_token_costs[TX_4X4],
     (const vp8_prob( *)[8][PREV_COEF_CONTEXTS][11])
     cpi->common.fc.hybrid_coef_probs,
     BLOCK_TYPES);
-#endif
 
   fill_token_costs(
     cpi->mb.token_costs[TX_8X8],
     (const vp8_prob( *)[8][PREV_COEF_CONTEXTS][11]) cpi->common.fc.coef_probs_8x8,
     BLOCK_TYPES_8X8);
-#if CONFIG_HYBRIDTRANSFORM8X8
   fill_token_costs(
     cpi->mb.hybrid_token_costs[TX_8X8],
     (const vp8_prob( *)[8][PREV_COEF_CONTEXTS][11])
     cpi->common.fc.hybrid_coef_probs_8x8,
     BLOCK_TYPES_8X8);
-#endif
 
   fill_token_costs(
     cpi->mb.token_costs[TX_16X16],
     (const vp8_prob(*)[8][PREV_COEF_CONTEXTS][11]) cpi->common.fc.coef_probs_16x16,
     BLOCK_TYPES_16X16);
-#if CONFIG_HYBRIDTRANSFORM16X16
   fill_token_costs(
     cpi->mb.hybrid_token_costs[TX_16X16],
     (const vp8_prob(*)[8][PREV_COEF_CONTEXTS][11])
     cpi->common.fc.hybrid_coef_probs_16x16,
     BLOCK_TYPES_16X16);
-#endif
 
   /*rough estimate for costing*/
   cpi->common.kf_ymode_probs_index = cpi->common.base_qindex >> 4;
@@ -393,14 +385,12 @@ void vp8_initialize_rd_consts(VP8_COMP *cpi, int QIndex) {
 
   if (cpi->common.frame_type != KEY_FRAME)
   {
-#if CONFIG_NEWMVENTROPY
     vp8_build_nmv_cost_table(
         cpi->mb.nmvjointcost,
         cpi->mb.e_mbd.allow_high_precision_mv ?
         cpi->mb.nmvcost_hp : cpi->mb.nmvcost,
         &cpi->common.fc.nmvc,
         cpi->mb.e_mbd.allow_high_precision_mv, 1, 1);
-#endif
   }
 }
 
@@ -409,19 +399,6 @@ void vp8_auto_select_speed(VP8_COMP *cpi) {
 
   milliseconds_for_compress = milliseconds_for_compress * (16 - cpi->oxcf.cpu_used) / 16;
 
-#if 0
-
-  if (0) {
-    FILE *f;
-
-    f = fopen("speed.stt", "a");
-    fprintf(f, " %8ld %10ld %10ld %10ld\n",
-            cpi->common.current_video_frame, cpi->Speed, milliseconds_for_compress, cpi->avg_pick_mode_time);
-    fclose(f);
-  }
-
-#endif
-
   /*
   // this is done during parameter valid check
   if( cpi->oxcf.cpu_used > 16)
@@ -520,7 +497,7 @@ int vp8_mbuverror_c(MACROBLOCK *mb) {
   return error;
 }
 
-int VP8_UVSSE(MACROBLOCK *x, const vp8_variance_rtcd_vtable_t *rtcd) {
+int vp8_uvsse(MACROBLOCK *x) {
   unsigned char *uptr, *vptr;
   unsigned char *upred_ptr = (*(x->block[16].base_src) + x->block[16].src);
   unsigned char *vpred_ptr = (*(x->block[20].base_src) + x->block[20].src);
@@ -551,16 +528,14 @@ int VP8_UVSSE(MACROBLOCK *x, const vp8_variance_rtcd_vtable_t *rtcd) {
   vptr = x->e_mbd.pre.v_buffer + offset;
 
   if ((mv_row | mv_col) & 7) {
-    VARIANCE_INVOKE(rtcd, subpixvar8x8)(uptr, pre_stride,
-                                        (mv_col & 7) << 1, (mv_row & 7) << 1, upred_ptr, uv_stride, &sse2);
-    VARIANCE_INVOKE(rtcd, subpixvar8x8)(vptr, pre_stride,
-                                        (mv_col & 7) << 1, (mv_row & 7) << 1, vpred_ptr, uv_stride, &sse1);
+    vp8_sub_pixel_variance8x8(uptr, pre_stride, (mv_col & 7) << 1,
+                              (mv_row & 7) << 1, upred_ptr, uv_stride, &sse2);
+    vp8_sub_pixel_variance8x8(vptr, pre_stride, (mv_col & 7) << 1,
+                              (mv_row & 7) << 1, vpred_ptr, uv_stride, &sse1);
     sse2 += sse1;
   } else {
-    VARIANCE_INVOKE(rtcd, var8x8)(uptr, pre_stride,
-                                  upred_ptr, uv_stride, &sse2);
-    VARIANCE_INVOKE(rtcd, var8x8)(vptr, pre_stride,
-                                  vpred_ptr, uv_stride, &sse1);
+    vp8_variance8x8(uptr, pre_stride, upred_ptr, uv_stride, &sse2);
+    vp8_variance8x8(vptr, pre_stride, vpred_ptr, uv_stride, &sse1);
     sse2 += sse1;
   }
   return sse2;
@@ -607,9 +582,7 @@ static int cost_coeffs(MACROBLOCK *mb, BLOCKD *b, PLANE_TYPE type,
   short *qcoeff_ptr = b->qcoeff;
   MACROBLOCKD *xd = &mb->e_mbd;
   MB_MODE_INFO *mbmi = &mb->e_mbd.mode_info_context->mbmi;
-#if CONFIG_HYBRIDTRANSFORM || CONFIG_HYBRIDTRANSFORM8X8 || CONFIG_HYBRIDTRANSFORM16X16
   TX_TYPE tx_type = DCT_DCT;
-#endif
   int segment_id = mbmi->segment_id;
 
   switch (tx_size) {
@@ -617,55 +590,47 @@ static int cost_coeffs(MACROBLOCK *mb, BLOCKD *b, PLANE_TYPE type,
       scan = vp8_default_zig_zag1d;
       band = vp8_coef_bands;
       default_eob = 16;
-#if CONFIG_HYBRIDTRANSFORM
-      if (type == PLANE_TYPE_Y_WITH_DC &&
-          mb->q_index < ACTIVE_HT &&
-          mbmi->mode == B_PRED) {
-        tx_type = b->bmi.as_mode.tx_type;
-        switch (tx_type) {
-          case ADST_DCT:
-            scan = vp8_row_scan;
-            break;
-
-          case DCT_ADST:
-            scan = vp8_col_scan;
-            break;
-
-          default:
-            scan = vp8_default_zig_zag1d;
-            break;
-        }
+      if (type == PLANE_TYPE_Y_WITH_DC) {
+        tx_type = get_tx_type_4x4(xd, b);
+        if (tx_type != DCT_DCT) {
+          switch (tx_type) {
+            case ADST_DCT:
+              scan = vp8_row_scan;
+              break;
+
+            case DCT_ADST:
+              scan = vp8_col_scan;
+              break;
 
+            default:
+              scan = vp8_default_zig_zag1d;
+              break;
+          }
+        }
       }
-#endif
+
       break;
     case TX_8X8:
       scan = vp8_default_zig_zag1d_8x8;
       band = vp8_coef_bands_8x8;
       default_eob = 64;
-#if CONFIG_HYBRIDTRANSFORM8X8
-      {
+      if (type == PLANE_TYPE_Y_WITH_DC) {
         BLOCKD *bb;
         int ib = (b - xd->block);
         if (ib < 16) {
           ib = (ib & 8) + ((ib & 4) >> 1);
           bb = xd->block + ib;
-          if (mbmi->mode == I8X8_PRED)
-            tx_type = bb->bmi.as_mode.tx_type;
+          tx_type = get_tx_type_8x8(xd, bb);
         }
       }
-#endif
       break;
     case TX_16X16:
       scan = vp8_default_zig_zag1d_16x16;
       band = vp8_coef_bands_16x16;
       default_eob = 256;
-#if CONFIG_HYBRIDTRANSFORM16X16
-      if (type == PLANE_TYPE_Y_WITH_DC &&
-          mbmi->mode < I8X8_PRED &&
-          mb->q_index < ACTIVE_HT16)
-          tx_type = b->bmi.as_mode.tx_type;
-#endif
+      if (type == PLANE_TYPE_Y_WITH_DC) {
+        tx_type = get_tx_type_16x16(xd, b);
+      }
       break;
     default:
       break;
@@ -675,11 +640,8 @@ static int cost_coeffs(MACROBLOCK *mb, BLOCKD *b, PLANE_TYPE type,
   else
     seg_eob = default_eob;
 
-  //mbmi->mode = mode;
-
   VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
 
-#if CONFIG_HYBRIDTRANSFORM || CONFIG_HYBRIDTRANSFORM8X8 || CONFIG_HYBRIDTRANSFORM16X16
   if (tx_type != DCT_DCT) {
     for (; c < eob; c++) {
       int v = qcoeff_ptr[scan[c]];
@@ -691,9 +653,7 @@ static int cost_coeffs(MACROBLOCK *mb, BLOCKD *b, PLANE_TYPE type,
     if (c < seg_eob)
       cost += mb->hybrid_token_costs[tx_size][type][band[c]]
           [pt][DCT_EOB_TOKEN];
-  } else
-#endif
-  {
+  } else {
     for (; c < eob; c++) {
       int v = qcoeff_ptr[scan[c]];
       int t = vp8_dct_value_tokens_ptr[v].Token;
@@ -871,6 +831,10 @@ static int vp8_rdcost_mby_16x16(MACROBLOCK *mb) {
 static void macro_block_yrd_16x16(MACROBLOCK *mb, int *Rate, int *Distortion,
                                   const VP8_ENCODER_RTCD *rtcd, int *skippable) {
   int d;
+  MACROBLOCKD *xd = &mb->e_mbd;
+  BLOCKD *b  = &mb->e_mbd.block[0];
+  BLOCK  *be = &mb->block[0];
+  TX_TYPE tx_type;
 
   ENCODEMB_INVOKE(&rtcd->encodemb, submby)(
     mb->src_diff,
@@ -878,27 +842,18 @@ static void macro_block_yrd_16x16(MACROBLOCK *mb, int *Rate, int *Distortion,
     mb->e_mbd.predictor,
     mb->block[0].src_stride);
 
-#if CONFIG_HYBRIDTRANSFORM16X16
-  if ((mb->e_mbd.mode_info_context->mbmi.mode < I8X8_PRED) &&
-      (mb->q_index < ACTIVE_HT16)) {
-    BLOCKD *b  = &mb->e_mbd.block[0];
-    BLOCK  *be = &mb->block[0];
-    txfm_map(b, pred_mode_conv(mb->e_mbd.mode_info_context->mbmi.mode));
-    vp8_fht_c(be->src_diff, be->coeff, 32, b->bmi.as_mode.tx_type, 16);
+  tx_type = get_tx_type_16x16(xd, b);
+  if (tx_type != DCT_DCT) {
+    vp8_fht_c(be->src_diff, be->coeff, 32, tx_type, 16);
   } else
     vp8_transform_mby_16x16(mb);
-#else
-  vp8_transform_mby_16x16(mb);
-#endif
 
   vp8_quantize_mby_16x16(mb);
-#if CONFIG_HYBRIDTRANSFORM16X16
   // TODO(jingning) is it possible to quickly determine whether to force
   //                trailing coefficients to be zero, instead of running trellis
   //                optimization in the rate-distortion optimization loop?
   if (mb->e_mbd.mode_info_context->mbmi.mode < I8X8_PRED)
     vp8_optimize_mby_16x16(mb, rtcd);
-#endif
 
   d = ENCODEMB_INVOKE(&rtcd->encodemb, mberr)(mb, 0);
 
@@ -914,8 +869,6 @@ static void macro_block_yrd(VP8_COMP *cpi, MACROBLOCK *x, int *rate,
   VP8_COMMON *cm = &cpi->common;
   MB_MODE_INFO *mbmi = &x->e_mbd.mode_info_context->mbmi;
 
-#if CONFIG_TX_SELECT
-
   MACROBLOCKD *xd = &x->e_mbd;
   int can_skip = cm->mb_no_coeff_skip;
   vp8_prob skip_prob = can_skip ? get_pred_prob(cm, xd, PRED_MBSKIP) : 128;
@@ -1023,25 +976,6 @@ static void macro_block_yrd(VP8_COMP *cpi, MACROBLOCK *x, int *rate,
   else
     txfm_cache[TX_MODE_SELECT] = rd4x4s < rd8x8s ? rd4x4s : rd8x8s;
 
-#else /* CONFIG_TX_SELECT */
-
-  switch (cpi->common.txfm_mode) {
-    case ALLOW_16X16:
-      macro_block_yrd_16x16(x, rate, distortion, IF_RTCD(&cpi->rtcd), skippable);
-      mbmi->txfm_size = TX_16X16;
-      break;
-    case ALLOW_8X8:
-      macro_block_yrd_8x8(x, rate, distortion, IF_RTCD(&cpi->rtcd), skippable);
-      mbmi->txfm_size = TX_8X8;
-      break;
-    default:
-    case ONLY_4X4:
-      macro_block_yrd_4x4(x, rate, distortion, IF_RTCD(&cpi->rtcd), skippable);
-      mbmi->txfm_size = TX_4X4;
-      break;
-  }
-
-#endif /* CONFIG_TX_SELECT */
 }
 
 static void copy_predictor(unsigned char *dst, const unsigned char *predictor) {
@@ -1145,12 +1079,7 @@ static int64_t rd_pick_intra4x4block(VP8_COMP *cpi, MACROBLOCK *x, BLOCK *be,
                                      int *bestrate, int *bestratey,
                                      int *bestdistortion) {
   B_PREDICTION_MODE mode;
-
-#if CONFIG_HYBRIDTRANSFORM
-  int QIndex = x->q_index;
-  int active_ht = (QIndex < ACTIVE_HT);
-  TX_TYPE best_tx_type;
-#endif
+  MACROBLOCKD *xd = &x->e_mbd;
 
 #if CONFIG_COMP_INTRA_PRED
   B_PREDICTION_MODE mode2;
@@ -1161,6 +1090,8 @@ static int64_t rd_pick_intra4x4block(VP8_COMP *cpi, MACROBLOCK *x, BLOCK *be,
 
   ENTROPY_CONTEXT ta = *a, tempa = *a;
   ENTROPY_CONTEXT tl = *l, templ = *l;
+  TX_TYPE tx_type = DCT_DCT;
+  TX_TYPE best_tx_type = DCT_DCT;
   /*
    * The predictor buffer is a 2d buffer with a stride of 16.  Create
    * a temp buffer that meets the stride requirements, but we are only
@@ -1177,11 +1108,6 @@ static int64_t rd_pick_intra4x4block(VP8_COMP *cpi, MACROBLOCK *x, BLOCK *be,
       int64_t this_rd;
       int ratey;
 
-      // TODO Temporarily ignore modes that need the above-right data. SB
-      // encoding means this data is not available for the bottom right MB
-      // Do we need to do this for mode2 also?
-      if (mode == B_LD_PRED || mode == B_VL_PRED)
-        continue;
       b->bmi.as_mode.first = mode;
       rate = bmode_costs[mode];
 
@@ -1197,48 +1123,42 @@ static int64_t rd_pick_intra4x4block(VP8_COMP *cpi, MACROBLOCK *x, BLOCK *be,
 #endif
       ENCODEMB_INVOKE(IF_RTCD(&cpi->rtcd.encodemb), subb)(be, b, 16);
 
-#if CONFIG_HYBRIDTRANSFORM
-      if (active_ht) {
-        txfm_map(b, mode);
-        vp8_fht_c(be->src_diff, be->coeff, 32, b->bmi.as_mode.tx_type, 4);
-        vp8_ht_quantize_b_4x4(be, b);
+      b->bmi.as_mode.first = mode;
+      tx_type = get_tx_type_4x4(xd, b);
+      if (tx_type != DCT_DCT) {
+        vp8_fht_c(be->src_diff, be->coeff, 32, tx_type, 4);
+        vp8_ht_quantize_b_4x4(be, b, tx_type);
       } else {
         x->vp8_short_fdct4x4(be->src_diff, be->coeff, 32);
         x->quantize_b_4x4(be, b);
       }
-#else
-        x->vp8_short_fdct4x4(be->src_diff, be->coeff, 32);
-        x->quantize_b_4x4(be, b);
-#endif
 
-        tempa = ta;
-        templ = tl;
+      tempa = ta;
+      templ = tl;
 
-        ratey = cost_coeffs(x, b, PLANE_TYPE_Y_WITH_DC, &tempa, &templ, TX_4X4);
-        rate += ratey;
-        distortion = ENCODEMB_INVOKE(IF_RTCD(&cpi->rtcd.encodemb), berr)(
-            be->coeff, b->dqcoeff, 16) >> 2;
+      ratey = cost_coeffs(x, b, PLANE_TYPE_Y_WITH_DC, &tempa, &templ, TX_4X4);
+      rate += ratey;
+      distortion = ENCODEMB_INVOKE(IF_RTCD(&cpi->rtcd.encodemb), berr)(
+          be->coeff, b->dqcoeff, 16) >> 2;
 
-        this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
+      this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
 
-        if (this_rd < best_rd) {
-          *bestrate = rate;
-          *bestratey = ratey;
-          *bestdistortion = distortion;
-          best_rd = this_rd;
-          *best_mode = mode;
-#if CONFIG_HYBRIDTRANSFORM
-          best_tx_type = b->bmi.as_mode.tx_type ;
-#endif
+      if (this_rd < best_rd) {
+        *bestrate = rate;
+        *bestratey = ratey;
+        *bestdistortion = distortion;
+        best_rd = this_rd;
+        *best_mode = mode;
+        best_tx_type = tx_type;
 
 #if CONFIG_COMP_INTRA_PRED
-          *best_second_mode = mode2;
+        *best_second_mode = mode2;
 #endif
-          *a = tempa;
-          *l = templ;
-          copy_predictor(best_predictor, b->predictor);
-          vpx_memcpy(best_dqcoeff, b->dqcoeff, 32);
-        }
+        *a = tempa;
+        *l = templ;
+        copy_predictor(best_predictor, b->predictor);
+        vpx_memcpy(best_dqcoeff, b->dqcoeff, 32);
+      }
 #if CONFIG_COMP_INTRA_PRED
     }
 #endif
@@ -1248,18 +1168,12 @@ static int64_t rd_pick_intra4x4block(VP8_COMP *cpi, MACROBLOCK *x, BLOCK *be,
   b->bmi.as_mode.second = (B_PREDICTION_MODE)(*best_second_mode);
 #endif
 
-#if CONFIG_HYBRIDTRANSFORM
-  b->bmi.as_mode.tx_type = best_tx_type;
-
   // inverse transform
-  if (active_ht)
-    vp8_ihtllm_c(best_dqcoeff, b->diff, 32, b->bmi.as_mode.tx_type, 4);
+  if (best_tx_type != DCT_DCT)
+    vp8_ihtllm_c(best_dqcoeff, b->diff, 32, best_tx_type, 4);
   else
-    IDCT_INVOKE(IF_RTCD(&cpi->rtcd.common->idct), idct16)(best_dqcoeff,
-                                                                b->diff, 32);
-#else
-  IDCT_INVOKE(IF_RTCD(&cpi->rtcd.common->idct), idct16)(best_dqcoeff, b->diff, 32);
-#endif
+    IDCT_INVOKE(IF_RTCD(&cpi->rtcd.common->idct), idct16)(
+        best_dqcoeff, b->diff, 32);
 
   vp8_recon_b(best_predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
 
@@ -1295,12 +1209,11 @@ static int64_t rd_pick_intra4x4mby_modes(VP8_COMP *cpi, MACROBLOCK *mb, int *Rat
     tl = (ENTROPY_CONTEXT *)&t_left;
   }
 
-  // TODO(agrange)
-  // vp8_intra_prediction_down_copy(xd);
-
   xd->mode_info_context->mbmi.mode = B_PRED;
   bmode_costs = mb->inter_bmode_costs;
 
+  vp8_intra_prediction_down_copy(xd);
+
   for (i = 0; i < 16; i++) {
     MODE_INFO *const mic = xd->mode_info_context;
     const int mis = xd->mode_info_stride;
@@ -1413,14 +1326,9 @@ static int64_t rd_pick_intra16x16mby_mode(VP8_COMP *cpi,
   int64_t this_rd;
   MACROBLOCKD *xd = &x->e_mbd;
 
-#if CONFIG_HYBRIDTRANSFORM16X16
-  int best_txtype, rd_txtype;
-#endif
-#if CONFIG_TX_SELECT
   int i;
   for (i = 0; i < NB_TXFM_MODES; i++)
     txfm_cache[i] = INT64_MAX;
-#endif
 
   // Y Search for 16x16 intra prediction mode
   for (mode = DC_PRED; mode <= TM_PRED; mode++) {
@@ -1449,9 +1357,6 @@ static int64_t rd_pick_intra16x16mby_mode(VP8_COMP *cpi,
 
       this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
 
-#if CONFIG_HYBRIDTRANSFORM16X16
-      rd_txtype = x->e_mbd.block[0].bmi.as_mode.tx_type;
-#endif
 
       if (this_rd < best_rd) {
         mode_selected = mode;
@@ -1463,13 +1368,9 @@ static int64_t rd_pick_intra16x16mby_mode(VP8_COMP *cpi,
         *Rate = rate;
         *rate_y = ratey;
         *Distortion = distortion;
-#if CONFIG_HYBRIDTRANSFORM16X16
-        best_txtype = rd_txtype;
-#endif
         *skippable = skip;
       }
 
-#if CONFIG_TX_SELECT
       for (i = 0; i < NB_TXFM_MODES; i++) {
         int64_t adj_rd = this_rd + local_txfm_cache[i] -
                           local_txfm_cache[cpi->common.txfm_mode];
@@ -1477,7 +1378,6 @@ static int64_t rd_pick_intra16x16mby_mode(VP8_COMP *cpi,
           txfm_cache[i] = adj_rd;
         }
       }
-#endif
 
 #if CONFIG_COMP_INTRA_PRED
     }
@@ -1486,9 +1386,6 @@ static int64_t rd_pick_intra16x16mby_mode(VP8_COMP *cpi,
 
   mbmi->txfm_size = txfm_size;
   mbmi->mode = mode_selected;
-#if CONFIG_HYBRIDTRANSFORM16X16
-  x->e_mbd.block[0].bmi.as_mode.tx_type = best_txtype;
-#endif
 
 #if CONFIG_COMP_INTRA_PRED
   mbmi->second_mode = mode2_selected;
@@ -1539,6 +1436,7 @@ static int64_t rd_pick_intra8x8block(VP8_COMP *cpi, MACROBLOCK *x, int ib,
 
       // FIXME rate for compound mode and second intrapred mode
       rate = mode_costs[mode];
+      b->bmi.as_mode.first = mode;
 
 #if CONFIG_COMP_INTRA_PRED
       if (mode2 == (MB_PREDICTION_MODE)(DC_PRED - 1)) {
@@ -1554,21 +1452,18 @@ static int64_t rd_pick_intra8x8block(VP8_COMP *cpi, MACROBLOCK *x, int ib,
       vp8_subtract_4b_c(be, b, 16);
 
       if (xd->mode_info_context->mbmi.txfm_size == TX_8X8) {
-#if CONFIG_HYBRIDTRANSFORM8X8
-        txfm_map(b, pred_mode_conv(mode));
-        vp8_fht_c(be->src_diff, (x->block + idx)->coeff, 32,
-                  b->bmi.as_mode.tx_type, 8);
-
-#else
-        x->vp8_short_fdct8x8(be->src_diff, (x->block + idx)->coeff, 32);
-#endif
+        TX_TYPE tx_type = get_tx_type_8x8(xd, b);
+        if (tx_type != DCT_DCT)
+          vp8_fht_c(be->src_diff, (x->block + idx)->coeff, 32, tx_type, 8);
+        else
+          x->vp8_short_fdct8x8(be->src_diff, (x->block + idx)->coeff, 32);
         x->quantize_b_8x8(x->block + idx, xd->block + idx);
 
         // compute quantization mse of 8x8 block
         distortion = vp8_block_error_c((x->block + idx)->coeff,
                                        (xd->block + idx)->dqcoeff, 64);
-        ta0 = *(a + vp8_block2above_8x8[idx]);
-        tl0 = *(l + vp8_block2left_8x8 [idx]);
+        ta0 = a[vp8_block2above_8x8[idx]];
+        tl0 = l[vp8_block2left_8x8[idx]];
 
         rate_t = cost_coeffs(x, xd->block + idx, PLANE_TYPE_Y_WITH_DC,
                              &ta0, &tl0, TX_8X8);
@@ -1594,10 +1489,10 @@ static int64_t rd_pick_intra8x8block(VP8_COMP *cpi, MACROBLOCK *x, int ib,
         distortion += vp8_block_error_c((x->block + ib + 5)->coeff,
                                         (xd->block + ib + 5)->dqcoeff, 16);
 
-        ta0 = *(a + vp8_block2above[ib]);
-        ta1 = *(a + vp8_block2above[ib + 1]);
-        tl0 = *(l + vp8_block2above[ib]);
-        tl1 = *(l + vp8_block2above[ib + 4]);
+        ta0 = a[vp8_block2above[ib]];
+        ta1 = a[vp8_block2above[ib + 1]];
+        tl0 = l[vp8_block2left[ib]];
+        tl1 = l[vp8_block2left[ib + 4]];
         rate_t = cost_coeffs(x, xd->block + ib, PLANE_TYPE_Y_WITH_DC,
                              &ta0, &tl0, TX_4X4);
         rate_t += cost_coeffs(x, xd->block + ib + 1, PLANE_TYPE_Y_WITH_DC,
@@ -1639,15 +1534,15 @@ static int64_t rd_pick_intra8x8block(VP8_COMP *cpi, MACROBLOCK *x, int ib,
   vp8_encode_intra8x8(IF_RTCD(&cpi->rtcd), x, ib);
 
   if (xd->mode_info_context->mbmi.txfm_size == TX_8X8) {
-    *(a + vp8_block2above_8x8[idx])     = besta0;
-    *(a + vp8_block2above_8x8[idx] + 1) = besta1;
-    *(l + vp8_block2left_8x8 [idx])     = bestl0;
-    *(l + vp8_block2left_8x8 [idx] + 1) = bestl1;
+    a[vp8_block2above_8x8[idx]]     = besta0;
+    a[vp8_block2above_8x8[idx] + 1] = besta1;
+    l[vp8_block2left_8x8[idx]]      = bestl0;
+    l[vp8_block2left_8x8[idx] + 1]  = bestl1;
   } else {
-    *(a + vp8_block2above[ib])     = besta0;
-    *(a + vp8_block2above[ib + 1]) = besta1;
-    *(l + vp8_block2above[ib])     = bestl0;
-    *(l + vp8_block2above[ib + 4]) = bestl1;
+    a[vp8_block2above[ib]]     = besta0;
+    a[vp8_block2above[ib + 1]] = besta1;
+    l[vp8_block2left[ib]]      = bestl0;
+    l[vp8_block2left[ib + 4]]  = bestl1;
   }
 
   return best_rd;
@@ -1839,7 +1734,7 @@ static int64_t rd_inter16x16_uv_8x8(VP8_COMP *cpi, MACROBLOCK *x, int *rate,
 
 
 static int64_t rd_inter4x4_uv(VP8_COMP *cpi, MACROBLOCK *x, int *rate,
-                              int *distortion, int fullpixel) {
+                              int *distortion, int *skippable, int fullpixel) {
   vp8_build_inter4x4_predictors_mbuv(&x->e_mbd);
   ENCODEMB_INVOKE(IF_RTCD(&cpi->rtcd.encodemb), submbuv)(x->src_diff,
                                                          x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor, x->src.uv_stride);
@@ -1849,6 +1744,7 @@ static int64_t rd_inter4x4_uv(VP8_COMP *cpi, MACROBLOCK *x, int *rate,
 
   *rate       = rd_cost_mbuv(x);
   *distortion = ENCODEMB_INVOKE(&cpi->rtcd.encodemb, mbuverr)(x) / 4;
+  *skippable  = mbuv_is_skippable_4x4(&x->e_mbd);
 
   return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
 }
@@ -2105,7 +2001,7 @@ static int labels2mode(
   int_mv *best_ref_mv,
   int_mv *second_best_ref_mv,
   DEC_MVCOSTS) {
-  MACROBLOCKD *const xd = & x->e_mbd;
+  MACROBLOCKD *const xd = &x->e_mbd;
   MODE_INFO *const mic = xd->mode_info_context;
   MB_MODE_INFO * mbmi = &mic->mbmi;
   const int mis = xd->mode_info_stride;
@@ -2199,30 +2095,19 @@ static int labels2mode(
   return cost;
 }
 
-static int rdcost_mbsegment_y(MACROBLOCK *mb, const int *labels,
-                              int which_label, ENTROPY_CONTEXT *ta,
-                              ENTROPY_CONTEXT *tl) {
-  int b, cost = 0;
-  MACROBLOCKD *xd = &mb->e_mbd;
-
-  for (b = 0; b < 16; b++)
-    if (labels[ b] == which_label)
-      cost += cost_coeffs(mb, xd->block + b, PLANE_TYPE_Y_WITH_DC,
-                          ta + vp8_block2above[b],
-                          tl + vp8_block2left[b], TX_4X4);
-
-  return cost;
-
-}
-
-static unsigned int vp8_encode_inter_mb_segment(MACROBLOCK *x,
-                                                int const *labels,
-                                                int which_label,
-                                                const VP8_ENCODER_RTCD *rtcd) {
+static int64_t encode_inter_mb_segment(MACROBLOCK *x,
+                                       int const *labels,
+                                       int which_label,
+                                       int *labelyrate,
+                                       int *distortion,
+                                       ENTROPY_CONTEXT *ta,
+                                       ENTROPY_CONTEXT *tl,
+                                       const VP8_ENCODER_RTCD *rtcd) {
   int i;
-  unsigned int distortion = 0;
   MACROBLOCKD *xd = &x->e_mbd;
 
+  *labelyrate = 0;
+  *distortion = 0;
   for (i = 0; i < 16; i++) {
     if (labels[i] == which_label) {
       BLOCKD *bd = &x->e_mbd.block[i];
@@ -2234,18 +2119,118 @@ static unsigned int vp8_encode_inter_mb_segment(MACROBLOCK *x,
         vp8_build_2nd_inter_predictors_b(bd, 16, xd->subpixel_predict_avg);
       ENCODEMB_INVOKE(&rtcd->encodemb, subb)(be, bd, 16);
       x->vp8_short_fdct4x4(be->src_diff, be->coeff, 32);
-
-      // set to 0 no way to account for 2nd order DC so discount
-      // be->coeff[0] = 0;
       x->quantize_b_4x4(be, bd);
-      thisdistortion = ENCODEMB_INVOKE(&rtcd->encodemb, berr)(
-                         be->coeff, bd->dqcoeff, 16) / 4;
-      distortion += thisdistortion;
+      thisdistortion = vp8_block_error_c(be->coeff, bd->dqcoeff, 16);
+      *distortion += thisdistortion;
+      *labelyrate += cost_coeffs(x, bd, PLANE_TYPE_Y_WITH_DC,
+                                 ta + vp8_block2above[i],
+                                 tl + vp8_block2left[i], TX_4X4);
     }
   }
-  return distortion;
+  *distortion >>= 2;
+  return RDCOST(x->rdmult, x->rddiv, *labelyrate, *distortion);
 }
 
+static int64_t encode_inter_mb_segment_8x8(MACROBLOCK *x,
+                                           int const *labels,
+                                           int which_label,
+                                           int *labelyrate,
+                                           int *distortion,
+                                           int64_t *otherrd,
+                                           ENTROPY_CONTEXT *ta,
+                                           ENTROPY_CONTEXT *tl,
+                                           const VP8_ENCODER_RTCD *rtcd) {
+  int i, j;
+  MACROBLOCKD *xd = &x->e_mbd;
+  const int iblock[4] = { 0, 1, 4, 5 };
+  int othercost = 0, otherdist = 0;
+  ENTROPY_CONTEXT_PLANES tac, tlc;
+  ENTROPY_CONTEXT *tacp = (ENTROPY_CONTEXT *) &tac,
+                  *tlcp = (ENTROPY_CONTEXT *) &tlc;
+
+  if (otherrd) {
+    memcpy(&tac, ta, sizeof(ENTROPY_CONTEXT_PLANES));
+    memcpy(&tlc, tl, sizeof(ENTROPY_CONTEXT_PLANES));
+  }
+
+  *distortion = 0;
+  *labelyrate = 0;
+  for (i = 0; i < 4; i++) {
+    int ib = vp8_i8x8_block[i];
+
+    if (labels[ib] == which_label) {
+      int idx = (ib & 8) + ((ib & 2) << 1);
+      BLOCKD *bd = &xd->block[ib], *bd2 = &xd->block[idx];
+      BLOCK *be = &x->block[ib], *be2 = &x->block[idx];
+      int thisdistortion;
+
+      vp8_build_inter_predictors4b(xd, bd, 16);
+      if (xd->mode_info_context->mbmi.second_ref_frame)
+        vp8_build_2nd_inter_predictors4b(xd, bd, 16);
+      vp8_subtract_4b_c(be, bd, 16);
+
+      if (xd->mode_info_context->mbmi.txfm_size == TX_4X4) {
+        if (otherrd) {
+          x->vp8_short_fdct8x8(be->src_diff, be2->coeff, 32);
+          x->quantize_b_8x8(be2, bd2);
+          thisdistortion = vp8_block_error_c(be2->coeff, bd2->dqcoeff, 64);
+          otherdist += thisdistortion;
+          othercost += cost_coeffs(x, bd2, PLANE_TYPE_Y_WITH_DC,
+                                     tacp + vp8_block2above_8x8[idx],
+                                     tlcp + vp8_block2left_8x8[idx], TX_8X8);
+        }
+        for (j = 0; j < 4; j += 2) {
+          bd = &xd->block[ib + iblock[j]];
+          be = &x->block[ib + iblock[j]];
+          x->vp8_short_fdct8x4(be->src_diff, be->coeff, 32);
+          x->quantize_b_4x4_pair(be, be + 1, bd, bd + 1);
+          thisdistortion = vp8_block_error_c(be->coeff, bd->dqcoeff, 32);
+          *distortion += thisdistortion;
+          *labelyrate += cost_coeffs(x, bd, PLANE_TYPE_Y_WITH_DC,
+                                     ta + vp8_block2above[ib + iblock[j]],
+                                     tl + vp8_block2left[ib + iblock[j]],
+                                     TX_4X4);
+          *labelyrate += cost_coeffs(x, bd + 1, PLANE_TYPE_Y_WITH_DC,
+                                     ta + vp8_block2above[ib + iblock[j] + 1],
+                                     tl + vp8_block2left[ib + iblock[j]],
+                                     TX_4X4);
+        }
+      } else /* 8x8 */ {
+        if (otherrd) {
+          for (j = 0; j < 4; j += 2) {
+            BLOCKD *bd3 = &xd->block[ib + iblock[j]];
+            BLOCK *be3 = &x->block[ib + iblock[j]];
+            x->vp8_short_fdct8x4(be3->src_diff, be3->coeff, 32);
+            x->quantize_b_4x4_pair(be3, be3 + 1, bd3, bd3 + 1);
+            thisdistortion = vp8_block_error_c(be3->coeff, bd3->dqcoeff, 32);
+            otherdist += thisdistortion;
+            othercost += cost_coeffs(x, bd3, PLANE_TYPE_Y_WITH_DC,
+                                     tacp + vp8_block2above[ib + iblock[j]],
+                                     tlcp + vp8_block2left[ib + iblock[j]],
+                                     TX_4X4);
+            othercost += cost_coeffs(x, bd3 + 1, PLANE_TYPE_Y_WITH_DC,
+                                     tacp + vp8_block2above[ib + iblock[j] + 1],
+                                     tlcp + vp8_block2left[ib + iblock[j]],
+                                     TX_4X4);
+          }
+        }
+        x->vp8_short_fdct8x8(be->src_diff, be2->coeff, 32);
+        x->quantize_b_8x8(be2, bd2);
+        thisdistortion = vp8_block_error_c(be2->coeff, bd2->dqcoeff, 64);
+        *distortion += thisdistortion;
+        *labelyrate += cost_coeffs(x, bd2, PLANE_TYPE_Y_WITH_DC,
+                                   ta + vp8_block2above_8x8[idx],
+                                   tl + vp8_block2left_8x8[idx], TX_8X8);
+      }
+    }
+  }
+  *distortion >>= 2;
+  if (otherrd) {
+    othercost >>= 2;
+    *otherrd = RDCOST(x->rdmult, x->rddiv, othercost, otherdist);
+  }
+  return RDCOST(x->rdmult, x->rddiv, *labelyrate, *distortion);
+}
 
 static const unsigned int segmentation_to_sseshift[4] = {3, 3, 2, 0};
 
@@ -2255,13 +2240,14 @@ typedef struct {
   int_mv mvp;
 
   int64_t segment_rd;
-  int segment_num;
+  SPLITMV_PARTITIONING_TYPE segment_num;
+  TX_SIZE txfm_size;
   int r;
   int d;
   int segment_yrate;
   B_PREDICTION_MODE modes[16];
   int_mv mvs[16], second_mvs[16];
-  unsigned char eobs[16];
+  int eobs[16];
 
   int mvthresh;
   int *mdcounts;
@@ -2281,21 +2267,27 @@ int mv_check_bounds(MACROBLOCK *x, int_mv *mv) {
   return r;
 }
 
-static void rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x,
-                             BEST_SEG_INFO *bsi, unsigned int segmentation,
-                             int_mv seg_mvs[16 /* n_blocks */][MAX_REF_FRAMES - 1]) {
-  int i;
+static void rd_check_segment_txsize(VP8_COMP *cpi, MACROBLOCK *x,
+                                    BEST_SEG_INFO *bsi,
+                                    SPLITMV_PARTITIONING_TYPE segmentation,
+                                    TX_SIZE tx_size, int64_t *otherrds,
+                                    int64_t *rds, int *completed,
+                                    /* 16 = n_blocks */
+                                    int_mv seg_mvs[16 /* n_blocks */]
+                                                  [MAX_REF_FRAMES - 1]) {
+  int i, j;
   int const *labels;
   int br = 0, bd = 0;
   B_PREDICTION_MODE this_mode;
   MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
 
   int label_count;
-  int64_t this_segment_rd = 0;
+  int64_t this_segment_rd = 0, other_segment_rd;
   int label_mv_thresh;
   int rate = 0;
   int sbr = 0, sbd = 0;
   int segmentyrate = 0;
+  int best_eobs[16] = { 0 };
 
   vp8_variance_fn_ptr_t *v_fn_ptr;
 
@@ -2323,20 +2315,23 @@ static void rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x,
   label_mv_thresh = 1 * bsi->mvthresh / label_count;
 
   // Segmentation method overheads
-  rate = vp8_cost_token(vp8_mbsplit_tree, vp8_mbsplit_probs, vp8_mbsplit_encodings + segmentation);
+  rate = vp8_cost_token(vp8_mbsplit_tree, vp8_mbsplit_probs,
+                        vp8_mbsplit_encodings + segmentation);
   rate += vp8_cost_mv_ref(cpi, SPLITMV, bsi->mdcounts);
   this_segment_rd += RDCOST(x->rdmult, x->rddiv, rate, 0);
   br += rate;
+  other_segment_rd = this_segment_rd;
 
-  for (i = 0; i < label_count; i++) {
+  mbmi->txfm_size = tx_size;
+  for (i = 0; i < label_count && this_segment_rd < bsi->segment_rd; i++) {
     int_mv mode_mv[B_MODE_COUNT], second_mode_mv[B_MODE_COUNT];
-    int64_t best_label_rd = INT64_MAX;
+    int64_t best_label_rd = INT64_MAX, best_other_rd = INT64_MAX;
     B_PREDICTION_MODE mode_selected = ZERO4X4;
     int bestlabelyrate = 0;
 
     // search for the best motion vector on this segment
     for (this_mode = LEFT4X4; this_mode <= NEW4X4; this_mode ++) {
-      int64_t this_rd;
+      int64_t this_rd, other_rd;
       int distortion;
       int labelyrate;
       ENTROPY_CONTEXT_PLANES t_above_s, t_left_s;
@@ -2358,21 +2353,23 @@ static void rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x,
         BLOCK *c;
         BLOCKD *e;
 
-        // Is the best so far sufficiently good that we cant justify doing and new motion search.
+        /* Is the best so far sufficiently good that we cant justify doing
+         * and new motion search. */
         if (best_label_rd < label_mv_thresh)
           break;
 
         if (cpi->compressor_speed) {
-          if (segmentation == BLOCK_8X16 || segmentation == BLOCK_16X8) {
+          if (segmentation == PARTITIONING_8X16 ||
+              segmentation == PARTITIONING_16X8) {
             bsi->mvp.as_int = bsi->sv_mvp[i].as_int;
-            if (i == 1 && segmentation == BLOCK_16X8)
+            if (i == 1 && segmentation == PARTITIONING_16X8)
               bsi->mvp.as_int = bsi->sv_mvp[2].as_int;
 
             step_param = bsi->sv_istep[i];
           }
 
           // use previous block's result as next block's MV predictor.
-          if (segmentation == BLOCK_4X4 && i > 0) {
+          if (segmentation == PARTITIONING_4X4 && i > 0) {
             bsi->mvp.as_int = x->e_mbd.block[i - 1].bmi.as_mv.first.as_int;
             if (i == 4 || i == 8 || i == 12)
               bsi->mvp.as_int = x->e_mbd.block[i - 4].bmi.as_mv.first.as_int;
@@ -2404,7 +2401,8 @@ static void rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x,
           // Should we do a full search (best quality only)
           if ((cpi->compressor_speed == 0) && (bestsme >> sseshift) > 4000) {
             /* Check if mvp_full is within the range. */
-            vp8_clamp_mv(&mvp_full, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
+            vp8_clamp_mv(&mvp_full, x->mv_col_min, x->mv_col_max,
+                         x->mv_row_min, x->mv_row_max);
 
             thissme = cpi->full_search_sad(x, c, e, &mvp_full,
                                            sadpb, 16, v_fn_ptr,
@@ -2414,7 +2412,8 @@ static void rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x,
               bestsme = thissme;
               mode_mv[NEW4X4].as_int = e->bmi.as_mv.first.as_int;
             } else {
-              // The full search result is actually worse so re-instate the previous best vector
+              /* The full search result is actually worse so re-instate the
+               * previous best vector */
               e->bmi.as_mv.first.as_int = mode_mv[NEW4X4].as_int;
             }
           }
@@ -2424,15 +2423,16 @@ static void rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x,
           int distortion;
           unsigned int sse;
           cpi->find_fractional_mv_step(x, c, e, &mode_mv[NEW4X4],
-                                       bsi->ref_mv, x->errorperbit, v_fn_ptr, XMVCOST,
-                                       &distortion, &sse);
+                                       bsi->ref_mv, x->errorperbit, v_fn_ptr,
+                                       XMVCOST, &distortion, &sse);
 
           // safe motion search result for use in compound prediction
           seg_mvs[i][mbmi->ref_frame - 1].as_int = mode_mv[NEW4X4].as_int;
         }
       } /* NEW4X4 */
       else if (mbmi->second_ref_frame && this_mode == NEW4X4) {
-        // motion search not completed? Then skip newmv for this block with comppred
+        /* motion search not completed? Then skip newmv for this block with
+         * comppred */
         if (seg_mvs[i][mbmi->second_ref_frame - 1].as_int == INVALID_MV ||
             seg_mvs[i][mbmi->ref_frame        - 1].as_int == INVALID_MV) {
           continue;
@@ -2454,21 +2454,39 @@ static void rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x,
           mv_check_bounds(x, &second_mode_mv[this_mode]))
         continue;
 
-      distortion = vp8_encode_inter_mb_segment(
-                     x, labels, i,
-                     IF_RTCD(&cpi->rtcd));
-
-      labelyrate = rdcost_mbsegment_y(x, labels, i, ta_s, tl_s);
+      if (segmentation == PARTITIONING_4X4) {
+        this_rd = encode_inter_mb_segment(x, labels, i, &labelyrate,
+                                          &distortion,
+                                          ta_s, tl_s, IF_RTCD(&cpi->rtcd));
+        other_rd = this_rd;
+      } else {
+        this_rd = encode_inter_mb_segment_8x8(x, labels, i, &labelyrate,
+                                              &distortion, &other_rd,
+                                              ta_s, tl_s, IF_RTCD(&cpi->rtcd));
+      }
+      this_rd += RDCOST(x->rdmult, x->rddiv, rate, 0);
       rate += labelyrate;
 
-      this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
-
       if (this_rd < best_label_rd) {
         sbr = rate;
         sbd = distortion;
         bestlabelyrate = labelyrate;
         mode_selected = this_mode;
         best_label_rd = this_rd;
+        if (x->e_mbd.mode_info_context->mbmi.txfm_size == TX_4X4) {
+          for (j = 0; j < 16; j++)
+            if (labels[j] == i)
+              best_eobs[j] = x->e_mbd.block[j].eob;
+        } else {
+          for (j = 0; j < 4; j++) {
+            int ib = vp8_i8x8_block[j], idx = j * 4;
+
+            if (labels[ib] == i)
+              best_eobs[idx] = x->e_mbd.block[idx].eob;
+          }
+        }
+        if (other_rd < best_other_rd)
+          best_other_rd = other_rd;
 
         vpx_memcpy(ta_b, ta_s, sizeof(ENTROPY_CONTEXT_PLANES));
         vpx_memcpy(tl_b, tl_s, sizeof(ENTROPY_CONTEXT_PLANES));
@@ -2480,18 +2498,18 @@ static void rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x,
     vpx_memcpy(tl, tl_b, sizeof(ENTROPY_CONTEXT_PLANES));
 
     labels2mode(x, labels, i, mode_selected, &mode_mv[mode_selected],
-                &second_mode_mv[mode_selected], seg_mvs[i], bsi->ref_mv, bsi->second_ref_mv, XMVCOST);
+                &second_mode_mv[mode_selected], seg_mvs[i],
+                bsi->ref_mv, bsi->second_ref_mv, XMVCOST);
 
     br += sbr;
     bd += sbd;
     segmentyrate += bestlabelyrate;
     this_segment_rd += best_label_rd;
-
-    if (this_segment_rd >= bsi->segment_rd) {
-      break;
-    }
-
-
+    other_segment_rd += best_other_rd;
+    if (rds)
+      rds[i] = this_segment_rd;
+    if (otherrds)
+      rds[i] = other_segment_rd;
   } /* for each label */
 
   if (this_segment_rd < bsi->segment_rd) {
@@ -2500,6 +2518,7 @@ static void rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x,
     bsi->segment_yrate = segmentyrate;
     bsi->segment_rd = this_segment_rd;
     bsi->segment_num = segmentation;
+    bsi->txfm_size = mbmi->txfm_size;
 
     // store everything needed to come back to this!!
     for (i = 0; i < 16; i++) {
@@ -2509,7 +2528,106 @@ static void rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x,
       if (mbmi->second_ref_frame)
         bsi->second_mvs[i].as_mv = x->partition_info->bmi[i].second_mv.as_mv;
       bsi->modes[i] = x->partition_info->bmi[i].mode;
-      bsi->eobs[i] = bd->eob;
+      bsi->eobs[i] = best_eobs[i];
+    }
+  }
+
+  if (completed) {
+    *completed = i;
+  }
+}
+
+static void rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x,
+                             BEST_SEG_INFO *bsi,
+                             unsigned int segmentation,
+                             /* 16 = n_blocks */
+                             int_mv seg_mvs[16][MAX_REF_FRAMES - 1],
+                             int64_t txfm_cache[NB_TXFM_MODES]) {
+  int i, n, c = vp8_mbsplit_count[segmentation];
+
+  if (segmentation == PARTITIONING_4X4) {
+    int64_t rd[16];
+
+    rd_check_segment_txsize(cpi, x, bsi, segmentation, TX_4X4, NULL,
+                            rd, &n, seg_mvs);
+    if (n == c) {
+      for (i = 0; i < NB_TXFM_MODES; i++) {
+        if (rd[c - 1] < txfm_cache[i])
+          txfm_cache[i] = rd[c - 1];
+      }
+    }
+  } else {
+    int64_t diff, base_rd;
+    int cost4x4 = vp8_cost_bit(cpi->common.prob_tx[0], 0);
+    int cost8x8 = vp8_cost_bit(cpi->common.prob_tx[0], 1);
+
+    if (cpi->common.txfm_mode == TX_MODE_SELECT) {
+      int64_t rd4x4[4], rd8x8[4];
+      int n4x4, n8x8, nmin;
+      BEST_SEG_INFO bsi4x4, bsi8x8;
+
+      /* factor in cost of cost4x4/8x8 in decision */
+      vpx_memcpy(&bsi4x4, bsi, sizeof(*bsi));
+      vpx_memcpy(&bsi8x8, bsi, sizeof(*bsi));
+      rd_check_segment_txsize(cpi, x, &bsi4x4, segmentation,
+                              TX_4X4, NULL, rd4x4, &n4x4, seg_mvs);
+      rd_check_segment_txsize(cpi, x, &bsi8x8, segmentation,
+                              TX_8X8, NULL, rd8x8, &n8x8, seg_mvs);
+      if (bsi4x4.segment_num == segmentation) {
+        bsi4x4.segment_rd += RDCOST(x->rdmult, x->rddiv, cost4x4, 0);
+        if (bsi4x4.segment_rd < bsi->segment_rd)
+          vpx_memcpy(bsi, &bsi4x4, sizeof(*bsi));
+      }
+      if (bsi8x8.segment_num == segmentation) {
+        bsi8x8.segment_rd += RDCOST(x->rdmult, x->rddiv, cost8x8, 0);
+        if (bsi8x8.segment_rd < bsi->segment_rd)
+          vpx_memcpy(bsi, &bsi8x8, sizeof(*bsi));
+      }
+      n = n4x4 > n8x8 ? n4x4 : n8x8;
+      if (n == c) {
+        nmin = n4x4 < n8x8 ? n4x4 : n8x8;
+        diff = rd8x8[nmin - 1] - rd4x4[nmin - 1];
+        if (n == n4x4) {
+          base_rd = rd4x4[c - 1];
+        } else {
+          base_rd = rd8x8[c - 1] - diff;
+        }
+      }
+    } else {
+      int64_t rd[4], otherrd[4];
+
+      if (cpi->common.txfm_mode == ONLY_4X4) {
+        rd_check_segment_txsize(cpi, x, bsi, segmentation, TX_4X4, otherrd,
+                                rd, &n, seg_mvs);
+        if (n == c) {
+          base_rd = rd[c - 1];
+          diff = otherrd[c - 1] - rd[c - 1];
+        }
+      } else /* use 8x8 transform */ {
+        rd_check_segment_txsize(cpi, x, bsi, segmentation, TX_8X8, otherrd,
+                                rd, &n, seg_mvs);
+        if (n == c) {
+          diff = rd[c - 1] - otherrd[c - 1];
+          base_rd = otherrd[c - 1];
+        }
+      }
+    }
+
+    if (n == c) {
+      if (base_rd < txfm_cache[ONLY_4X4]) {
+        txfm_cache[ONLY_4X4] = base_rd;
+      }
+      if (base_rd + diff < txfm_cache[1]) {
+        txfm_cache[ALLOW_8X8] = txfm_cache[ALLOW_16X16] = base_rd + diff;
+      }
+      if (diff < 0) {
+        base_rd += diff + RDCOST(x->rdmult, x->rddiv, cost8x8, 0);
+      } else {
+        base_rd += RDCOST(x->rdmult, x->rddiv, cost4x4, 0);
+      }
+      if (base_rd < txfm_cache[TX_MODE_SELECT]) {
+        txfm_cache[TX_MODE_SELECT] = base_rd;
+      }
     }
   }
 }
@@ -2527,17 +2645,26 @@ void vp8_cal_step_param(int sr, int *sp) {
   *sp = MAX_MVSEARCH_STEPS - 1 - step;
 }
 
-static int vp8_rd_pick_best_mbsegmentation(VP8_COMP *cpi, MACROBLOCK *x,
-                                           int_mv *best_ref_mv, int_mv *second_best_ref_mv, int64_t best_rd,
-                                           int *mdcounts, int *returntotrate,
-                                           int *returnyrate, int *returndistortion,
-                                           int mvthresh,
-                                           int_mv seg_mvs[BLOCK_MAX_SEGMENTS - 1][16 /* n_blocks */][MAX_REF_FRAMES - 1]) {
+static int rd_pick_best_mbsegmentation(VP8_COMP *cpi, MACROBLOCK *x,
+                                       int_mv *best_ref_mv,
+                                       int_mv *second_best_ref_mv,
+                                       int64_t best_rd,
+                                       int *mdcounts,
+                                       int *returntotrate,
+                                       int *returnyrate,
+                                       int *returndistortion,
+                                       int *skippable, int mvthresh,
+                                       int_mv seg_mvs[NB_PARTITIONINGS]
+                                                     [16 /* n_blocks */]
+                                                     [MAX_REF_FRAMES - 1],
+                                       int64_t txfm_cache[NB_TXFM_MODES]) {
   int i;
   BEST_SEG_INFO bsi;
   MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
 
   vpx_memset(&bsi, 0, sizeof(bsi));
+  for (i = 0; i < NB_TXFM_MODES; i++)
+    txfm_cache[i] = INT64_MAX;
 
   bsi.segment_rd = best_rd;
   bsi.ref_mv = best_ref_mv;
@@ -2545,6 +2672,7 @@ static int vp8_rd_pick_best_mbsegmentation(VP8_COMP *cpi, MACROBLOCK *x,
   bsi.mvp.as_int = best_ref_mv->as_int;
   bsi.mvthresh = mvthresh;
   bsi.mdcounts = mdcounts;
+  bsi.txfm_size = TX_4X4;
 
   for (i = 0; i < 16; i++)
     bsi.modes[i] = ZERO4X4;
@@ -2552,15 +2680,19 @@ static int vp8_rd_pick_best_mbsegmentation(VP8_COMP *cpi, MACROBLOCK *x,
   if (cpi->compressor_speed == 0) {
     /* for now, we will keep the original segmentation order
        when in best quality mode */
-    rd_check_segment(cpi, x, &bsi, BLOCK_16X8, seg_mvs[BLOCK_16X8]);
-    rd_check_segment(cpi, x, &bsi, BLOCK_8X16, seg_mvs[BLOCK_8X16]);
-    rd_check_segment(cpi, x, &bsi, BLOCK_8X8,  seg_mvs[BLOCK_8X8]);
-    rd_check_segment(cpi, x, &bsi, BLOCK_4X4,  seg_mvs[BLOCK_4X4]);
+    rd_check_segment(cpi, x, &bsi, PARTITIONING_16X8,
+                     seg_mvs[PARTITIONING_16X8], txfm_cache);
+    rd_check_segment(cpi, x, &bsi, PARTITIONING_8X16,
+                     seg_mvs[PARTITIONING_8X16], txfm_cache);
+    rd_check_segment(cpi, x, &bsi, PARTITIONING_8X8,
+                     seg_mvs[PARTITIONING_8X8], txfm_cache);
+    rd_check_segment(cpi, x, &bsi, PARTITIONING_4X4,
+                     seg_mvs[PARTITIONING_4X4], txfm_cache);
   } else {
     int sr;
 
-    rd_check_segment(cpi, x, &bsi, BLOCK_8X8, seg_mvs[BLOCK_8X8]);
-
+    rd_check_segment(cpi, x, &bsi, PARTITIONING_8X8,
+                     seg_mvs[PARTITIONING_8X8], txfm_cache);
 
     if (bsi.segment_rd < best_rd) {
       int tmp_col_min = x->mv_col_min;
@@ -2576,34 +2708,40 @@ static int vp8_rd_pick_best_mbsegmentation(VP8_COMP *cpi, MACROBLOCK *x,
       bsi.sv_mvp[2].as_int = bsi.mvs[8].as_int;
       bsi.sv_mvp[3].as_int = bsi.mvs[10].as_int;
 
-      /* Use 8x8 result as 16x8/8x16's predictor MV. Adjust search range according to the closeness of 2 MV. */
+      /* Use 8x8 result as 16x8/8x16's predictor MV. Adjust search range
+       * according to the closeness of 2 MV. */
       /* block 8X16 */
-      {
-        sr = MAXF((abs(bsi.sv_mvp[0].as_mv.row - bsi.sv_mvp[2].as_mv.row)) >> 3, (abs(bsi.sv_mvp[0].as_mv.col - bsi.sv_mvp[2].as_mv.col)) >> 3);
-        vp8_cal_step_param(sr, &bsi.sv_istep[0]);
+      sr = MAXF((abs(bsi.sv_mvp[0].as_mv.row - bsi.sv_mvp[2].as_mv.row)) >> 3,
+                (abs(bsi.sv_mvp[0].as_mv.col - bsi.sv_mvp[2].as_mv.col)) >> 3);
+      vp8_cal_step_param(sr, &bsi.sv_istep[0]);
 
-        sr = MAXF((abs(bsi.sv_mvp[1].as_mv.row - bsi.sv_mvp[3].as_mv.row)) >> 3, (abs(bsi.sv_mvp[1].as_mv.col - bsi.sv_mvp[3].as_mv.col)) >> 3);
-        vp8_cal_step_param(sr, &bsi.sv_istep[1]);
+      sr = MAXF((abs(bsi.sv_mvp[1].as_mv.row - bsi.sv_mvp[3].as_mv.row)) >> 3,
+                (abs(bsi.sv_mvp[1].as_mv.col - bsi.sv_mvp[3].as_mv.col)) >> 3);
+      vp8_cal_step_param(sr, &bsi.sv_istep[1]);
 
-        rd_check_segment(cpi, x, &bsi, BLOCK_8X16, seg_mvs[BLOCK_8X16]);
-      }
+      rd_check_segment(cpi, x, &bsi, PARTITIONING_8X16,
+                       seg_mvs[PARTITIONING_8X16], txfm_cache);
 
       /* block 16X8 */
-      {
-        sr = MAXF((abs(bsi.sv_mvp[0].as_mv.row - bsi.sv_mvp[1].as_mv.row)) >> 3, (abs(bsi.sv_mvp[0].as_mv.col - bsi.sv_mvp[1].as_mv.col)) >> 3);
-        vp8_cal_step_param(sr, &bsi.sv_istep[0]);
+      sr = MAXF((abs(bsi.sv_mvp[0].as_mv.row - bsi.sv_mvp[1].as_mv.row)) >> 3,
+                (abs(bsi.sv_mvp[0].as_mv.col - bsi.sv_mvp[1].as_mv.col)) >> 3);
+      vp8_cal_step_param(sr, &bsi.sv_istep[0]);
 
-        sr = MAXF((abs(bsi.sv_mvp[2].as_mv.row - bsi.sv_mvp[3].as_mv.row)) >> 3, (abs(bsi.sv_mvp[2].as_mv.col - bsi.sv_mvp[3].as_mv.col)) >> 3);
-        vp8_cal_step_param(sr, &bsi.sv_istep[1]);
+      sr = MAXF((abs(bsi.sv_mvp[2].as_mv.row - bsi.sv_mvp[3].as_mv.row)) >> 3,
+                (abs(bsi.sv_mvp[2].as_mv.col - bsi.sv_mvp[3].as_mv.col)) >> 3);
+      vp8_cal_step_param(sr, &bsi.sv_istep[1]);
 
-        rd_check_segment(cpi, x, &bsi, BLOCK_16X8, seg_mvs[BLOCK_16X8]);
-      }
+      rd_check_segment(cpi, x, &bsi, PARTITIONING_16X8,
+                       seg_mvs[PARTITIONING_16X8], txfm_cache);
 
       /* If 8x8 is better than 16x8/8x16, then do 4x4 search */
       /* Not skip 4x4 if speed=0 (good quality) */
-      if (cpi->sf.no_skip_block4x4_search || bsi.segment_num == BLOCK_8X8) { /* || (sv_segment_rd8x8-bsi.segment_rd) < sv_segment_rd8x8>>5) */
+      if (cpi->sf.no_skip_block4x4_search ||
+          bsi.segment_num == PARTITIONING_8X8) {
+        /* || (sv_segment_rd8x8-bsi.segment_rd) < sv_segment_rd8x8>>5) */
         bsi.mvp.as_int = bsi.sv_mvp[0].as_int;
-        rd_check_segment(cpi, x, &bsi, BLOCK_4X4, seg_mvs[BLOCK_4X4]);
+        rd_check_segment(cpi, x, &bsi, PARTITIONING_4X4,
+                         seg_mvs[PARTITIONING_4X4], txfm_cache);
       }
 
       /* restore UMV window */
@@ -2627,8 +2765,12 @@ static int vp8_rd_pick_best_mbsegmentation(VP8_COMP *cpi, MACROBLOCK *x,
   *returntotrate = bsi.r;
   *returndistortion = bsi.d;
   *returnyrate = bsi.segment_yrate;
+  *skippable = bsi.txfm_size == TX_4X4 ?
+                    mby_is_skippable_4x4(&x->e_mbd, 0) :
+                    mby_is_skippable_8x8(&x->e_mbd, 0);
 
   /* save partitions */
+  mbmi->txfm_size = bsi.txfm_size;
   mbmi->partitioning = bsi.segment_num;
   x->partition_info->count = vp8_mbsplit_count[bsi.segment_num];
 
@@ -2874,9 +3016,7 @@ void vp8_cal_sad(VP8_COMP *cpi, MACROBLOCKD *xd, MACROBLOCK *x, int recon_yoffse
 void rd_update_mvcount(VP8_COMP *cpi, MACROBLOCK *x,
                        int_mv *best_ref_mv, int_mv *second_best_ref_mv) {
   MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
-#if CONFIG_NEWMVENTROPY
   MV mv;
-#endif
 
   if (mbmi->mode == SPLITMV) {
     int i;
@@ -2884,7 +3024,6 @@ void rd_update_mvcount(VP8_COMP *cpi, MACROBLOCK *x,
     for (i = 0; i < x->partition_info->count; i++) {
       if (x->partition_info->bmi[i].mode == NEW4X4) {
         if (x->e_mbd.allow_high_precision_mv) {
-#if CONFIG_NEWMVENTROPY
           mv.row = (x->partition_info->bmi[i].mv.as_mv.row
                     - best_ref_mv->as_mv.row);
           mv.col = (x->partition_info->bmi[i].mv.as_mv.col
@@ -2898,20 +3037,7 @@ void rd_update_mvcount(VP8_COMP *cpi, MACROBLOCK *x,
             vp8_increment_nmv(&mv, &second_best_ref_mv->as_mv,
                               &cpi->NMVcount, 1);
           }
-#else
-          cpi->MVcount_hp[0][mv_max_hp + (x->partition_info->bmi[i].mv.as_mv.row
-                                          - best_ref_mv->as_mv.row)]++;
-          cpi->MVcount_hp[1][mv_max_hp + (x->partition_info->bmi[i].mv.as_mv.col
-                                          - best_ref_mv->as_mv.col)]++;
-          if (mbmi->second_ref_frame) {
-            cpi->MVcount_hp[0][mv_max_hp + (x->partition_info->bmi[i].second_mv.as_mv.row
-                                            - second_best_ref_mv->as_mv.row)]++;
-            cpi->MVcount_hp[1][mv_max_hp + (x->partition_info->bmi[i].second_mv.as_mv.col
-                                            - second_best_ref_mv->as_mv.col)]++;
-          }
-#endif
         } else {
-#if CONFIG_NEWMVENTROPY
           mv.row = (x->partition_info->bmi[i].mv.as_mv.row
                     - best_ref_mv->as_mv.row);
           mv.col = (x->partition_info->bmi[i].mv.as_mv.col
@@ -2925,24 +3051,11 @@ void rd_update_mvcount(VP8_COMP *cpi, MACROBLOCK *x,
             vp8_increment_nmv(&mv, &second_best_ref_mv->as_mv,
                               &cpi->NMVcount, 0);
           }
-#else
-          cpi->MVcount[0][mv_max + ((x->partition_info->bmi[i].mv.as_mv.row
-                                     - best_ref_mv->as_mv.row) >> 1)]++;
-          cpi->MVcount[1][mv_max + ((x->partition_info->bmi[i].mv.as_mv.col
-                                     - best_ref_mv->as_mv.col) >> 1)]++;
-          if (mbmi->second_ref_frame) {
-            cpi->MVcount[0][mv_max + ((x->partition_info->bmi[i].second_mv.as_mv.row
-                                       - second_best_ref_mv->as_mv.row) >> 1)]++;
-            cpi->MVcount[1][mv_max + ((x->partition_info->bmi[i].second_mv.as_mv.col
-                                       - second_best_ref_mv->as_mv.col) >> 1)]++;
-          }
-#endif
         }
       }
     }
   } else if (mbmi->mode == NEWMV) {
     if (x->e_mbd.allow_high_precision_mv) {
-#if CONFIG_NEWMVENTROPY
       mv.row = (mbmi->mv[0].as_mv.row - best_ref_mv->as_mv.row);
       mv.col = (mbmi->mv[0].as_mv.col - best_ref_mv->as_mv.col);
       vp8_increment_nmv(&mv, &best_ref_mv->as_mv, &cpi->NMVcount, 1);
@@ -2951,20 +3064,7 @@ void rd_update_mvcount(VP8_COMP *cpi, MACROBLOCK *x,
         mv.col = (mbmi->mv[1].as_mv.col - second_best_ref_mv->as_mv.col);
         vp8_increment_nmv(&mv, &second_best_ref_mv->as_mv, &cpi->NMVcount, 1);
       }
-#else
-      cpi->MVcount_hp[0][mv_max_hp + (mbmi->mv[0].as_mv.row
-                                      - best_ref_mv->as_mv.row)]++;
-      cpi->MVcount_hp[1][mv_max_hp + (mbmi->mv[0].as_mv.col
-                                      - best_ref_mv->as_mv.col)]++;
-      if (mbmi->second_ref_frame) {
-        cpi->MVcount_hp[0][mv_max_hp + (mbmi->mv[1].as_mv.row
-                                        - second_best_ref_mv->as_mv.row)]++;
-        cpi->MVcount_hp[1][mv_max_hp + (mbmi->mv[1].as_mv.col
-                                        - second_best_ref_mv->as_mv.col)]++;
-      }
-#endif
     } else {
-#if CONFIG_NEWMVENTROPY
       mv.row = (mbmi->mv[0].as_mv.row - best_ref_mv->as_mv.row);
       mv.col = (mbmi->mv[0].as_mv.col - best_ref_mv->as_mv.col);
       vp8_increment_nmv(&mv, &best_ref_mv->as_mv, &cpi->NMVcount, 0);
@@ -2973,18 +3073,6 @@ void rd_update_mvcount(VP8_COMP *cpi, MACROBLOCK *x,
         mv.col = (mbmi->mv[1].as_mv.col - second_best_ref_mv->as_mv.col);
         vp8_increment_nmv(&mv, &second_best_ref_mv->as_mv, &cpi->NMVcount, 0);
       }
-#else
-      cpi->MVcount[0][mv_max + ((mbmi->mv[0].as_mv.row
-                                 - best_ref_mv->as_mv.row) >> 1)]++;
-      cpi->MVcount[1][mv_max + ((mbmi->mv[0].as_mv.col
-                                 - best_ref_mv->as_mv.col) >> 1)]++;
-      if (mbmi->second_ref_frame) {
-        cpi->MVcount[0][mv_max + ((mbmi->mv[1].as_mv.row
-                                   - second_best_ref_mv->as_mv.row) >> 1)]++;
-        cpi->MVcount[1][mv_max + ((mbmi->mv[1].as_mv.col
-                                   - second_best_ref_mv->as_mv.col) >> 1)]++;
-      }
-#endif
     }
   }
 }
@@ -3158,9 +3246,7 @@ static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
                                  int hybrid_pred_diff,
                                  int64_t txfm_size_diff[NB_TXFM_MODES]) {
   MACROBLOCKD *xd = &x->e_mbd;
-#if CONFIG_TX_SELECT
   MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
-#endif
 
   // Take a snapshot of the coding context so it can be
   // restored if we decide to encode this way
@@ -3180,9 +3266,7 @@ static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
   ctx->comp_pred_diff   = comp_pred_diff;
   ctx->hybrid_pred_diff = hybrid_pred_diff;
 
-#if CONFIG_TX_SELECT
   memcpy(ctx->txfm_rd_diff, txfm_size_diff, sizeof(ctx->txfm_rd_diff));
-#endif
 }
 
 static void inter_mode_cost(VP8_COMP *cpi, MACROBLOCK *x, int this_mode,
@@ -3299,9 +3383,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
   int_mv ref_mv[MAX_REF_FRAMES] = {{0}};
 #endif
 
-#if CONFIG_SWITCHABLE_INTERP
   int switchable_filter_index = 0;
-#endif
 
   MB_PREDICTION_MODE uv_intra_mode;
   MB_PREDICTION_MODE uv_intra_mode_8x8 = 0;
@@ -3317,11 +3399,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
   unsigned char *y_buffer[4], *u_buffer[4], *v_buffer[4];
 
   unsigned int ref_costs[MAX_REF_FRAMES];
-  int_mv seg_mvs[BLOCK_MAX_SEGMENTS - 1][16 /* n_blocks */][MAX_REF_FRAMES - 1];
-
-#if CONFIG_HYBRIDTRANSFORM16X16
-  int best_txtype, rd_txtype;
-#endif
+  int_mv seg_mvs[NB_PARTITIONINGS][16 /* n_blocks */][MAX_REF_FRAMES - 1];
 
   vpx_memset(mode8x8, 0, sizeof(mode8x8));
   vpx_memset(&frame_mv, 0, sizeof(frame_mv));
@@ -3336,7 +3414,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
   for (i = 0; i < NB_TXFM_MODES; i++)
     best_txfm_rd[i] = INT64_MAX;
 
-  for (i = 0; i < BLOCK_MAX_SEGMENTS - 1; i++) {
+  for (i = 0; i < NB_PARTITIONINGS; i++) {
     int j, k;
 
     for (j = 0; j < 16; j++)
@@ -3402,12 +3480,8 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
   // that depend on the current prediction etc.
   vp8_estimate_ref_frame_costs(cpi, segment_id, ref_costs);
 
-#if CONFIG_SWITCHABLE_INTERP
   for (mode_index = 0; mode_index < MAX_MODES;
        mode_index += (!switchable_filter_index)) {
-#else
-  for (mode_index = 0; mode_index < MAX_MODES; ++mode_index) {
-#endif
     int64_t this_rd = INT64_MAX;
     int is_comp_pred;
     int disable_skip = 0, skippable = 0;
@@ -3435,19 +3509,16 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
 #if CONFIG_PRED_FILTER
     mbmi->pred_filter_enabled = 0;
 #endif
-#if CONFIG_SWITCHABLE_INTERP
     if (cpi->common.mcomp_filter_type == SWITCHABLE &&
         this_mode >= NEARESTMV && this_mode <= SPLITMV) {
       mbmi->interp_filter =
           vp8_switchable_interp[switchable_filter_index++];
       if (switchable_filter_index == VP8_SWITCHABLE_FILTERS)
         switchable_filter_index = 0;
-        //printf("Searching %d (%d)\n", this_mode, switchable_filter_index);
     } else {
       mbmi->interp_filter = cpi->common.mcomp_filter_type;
     }
     vp8_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
-#endif
 
     // Test best rd so far against threshold for trying this mode.
     if (best_rd <= cpi->rd_threshes[mode_index])
@@ -3546,9 +3617,6 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
           // FIXME compound intra prediction
           vp8_build_intra_predictors_mby(&x->e_mbd);
           macro_block_yrd(cpi, x, &rate_y, &distortion, &skippable, txfm_cache);
-#if CONFIG_HYBRIDTRANSFORM16X16
-          rd_txtype = x->e_mbd.block[0].bmi.as_mode.tx_type;
-#endif
           rate2 += rate_y;
           distortion2 += distortion;
           rate2 += x->mbmode_cost[x->e_mbd.frame_type][mbmi->mode];
@@ -3592,11 +3660,9 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
         }
         break;
         case I8X8_PRED: {
-#if CONFIG_TX_SELECT
           int cost0 = vp8_cost_bit(cm->prob_tx[0], 0);
           int cost1 = vp8_cost_bit(cm->prob_tx[0], 1);
           int64_t tmp_rd_4x4s, tmp_rd_8x8s;
-#endif
           int64_t tmp_rd_4x4, tmp_rd_8x8, tmp_rd;
           int r4x4, tok4x4, d4x4, r8x8, tok8x8, d8x8;
           mbmi->txfm_size = TX_4X4;
@@ -3618,7 +3684,6 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
           txfm_cache[ONLY_4X4]  = tmp_rd_4x4;
           txfm_cache[ALLOW_8X8] = tmp_rd_8x8;
           txfm_cache[ALLOW_16X16] = tmp_rd_8x8;
-#if CONFIG_TX_SELECT
           tmp_rd_4x4s = tmp_rd_4x4 + RDCOST(x->rdmult, x->rddiv, cost0, 0);
           tmp_rd_8x8s = tmp_rd_8x8 + RDCOST(x->rdmult, x->rddiv, cost1, 0);
           txfm_cache[TX_MODE_SELECT] = tmp_rd_4x4s < tmp_rd_8x8s ? tmp_rd_4x4s : tmp_rd_8x8s;
@@ -3647,9 +3712,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
               mode8x8[1][3] = x->e_mbd.mode_info_context->bmi[10].as_mode.second;
 #endif
             }
-          } else
-#endif
-          if (cm->txfm_mode == ONLY_4X4) {
+          } else if (cm->txfm_mode == ONLY_4X4) {
             rate = r4x4;
             rate_y = tok4x4;
             distortion = d4x4;
@@ -3705,26 +3768,29 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
               (mbmi->ref_frame == GOLDEN_FRAME) ?
           cpi->rd_threshes[THR_NEWG] : this_rd_thresh;
 
-      mbmi->txfm_size = TX_4X4; // FIXME use 8x8 in case of 8x8/8x16/16x8
-      tmp_rd = vp8_rd_pick_best_mbsegmentation(cpi, x, &best_ref_mv,
-                                               second_ref, best_yrd, mdcounts,
-                                               &rate, &rate_y, &distortion,
-                                               this_rd_thresh, seg_mvs);
+      tmp_rd = rd_pick_best_mbsegmentation(cpi, x, &best_ref_mv,
+                                           second_ref, best_yrd, mdcounts,
+                                           &rate, &rate_y, &distortion,
+                                           &skippable,
+                                           this_rd_thresh, seg_mvs,
+                                           txfm_cache);
       rate2 += rate;
       distortion2 += distortion;
 
-#if CONFIG_SWITCHABLE_INTERP
       if (cpi->common.mcomp_filter_type == SWITCHABLE)
         rate2 += SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs
             [get_pred_context(&cpi->common, xd, PRED_SWITCHABLE_INTERP)]
                 [vp8_switchable_interp_map[mbmi->interp_filter]];
-#endif
       // If even the 'Y' rd value of split is higher than best so far
       // then dont bother looking at UV
       if (tmp_rd < best_yrd) {
-        rd_inter4x4_uv(cpi, x, &rate_uv, &distortion_uv, cpi->common.full_pixel);
+        int uv_skippable;
+
+        rd_inter4x4_uv(cpi, x, &rate_uv, &distortion_uv, &uv_skippable,
+                       cpi->common.full_pixel);
         rate2 += rate_uv;
         distortion2 += distortion_uv;
+        skippable = skippable && uv_skippable;
       } else {
         this_rd = INT64_MAX;
         disable_skip = 1;
@@ -3852,13 +3918,11 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
       rate2 += vp8_cost_bit(cpi->common.prob_pred_filter_off,
                             xd->mode_info_context->mbmi.pred_filter_enabled);
 #endif
-#if CONFIG_SWITCHABLE_INTERP
       if (cpi->common.mcomp_filter_type == SWITCHABLE)
         rate2 += SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs
             [get_pred_context(&cpi->common, xd, PRED_SWITCHABLE_INTERP)]
             [vp8_switchable_interp_map[
             x->e_mbd.mode_info_context->mbmi.interp_filter]];
-#endif
 
       /* We don't include the cost of the second reference here, because there are only
        * three options: Last/Golden, ARF/Last or Golden/ARF, or in other words if you
@@ -3883,8 +3947,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
         if (threshold < x->encode_breakout)
           threshold = x->encode_breakout;
 
-        var = VARIANCE_INVOKE(&cpi->rtcd.variance, var16x16)
-              (*(b->base_src), b->src_stride,
+        var = vp8_variance16x16(*(b->base_src), b->src_stride,
                x->e_mbd.predictor, 16, &sse);
 
         if (sse < threshold) {
@@ -3894,7 +3957,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
           if ((sse - var < q2dc *q2dc >> 4) ||
               (sse / 2 > var && sse - var < 64)) {
             // Check u and v to make sure skip is ok
-            int sse2 =  VP8_UVSSE(x, IF_RTCD(&cpi->rtcd.variance));
+            int sse2 =  vp8_uvsse(x);
             if (sse2 * 2 < threshold) {
               x->skip = 1;
               distortion2 = sse + sse2;
@@ -3906,22 +3969,22 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
 
               disable_skip = 1;
               this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
-
-              break;
             }
           }
         }
       }
 
-      vp8_build_1st_inter16x16_predictors_mbuv(&x->e_mbd, &xd->predictor[256],
-                                               &xd->predictor[320], 8);
-      if (is_comp_pred)
-        vp8_build_2nd_inter16x16_predictors_mbuv(&x->e_mbd,
-                                                 &xd->predictor[256],
+      if (!x->skip) {
+        vp8_build_1st_inter16x16_predictors_mbuv(&x->e_mbd, &xd->predictor[256],
                                                  &xd->predictor[320], 8);
-      inter_mode_cost(cpi, x, this_mode, &rate2, &distortion2,
-                      &rate_y, &distortion, &rate_uv, &distortion_uv,
-                      &skippable, txfm_cache);
+        if (is_comp_pred)
+          vp8_build_2nd_inter16x16_predictors_mbuv(&x->e_mbd,
+                                                   &xd->predictor[256],
+                                                   &xd->predictor[320], 8);
+        inter_mode_cost(cpi, x, this_mode, &rate2, &distortion2,
+                        &rate_y, &distortion, &rate_uv, &distortion_uv,
+                        &skippable, txfm_cache);
+      }
       if (is_comp_pred)
         mode_excluded = cpi->common.comp_pred_mode == SINGLE_PREDICTION_ONLY;
       else
@@ -4019,10 +4082,6 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
           // Note index of best mode so far
           best_mode_index = mode_index;
 
-#if CONFIG_HYBRIDTRANSFORM16X16
-          best_txtype = rd_txtype;
-#endif
-
           if (this_mode <= B_PRED) {
             if (mbmi->txfm_size != TX_4X4
                 && this_mode != B_PRED
@@ -4106,7 +4165,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
       if (!mode_excluded && this_rd != INT64_MAX) {
         for (i = 0; i < NB_TXFM_MODES; i++) {
           int64_t adj_rd;
-          if (this_mode != B_PRED && this_mode != SPLITMV) {
+          if (this_mode != B_PRED) {
             adj_rd = this_rd + txfm_cache[i] - txfm_cache[cm->txfm_mode];
           } else {
             adj_rd = this_rd;
@@ -4130,7 +4189,6 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
   else
     ++cpi->pred_filter_off_count;
 #endif
-#if CONFIG_SWITCHABLE_INTERP
   if (cpi->common.mcomp_filter_type == SWITCHABLE &&
       best_mbmode.mode >= NEARESTMV &&
       best_mbmode.mode <= SPLITMV) {
@@ -4138,7 +4196,6 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
         [get_pred_context(&cpi->common, xd, PRED_SWITCHABLE_INTERP)]
         [vp8_switchable_interp_map[best_mbmode.interp_filter]];
   }
-#endif
 
   // Reduce the activation RD thresholds for the best choice mode
   if ((cpi->rd_baseline_thresh[best_mode_index] > 0) &&
@@ -4164,11 +4221,9 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
       (cpi->oxcf.arnr_max_frames == 0) &&
       (best_mbmode.mode != ZEROMV || best_mbmode.ref_frame != ALTREF_FRAME)) {
     mbmi->mode = ZEROMV;
-#if CONFIG_TX_SELECT
     if (cm->txfm_mode != TX_MODE_SELECT)
       mbmi->txfm_size = cm->txfm_mode;
     else
-#endif
       mbmi->txfm_size = TX_16X16;
     mbmi->ref_frame = ALTREF_FRAME;
     mbmi->mv[0].as_int = 0;
@@ -4195,11 +4250,6 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
     }
   }
 
-#if CONFIG_HYBRIDTRANSFORM16X16
-  if (best_mbmode.mode < I8X8_PRED)
-    xd->mode_info_context->bmi[0].as_mode.tx_type = best_txtype;
-#endif
-
   if (best_mbmode.mode == I8X8_PRED)
     set_i8x8_block_modes(x, mode8x8);
 
@@ -4223,7 +4273,6 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
       best_pred_diff[i] = best_rd - best_pred_rd[i];
   }
 
-#if CONFIG_TX_SELECT
   if (!x->skip) {
     for (i = 0; i < NB_TXFM_MODES; i++) {
       if (best_txfm_rd[i] == INT64_MAX)
@@ -4234,7 +4283,6 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
   } else {
     vpx_memset(best_txfm_diff, 0, sizeof(best_txfm_diff));
   }
-#endif
 
 end:
   store_coding_context(x, &x->mb_context[xd->mb_index], best_mode_index, &best_partition,
@@ -4304,10 +4352,6 @@ void vp8_rd_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x,
   TX_SIZE txfm_size_16x16;
   int i;
 
-#if CONFIG_HYBRIDTRANSFORM16X16
-  int best_txtype;
-#endif
-
   mbmi->ref_frame = INTRA_FRAME;
   rd_pick_intra_mbuv_mode(cpi, x, &rateuv, &rateuv_tokenonly, &distuv,
                           &uv_intra_skippable);
@@ -4329,10 +4373,6 @@ void vp8_rd_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x,
                                           &rate16x16_tokenonly, &dist16x16,
                                           &y_intra16x16_skippable, txfm_cache);
   mode16x16 = mbmi->mode;
-#if CONFIG_HYBRIDTRANSFORM16X16
-  best_txtype = xd->block[0].bmi.as_mode.tx_type;
-  xd->mode_info_context->bmi[0].as_mode.tx_type = best_txtype;
-#endif
   txfm_size_16x16 = mbmi->txfm_size;
 
   // FIXME(rbultje) support transform-size selection
@@ -4373,10 +4413,8 @@ void vp8_rd_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x,
            vp8_cost_bit(get_pred_prob(cm, xd, PRED_MBSKIP), 1);
     dist = dist16x16 + (distuv8x8 >> 2);
     mbmi->txfm_size = txfm_size_16x16;
-#if CONFIG_TX_SELECT
     memset(x->mb_context[xd->mb_index].txfm_rd_diff, 0,
            sizeof(x->mb_context[xd->mb_index].txfm_rd_diff));
-#endif
   } else if (error8x8 > error16x16) {
     if (error4x4 < error16x16) {
       rate = rateuv;
@@ -4393,24 +4431,16 @@ void vp8_rd_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x,
       mbmi->mode = B_PRED;
       mbmi->txfm_size = TX_4X4;
       dist = dist4x4 + (distuv >> 2);
-#if CONFIG_TX_SELECT
       memset(x->mb_context[xd->mb_index].txfm_rd_diff, 0,
              sizeof(x->mb_context[xd->mb_index].txfm_rd_diff));
-#endif
     } else {
       mbmi->txfm_size = txfm_size_16x16;
       mbmi->mode = mode16x16;
       rate = rate16x16 + rateuv8x8;
       dist = dist16x16 + (distuv8x8 >> 2);
-#if CONFIG_HYBRIDTRANSFORM16X16
-      // save this into supermacroblock coding decision buffer
-      xd->mode_info_context->bmi[0].as_mode.tx_type = best_txtype;
-#endif
-#if CONFIG_TX_SELECT
       for (i = 0; i < NB_TXFM_MODES; i++) {
         x->mb_context[xd->mb_index].txfm_rd_diff[i] = error16x16 - txfm_cache[i];
       }
-#endif
     }
     if (cpi->common.mb_no_coeff_skip)
       rate += vp8_cost_bit(get_pred_prob(cm, xd, PRED_MBSKIP), 0);
@@ -4430,10 +4460,8 @@ void vp8_rd_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x,
       mbmi->mode = B_PRED;
       mbmi->txfm_size = TX_4X4;
       dist = dist4x4 + (distuv >> 2);
-#if CONFIG_TX_SELECT
       memset(x->mb_context[xd->mb_index].txfm_rd_diff, 0,
              sizeof(x->mb_context[xd->mb_index].txfm_rd_diff));
-#endif
     } else {
       // FIXME(rbultje) support transform-size selection
       mbmi->mode = I8X8_PRED;
@@ -4441,10 +4469,8 @@ void vp8_rd_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x,
       set_i8x8_block_modes(x, mode8x8);
       rate = rate8x8 + rateuv;
       dist = dist8x8 + (distuv >> 2);
-#if CONFIG_TX_SELECT
       memset(x->mb_context[xd->mb_index].txfm_rd_diff, 0,
              sizeof(x->mb_context[xd->mb_index].txfm_rd_diff));
-#endif
     }
     if (cpi->common.mb_no_coeff_skip)
       rate += vp8_cost_bit(get_pred_prob(cm, xd, PRED_MBSKIP), 0);
@@ -4801,8 +4827,8 @@ int64_t vp8_rd_pick_inter_mode_sb(VP8_COMP *cpi, MACROBLOCK *x,
             if (threshold < x->encode_breakout)
               threshold = x->encode_breakout;
 
-            var = VARIANCE_INVOKE(&cpi->rtcd.variance, var32x32)(*(b->base_src),
-              b->src_stride, xd->dst.y_buffer, xd->dst.y_stride, &sse);
+            var = vp8_variance32x32(*(b->base_src), b->src_stride,
+                                    xd->dst.y_buffer, xd->dst.y_stride, &sse);
 
             if (sse < threshold) {
               unsigned int q2dc = xd->block[24].dequant[0];
@@ -4812,11 +4838,9 @@ int64_t vp8_rd_pick_inter_mode_sb(VP8_COMP *cpi, MACROBLOCK *x,
                   (sse / 2 > var && sse - var < 64)) {
                 // Check u and v to make sure skip is ok
                 unsigned int sse2, sse3;
-                var += VARIANCE_INVOKE(&cpi->rtcd.variance, var16x16)
-                                  (x->src.u_buffer, x->src.uv_stride,
+                var += vp8_variance16x16(x->src.u_buffer, x->src.uv_stride,
                                    xd->dst.u_buffer, xd->dst.uv_stride, &sse2);
-                var += VARIANCE_INVOKE(&cpi->rtcd.variance, var16x16)
-                                  (x->src.v_buffer, x->src.uv_stride,
+                var += vp8_variance16x16(x->src.v_buffer, x->src.uv_stride,
                                    xd->dst.v_buffer, xd->dst.uv_stride, &sse3);
                 sse2 += sse3;
                 if (sse2 * 2 < threshold) {