12 files changed, 730 insertions, 270 deletions
diff --git a/configure b/configure
index 168da07f4..575e6d2e4 100755
--- a/configure
+++ b/configure
@@ -229,6 +229,7 @@ EXPERIMENT_LIST="
     new_mvref
     hybridtransform16x16
     newmventropy
+    tx_select
 "
 CONFIG_LIST="
     external_build
diff --git a/vp8/common/onyxc_int.h b/vp8/common/onyxc_int.h
index 7c6093b41..e3361556f 100644
--- a/vp8/common/onyxc_int.h
+++ b/vp8/common/onyxc_int.h
@@ -163,10 +163,16 @@ typedef enum {
   NB_PREDICTION_TYPES    = 3,
 } COMPPREDMODE_TYPE;
 
-/* TODO: allows larger transform */
 typedef enum {
   ONLY_4X4            = 0,
-  ALLOW_8X8           = 1
+  ALLOW_8X8           = 1,
+#if CONFIG_TX16X16
+  ALLOW_16X16         = 2,
+#endif
+#if CONFIG_TX_SELECT
+  TX_MODE_SELECT      = 2 + CONFIG_TX16X16,
+#endif
+  NB_TXFM_MODES       = 2 + CONFIG_TX16X16 + CONFIG_TX_SELECT,
 } TXFM_MODE;
 
 typedef struct VP8_COMMON_RTCD {
@@ -306,6 +312,11 @@ typedef struct VP8Common {
 
   vp8_prob prob_comppred[COMP_PRED_CONTEXTS];
 
+#if CONFIG_TX_SELECT
+  // FIXME contextualize
+  vp8_prob prob_tx[TX_SIZE_MAX - 1];
+#endif
+
   vp8_prob mbskip_pred_probs[MBSKIP_CONTEXTS];
 
   FRAME_CONTEXT lfc_a; /* last alt ref entropy */
diff --git a/vp8/decoder/decodemv.c b/vp8/decoder/decodemv.c
index 7f7567e4e..ddf5301f9 100644
--- a/vp8/decoder/decodemv.c
+++ b/vp8/decoder/decodemv.c
@@ -170,6 +170,27 @@ static void vp8_kfread_modes(VP8D_COMP *pbi,
   m->mbmi.second_uv_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
 #endif
 
+#if CONFIG_TX_SELECT
+  if (cm->txfm_mode == TX_MODE_SELECT && m->mbmi.mb_skip_coeff == 0 &&
+      m->mbmi.mode <= TM_PRED) {
+    // FIXME(rbultje) code ternary symbol once all experiments are merged
+    m->mbmi.txfm_size = vp8_read(bc, cm->prob_tx[0]);
+#if CONFIG_TX16X16
+    if (m->mbmi.txfm_size != TX_4X4)
+      m->mbmi.txfm_size += vp8_read(bc, cm->prob_tx[1]);
+#endif
+  } else
+#endif
+#if CONFIG_TX16X16
+  if (cm->txfm_mode >= ALLOW_16X16 && m->mbmi.mode <= TM_PRED) {
+    m->mbmi.txfm_size = TX_16X16;
+  } else
+#endif
+  if (cm->txfm_mode >= ALLOW_8X8 && m->mbmi.mode != B_PRED) {
+    m->mbmi.txfm_size = TX_8X8;
+  } else {
+    m->mbmi.txfm_size = TX_4X4;
+  }
 }
 
 #if CONFIG_NEWMVENTROPY
@@ -1263,6 +1284,32 @@ static void read_mb_modes_mv(VP8D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
 #endif
   }
 
+#if CONFIG_TX_SELECT
+  if (cm->txfm_mode == TX_MODE_SELECT && mbmi->mb_skip_coeff == 0 &&
+      ((mbmi->ref_frame == INTRA_FRAME && mbmi->mode <= TM_PRED) ||
+       (mbmi->ref_frame != INTRA_FRAME && mbmi->mode != SPLITMV))) {
+    // FIXME(rbultje) code ternary symbol once all experiments are merged
+    mbmi->txfm_size = vp8_read(bc, cm->prob_tx[0]);
+#if CONFIG_TX16X16
+    if (mbmi->txfm_size != TX_4X4)
+      mbmi->txfm_size += vp8_read(bc, cm->prob_tx[1]);
+#endif
+  } else
+#endif
+#if CONFIG_TX16X16
+  if (cm->txfm_mode >= ALLOW_16X16 &&
+      ((mbmi->ref_frame == INTRA_FRAME && mbmi->mode <= TM_PRED) ||
+       (mbmi->ref_frame != INTRA_FRAME && mbmi->mode != SPLITMV))) {
+    mbmi->txfm_size = TX_16X16;
+  } else
+#endif
+  if (cm->txfm_mode >= ALLOW_8X8 &&
+      ((mbmi->ref_frame == INTRA_FRAME && mbmi->mode != B_PRED) ||
+       (mbmi->ref_frame != INTRA_FRAME && mbmi->mode != SPLITMV))) {
+    mbmi->txfm_size = TX_8X8;
+  } else {
+    mbmi->txfm_size = TX_4X4;
+  }
 }
 
 void vp8_decode_mode_mvs(VP8D_COMP *pbi) {
diff --git a/vp8/decoder/decodframe.c b/vp8/decoder/decodframe.c
index 34b43ce3b..a186f6939 100644
--- a/vp8/decoder/decodframe.c
+++ b/vp8/decoder/decodframe.c
@@ -231,39 +231,6 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
   if (xd->segmentation_enabled)
     mb_init_dequantizer(pbi, xd);
 
-  if (pbi->common.frame_type == KEY_FRAME) {
-#if CONFIG_TX16X16 || CONFIG_HYBRIDTRANSFORM16X16
-    if (xd->mode_info_context->mbmi.mode <= TM_PRED ||
-        xd->mode_info_context->mbmi.mode == NEWMV ||
-        xd->mode_info_context->mbmi.mode == ZEROMV ||
-        xd->mode_info_context->mbmi.mode == NEARMV ||
-        xd->mode_info_context->mbmi.mode == NEARESTMV)
-      xd->mode_info_context->mbmi.txfm_size = TX_16X16;
-    else
-#endif
-      if (pbi->common.txfm_mode == ALLOW_8X8 &&
-        xd->mode_info_context->mbmi.mode != B_PRED)
-      xd->mode_info_context->mbmi.txfm_size = TX_8X8;
-    else
-      xd->mode_info_context->mbmi.txfm_size = TX_4X4;
-  } else {
-#if CONFIG_TX16X16 || CONFIG_HYBRIDTRANSFORM16X16
-    if (xd->mode_info_context->mbmi.mode <= TM_PRED ||
-        xd->mode_info_context->mbmi.mode == NEWMV ||
-        xd->mode_info_context->mbmi.mode == ZEROMV ||
-        xd->mode_info_context->mbmi.mode == NEARMV ||
-        xd->mode_info_context->mbmi.mode == NEARESTMV)
-      xd->mode_info_context->mbmi.txfm_size = TX_16X16;
-    else
-#endif
-      if (pbi->common.txfm_mode == ALLOW_8X8 &&
-        xd->mode_info_context->mbmi.mode != B_PRED &&
-        xd->mode_info_context->mbmi.mode != SPLITMV)
-      xd->mode_info_context->mbmi.txfm_size = TX_8X8;
-    else
-      xd->mode_info_context->mbmi.txfm_size = TX_4X4;
-  }
-
 #if CONFIG_SUPERBLOCKS
   if (xd->mode_info_context->mbmi.encoded_as_sb) {
     xd->mode_info_context->mbmi.txfm_size = TX_8X8;
@@ -1006,7 +973,7 @@ static void read_coef_probs(VP8D_COMP *pbi) {
   }
 #endif
 
-  if (pbi->common.txfm_mode == ALLOW_8X8 && vp8_read_bit(bc)) {
+  if (pbi->common.txfm_mode != ONLY_4X4 && vp8_read_bit(bc)) {
     // read coef probability tree
     for (i = 0; i < BLOCK_TYPES_8X8; i++)
       for (j = !i; j < COEF_BANDS; j++)
@@ -1025,7 +992,7 @@ static void read_coef_probs(VP8D_COMP *pbi) {
         }
   }
 #if CONFIG_HYBRIDTRANSFORM8X8
-  if (pbi->common.txfm_mode == ALLOW_8X8 && vp8_read_bit(bc)) {
+  if (pbi->common.txfm_mode != ONLY_4X4 && vp8_read_bit(bc)) {
     // read coef probability tree
     for (i = 0; i < BLOCK_TYPES_8X8; i++)
       for (j = !i; j < COEF_BANDS; j++)
@@ -1047,7 +1014,7 @@ static void read_coef_probs(VP8D_COMP *pbi) {
 
 #if CONFIG_TX16X16
   // 16x16
-  if (vp8_read_bit(bc)) {
+  if (pbi->common.txfm_mode > ALLOW_8X8 && vp8_read_bit(bc)) {
     // read coef probability tree
     for (i = 0; i < BLOCK_TYPES_16X16; ++i)
       for (j = !i; j < COEF_BANDS; ++j)
@@ -1066,7 +1033,7 @@ static void read_coef_probs(VP8D_COMP *pbi) {
         }
   }
 #if CONFIG_HYBRIDTRANSFORM16X16
-  if (vp8_read_bit(bc)) {
+  if (pbi->common.txfm_mode > ALLOW_8X8 && vp8_read_bit(bc)) {
     // read coef probability tree
     for (i = 0; i < BLOCK_TYPES_16X16; ++i)
       for (j = !i; j < COEF_BANDS; ++j)
@@ -1314,7 +1281,27 @@ int vp8_decode_frame(VP8D_COMP *pbi) {
 #endif
 
   /* Read the loop filter level and type */
+#if CONFIG_TX_SELECT
+#if CONFIG_TX16X16
+  pc->txfm_mode = vp8_read_literal(bc, 2);
+#else
+  pc->txfm_mode = vp8_read_bit(bc);
+  if (pc->txfm_mode)
+    pc->txfm_mode += vp8_read_bit(bc);
+#endif
+  if (pc->txfm_mode == TX_MODE_SELECT) {
+    pc->prob_tx[0] = vp8_read_literal(bc, 8);
+#if CONFIG_TX16X16
+    pc->prob_tx[1] = vp8_read_literal(bc, 8);
+#endif
+  }
+#else
   pc->txfm_mode = (TXFM_MODE) vp8_read_bit(bc);
+#if CONFIG_TX16X16
+  if (pc->txfm_mode == ALLOW_8X8)
+    pc->txfm_mode = ALLOW_16X16;
+#endif
+#endif
 
   pc->filter_type = (LOOPFILTERTYPE) vp8_read_bit(bc);
   pc->filter_level = vp8_read_literal(bc, 6);
diff --git a/vp8/encoder/bitstream.c b/vp8/encoder/bitstream.c
index 2f748015f..cf3acef12 100644
--- a/vp8/encoder/bitstream.c
+++ b/vp8/encoder/bitstream.c
@@ -1290,6 +1290,23 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi) {
           }
         }
 
+#if CONFIG_TX_SELECT
+        if (((rf == INTRA_FRAME && mode <= TM_PRED) ||
+             (rf != INTRA_FRAME && mode != SPLITMV)) &&
+            pc->txfm_mode == TX_MODE_SELECT &&
+            !((pc->mb_no_coeff_skip && mi->mb_skip_coeff) ||
+              (segfeature_active(xd, segment_id, SEG_LVL_EOB) &&
+               get_segdata(xd, segment_id, SEG_LVL_EOB) == 0))) {
+          TX_SIZE sz = mi->txfm_size;
+          // FIXME(rbultje) code ternary symbol once all experiments are merged
+          vp8_write(w, sz != TX_4X4, pc->prob_tx[0]);
+#if CONFIG_TX16X16
+          if (sz != TX_4X4)
+            vp8_write(w, sz != TX_8X8, pc->prob_tx[1]);
+#endif
+        }
+#endif
+
 #if CONFIG_SUPERBLOCKS
         if (m->mbmi.encoded_as_sb) {
           assert(!i);
@@ -1411,6 +1428,7 @@ static void write_kfmodes(VP8_COMP *cpi) {
               vp8_encode_bool(bc, skip_coeff,
                           get_pred_prob(c, xd, PRED_MBSKIP));
         }
+
 #if CONFIG_SUPERBLOCKS
         if (m->mbmi.encoded_as_sb) {
           sb_kfwrite_ymode(bc, ym,
@@ -1468,6 +1486,21 @@ static void write_kfmodes(VP8_COMP *cpi) {
         } else
           write_uv_mode(bc, m->mbmi.uv_mode, c->kf_uv_mode_prob[ym]);
 
+#if CONFIG_TX_SELECT
+        if (ym <= TM_PRED && c->txfm_mode == TX_MODE_SELECT &&
+            !((c->mb_no_coeff_skip && m->mbmi.mb_skip_coeff) ||
+              (segfeature_active(xd, segment_id, SEG_LVL_EOB) &&
+               get_segdata(xd, segment_id, SEG_LVL_EOB) == 0))) {
+          TX_SIZE sz = m->mbmi.txfm_size;
+          // FIXME(rbultje) code ternary symbol once all experiments are merged
+          vp8_write(bc, sz != TX_4X4, c->prob_tx[0]);
+#if CONFIG_TX16X16
+          if (sz != TX_4X4)
+            vp8_write(bc, sz != TX_8X8, c->prob_tx[1]);
+#endif
+        }
+#endif
+
 #if CONFIG_SUPERBLOCKS
         if (m->mbmi.encoded_as_sb) {
           assert(!i);
@@ -1564,7 +1597,7 @@ void build_coeff_contexts(VP8_COMP *cpi) {
 #endif
 
 
-  if (cpi->common.txfm_mode == ALLOW_8X8) {
+  if (cpi->common.txfm_mode != ONLY_4X4) {
     for (i = 0; i < BLOCK_TYPES_8X8; ++i) {
       for (j = 0; j < COEF_BANDS; ++j) {
         for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
@@ -1618,22 +1651,23 @@ void build_coeff_contexts(VP8_COMP *cpi) {
   }
 
 #if CONFIG_TX16X16
-  //16x16
-  for (i = 0; i < BLOCK_TYPES_16X16; ++i) {
-    for (j = 0; j < COEF_BANDS; ++j) {
-      for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
-        if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
-          continue;
-        vp8_tree_probs_from_distribution(
-          MAX_ENTROPY_TOKENS, vp8_coef_encodings, vp8_coef_tree,
-          cpi->frame_coef_probs_16x16[i][j][k],
-          cpi->frame_branch_ct_16x16[i][j][k],
-          cpi->coef_counts_16x16[i][j][k], 256, 1);
+  if (cpi->common.txfm_mode > ALLOW_8X8) {
+    for (i = 0; i < BLOCK_TYPES_16X16; ++i) {
+      for (j = 0; j < COEF_BANDS; ++j) {
+        for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
+          if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
+            continue;
+          vp8_tree_probs_from_distribution(
+            MAX_ENTROPY_TOKENS, vp8_coef_encodings, vp8_coef_tree,
+            cpi->frame_coef_probs_16x16[i][j][k],
+            cpi->frame_branch_ct_16x16[i][j][k],
+            cpi->coef_counts_16x16[i][j][k], 256, 1);
 #ifdef ENTROPY_STATS
-        if (!cpi->dummy_packing)
-          for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)
-            context_counters_16x16[i][j][k][t] += cpi->coef_counts_16x16[i][j][k][t];
+          if (!cpi->dummy_packing)
+            for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)
+              context_counters_16x16[i][j][k][t] += cpi->coef_counts_16x16[i][j][k][t];
 #endif
+        }
       }
     }
   }
@@ -1746,8 +1780,7 @@ static void update_coef_probs2(VP8_COMP *cpi) {
     }
   }
 
-  if (cpi->common.txfm_mode != ALLOW_8X8) return;
-
+  if (cpi->common.txfm_mode != ONLY_4X4)
   for (t = 0; t < ENTROPY_NODES; ++t) {
     /* dry run to see if there is any udpate at all needed */
     savings = 0;
@@ -2024,7 +2057,7 @@ static void update_coef_probs(VP8_COMP *cpi) {
 #endif
 
   /* do not do this if not even allowed */
-  if (cpi->common.txfm_mode == ALLOW_8X8) {
+  if (cpi->common.txfm_mode != ONLY_4X4) {
     /* dry run to see if update is necessary */
     update[0] = update[1] = 0;
     savings = 0;
@@ -2177,7 +2210,7 @@ static void update_coef_probs(VP8_COMP *cpi) {
   }
 
 #if CONFIG_TX16X16
-  // 16x16
+  if (cpi->common.txfm_mode > ALLOW_8X8) {
   /* dry run to see if update is necessary */
   update[0] = update[1] = 0;
   savings = 0;
@@ -2327,6 +2360,7 @@ static void update_coef_probs(VP8_COMP *cpi) {
     }
   }
 #endif
+  }
 #endif
 }
 
@@ -2616,7 +2650,68 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned long *size)
   }
 #endif
 
-  vp8_write_bit(bc, pc->txfm_mode);
+#if CONFIG_TX_SELECT
+  {
+#if CONFIG_TX16X16
+    int cnt = cpi->txfm_count[0] + cpi->txfm_count[1] + cpi->txfm_count[2];
+    if (cnt && pc->txfm_mode == TX_MODE_SELECT) {
+      int prob = (255 * (cpi->txfm_count[1] + cpi->txfm_count[2]) + (cnt >> 1)) / cnt;
+      if (prob <= 1) {
+        pc->prob_tx[0] = 1;
+      } else if (prob >= 255) {
+        pc->prob_tx[0] = 255;
+      } else {
+        pc->prob_tx[0] = prob;
+      }
+      pc->prob_tx[0] = 256 - pc->prob_tx[0];
+    } else {
+      pc->prob_tx[0] = 128;
+    }
+    cnt -= cpi->txfm_count[0];
+    if (cnt && pc->txfm_mode == TX_MODE_SELECT) {
+      int prob = (255 * cpi->txfm_count[2] + (cnt >> 1)) / cnt;
+      if (prob <= 1) {
+        pc->prob_tx[1] = 1;
+      } else if (prob >= 255) {
+        pc->prob_tx[1] = 255;
+      } else {
+        pc->prob_tx[1] = prob;
+      }
+      pc->prob_tx[1] = 256 - pc->prob_tx[1];
+    } else {
+      pc->prob_tx[1] = 128;
+    }
+    vp8_write_literal(bc, pc->txfm_mode, 2);
+    if (pc->txfm_mode == TX_MODE_SELECT) {
+      vp8_write_literal(bc, pc->prob_tx[0], 8);
+      vp8_write_literal(bc, pc->prob_tx[1], 8);
+    }
+#else
+    int cnt = cpi->txfm_count[0] + cpi->txfm_count[1];
+    if (cnt && pc->txfm_mode == TX_MODE_SELECT) {
+      int prob = (255 * cpi->txfm_count[1] + (cnt >> 1)) / cnt;
+      if (prob <= 1) {
+        pc->prob_tx[0] = 1;
+      } else if (prob >= 255) {
+        pc->prob_tx[0] = 255;
+      } else {
+        pc->prob_tx[0] = prob;
+      }
+      pc->prob_tx[0] = 256 - pc->prob_tx[0];
+    } else {
+      pc->prob_tx[0] = 128;
+    }
+    vp8_write_bit(bc, pc->txfm_mode != 0);
+    if (pc->txfm_mode)
+      vp8_write_bit(bc, pc->txfm_mode - 1);
+    if (pc->txfm_mode == TX_MODE_SELECT) {
+      vp8_write_literal(bc, pc->prob_tx[0], 8);
+    }
+#endif
+  }
+#else
+  vp8_write_bit(bc, !!pc->txfm_mode);
+#endif
 
   // Encode the loop filter level and type
   vp8_write_bit(bc, pc->filter_type);
diff --git a/vp8/encoder/block.h b/vp8/encoder/block.h
index a204c8b60..bd29eeee7 100644
--- a/vp8/encoder/block.h
+++ b/vp8/encoder/block.h
@@ -85,6 +85,9 @@ typedef struct {
   int hybrid_pred_diff;
   int comp_pred_diff;
   int single_pred_diff;
+#if CONFIG_TX_SELECT
+  int64_t txfm_rd_diff[NB_TXFM_MODES];
+#endif
 } PICK_MODE_CONTEXT;
 
 typedef struct {
diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c
index 584570da9..fc6dc66d6 100644
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@@ -417,6 +417,18 @@ static void update_state(VP8_COMP *cpi, MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
     mbmi->mv[1].as_int = x->partition_info->bmi[15].second_mv.as_int;
   }
 
+#if CONFIG_TX_SELECT
+  {
+    int segment_id = mbmi->segment_id;
+    if (!segfeature_active(xd, segment_id, SEG_LVL_EOB) ||
+        get_segdata(xd, segment_id, SEG_LVL_EOB)) {
+      for (i = 0; i < NB_TXFM_MODES; i++) {
+        cpi->rd_tx_select_diff[i] += ctx->txfm_rd_diff[i];
+      }
+    }
+  }
+#endif
+
   if (cpi->common.frame_type == KEY_FRAME) {
     // Restore the coding modes to that held in the coding context
     // if (mb_mode == B_PRED)
@@ -606,9 +618,6 @@ static void pick_mb_modes(VP8_COMP *cpi,
 
     x->active_ptr = cpi->active_map + map_index;
 
-    /* force 4x4 transform for mode selection */
-    mbmi->txfm_size = TX_4X4; // TODO IS this right??
-
 #if CONFIG_SUPERBLOCKS
     xd->mode_info_context->mbmi.encoded_as_sb = 0;
 #endif
@@ -1395,7 +1404,10 @@ static void encode_frame_internal(VP8_COMP *cpi) {
   vpx_memset(cpi->rd_comp_pred_diff, 0, sizeof(cpi->rd_comp_pred_diff));
   vpx_memset(cpi->single_pred_count, 0, sizeof(cpi->single_pred_count));
   vpx_memset(cpi->comp_pred_count, 0, sizeof(cpi->comp_pred_count));
-
+#if CONFIG_TX_SELECT
+  vpx_memset(cpi->txfm_count, 0, sizeof(cpi->txfm_count));
+  vpx_memset(cpi->rd_tx_select_diff, 0, sizeof(cpi->rd_tx_select_diff));
+#endif
   {
     struct vpx_usec_timer  emr_timer;
     vpx_usec_timer_start(&emr_timer);
@@ -1458,6 +1470,7 @@ static int check_dual_ref_flags(VP8_COMP *cpi) {
 void vp8_encode_frame(VP8_COMP *cpi) {
   if (cpi->sf.RD) {
     int i, frame_type, pred_type;
+    TXFM_MODE txfm_type;
 
     /*
      * This code does a single RD pass over the whole frame assuming
@@ -1465,9 +1478,8 @@ void vp8_encode_frame(VP8_COMP *cpi) {
      * worked best for that type of frame in the past.
      * It also predicts whether another coding mode would have worked
      * better that this coding mode. If that is the case, it remembers
-     * that for subsequent frames. If the difference is above a certain
-     * threshold, it will actually re-encode the current frame using
-     * that different coding mode.
+     * that for subsequent frames.
+     * It does the same analysis for transform size selection also.
      */
     if (cpi->common.frame_type == KEY_FRAME)
       frame_type = 0;
@@ -1478,6 +1490,7 @@ void vp8_encode_frame(VP8_COMP *cpi) {
     else
       frame_type = 2;
 
+    /* prediction (compound, single or hybrid) mode selection */
     if (frame_type == 3)
       pred_type = SINGLE_PREDICTION_ONLY;
     else if (cpi->rd_prediction_type_threshes[frame_type][1] >
@@ -1492,15 +1505,111 @@ void vp8_encode_frame(VP8_COMP *cpi) {
     else
       pred_type = HYBRID_PREDICTION;
 
+    /* transform size (4x4, 8x8, 16x16 or select-per-mb) selection */
+#if CONFIG_LOSSLESS
+    if (cpi->oxcf.lossless) {
+      txfm_type = ONLY_4X4;
+    } else
+#endif
+#if CONFIG_TX_SELECT
+    /* FIXME (rbultje)
+     * this is a hack (no really), basically to work around the complete
+     * nonsense coefficient cost prediction for keyframes. The probabilities
+     * are reset to defaults, and thus we basically have no idea how expensive
+     * a 4x4 vs. 8x8 will really be. The result is that any estimate at which
+     * of the two is better is utterly bogus.
+     * I'd like to eventually remove this hack, but in order to do that, we
+     * need to move the frame reset code from the frame encode init to the
+     * bitstream write code, or alternatively keep a backup of the previous
+     * keyframe's probabilities as an estimate of what the current keyframe's
+     * coefficient cost distributions may look like. */
+    if (frame_type == 0) {
+#if CONFIG_TX16X16
+      txfm_type = ALLOW_16X16;
+#else
+      txfm_type = ALLOW_8X8;
+#endif
+    } else
+#if 0
+    /* FIXME (rbultje)
+     * this code is disabled for a similar reason as the code above; the
+     * problem is that each time we "revert" to 4x4 only (or even 8x8 only),
+     * the coefficient probabilities for 16x16 (and 8x8) start lagging behind,
+     * thus leading to them lagging further behind and not being chosen for
+     * subsequent frames either. This is essentially a local minimum problem
+     * that we can probably fix by estimating real costs more closely within
+     * a frame, perhaps by re-calculating costs on-the-fly as frame encoding
+     * progresses. */
+    if (cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] >
+            cpi->rd_tx_select_threshes[frame_type][ONLY_4X4] &&
+#if CONFIG_TX16X16
+        cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] >
+            cpi->rd_tx_select_threshes[frame_type][ALLOW_16X16] &&
+#endif
+        cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] >
+            cpi->rd_tx_select_threshes[frame_type][ALLOW_8X8]) {
+      txfm_type = TX_MODE_SELECT;
+    } else if (cpi->rd_tx_select_threshes[frame_type][ONLY_4X4] >
+                  cpi->rd_tx_select_threshes[frame_type][ALLOW_8X8]
+#if CONFIG_TX16X16
+            && cpi->rd_tx_select_threshes[frame_type][ONLY_4X4] >
+                  cpi->rd_tx_select_threshes[frame_type][ALLOW_16X16]
+#endif
+               ) {
+      txfm_type = ONLY_4X4;
+#if CONFIG_TX16X16
+    } else if (cpi->rd_tx_select_threshes[frame_type][ALLOW_16X16] >=
+                  cpi->rd_tx_select_threshes[frame_type][ALLOW_8X8]) {
+      txfm_type = ALLOW_16X16;
+#endif
+    } else
+      txfm_type = ALLOW_8X8;
+#else
+#if CONFIG_TX16X16
+    txfm_type = cpi->rd_tx_select_threshes[frame_type][ALLOW_16X16] >=
+                 cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] ?
+    ALLOW_16X16 : TX_MODE_SELECT;
+#else
+    txfm_type = cpi->rd_tx_select_threshes[frame_type][ALLOW_8X8] >=
+                cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] ?
+    ALLOW_8X8 : TX_MODE_SELECT;
+#endif
+#endif
+#elif CONFIG_TX16X16
+      txfm_type = ALLOW_16X16;
+#else
+      txfm_type = ALLOW_8X8;
+#endif // CONFIG_TX_SELECT
+    cpi->common.txfm_mode = txfm_type;
+#if CONFIG_TX_SELECT
+    if (txfm_type != TX_MODE_SELECT) {
+      cpi->common.prob_tx[0] = 128;
+#if CONFIG_TX16X16
+      cpi->common.prob_tx[1] = 128;
+#endif
+    }
+#endif
     cpi->common.comp_pred_mode = pred_type;
     encode_frame_internal(cpi);
 
     for (i = 0; i < NB_PREDICTION_TYPES; ++i) {
-      int diff = cpi->rd_comp_pred_diff[i] / cpi->common.MBs;
+      const int diff = cpi->rd_comp_pred_diff[i] / cpi->common.MBs;
       cpi->rd_prediction_type_threshes[frame_type][i] += diff;
       cpi->rd_prediction_type_threshes[frame_type][i] >>= 1;
     }
 
+#if CONFIG_TX_SELECT
+    for (i = 0; i < NB_TXFM_MODES; ++i) {
+      int64_t pd = cpi->rd_tx_select_diff[i];
+      int diff;
+      if (i == TX_MODE_SELECT)
+        pd -= RDCOST(cpi->mb.rdmult, cpi->mb.rddiv, 2048 * (TX_SIZE_MAX - 1), 0);
+      diff = pd / cpi->common.MBs;
+      cpi->rd_tx_select_threshes[frame_type][i] += diff;
+      cpi->rd_tx_select_threshes[frame_type][i] /= 2;
+    }
+#endif
+
     if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {
       int single_count_zero = 0;
       int comp_count_zero = 0;
@@ -1516,6 +1625,28 @@ void vp8_encode_frame(VP8_COMP *cpi) {
         cpi->common.comp_pred_mode = COMP_PREDICTION_ONLY;
       }
     }
+
+#if CONFIG_TX_SELECT
+    if (cpi->common.txfm_mode == TX_MODE_SELECT) {
+      const int count4x4 = cpi->txfm_count[TX_4X4];
+      const int count8x8 = cpi->txfm_count[TX_8X8];
+#if CONFIG_TX16X16
+      const int count16x16 = cpi->txfm_count[TX_16X16];
+#else
+      const int count16x16 = 0;
+#endif
+
+      if (count4x4 == 0 && count16x16 == 0) {
+        cpi->common.txfm_mode = ALLOW_8X8;
+      } else if (count8x8 == 0 && count16x16 == 0) {
+        cpi->common.txfm_mode = ONLY_4X4;
+#if CONFIG_TX16X16
+      } else if (count8x8 == 0 && count4x4 == 0) {
+        cpi->common.txfm_mode = ALLOW_16X16;
+#endif
+      }
+    }
+#endif
   } else {
     encode_frame_internal(cpi);
   }
@@ -1755,18 +1886,6 @@ void vp8cx_encode_intra_super_block(VP8_COMP *cpi,
     vp8_update_zbin_extra(cpi, x);
   }
 
-  /* test code: set transform size based on mode selection */
-  if (cpi->common.txfm_mode == ALLOW_8X8) {
-    x->e_mbd.mode_info_context->mbmi.txfm_size = TX_8X8;
-    x->e_mbd.mode_info_context[1].mbmi.txfm_size = TX_8X8;
-    x->e_mbd.mode_info_context[cm->mode_info_stride].mbmi.txfm_size = TX_8X8;
-    x->e_mbd.mode_info_context[cm->mode_info_stride+1].mbmi.txfm_size = TX_8X8;
-    cpi->t8x8_count++;
-  } else {
-    x->e_mbd.mode_info_context->mbmi.txfm_size = TX_4X4;
-    cpi->t4x4_count++;
-  }
-
   RECON_INVOKE(&rtcd->common->recon, build_intra_predictors_sby_s)(&x->e_mbd);
   RECON_INVOKE(&rtcd->common->recon, build_intra_predictors_sbuv_s)(&x->e_mbd);
 
@@ -1835,23 +1954,6 @@ void vp8cx_encode_intra_macro_block(VP8_COMP *cpi,
     vp8_update_zbin_extra(cpi, x);
   }
 
-  /* test code: set transform size based on mode selection */
-#if CONFIG_TX16X16 || CONFIG_HYBRIDTRANSFORM16X16
-  if (mbmi->mode <= TM_PRED) {
-    mbmi->txfm_size = TX_16X16;
-    cpi->t16x16_count++;
-  }
-  else
-#endif
-  if (cpi->common.txfm_mode == ALLOW_8X8
-      && mbmi->mode != B_PRED) {
-    mbmi->txfm_size = TX_8X8;
-    cpi->t8x8_count++;
-  } else {
-    mbmi->txfm_size = TX_4X4;
-    cpi->t4x4_count++;
-  }
-
   if (mbmi->mode == I8X8_PRED) {
     vp8_encode_intra8x8mby(IF_RTCD(&cpi->rtcd), x);
     vp8_encode_intra8x8mbuv(IF_RTCD(&cpi->rtcd), x);
@@ -1865,9 +1967,34 @@ void vp8cx_encode_intra_macro_block(VP8_COMP *cpi,
   }
 
   if (output_enabled) {
+#if CONFIG_TX_SELECT
+    int segment_id = mbmi->segment_id;
+#endif
+
     // Tokenize
     sum_intra_stats(cpi, x);
     vp8_tokenize_mb(cpi, &x->e_mbd, t, 0);
+
+#if CONFIG_TX_SELECT
+    if (cpi->common.txfm_mode == TX_MODE_SELECT &&
+        !((cpi->common.mb_no_coeff_skip && mbmi->mb_skip_coeff) ||
+          (segfeature_active(&x->e_mbd, segment_id, SEG_LVL_EOB) &&
+           get_segdata(&x->e_mbd, segment_id, SEG_LVL_EOB) == 0))) {
+      if (mbmi->mode != B_PRED && mbmi->mode != I8X8_PRED) {
+        cpi->txfm_count[mbmi->txfm_size]++;
+      }
+    } else
+#endif
+#if CONFIG_TX16X16
+    if (cpi->common.txfm_mode >= ALLOW_16X16 && mbmi->mode <= TM_PRED) {
+      mbmi->txfm_size = TX_16X16;
+    } else
+#endif
+    if (cpi->common.txfm_mode >= ALLOW_8X8 && mbmi->mode != B_PRED) {
+      mbmi->txfm_size = TX_8X8;
+    } else {
+      mbmi->txfm_size = TX_4X4;
+    }
   }
 #if CONFIG_NEWBESTREFMV
   else
@@ -1932,24 +2059,6 @@ void vp8cx_encode_inter_macroblock (VP8_COMP *cpi, MACROBLOCK *x,
   ref_pred_flag = ((mbmi->ref_frame == get_pred_ref(cm, xd)));
   set_pred_flag(xd, PRED_REF, ref_pred_flag);
 
-  /* test code: set transform size based on mode selection */
-#if CONFIG_TX16X16 || CONFIG_HYBRIDTRANSFORM16X16
-  if (mbmi->mode <= TM_PRED || mbmi->mode == NEWMV || mbmi->mode == ZEROMV ||
-      mbmi->mode == NEARMV ||  mbmi->mode == NEARESTMV) {
-    mbmi->txfm_size = TX_16X16;
-    cpi->t16x16_count++;
-  } else
-#endif
-  if (cpi->common.txfm_mode == ALLOW_8X8
-      && mbmi->mode != B_PRED
-      && mbmi->mode != SPLITMV) {
-    mbmi->txfm_size = TX_8X8;
-    cpi->t8x8_count++;
-  } else {
-    mbmi->txfm_size = TX_4X4;
-    cpi->t4x4_count++;
-  }
-
   if (mbmi->ref_frame == INTRA_FRAME) {
     if (mbmi->mode == B_PRED) {
       vp8_encode_intra16x16mbuv(IF_RTCD(&cpi->rtcd), x);
@@ -2056,6 +2165,33 @@ void vp8cx_encode_inter_macroblock (VP8_COMP *cpi, MACROBLOCK *x,
         cpi->skip_false_count[mb_skip_context]++;
     }
   }
+
+  if (output_enabled) {
+#if CONFIG_TX_SELECT
+    int segment_id = mbmi->segment_id;
+    if (cpi->common.txfm_mode == TX_MODE_SELECT &&
+        !((cpi->common.mb_no_coeff_skip && mbmi->mb_skip_coeff) ||
+          (segfeature_active(&x->e_mbd, segment_id, SEG_LVL_EOB) &&
+           get_segdata(&x->e_mbd, segment_id, SEG_LVL_EOB) == 0))) {
+      if (mbmi->mode != B_PRED && mbmi->mode != I8X8_PRED &&
+          mbmi->mode != SPLITMV) {
+        cpi->txfm_count[mbmi->txfm_size]++;
+      }
+    } else
+#endif
+#if CONFIG_TX16X16
+    if (mbmi->mode != B_PRED && mbmi->mode != I8X8_PRED &&
+        mbmi->mode != SPLITMV && cpi->common.txfm_mode >= ALLOW_16X16) {
+      mbmi->txfm_size = TX_16X16;
+    } else
+#endif
+    if (mbmi->mode != B_PRED && mbmi->mode != SPLITMV &&
+        cpi->common.txfm_mode >= ALLOW_8X8) {
+      mbmi->txfm_size = TX_8X8;
+    } else {
+      mbmi->txfm_size = TX_4X4;
+    }
+  }
 }
 
 #if CONFIG_SUPERBLOCKS
@@ -2119,17 +2255,6 @@ void vp8cx_encode_inter_superblock(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t,
                     get_pred_ref(cm, xd)));
   set_pred_flag(xd, PRED_REF, ref_pred_flag);
 
-  /* test code: set transform size based on mode selection */
-  if (cpi->common.txfm_mode == ALLOW_8X8
-      && x->e_mbd.mode_info_context->mbmi.mode != B_PRED
-      && x->e_mbd.mode_info_context->mbmi.mode != SPLITMV) {
-    x->e_mbd.mode_info_context->mbmi.txfm_size = TX_8X8;
-    cpi->t8x8_count++;
-  } else {
-    x->e_mbd.mode_info_context->mbmi.txfm_size = TX_4X4;
-    cpi->t4x4_count++;
-  }
-
   if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {
     RECON_INVOKE(&rtcd->common->recon, build_intra_predictors_sby_s)(&x->e_mbd);
     RECON_INVOKE(&rtcd->common->recon, build_intra_predictors_sbuv_s)(&x->e_mbd);
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index 9a88eddb9..cfd70c407 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -1829,6 +1829,10 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf) {
 #endif
   for (i = 0; i < COMP_PRED_CONTEXTS; i++)
     cm->prob_comppred[i]         = 128;
+#if CONFIG_TX_SELECT
+  for (i = 0; i < TX_SIZE_MAX - 1; i++)
+    cm->prob_tx[i]               = 128;
+#endif
 
   // Prime the recent reference frame useage counters.
   // Hereafter they will be maintained as a sort of moving average
diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index 5cc87d7a9..34a4b37de 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -472,6 +472,12 @@ typedef struct VP8_COMP {
   int rd_prediction_type_threshes[4][NB_PREDICTION_TYPES];
   int comp_pred_count[COMP_PRED_CONTEXTS];
   int single_pred_count[COMP_PRED_CONTEXTS];
+#if CONFIG_TX_SELECT
+  // FIXME contextualize
+  int txfm_count[TX_SIZE_MAX + 1];
+  int64_t rd_tx_select_diff[NB_TXFM_MODES];
+  int rd_tx_select_threshes[4][NB_TXFM_MODES];
+#endif
 
   int RDMULT;
   int RDDIV;
@@ -654,11 +660,6 @@ typedef struct VP8_COMP {
   int gf_update_recommended;
   int skip_true_count[3];
   int skip_false_count[3];
-  int t4x4_count;
-  int t8x8_count;
-#if CONFIG_TX16X16 || CONFIG_HYBRIDTRANSFORM16X16
-  int t16x16_count;
-#endif
 
   unsigned char *segmentation_map;
 
diff --git a/vp8/encoder/picklpf.c b/vp8/encoder/picklpf.c
index d17dd9219..954997889 100644
--- a/vp8/encoder/picklpf.c
+++ b/vp8/encoder/picklpf.c
@@ -337,7 +337,7 @@ void vp8cx_pick_filter_level_sg(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi, int segme
       Bias = Bias * cpi->twopass.section_intra_rating / 20;
 
     // yx, bias less for large block size
-    if (cpi->common.txfm_mode == ALLOW_8X8)
+    if (cpi->common.txfm_mode != ONLY_4X4)
       Bias >>= 1;
 
     filt_high = ((filt_mid + filter_step) > max_filter_level) ? max_filter_level : (filt_mid + filter_step);
@@ -546,7 +546,7 @@ void vp8cx_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi) {
       Bias = Bias * cpi->twopass.section_intra_rating / 20;
 
     // yx, bias less for large block size
-    if (cpi->common.txfm_mode == ALLOW_8X8)
+    if (cpi->common.txfm_mode != ONLY_4X4)
       Bias >>= 1;
 
     filt_high = ((filt_mid + filter_step) > max_filter_level) ? max_filter_level : (filt_mid + filter_step);
diff --git a/vp8/encoder/ratectrl.c b/vp8/encoder/ratectrl.c
index e059a10e2..47d578e2b 100644
--- a/vp8/encoder/ratectrl.c
+++ b/vp8/encoder/ratectrl.c
@@ -288,12 +288,6 @@ void vp8_setup_key_frame(VP8_COMP *cpi) {
   }
 #endif
 
-  cpi->common.txfm_mode = ALLOW_8X8;
-
-#if CONFIG_LOSSLESS
-  if (cpi->oxcf.lossless)
-    cpi->common.txfm_mode = ONLY_4X4;
-#endif
   // cpi->common.filter_level = 0;      // Reset every key frame.
   cpi->common.filter_level = cpi->common.base_qindex * 3 / 8;
 
@@ -310,14 +304,6 @@ void vp8_setup_key_frame(VP8_COMP *cpi) {
 }
 
 void vp8_setup_inter_frame(VP8_COMP *cpi) {
-
-  cpi->common.txfm_mode = ALLOW_8X8;
-
-#if CONFIG_LOSSLESS
-  if (cpi->oxcf.lossless)
-    cpi->common.txfm_mode = ONLY_4X4;
-#endif
-
   if (cpi->common.refresh_alt_ref_frame) {
     vpx_memcpy(&cpi->common.fc,
                &cpi->common.lfc_a,
diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c
index 0613355fc..20376e7e7 100644
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -744,10 +744,11 @@ static int vp8_rdcost_mby(MACROBLOCK *mb) {
   return cost;
 }
 
-static void macro_block_yrd(MACROBLOCK *mb,
-                            int *Rate,
-                            int *Distortion,
-                            const VP8_ENCODER_RTCD *rtcd) {
+static void macro_block_yrd_4x4(MACROBLOCK *mb,
+                                int *Rate,
+                                int *Distortion,
+                                const VP8_ENCODER_RTCD *rtcd,
+                                int *skippable) {
   int b;
   MACROBLOCKD *const xd = &mb->e_mbd;
   BLOCK   *const mb_y2 = mb->block + 24;
@@ -788,6 +789,7 @@ static void macro_block_yrd(MACROBLOCK *mb,
   *Distortion = (d >> 2);
   // rate
   *Rate = vp8_rdcost_mby(mb);
+  *skippable = mby_is_skippable(&mb->e_mbd, 1);
 }
 
 static int vp8_rdcost_mby_8x8(MACROBLOCK *mb, int backup) {
@@ -822,7 +824,8 @@ static int vp8_rdcost_mby_8x8(MACROBLOCK *mb, int backup) {
 static void macro_block_yrd_8x8(MACROBLOCK *mb,
                                 int *Rate,
                                 int *Distortion,
-                                const VP8_ENCODER_RTCD *rtcd) {
+                                const VP8_ENCODER_RTCD *rtcd,
+                                int *skippable) {
   MACROBLOCKD *const xd = &mb->e_mbd;
   BLOCK   *const mb_y2 = mb->block + 24;
   BLOCKD *const x_y2  = xd->block + 24;
@@ -853,6 +856,7 @@ static void macro_block_yrd_8x8(MACROBLOCK *mb,
   *Distortion = (d >> 2);
   // rate
   *Rate = vp8_rdcost_mby_8x8(mb, 1);
+  *skippable = mby_is_skippable_8x8(&mb->e_mbd, 1);
 }
 
 #if CONFIG_TX16X16 || CONFIG_HYBRIDTRANSFORM16X16
@@ -873,7 +877,7 @@ static int vp8_rdcost_mby_16x16(MACROBLOCK *mb) {
 }
 
 static void macro_block_yrd_16x16(MACROBLOCK *mb, int *Rate, int *Distortion,
-                                  const VP8_ENCODER_RTCD *rtcd) {
+                                  const VP8_ENCODER_RTCD *rtcd, int *skippable) {
   int d;
 
   ENCODEMB_INVOKE(&rtcd->encodemb, submby)(
@@ -909,9 +913,158 @@ static void macro_block_yrd_16x16(MACROBLOCK *mb, int *Rate, int *Distortion,
   *Distortion = (d >> 2);
   // rate
   *Rate = vp8_rdcost_mby_16x16(mb);
+  *skippable = mby_is_skippable_16x16(&mb->e_mbd);
 }
 #endif
 
+static void macro_block_yrd(VP8_COMP *cpi, MACROBLOCK *x, int *rate,
+                            int *distortion, int *skippable,
+                            int64_t txfm_cache[NB_TXFM_MODES]) {
+  VP8_COMMON *cm = &cpi->common;
+  MB_MODE_INFO *mbmi = &x->e_mbd.mode_info_context->mbmi;
+
+#if CONFIG_TX_SELECT
+
+  MACROBLOCKD *xd = &x->e_mbd;
+  int can_skip = cm->mb_no_coeff_skip;
+  vp8_prob skip_prob = can_skip ? get_pred_prob(cm, xd, PRED_MBSKIP) : 128;
+  int s0, s1;
+  int r4x4, r4x4s, r8x8, r8x8s, d4x4, d8x8, s4x4, s8x8;
+  int64_t rd4x4, rd8x8, rd4x4s, rd8x8s;
+#if CONFIG_TX16X16
+  int d16x16, r16x16, r16x16s, s16x16;
+  int64_t rd16x16, rd16x16s;
+#endif
+
+  // FIXME don't do sub x3
+  if (skip_prob == 0)
+    skip_prob = 1;
+  s0 = vp8_cost_bit(skip_prob, 0);
+  s1 = vp8_cost_bit(skip_prob, 1);
+#if CONFIG_TX16X16
+  macro_block_yrd_16x16(x, &r16x16, &d16x16, IF_RTCD(&cpi->rtcd), &s16x16);
+  if (can_skip) {
+    if (s16x16) {
+      rd16x16 = RDCOST(x->rdmult, x->rddiv, s1, d16x16);
+    } else {
+      rd16x16 = RDCOST(x->rdmult, x->rddiv, r16x16 + s0, d16x16);
+    }
+  } else {
+    rd16x16 = RDCOST(x->rdmult, x->rddiv, r16x16, d16x16);
+  }
+  r16x16s = r16x16 + vp8_cost_one(cm->prob_tx[0]) + vp8_cost_one(cm->prob_tx[1]);
+  if (can_skip) {
+    if (s16x16) {
+      rd16x16s = RDCOST(x->rdmult, x->rddiv, s1, d16x16);
+    } else {
+      rd16x16s = RDCOST(x->rdmult, x->rddiv, r16x16s + s0, d16x16);
+    }
+  } else {
+    rd16x16s = RDCOST(x->rdmult, x->rddiv, r16x16s, d16x16);
+  }
+#endif
+  macro_block_yrd_8x8(x, &r8x8, &d8x8, IF_RTCD(&cpi->rtcd), &s8x8);
+  if (can_skip) {
+    if (s8x8) {
+      rd8x8 = RDCOST(x->rdmult, x->rddiv, s1, d8x8);
+    } else {
+      rd8x8 = RDCOST(x->rdmult, x->rddiv, r8x8 + s0, d8x8);
+    }
+  } else {
+    rd8x8 = RDCOST(x->rdmult, x->rddiv, r8x8, d8x8);
+  }
+  r8x8s = r8x8 + vp8_cost_one(cm->prob_tx[0]);
+#if CONFIG_TX16X16
+  r8x8s += vp8_cost_zero(cm->prob_tx[1]);
+#endif
+  if (can_skip) {
+    if (s8x8) {
+      rd8x8s = RDCOST(x->rdmult, x->rddiv, s1, d8x8);
+    } else {
+      rd8x8s = RDCOST(x->rdmult, x->rddiv, r8x8s + s0, d8x8);
+    }
+  } else {
+    rd8x8s = RDCOST(x->rdmult, x->rddiv, r8x8s, d8x8);
+  }
+  macro_block_yrd_4x4(x, &r4x4, &d4x4, IF_RTCD(&cpi->rtcd), &s4x4);
+  if (can_skip) {
+    if (s4x4) {
+      rd4x4 = RDCOST(x->rdmult, x->rddiv, s1, d4x4);
+    } else {
+      rd4x4 = RDCOST(x->rdmult, x->rddiv, r4x4 + s0, d4x4);
+    }
+  } else {
+    rd4x4 = RDCOST(x->rdmult, x->rddiv, r4x4, d4x4);
+  }
+  r4x4s = r4x4 + vp8_cost_zero(cm->prob_tx[0]);
+  if (can_skip) {
+    if (s4x4) {
+      rd4x4s = RDCOST(x->rdmult, x->rddiv, s1, d4x4);
+    } else {
+      rd4x4s = RDCOST(x->rdmult, x->rddiv, r4x4s + s0, d4x4);
+    }
+  } else {
+    rd4x4s = RDCOST(x->rdmult, x->rddiv, r4x4s, d4x4);
+  }
+
+#if CONFIG_TX16X16
+  if ( cpi->common.txfm_mode == ALLOW_16X16 ||
+      (cpi->common.txfm_mode == TX_MODE_SELECT &&
+       rd16x16s < rd8x8s && rd16x16s < rd4x4s)) {
+    mbmi->txfm_size = TX_16X16;
+    *skippable = s16x16;
+    *distortion = d16x16;
+    *rate = (cpi->common.txfm_mode == ALLOW_16X16) ? r16x16 : r16x16s;
+  } else
+#endif
+  if ( cpi->common.txfm_mode == ALLOW_8X8 ||
+      (cpi->common.txfm_mode == TX_MODE_SELECT && rd8x8s < rd4x4s)) {
+    mbmi->txfm_size = TX_8X8;
+    *skippable = s8x8;
+    *distortion = d8x8;
+    *rate = (cpi->common.txfm_mode == ALLOW_8X8) ? r8x8 : r8x8s;
+  } else {
+    assert(cpi->common.txfm_mode == ONLY_4X4 ||
+           (cpi->common.txfm_mode == TX_MODE_SELECT && rd4x4s <= rd8x8s));
+    mbmi->txfm_size = TX_4X4;
+    *skippable = s4x4;
+    *distortion = d4x4;
+    *rate = (cpi->common.txfm_mode == ONLY_4X4) ? r4x4 : r4x4s;
+  }
+
+  txfm_cache[ONLY_4X4] = rd4x4;
+  txfm_cache[ALLOW_8X8] = rd8x8;
+#if CONFIG_TX16X16
+  txfm_cache[ALLOW_16X16] = rd16x16;
+  if (rd16x16s < rd8x8s && rd16x16s < rd4x4s)
+    txfm_cache[TX_MODE_SELECT] = rd16x16s;
+  else
+#endif
+    txfm_cache[TX_MODE_SELECT] = rd4x4s < rd8x8s ? rd4x4s : rd8x8s;
+
+#else /* CONFIG_TX_SELECT */
+
+  switch (cpi->common.txfm_mode) {
+#if CONFIG_TX16X16
+    case ALLOW_16X16:
+      macro_block_yrd_16x16(x, rate, distortion, IF_RTCD(&cpi->rtcd), skippable);
+      mbmi->txfm_size = TX_16X16;
+      break;
+#endif
+    case ALLOW_8X8:
+      macro_block_yrd_8x8(x, rate, distortion, IF_RTCD(&cpi->rtcd), skippable);
+      mbmi->txfm_size = TX_8X8;
+      break;
+    default:
+    case ONLY_4X4:
+      macro_block_yrd_4x4(x, rate, distortion, IF_RTCD(&cpi->rtcd), skippable);
+      mbmi->txfm_size = TX_4X4;
+      break;
+  }
+
+#endif /* CONFIG_TX_SELECT */
+}
+
 static void copy_predictor(unsigned char *dst, const unsigned char *predictor) {
   const unsigned int *p = (const unsigned int *)predictor;
   unsigned int *d = (unsigned int *)dst;
@@ -1267,8 +1420,10 @@ static int64_t rd_pick_intra16x16mby_mode(VP8_COMP *cpi,
                                           int *Rate,
                                           int *rate_y,
                                           int *Distortion,
-                                          int *skippable) {
+                                          int *skippable,
+                                          int64_t txfm_cache[NB_TXFM_MODES]) {
   MB_PREDICTION_MODE mode;
+  TX_SIZE txfm_size;
   MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
 #if CONFIG_COMP_INTRA_PRED
   MB_PREDICTION_MODE mode2;
@@ -1276,18 +1431,24 @@ static int64_t rd_pick_intra16x16mby_mode(VP8_COMP *cpi,
 #endif
   MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
   int rate, ratey;
-  int distortion;
+  int distortion, skip;
   int64_t best_rd = INT64_MAX;
   int64_t this_rd;
-  int UNINITIALIZED_IS_SAFE(skip);
   MACROBLOCKD *xd = &x->e_mbd;
 
 #if CONFIG_HYBRIDTRANSFORM16X16
   int best_txtype, rd_txtype;
 #endif
+#if CONFIG_TX_SELECT
+  int i;
+  for (i = 0; i < NB_TXFM_MODES; i++)
+    txfm_cache[i] = INT64_MAX;
+#endif
 
   // Y Search for 16x16 intra prediction mode
   for (mode = DC_PRED; mode <= TM_PRED; mode++) {
+    int64_t local_txfm_cache[NB_TXFM_MODES];
+
     mbmi->mode = mode;
 #if CONFIG_HYBRIDTRANSFORM16X16
     mbmi->mode_rdopt = mode;
@@ -1308,11 +1469,8 @@ static int64_t rd_pick_intra16x16mby_mode(VP8_COMP *cpi,
       }
 #endif
 
-#if CONFIG_TX16X16 || CONFIG_HYBRIDTRANSFORM16X16
-      macro_block_yrd_16x16(x, &ratey, &distortion, IF_RTCD(&cpi->rtcd));
-#else
-      macro_block_yrd_8x8(x, &ratey, &distortion, IF_RTCD(&cpi->rtcd));
-#endif
+      macro_block_yrd(cpi, x, &ratey, &distortion, &skip, local_txfm_cache);
+
       // FIXME add compoundmode cost
       // FIXME add rate for mode2
       rate = ratey + x->mbmode_cost[x->e_mbd.frame_type][mbmi->mode];
@@ -1324,12 +1482,8 @@ static int64_t rd_pick_intra16x16mby_mode(VP8_COMP *cpi,
 #endif
 
       if (this_rd < best_rd) {
-#if CONFIG_TX16X16
-        skip = mby_is_skippable_16x16(xd);
-#else
-        skip = mby_is_skippable_8x8(xd, 1);
-#endif
         mode_selected = mode;
+        txfm_size = mbmi->txfm_size;
 #if CONFIG_COMP_INTRA_PRED
         mode2_selected = mode2;
 #endif
@@ -1340,13 +1494,25 @@ static int64_t rd_pick_intra16x16mby_mode(VP8_COMP *cpi,
 #if CONFIG_HYBRIDTRANSFORM16X16
         best_txtype = rd_txtype;
 #endif
+        *skippable = skip;
+      }
+
+#if CONFIG_TX_SELECT
+      for (i = 0; i < NB_TXFM_MODES; i++) {
+        int64_t adj_rd = this_rd + local_txfm_cache[i] -
+                          local_txfm_cache[cpi->common.txfm_mode];
+        if (adj_rd < txfm_cache[i]) {
+          txfm_cache[i] = adj_rd;
+        }
       }
+#endif
+
 #if CONFIG_COMP_INTRA_PRED
     }
 #endif
   }
 
-  *skippable = skip;
+  mbmi->txfm_size = txfm_size;
   mbmi->mode = mode_selected;
 #if CONFIG_HYBRIDTRANSFORM16X16
   x->e_mbd.block[0].bmi.as_mode.tx_type = best_txtype;
@@ -1556,15 +1722,19 @@ static int rd_cost_mbuv(MACROBLOCK *mb) {
 
 
 static int64_t rd_inter16x16_uv(VP8_COMP *cpi, MACROBLOCK *x, int *rate,
-                            int *distortion, int fullpixel) {
+                                int *distortion, int fullpixel, int *skip) {
   ENCODEMB_INVOKE(IF_RTCD(&cpi->rtcd.encodemb), submbuv)(x->src_diff,
-                                                         x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor, x->src.uv_stride);
+                                                         x->src.u_buffer,
+                                                         x->src.v_buffer,
+                                                         x->e_mbd.predictor,
+                                                         x->src.uv_stride);
 
   vp8_transform_mbuv(x);
   vp8_quantize_mbuv(x);
 
   *rate       = rd_cost_mbuv(x);
   *distortion = ENCODEMB_INVOKE(&cpi->rtcd.encodemb, mbuverr)(x) / 4;
+  *skip       = mbuv_is_skippable(&x->e_mbd);
 
   return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
 }
@@ -1645,16 +1815,19 @@ static int64_t rd_inter32x32_uv_8x8(VP8_COMP *cpi, MACROBLOCK *x, int *rate,
 #endif
 
 static int64_t rd_inter16x16_uv_8x8(VP8_COMP *cpi, MACROBLOCK *x, int *rate,
-                                    int *distortion, int fullpixel) {
+                                    int *distortion, int fullpixel, int *skip) {
   ENCODEMB_INVOKE(IF_RTCD(&cpi->rtcd.encodemb), submbuv)(x->src_diff,
-                                                         x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor, x->src.uv_stride);
+                                                         x->src.u_buffer,
+                                                         x->src.v_buffer,
+                                                         x->e_mbd.predictor,
+                                                         x->src.uv_stride);
 
   vp8_transform_mbuv_8x8(x);
-
   vp8_quantize_mbuv_8x8(x);
 
   *rate       = rd_cost_mbuv_8x8(x, 1);
   *distortion = ENCODEMB_INVOKE(&cpi->rtcd.encodemb, mbuverr)(x) / 4;
+  *skip       = mbuv_is_skippable_8x8(&x->e_mbd);
 
   return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
 }
@@ -2981,8 +3154,12 @@ static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
                                  int_mv *second_ref_mv,
                                  int single_pred_diff,
                                  int comp_pred_diff,
-                                 int hybrid_pred_diff) {
+                                 int hybrid_pred_diff,
+                                 int64_t txfm_size_diff[NB_TXFM_MODES]) {
   MACROBLOCKD *xd = &x->e_mbd;
+#if CONFIG_TX_SELECT
+  MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
+#endif
 
   // Take a snapshot of the coding context so it can be
   // restored if we decide to encode this way
@@ -3001,46 +3178,34 @@ static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
   ctx->single_pred_diff = single_pred_diff;
   ctx->comp_pred_diff   = comp_pred_diff;
   ctx->hybrid_pred_diff = hybrid_pred_diff;
+
+#if CONFIG_TX_SELECT
+  memcpy(ctx->txfm_rd_diff, txfm_size_diff, sizeof(ctx->txfm_rd_diff));
+#endif
 }
 
 static void inter_mode_cost(VP8_COMP *cpi, MACROBLOCK *x, int this_mode,
                             int *rate2, int *distortion2, int *rate_y,
-                            int *distortion, int* rate_uv, int *distortion_uv) {
+                            int *distortion, int* rate_uv, int *distortion_uv,
+                            int *skippable, int64_t txfm_cache[NB_TXFM_MODES]) {
+  int y_skippable, uv_skippable;
+
   // Y cost and distortion
-#if CONFIG_TX16X16 || CONFIG_HYBRIDTRANSFORM16X16
-  if (this_mode == ZEROMV ||
-      this_mode == NEARESTMV ||
-      this_mode == NEARMV ||
-      this_mode == NEWMV)
-    macro_block_yrd_16x16(x, rate_y, distortion, IF_RTCD(&cpi->rtcd));
-  else {
-#endif
-    if (cpi->common.txfm_mode == ALLOW_8X8)
-      macro_block_yrd_8x8(x, rate_y, distortion, IF_RTCD(&cpi->rtcd));
-    else
-      macro_block_yrd(x, rate_y, distortion, IF_RTCD(&cpi->rtcd));
-#if CONFIG_TX16X16 || CONFIG_HYBRIDTRANSFORM16X16
-  }
-#endif
+  macro_block_yrd(cpi, x, rate_y, distortion, &y_skippable, txfm_cache);
 
   *rate2 += *rate_y;
   *distortion2 += *distortion;
 
   // UV cost and distortion
-  if (cpi->common.txfm_mode == ALLOW_8X8
-#if CONFIG_TX16X16 || CONFIG_HYBRIDTRANSFORM16X16
-      || this_mode == ZEROMV ||
-      this_mode == NEARESTMV ||
-      this_mode == NEARMV ||
-      this_mode == NEWMV
-#endif
-      )
+  if (x->e_mbd.mode_info_context->mbmi.txfm_size != TX_4X4)
     rd_inter16x16_uv_8x8(cpi, x, rate_uv, distortion_uv,
-                         cpi->common.full_pixel);
+                         cpi->common.full_pixel, &uv_skippable);
   else
-    rd_inter16x16_uv(cpi, x, rate_uv, distortion_uv, cpi->common.full_pixel);
+    rd_inter16x16_uv(cpi, x, rate_uv, distortion_uv, cpi->common.full_pixel,
+                     &uv_skippable);
   *rate2 += *rate_uv;
   *distortion2 += *distortion_uv;
+  *skippable = y_skippable && uv_skippable;
 }
 
 #define MIN(x,y) (((x)<(y))?(x):(y))
@@ -3111,6 +3276,8 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
   int mdcounts[4];
   int rate, distortion;
   int rate2, distortion2;
+  int64_t best_txfm_rd[NB_TXFM_MODES];
+  int64_t best_txfm_diff[NB_TXFM_MODES];
   int64_t best_pred_diff[NB_PREDICTION_TYPES];
   int64_t best_pred_rd[NB_PREDICTION_TYPES];
   int64_t best_rd = INT64_MAX, best_intra_rd = INT64_MAX;
@@ -3165,6 +3332,8 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
     frame_mv[NEWMV][i].as_int = INVALID_MV;
   for (i = 0; i < NB_PREDICTION_TYPES; ++i)
     best_pred_rd[i] = INT64_MAX;
+  for (i = 0; i < NB_TXFM_MODES; i++)
+    best_txfm_rd[i] = INT64_MAX;
 
   for (i = 0; i < BLOCK_MAX_SEGMENTS - 1; i++) {
     int j, k;
@@ -3220,7 +3389,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
   uv_intra_mode = mbmi->uv_mode;
 
   /* rough estimate for now */
-  if (cpi->common.txfm_mode == ALLOW_8X8) {
+  if (cpi->common.txfm_mode != ONLY_4X4) {
     rd_pick_intra_mbuv_mode_8x8(cpi, x, &uv_intra_rate_8x8,
                                 &uv_intra_rate_tokenonly_8x8,
                                 &uv_intra_distortion_8x8,
@@ -3240,10 +3409,11 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
 #endif
     int64_t this_rd = INT64_MAX;
     int is_comp_pred;
-    int disable_skip = 0;
+    int disable_skip = 0, skippable = 0;
     int other_cost = 0;
     int compmode_cost = 0;
     int mode_excluded = 0;
+    int64_t txfm_cache[NB_TXFM_MODES];
 
     // These variables hold are rolling total cost and distortion for this mode
     rate2 = 0;
@@ -3380,50 +3550,33 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
           // FIXME compound intra prediction
           RECON_INVOKE(&cpi->common.rtcd.recon, build_intra_predictors_mby)
               (&x->e_mbd);
-
-#if CONFIG_TX16X16 || CONFIG_HYBRIDTRANSFORM16X16
-          // FIXME: breaks lossless since 4x4 isn't allowed
-          macro_block_yrd_16x16(x, &rate_y, &distortion,
-                                IF_RTCD(&cpi->rtcd));
+          macro_block_yrd(cpi, x, &rate_y, &distortion, &skippable, txfm_cache);
 #if CONFIG_HYBRIDTRANSFORM16X16
           rd_txtype = x->e_mbd.block[0].bmi.as_mode.tx_type;
 #endif
           rate2 += rate_y;
           distortion2 += distortion;
           rate2 += x->mbmode_cost[x->e_mbd.frame_type][mbmi->mode];
-          rate2 += uv_intra_rate_8x8;
-          rate_uv = uv_intra_rate_tokenonly_8x8;
-          distortion2 += uv_intra_distortion_8x8;
-          distortion_uv = uv_intra_distortion_8x8;
-          break;
-#else
-          if (cpi->common.txfm_mode == ALLOW_8X8)
-            macro_block_yrd_8x8(x, &rate_y, &distortion,
-                                IF_RTCD(&cpi->rtcd));
-          else
-            macro_block_yrd(x, &rate_y, &distortion,
-                            IF_RTCD(&cpi->rtcd));
-          rate2 += rate_y;
-          distortion2 += distortion;
-          rate2 += x->mbmode_cost[x->e_mbd.frame_type][mbmi->mode];
-          if (cpi->common.txfm_mode == ALLOW_8X8) {
+          if (mbmi->txfm_size != TX_4X4) {
             rate2 += uv_intra_rate_8x8;
             rate_uv = uv_intra_rate_tokenonly_8x8;
             distortion2 += uv_intra_distortion_8x8;
             distortion_uv = uv_intra_distortion_8x8;
+            skippable = skippable && uv_intra_skippable_8x8;
           } else {
             rate2 += uv_intra_rate;
             rate_uv = uv_intra_rate_tokenonly;
             distortion2 += uv_intra_distortion;
             distortion_uv = uv_intra_distortion;
+            skippable = skippable && uv_intra_skippable;
           }
           break;
-#endif
         case B_PRED: {
           int64_t tmp_rd;
 
           // Note the rate value returned here includes the cost of coding
           // the BPRED mode : x->mbmode_cost[x->e_mbd.frame_type][BPRED];
+          mbmi->txfm_size = TX_4X4;
           tmp_rd = rd_pick_intra4x4mby_modes(cpi, x, &rate, &rate_y, &distortion, best_yrd,
 #if CONFIG_COMP_INTRA_PRED
                                              0,
@@ -3445,6 +3598,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
         break;
         case I8X8_PRED: {
           int64_t tmp_rd;
+          mbmi->txfm_size = TX_8X8; // FIXME wrong in case of hybridtransform8x8
           tmp_rd = rd_pick_intra8x8mby_modes(cpi, x, &rate, &rate_y,
                                              &distortion, best_yrd);
           rate2 += rate;
@@ -3489,6 +3643,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
               (mbmi->ref_frame == GOLDEN_FRAME) ?
           cpi->rd_threshes[THR_NEWG] : this_rd_thresh;
 
+      mbmi->txfm_size = TX_4X4; // FIXME use 8x8 in case of 8x8/8x16/16x8
       tmp_rd = vp8_rd_pick_best_mbsegmentation(cpi, x, &best_ref_mv,
                                                second_ref, best_yrd, mdcounts,
                                                &rate, &rate_y, &distortion,
@@ -3524,7 +3679,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
     }
     else {
       const int num_refs = is_comp_pred ? 2 : 1;
-      int flag;
+      int flag, skip;
       int refs[2] = {x->e_mbd.mode_info_context->mbmi.ref_frame,
                      x->e_mbd.mode_info_context->mbmi.second_ref_frame};
       int_mv cur_mv[2];
@@ -3703,7 +3858,8 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
                                                  &xd->predictor[256],
                                                  &xd->predictor[320], 8);
       inter_mode_cost(cpi, x, this_mode, &rate2, &distortion2,
-                      &rate_y, &distortion, &rate_uv, &distortion_uv);
+                      &rate_y, &distortion, &rate_uv, &distortion_uv,
+                      &skippable, txfm_cache);
       if (is_comp_pred)
         mode_excluded = cpi->common.comp_pred_mode == SINGLE_PREDICTION_ONLY;
       else
@@ -3723,51 +3879,19 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
       // necessary adjustment for rate. Ignore if skip is coded at
       // segment level as the cost wont have been added in.
       if (cpi->common.mb_no_coeff_skip) {
-        int mb_skippable;
         int mb_skip_allowed;
         int has_y2 = (this_mode != SPLITMV
                       && this_mode != B_PRED
                       && this_mode != I8X8_PRED);
 
-#if CONFIG_TX16X16 || CONFIG_HYBRIDTRANSFORM16X16
-        if (this_mode <= TM_PRED ||
-            this_mode == NEWMV ||
-            this_mode == ZEROMV ||
-            this_mode == NEARESTMV ||
-            this_mode == NEARMV)
-          mb_skippable = mb_is_skippable_16x16(&x->e_mbd);
-        else
-#endif
-        if ((cpi->common.txfm_mode == ALLOW_8X8) && has_y2) {
-          if (mbmi->ref_frame != INTRA_FRAME) {
-#if CONFIG_TX16X16
-            mb_skippable = mb_is_skippable_16x16(&x->e_mbd);
-#else
-            mb_skippable = mb_is_skippable_8x8(&x->e_mbd, has_y2);
-#endif
-          } else {
-#if CONFIG_TX16X16
-            mb_skippable = uv_intra_skippable_8x8
-                           & mby_is_skippable_16x16(&x->e_mbd);
-#else
-            mb_skippable = uv_intra_skippable_8x8
-                           & mby_is_skippable_8x8(&x->e_mbd, has_y2);
-#endif
-          }
-        } else {
-          if (mbmi->ref_frame != INTRA_FRAME)
-            mb_skippable = mb_is_skippable(&x->e_mbd, has_y2);
-          else
-            mb_skippable = uv_intra_skippable
-                           & mby_is_skippable(&x->e_mbd, has_y2);
-        }
-
         // Is Mb level skip allowed for this mb.
         mb_skip_allowed =
           !segfeature_active(xd, segment_id, SEG_LVL_EOB) ||
           get_segdata(xd, segment_id, SEG_LVL_EOB);
 
-        if (mb_skippable) {
+        if (skippable) {
+          mbmi->mb_skip_coeff = 1;
+
           // Back out the coefficient coding costs
           rate2 -= (rate_y + rate_uv);
           // for best_yrd calculation
@@ -3788,11 +3912,14 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
           }
         }
         // Add in the cost of the no skip flag.
-        else if (mb_skip_allowed) {
-          int prob_skip_cost = vp8_cost_bit(
-                                 get_pred_prob(cm, &x->e_mbd, PRED_MBSKIP), 0);
-          rate2 += prob_skip_cost;
-          other_cost += prob_skip_cost;
+        else {
+          mbmi->mb_skip_coeff = 0;
+          if (mb_skip_allowed) {
+            int prob_skip_cost = vp8_cost_bit(
+                   get_pred_prob(cm, &x->e_mbd, PRED_MBSKIP), 0);
+            rate2 += prob_skip_cost;
+            other_cost += prob_skip_cost;
+          }
         }
       }
 
@@ -3835,7 +3962,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
 #endif
 
           if (this_mode <= B_PRED) {
-            if (cpi->common.txfm_mode == ALLOW_8X8
+            if (mbmi->txfm_size != TX_4X4
                 && this_mode != B_PRED
                 && this_mode != I8X8_PRED)
               mbmi->uv_mode = uv_intra_mode_8x8;
@@ -3912,6 +4039,21 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
         if (hybrid_rd < best_pred_rd[HYBRID_PREDICTION])
           best_pred_rd[HYBRID_PREDICTION] = hybrid_rd;
       }
+
+      /* keep record of best txfm size */
+      if (!mode_excluded && this_rd != INT64_MAX) {
+        for (i = 0; i < NB_TXFM_MODES; i++) {
+          int64_t adj_rd;
+          if (this_mode != B_PRED && this_mode != I8X8_PRED &&
+              this_mode != SPLITMV) {
+            adj_rd = this_rd + txfm_cache[i] - txfm_cache[cm->txfm_mode];
+          } else {
+            adj_rd = this_rd;
+          }
+          if (adj_rd < best_txfm_rd[i])
+            best_txfm_rd[i] = adj_rd;
+        }
+      }
 #if CONFIG_PRED_FILTER
     }
 #endif
@@ -3961,6 +4103,16 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
       (cpi->oxcf.arnr_max_frames == 0) &&
       (best_mbmode.mode != ZEROMV || best_mbmode.ref_frame != ALTREF_FRAME)) {
     mbmi->mode = ZEROMV;
+#if CONFIG_TX_SELECT
+    if (cm->txfm_mode != TX_MODE_SELECT)
+      mbmi->txfm_size = cm->txfm_mode;
+    else
+#endif
+#if CONFIG_TX16X16
+      mbmi->txfm_size = TX_16X16;
+#else
+      mbmi->txfm_size = TX_8X8;
+#endif
     mbmi->ref_frame = ALTREF_FRAME;
     mbmi->mv[0].as_int = 0;
     mbmi->uv_mode = DC_PRED;
@@ -3969,6 +4121,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
     mbmi->partitioning = 0;
 
     vpx_memset(best_pred_diff, 0, sizeof(best_pred_diff));
+    vpx_memset(best_txfm_diff, 0, sizeof(best_txfm_diff));
     goto end;
   }
 
@@ -4013,11 +4166,25 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
       best_pred_diff[i] = best_rd - best_pred_rd[i];
   }
 
+#if CONFIG_TX_SELECT
+  if (!x->skip) {
+    for (i = 0; i < NB_TXFM_MODES; i++) {
+      if (best_txfm_rd[i] == INT64_MAX)
+        best_txfm_diff[i] = INT_MIN;
+      else
+        best_txfm_diff[i] = best_rd - best_txfm_rd[i];
+    }
+  } else {
+    vpx_memset(best_txfm_diff, 0, sizeof(best_txfm_diff));
+  }
+#endif
+
 end:
   store_coding_context(x, &x->mb_context[xd->mb_index], best_mode_index, &best_partition,
                        &frame_best_ref_mv[xd->mode_info_context->mbmi.ref_frame],
                        &frame_best_ref_mv[xd->mode_info_context->mbmi.second_ref_frame],
-                       best_pred_diff[0], best_pred_diff[1], best_pred_diff[2]);
+                       best_pred_diff[0], best_pred_diff[1], best_pred_diff[2],
+                       best_txfm_diff);
 }
 
 #if CONFIG_SUPERBLOCKS
@@ -4076,6 +4243,9 @@ void vp8_rd_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x,
   int dist;
   int modeuv, modeuv8x8, uv_intra_skippable, uv_intra_skippable_8x8;
   int y_intra16x16_skippable;
+  int64_t txfm_cache[NB_TXFM_MODES];
+  TX_SIZE txfm_size_16x16;
+  int i;
 
 #if CONFIG_HYBRIDTRANSFORM16X16
   int best_txtype;
@@ -4085,7 +4255,7 @@ void vp8_rd_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x,
   rd_pick_intra_mbuv_mode(cpi, x, &rateuv, &rateuv_tokenonly, &distuv,
                           &uv_intra_skippable);
   modeuv = mbmi->uv_mode;
-  if (cpi->common.txfm_mode == ALLOW_8X8) {
+  if (cpi->common.txfm_mode != ONLY_4X4) {
     rd_pick_intra_mbuv_mode_8x8(cpi, x, &rateuv8x8, &rateuv8x8_tokenonly,
                                 &distuv8x8, &uv_intra_skippable_8x8);
     modeuv8x8 = mbmi->uv_mode;
@@ -4104,12 +4274,13 @@ void vp8_rd_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x,
 
   error16x16 = rd_pick_intra16x16mby_mode(cpi, x, &rate16x16,
                                           &rate16x16_tokenonly, &dist16x16,
-                                          &y_intra16x16_skippable);
+                                          &y_intra16x16_skippable, txfm_cache);
   mode16x16 = mbmi->mode;
 #if CONFIG_HYBRIDTRANSFORM16X16
   best_txtype = xd->block[0].bmi.as_mode.tx_type;
   xd->mode_info_context->bmi[0].as_mode.tx_type = best_txtype;
 #endif
+  txfm_size_16x16 = mbmi->txfm_size;
 
 #if CONFIG_HYBRIDTRANSFORM || CONFIG_HYBRIDTRANSFORM8X8
   mbmi->mode_rdopt = I8X8_PRED;
@@ -4145,12 +4316,19 @@ void vp8_rd_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x,
                                         &dist4x4d, error16x16, 1, 0);
 #endif
 
+  mbmi->mb_skip_coeff = 0;
   if (cpi->common.mb_no_coeff_skip &&
       y_intra16x16_skippable && uv_intra_skippable_8x8) {
+    mbmi->mb_skip_coeff = 1;
     mbmi->uv_mode = modeuv;
     rate = rateuv8x8 + rate16x16 - rateuv8x8_tokenonly - rate16x16_tokenonly +
            vp8_cost_bit(get_pred_prob(cm, xd, PRED_MBSKIP), 1);
     dist = dist16x16 + (distuv8x8 >> 2);
+    mbmi->txfm_size = txfm_size_16x16;
+#if CONFIG_TX_SELECT
+    memset(x->mb_context[xd->mb_index].txfm_rd_diff, 0,
+           sizeof(x->mb_context[xd->mb_index].txfm_rd_diff));
+#endif
   } else if (error8x8 > error16x16) {
     if (error4x4 < error16x16) {
       rate = rateuv;
@@ -4165,8 +4343,14 @@ void vp8_rd_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x,
       rate += rate4x4;
 #endif
       mbmi->mode = B_PRED;
+      mbmi->txfm_size = TX_4X4;
       dist = dist4x4 + (distuv >> 2);
+#if CONFIG_TX_SELECT
+      memset(x->mb_context[xd->mb_index].txfm_rd_diff, 0,
+             sizeof(x->mb_context[xd->mb_index].txfm_rd_diff));
+#endif
     } else {
+      mbmi->txfm_size = txfm_size_16x16;
       mbmi->mode = mode16x16;
       rate = rate16x16 + rateuv8x8;
       dist = dist16x16 + (distuv8x8 >> 2);
@@ -4174,6 +4358,11 @@ void vp8_rd_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x,
       // save this into supermacroblock coding decision buffer
       xd->mode_info_context->bmi[0].as_mode.tx_type = best_txtype;
 #endif
+#if CONFIG_TX_SELECT
+      for (i = 0; i < NB_TXFM_MODES; i++) {
+        x->mb_context[xd->mb_index].txfm_rd_diff[i] = error16x16 - txfm_cache[i];
+      }
+#endif
     }
     if (cpi->common.mb_no_coeff_skip)
       rate += vp8_cost_bit(get_pred_prob(cm, xd, PRED_MBSKIP), 0);
@@ -4191,12 +4380,22 @@ void vp8_rd_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x,
       rate += rate4x4;
 #endif
       mbmi->mode = B_PRED;
+      mbmi->txfm_size = TX_4X4;
       dist = dist4x4 + (distuv >> 2);
+#if CONFIG_TX_SELECT
+      memset(x->mb_context[xd->mb_index].txfm_rd_diff, 0,
+             sizeof(x->mb_context[xd->mb_index].txfm_rd_diff));
+#endif
     } else {
       mbmi->mode = I8X8_PRED;
+      mbmi->txfm_size = TX_8X8;
       set_i8x8_block_modes(x, mode8x8);
       rate = rate8x8 + rateuv;
       dist = dist8x8 + (distuv >> 2);
+#if CONFIG_TX_SELECT
+      memset(x->mb_context[xd->mb_index].txfm_rd_diff, 0,
+             sizeof(x->mb_context[xd->mb_index].txfm_rd_diff));
+#endif
     }
     if (cpi->common.mb_no_coeff_skip)
       rate += vp8_cost_bit(get_pred_prob(cm, xd, PRED_MBSKIP), 0);
@@ -4883,7 +5082,7 @@ int64_t vp8_rd_pick_inter_mode_sb(VP8_COMP *cpi, MACROBLOCK *x,
       store_coding_context(x, &x->sb_context[0], mode_index, NULL,
         &frame_best_ref_mv[xd->mode_info_context->mbmi.ref_frame],
         &frame_best_ref_mv[xd->mode_info_context->mbmi.second_ref_frame],
-        0, 0, 0);
+        0, 0, 0, NULL);
     return best_rd;
   }
 
@@ -4898,7 +5097,8 @@ int64_t vp8_rd_pick_inter_mode_sb(VP8_COMP *cpi, MACROBLOCK *x,
       &frame_best_ref_mv[xd->mode_info_context->mbmi.second_ref_frame],
       (best_single_rd == INT64_MAX) ? INT_MIN : (best_rd - best_single_rd),
       (best_comp_rd   == INT64_MAX) ? INT_MIN : (best_rd - best_comp_rd),
-      (best_hybrid_rd == INT64_MAX) ? INT_MIN : (best_rd - best_hybrid_rd));
+      (best_hybrid_rd == INT64_MAX) ? INT_MIN : (best_rd - best_hybrid_rd),
+                         NULL);
 
   return best_rd;
 }