17 files changed, 861 insertions, 569 deletions
diff --git a/vp9/encoder/vp9_dct.c b/vp9/encoder/vp9_dct.c
index 0f4a6bb63..a840b480a 100644
--- a/vp9/encoder/vp9_dct.c
+++ b/vp9/encoder/vp9_dct.c
@@ -997,7 +997,7 @@ static INLINE int half_round_shift(int input) {
   return rv;
 }
 
-static void dct32_1d(const int *input, int *output, int round) {
+static void fdct32(const int *input, int *output, int round) {
   int step[32];
   // Stage 1
   step[0] = input[0] + input[(32 - 1)];
@@ -1329,7 +1329,7 @@ void vp9_fdct32x32_c(const int16_t *input, int16_t *out, int stride) {
     int temp_in[32], temp_out[32];
     for (j = 0; j < 32; ++j)
       temp_in[j] = input[j * stride + i] * 4;
-    dct32_1d(temp_in, temp_out, 0);
+    fdct32(temp_in, temp_out, 0);
     for (j = 0; j < 32; ++j)
       output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
   }
@@ -1339,13 +1339,13 @@ void vp9_fdct32x32_c(const int16_t *input, int16_t *out, int stride) {
     int temp_in[32], temp_out[32];
     for (j = 0; j < 32; ++j)
       temp_in[j] = output[j + i * 32];
-    dct32_1d(temp_in, temp_out, 0);
+    fdct32(temp_in, temp_out, 0);
     for (j = 0; j < 32; ++j)
       out[j + i * 32] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
   }
 }
 
-// Note that although we use dct_32_round in dct32_1d computation flow,
+// Note that although we use dct_32_round in dct32 computation flow,
 // this 2d fdct32x32 for rate-distortion optimization loop is operating
 // within 16 bits precision.
 void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *out, int stride) {
@@ -1357,7 +1357,7 @@ void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *out, int stride) {
     int temp_in[32], temp_out[32];
     for (j = 0; j < 32; ++j)
       temp_in[j] = input[j * stride + i] * 4;
-    dct32_1d(temp_in, temp_out, 0);
+    fdct32(temp_in, temp_out, 0);
     for (j = 0; j < 32; ++j)
       // TODO(cd): see quality impact of only doing
       //           output[j * 32 + i] = (temp_out[j] + 1) >> 2;
@@ -1370,7 +1370,7 @@ void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *out, int stride) {
     int temp_in[32], temp_out[32];
     for (j = 0; j < 32; ++j)
       temp_in[j] = output[j + i * 32];
-    dct32_1d(temp_in, temp_out, 1);
+    fdct32(temp_in, temp_out, 1);
     for (j = 0; j < 32; ++j)
       out[j + i * 32] = temp_out[j];
   }
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 317ac9815..b97fd0293 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -380,8 +380,10 @@ static void select_in_frame_q_segment(VP9_COMP *cpi,
       segment = 0;
     }
 
-    complexity_metric =
-      clamp((int)((projected_rate * 64) / target_rate), 16, 255);
+    if (target_rate > 0) {
+      complexity_metric =
+        clamp((int)((projected_rate * 64) / target_rate), 16, 255);
+    }
   }
 
   // Fill in the entires in the segment map corresponding to this SB64
@@ -1029,131 +1031,171 @@ static int sb_has_motion(const VP9_COMMON *cm, MODE_INFO **prev_mi_8x8) {
   }
   return 0;
 }
-
-// TODO(jingning) This currently serves as a test framework for non-RD mode
-// decision. To be continued on optimizing the partition type decisions.
-static void pick_partition_type(VP9_COMP *cpi,
-                                const TileInfo *const tile,
-                                MODE_INFO **mi_8x8, TOKENEXTRA **tp,
-                                int mi_row, int mi_col,
-                                BLOCK_SIZE bsize, int *rate, int64_t *dist,
-                                int do_recon) {
+static void update_state_rt(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
+                         BLOCK_SIZE bsize, int output_enabled) {
+  int i;
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
-  const int mi_stride = cm->mode_info_stride;
-  const int num_8x8_subsize = (num_8x8_blocks_wide_lookup[bsize] >> 1);
-  int i;
-  PARTITION_TYPE partition = PARTITION_NONE;
-  BLOCK_SIZE subsize;
-  BLOCK_SIZE bs_type = mi_8x8[0]->mbmi.sb_type;
-  int sub_rate[4] = {0};
-  int64_t sub_dist[4] = {0};
-  int mi_offset;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct macroblock_plane *const p = x->plane;
+  struct macroblockd_plane *const pd = xd->plane;
+  MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
 
-  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
-    return;
+  const int mb_mode_index = ctx->best_mode_index;
+  int max_plane;
 
-  partition = partition_lookup[b_width_log2(bsize)][bs_type];
-  subsize = get_subsize(bsize, partition);
+  max_plane = is_inter_block(mbmi) ? MAX_MB_PLANE : 1;
+  for (i = 0; i < max_plane; ++i) {
+    p[i].coeff = ctx->coeff_pbuf[i][1];
+    p[i].qcoeff = ctx->qcoeff_pbuf[i][1];
+    pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][1];
+    p[i].eobs = ctx->eobs_pbuf[i][1];
+  }
+
+  for (i = max_plane; i < MAX_MB_PLANE; ++i) {
+    p[i].coeff = ctx->coeff_pbuf[i][2];
+    p[i].qcoeff = ctx->qcoeff_pbuf[i][2];
+    pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][2];
+    p[i].eobs = ctx->eobs_pbuf[i][2];
+  }
+
+  x->skip = ctx->skip;
+
+  if (frame_is_intra_only(cm)) {
+#if CONFIG_INTERNAL_STATS
+    static const int kf_mode_index[] = {
+      THR_DC /*DC_PRED*/,
+      THR_V_PRED /*V_PRED*/,
+      THR_H_PRED /*H_PRED*/,
+      THR_D45_PRED /*D45_PRED*/,
+      THR_D135_PRED /*D135_PRED*/,
+      THR_D117_PRED /*D117_PRED*/,
+      THR_D153_PRED /*D153_PRED*/,
+      THR_D207_PRED /*D207_PRED*/,
+      THR_D63_PRED /*D63_PRED*/,
+      THR_TM /*TM_PRED*/,
+    };
+    ++cpi->mode_chosen_counts[kf_mode_index[mbmi->mode]];
+#endif
+  } else {
+    // Note how often each mode chosen as best
+    cpi->mode_chosen_counts[mb_mode_index]++;
+    if (is_inter_block(mbmi) &&
+        (mbmi->sb_type < BLOCK_8X8 || mbmi->mode == NEWMV)) {
+      int_mv best_mv[2];
+      for (i = 0; i < 1 + has_second_ref(mbmi); ++i)
+        best_mv[i].as_int = mbmi->ref_mvs[mbmi->ref_frame[i]][0].as_int;
+      vp9_update_mv_count(cpi, x, best_mv);
+    }
+
+    if (cm->interp_filter == SWITCHABLE && is_inter_mode(mbmi->mode)) {
+      const int ctx = vp9_get_pred_context_switchable_interp(xd);
+      ++cm->counts.switchable_interp[ctx][mbmi->interp_filter];
+    }
+  }
+}
+
+static void encode_b_rt(VP9_COMP *cpi, const TileInfo *const tile,
+                     TOKENEXTRA **tp, int mi_row, int mi_col,
+                     int output_enabled, BLOCK_SIZE bsize) {
+  MACROBLOCK *const x = &cpi->mb;
 
   if (bsize < BLOCK_8X8) {
     // When ab_index = 0 all sub-blocks are handled, so for ab_index != 0
     // there is nothing to be done.
-    if (x->ab_index != 0) {
-      *rate = 0;
-      *dist = 0;
+    if (x->ab_index > 0)
       return;
-    }
+  }
+  set_offsets(cpi, tile, mi_row, mi_col, bsize);
+  update_state_rt(cpi, get_block_context(x, bsize), bsize, output_enabled);
+
+  encode_superblock(cpi, tp, output_enabled, mi_row, mi_col, bsize);
+  update_stats(cpi);
+
+  (*tp)->token = EOSB_TOKEN;
+  (*tp)++;
+}
+
+static void encode_sb_rt(VP9_COMP *cpi, const TileInfo *const tile,
+                      TOKENEXTRA **tp, int mi_row, int mi_col,
+                      int output_enabled, BLOCK_SIZE bsize) {
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &cpi->mb;
+  const int bsl = b_width_log2(bsize), hbs = (1 << bsl) / 4;
+  int ctx;
+  PARTITION_TYPE partition;
+  BLOCK_SIZE subsize;
+
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
+    return;
+
+  if (bsize >= BLOCK_8X8) {
+    MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+    const int idx_str = xd->mode_info_stride * mi_row + mi_col;
+    MODE_INFO ** mi_8x8 = cm->mi_grid_visible + idx_str;
+    ctx = partition_plane_context(cpi->above_seg_context, cpi->left_seg_context,
+                                 mi_row, mi_col, bsize);
+    subsize = mi_8x8[0]->mbmi.sb_type;
+
   } else {
-    *(get_sb_partitioning(x, bsize)) = subsize;
+    ctx = 0;
+    subsize = BLOCK_4X4;
   }
 
+  partition = partition_lookup[bsl][subsize];
+
   switch (partition) {
     case PARTITION_NONE:
-      rd_pick_sb_modes(cpi, tile, mi_row, mi_col, rate, dist,
-                       bsize, get_block_context(x, bsize), INT64_MAX);
+      if (output_enabled && bsize >= BLOCK_8X8)
+        cm->counts.partition[ctx][PARTITION_NONE]++;
+      encode_b_rt(cpi, tile, tp, mi_row, mi_col, output_enabled, subsize);
       break;
-    case PARTITION_HORZ:
+    case PARTITION_VERT:
+      if (output_enabled)
+        cm->counts.partition[ctx][PARTITION_VERT]++;
       *get_sb_index(x, subsize) = 0;
-      rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &sub_rate[0], &sub_dist[0],
-                       subsize, get_block_context(x, subsize), INT64_MAX);
-      if (bsize >= BLOCK_8X8 && mi_row + num_8x8_subsize < cm->mi_rows) {
-        update_state(cpi, get_block_context(x, subsize), subsize, 0);
-        encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
+      encode_b_rt(cpi, tile, tp, mi_row, mi_col, output_enabled, subsize);
+      if (mi_col + hbs < cm->mi_cols) {
         *get_sb_index(x, subsize) = 1;
-        rd_pick_sb_modes(cpi, tile, mi_row + num_8x8_subsize, mi_col,
-                         &sub_rate[1], &sub_dist[1], subsize,
-                         get_block_context(x, subsize), INT64_MAX);
+        encode_b_rt(cpi, tile, tp, mi_row, mi_col + hbs, output_enabled,
+                    subsize);
       }
-      *rate = sub_rate[0] + sub_rate[1];
-      *dist = sub_dist[0] + sub_dist[1];
       break;
-    case PARTITION_VERT:
+    case PARTITION_HORZ:
+      if (output_enabled)
+        cm->counts.partition[ctx][PARTITION_HORZ]++;
       *get_sb_index(x, subsize) = 0;
-      rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &sub_rate[0], &sub_dist[0],
-                       subsize, get_block_context(x, subsize), INT64_MAX);
-      if (bsize >= BLOCK_8X8 && mi_col + num_8x8_subsize < cm->mi_cols) {
-        update_state(cpi, get_block_context(x, subsize), subsize, 0);
-        encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
+      encode_b_rt(cpi, tile, tp, mi_row, mi_col, output_enabled, subsize);
+      if (mi_row + hbs < cm->mi_rows) {
         *get_sb_index(x, subsize) = 1;
-        rd_pick_sb_modes(cpi, tile, mi_row, mi_col + num_8x8_subsize,
-                         &sub_rate[1], &sub_dist[1], subsize,
-                         get_block_context(x, subsize), INT64_MAX);
+        encode_b_rt(cpi, tile, tp, mi_row + hbs, mi_col, output_enabled,
+                    subsize);
       }
-      *rate = sub_rate[0] + sub_rate[1];
-      *dist = sub_dist[1] + sub_dist[1];
       break;
     case PARTITION_SPLIT:
-      *get_sb_index(x, subsize) = 0;
-      pick_partition_type(cpi, tile, mi_8x8, tp, mi_row, mi_col, subsize,
-                          &sub_rate[0], &sub_dist[0], 0);
-
-      if ((mi_col + num_8x8_subsize) < cm->mi_cols) {
-        *get_sb_index(x, subsize) = 1;
-        pick_partition_type(cpi, tile, mi_8x8 + num_8x8_subsize, tp,
-                            mi_row, mi_col + num_8x8_subsize, subsize,
-                            &sub_rate[1], &sub_dist[1], 0);
-      }
-
-      if ((mi_row + num_8x8_subsize) < cm->mi_rows) {
-        *get_sb_index(x, subsize) = 2;
-        pick_partition_type(cpi, tile, mi_8x8 + num_8x8_subsize * mi_stride, tp,
-                            mi_row + num_8x8_subsize, mi_col, subsize,
-                            &sub_rate[2], &sub_dist[2], 0);
-      }
-
-      if ((mi_col + num_8x8_subsize) < cm->mi_cols &&
-          (mi_row + num_8x8_subsize) < cm->mi_rows) {
-        *get_sb_index(x, subsize) = 3;
-        mi_offset = num_8x8_subsize * mi_stride + num_8x8_subsize;
-        pick_partition_type(cpi, tile, mi_8x8 + mi_offset, tp,
-                            mi_row + num_8x8_subsize, mi_col + num_8x8_subsize,
-                            subsize, &sub_rate[3], &sub_dist[3], 0);
-      }
-
-      for (i = 0; i < 4; ++i) {
-        *rate += sub_rate[i];
-        *dist += sub_dist[i];
-      }
+      subsize = get_subsize(bsize, PARTITION_SPLIT);
+      if (output_enabled)
+        cm->counts.partition[ctx][PARTITION_SPLIT]++;
 
+      *get_sb_index(x, subsize) = 0;
+      encode_sb_rt(cpi, tile, tp, mi_row, mi_col, output_enabled, subsize);
+      *get_sb_index(x, subsize) = 1;
+      encode_sb_rt(cpi, tile, tp, mi_row, mi_col + hbs, output_enabled,
+                   subsize);
+      *get_sb_index(x, subsize) = 2;
+      encode_sb_rt(cpi, tile, tp, mi_row + hbs, mi_col, output_enabled,
+                   subsize);
+      *get_sb_index(x, subsize) = 3;
+      encode_sb_rt(cpi, tile, tp, mi_row + hbs, mi_col + hbs, output_enabled,
+                subsize);
       break;
     default:
-      assert(0);
+      assert("Invalid partition type.");
   }
 
-  if (do_recon) {
-    int output_enabled = (bsize == BLOCK_64X64);
-
-    // Check the projected output rate for this SB against it's target
-    // and and if necessary apply a Q delta using segmentation to get
-    // closer to the target.
-    if ((cpi->oxcf.aq_mode == COMPLEXITY_AQ) && cm->seg.update_map) {
-      select_in_frame_q_segment(cpi, mi_row, mi_col,
-                                output_enabled, *rate);
-    }
-
-    encode_sb(cpi, tile, tp, mi_row, mi_col, output_enabled, bsize);
-  }
+  if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8)
+    update_partition_context(cpi->above_seg_context, cpi->left_seg_context,
+                             mi_row, mi_col, subsize, bsize);
 }
 
 static void rd_use_partition(VP9_COMP *cpi,
@@ -1444,15 +1486,19 @@ static void rd_use_partition(VP9_COMP *cpi,
 }
 
 static const BLOCK_SIZE min_partition_size[BLOCK_SIZES] = {
-  BLOCK_4X4, BLOCK_4X4, BLOCK_4X4, BLOCK_4X4,
-  BLOCK_4X4, BLOCK_4X4, BLOCK_8X8, BLOCK_8X8,
-  BLOCK_8X8, BLOCK_16X16, BLOCK_16X16, BLOCK_16X16, BLOCK_16X16
+  BLOCK_4X4,   BLOCK_4X4,   BLOCK_4X4,
+  BLOCK_4X4,   BLOCK_4X4,   BLOCK_4X4,
+  BLOCK_8X8,   BLOCK_8X8,   BLOCK_8X8,
+  BLOCK_16X16, BLOCK_16X16, BLOCK_16X16,
+  BLOCK_16X16
 };
 
 static const BLOCK_SIZE max_partition_size[BLOCK_SIZES] = {
-  BLOCK_8X8, BLOCK_16X16, BLOCK_16X16, BLOCK_16X16,
-  BLOCK_32X32, BLOCK_32X32, BLOCK_32X32, BLOCK_64X64,
-  BLOCK_64X64, BLOCK_64X64, BLOCK_64X64, BLOCK_64X64, BLOCK_64X64
+  BLOCK_8X8,   BLOCK_16X16, BLOCK_16X16,
+  BLOCK_16X16, BLOCK_32X32, BLOCK_32X32,
+  BLOCK_32X32, BLOCK_64X64, BLOCK_64X64,
+  BLOCK_64X64, BLOCK_64X64, BLOCK_64X64,
+  BLOCK_64X64
 };
 
 // Look at all the mode_info entries for blocks that are part of this
@@ -1538,9 +1584,11 @@ static void rd_auto_partition_range(VP9_COMP *cpi, const TileInfo *const tile,
     }
   }
 
-  // Give a bit of leaway either side of the observed min and max
-  *min_block_size = min_partition_size[*min_block_size];
-  *max_block_size = max_partition_size[*max_block_size];
+  // adjust observed min and max
+  if (cpi->sf.auto_min_max_partition_size == RELAXED_NEIGHBORING_MIN_MAX) {
+    *min_block_size = min_partition_size[*min_block_size];
+    *max_block_size = max_partition_size[*max_block_size];
+  }
 
   // Check border cases where max and min from neighbours may not be legal.
   *max_block_size = find_partition_size(*max_block_size,
@@ -1996,34 +2044,6 @@ static void rd_pick_reference_frame(VP9_COMP *cpi, const TileInfo *const tile,
   restore_context(cpi, mi_row, mi_col, a, l, sa, sl, BLOCK_64X64);
 }
 
-static void encode_sb_row_rt(VP9_COMP *cpi, const TileInfo *const tile,
-                             int mi_row, TOKENEXTRA **tp) {
-  VP9_COMMON *const cm = &cpi->common;
-  int mi_col;
-
-  cpi->sf.always_this_block_size = BLOCK_8X8;
-
-  // Initialize the left context for the new SB row
-  vpx_memset(&cpi->left_context, 0, sizeof(cpi->left_context));
-  vpx_memset(cpi->left_seg_context, 0, sizeof(cpi->left_seg_context));
-
-  // Code each SB in the row
-  for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
-       mi_col += MI_BLOCK_SIZE) {
-    int dummy_rate;
-    int64_t dummy_dist;
-    const int idx_str = cm->mode_info_stride * mi_row + mi_col;
-    MODE_INFO **mi_8x8 = cm->mi_grid_visible + idx_str;
-
-    vp9_zero(cpi->mb.pred_mv);
-
-    set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64);
-    set_partitioning(cpi, tile, mi_8x8, mi_row, mi_col);
-    pick_partition_type(cpi, tile, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64,
-                        &dummy_rate, &dummy_dist, 1);
-  }
-}
-
 static void encode_sb_row(VP9_COMP *cpi, const TileInfo *const tile,
                           int mi_row, TOKENEXTRA **tp) {
   VP9_COMMON *const cm = &cpi->common;
@@ -2250,11 +2270,7 @@ static void encode_frame_internal(VP9_COMP *cpi) {
           vp9_tile_init(&tile, cm, tile_row, tile_col);
           for (mi_row = tile.mi_row_start;
                mi_row < tile.mi_row_end; mi_row += 8)
-#if 1
             encode_sb_row(cpi, &tile, mi_row, &tp);
-#else
-            encode_sb_row_rt(cpi, &tile, mi_row, &tp);
-#endif
 
           cpi->tok_count[tile_row][tile_col] = (unsigned int)(tp - tp_old);
           assert(tp - cpi->tok <= get_token_alloc(cm->mb_rows, cm->mb_cols));
@@ -2433,6 +2449,264 @@ static void select_tx_mode(VP9_COMP *cpi) {
     }
   }
 }
+// Start RTC Exploration
+typedef enum {
+  BOTH_ZERO = 0,
+  ZERO_PLUS_PREDICTED = 1,
+  BOTH_PREDICTED = 2,
+  NEW_PLUS_NON_INTRA = 3,
+  BOTH_NEW = 4,
+  INTRA_PLUS_NON_INTRA = 5,
+  BOTH_INTRA = 6,
+  INVALID_CASE = 9
+} motion_vector_context;
+
+static void set_mode_info(MB_MODE_INFO *mbmi, BLOCK_SIZE bsize,
+                          MB_PREDICTION_MODE mode, int mi_row, int mi_col) {
+  mbmi->interp_filter = EIGHTTAP;
+  mbmi->mode = mode;
+  mbmi->mv[0].as_int = 0;
+  mbmi->mv[1].as_int = 0;
+  if (mode < NEARESTMV) {
+    mbmi->ref_frame[0] = INTRA_FRAME;
+  } else {
+    mbmi->ref_frame[0] = LAST_FRAME;
+  }
+
+  mbmi->ref_frame[1] = INTRA_FRAME;
+  mbmi->tx_size = max_txsize_lookup[bsize];
+  mbmi->uv_mode = mode;
+  mbmi->skip_coeff = 0;
+  mbmi->sb_type = bsize;
+  mbmi->segment_id = 0;
+}
+static inline int get_block_row(int b32i, int b16i, int b8i) {
+  return ((b32i >> 1) << 2) + ((b16i >> 1) << 1) + (b8i >> 1);
+}
+static inline int get_block_col(int b32i, int b16i, int b8i) {
+  return ((b32i & 1) << 2) + ((b16i & 1) << 1) + (b8i & 1);
+}
+static void rtc_use_partition(VP9_COMP *cpi,
+                             const TileInfo *const tile,
+                             MODE_INFO **mi_8x8,
+                             TOKENEXTRA **tp, int mi_row, int mi_col,
+                             BLOCK_SIZE bsize, int *rate, int64_t *dist,
+                             int do_recon) {
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+  const int mis = cm->mode_info_stride;
+  int mi_width = num_8x8_blocks_wide_lookup[cpi->sf.always_this_block_size];
+  int mi_height = num_8x8_blocks_high_lookup[cpi->sf.always_this_block_size];
+  int i, j;
+  int chosen_rate = INT_MAX;
+  int64_t chosen_dist = INT_MAX;
+  MB_PREDICTION_MODE mode = DC_PRED;
+  int row8x8_remaining = tile->mi_row_end - mi_row;
+  int col8x8_remaining = tile->mi_col_end - mi_col;
+  int b32i;
+  x->fast_ms = 0;
+  x->subblock_ref = 0;
+  for (b32i = 0; b32i < 4; b32i++) {
+    int b16i;
+    for (b16i = 0; b16i < 4; b16i++) {
+      int b8i;
+      int block_row = get_block_row(b32i, b16i, 0);
+      int block_col = get_block_col(b32i, b16i, 0);
+      int index = block_row * mis + block_col;
+      int rate;
+      int64_t dist;
+
+      int_mv frame_nearest_mv[MAX_REF_FRAMES];
+      int_mv frame_near_mv[MAX_REF_FRAMES];
+      struct buf_2d yv12_mb[MAX_REF_FRAMES][MAX_MB_PLANE];
+
+      // Find a partition size that fits
+      bsize = find_partition_size(cpi->sf.always_this_block_size,
+                                  (row8x8_remaining - block_row),
+                                  (col8x8_remaining - block_col),
+                                  &mi_height, &mi_width);
+      mi_8x8[index] = mi_8x8[0] + index;
+
+      set_mi_row_col(xd, tile, mi_row + block_row, mi_height,
+                     mi_col + block_col, mi_width, cm->mi_rows, cm->mi_cols);
+
+      xd->mi_8x8 = mi_8x8 + index;
+
+      if (cm->frame_type != KEY_FRAME) {
+        set_offsets(cpi, tile, mi_row + block_row, mi_col + block_col, bsize);
+
+        vp9_pick_inter_mode(cpi, x, tile,
+                            mi_row + block_row, mi_col + block_col,
+                            &rate, &dist, cpi->sf.always_this_block_size);
+      } else {
+        set_mode_info(&mi_8x8[index]->mbmi, bsize, mode,
+                      mi_row + block_row, mi_col + block_col);
+        vp9_setup_buffer_inter(cpi, x, tile,
+                               LAST_FRAME, cpi->sf.always_this_block_size,
+                               mi_row + block_row, mi_col + block_col,
+                               frame_nearest_mv, frame_near_mv, yv12_mb);
+      }
+
+      for (j = 0; j < mi_height; j++)
+        for (i = 0; i < mi_width; i++)
+          if ((xd->mb_to_right_edge >> (3 + MI_SIZE_LOG2)) + mi_width > i
+            && (xd->mb_to_bottom_edge >> (3 + MI_SIZE_LOG2)) + mi_height > j) {
+            mi_8x8[index+ i + j * mis] = mi_8x8[index];
+          }
+
+      for (b8i = 0; b8i < 4; b8i++) {
+      }
+    }
+  }
+  encode_sb_rt(cpi, tile, tp, mi_row, mi_col, 1, BLOCK_64X64);
+
+  *rate = chosen_rate;
+  *dist = chosen_dist;
+}
+
+static void encode_rtc_sb_row(VP9_COMP *cpi, const TileInfo *const tile,
+                              int mi_row, TOKENEXTRA **tp) {
+  VP9_COMMON * const cm = &cpi->common;
+  int mi_col;
+
+  // Initialize the left context for the new SB row
+  vpx_memset(&cpi->left_context, 0, sizeof(cpi->left_context));
+  vpx_memset(cpi->left_seg_context, 0, sizeof(cpi->left_seg_context));
+
+  // Code each SB in the row
+  for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
+       mi_col += MI_BLOCK_SIZE) {
+    int dummy_rate;
+    int64_t dummy_dist;
+
+    const int idx_str = cm->mode_info_stride * mi_row + mi_col;
+    MODE_INFO **mi_8x8 = cm->mi_grid_visible + idx_str;
+
+    cpi->mb.source_variance = UINT_MAX;
+    set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64);
+    set_partitioning(cpi, tile, mi_8x8, mi_row, mi_col);
+    rtc_use_partition(cpi, tile, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64,
+                     &dummy_rate, &dummy_dist, 1);
+  }
+}
+
+
+static void encode_rtc_frame_internal(VP9_COMP *cpi) {
+  int mi_row;
+  MACROBLOCK * const x = &cpi->mb;
+  VP9_COMMON * const cm = &cpi->common;
+  MACROBLOCKD * const xd = &x->e_mbd;
+
+//  fprintf(stderr, "encode_frame_internal frame %d (%d) type %d\n",
+//           cpi->common.current_video_frame, cpi->common.show_frame,
+//           cm->frame_type);
+
+// debug output
+#if DBG_PRNT_SEGMAP
+  {
+    FILE *statsfile;
+    statsfile = fopen("segmap2.stt", "a");
+    fprintf(statsfile, "\n");
+    fclose(statsfile);
+  }
+#endif
+
+  vp9_zero(cm->counts.switchable_interp);
+  vp9_zero(cpi->tx_stepdown_count);
+
+  xd->mi_8x8 = cm->mi_grid_visible;
+  // required for vp9_frame_init_quantizer
+  xd->mi_8x8[0] = cm->mi;
+
+  xd->last_mi = cm->prev_mi;
+
+  vp9_zero(cpi->common.counts.mv);
+  vp9_zero(cpi->coef_counts);
+  vp9_zero(cm->counts.eob_branch);
+
+  cpi->mb.e_mbd.lossless = cm->base_qindex == 0 && cm->y_dc_delta_q == 0
+      && cm->uv_dc_delta_q == 0 && cm->uv_ac_delta_q == 0;
+  switch_lossless_mode(cpi, cpi->mb.e_mbd.lossless);
+
+  vp9_frame_init_quantizer(cpi);
+
+  vp9_initialize_rd_consts(cpi);
+  vp9_initialize_me_consts(cpi, cm->base_qindex);
+  switch_tx_mode(cpi);
+  cpi->sf.always_this_block_size = BLOCK_16X16;
+
+  if (cpi->oxcf.tuning == VP8_TUNE_SSIM) {
+    // Initialize encode frame context.
+    init_encode_frame_mb_context(cpi);
+
+    // Build a frame level activity map
+    build_activity_map(cpi);
+  }
+
+  // Re-initialize encode frame context.
+  init_encode_frame_mb_context(cpi);
+
+  vp9_zero(cpi->rd_comp_pred_diff);
+  vp9_zero(cpi->rd_filter_diff);
+  vp9_zero(cpi->rd_tx_select_diff);
+  vp9_zero(cpi->rd_tx_select_threshes);
+
+  set_prev_mi(cm);
+
+  {
+    struct vpx_usec_timer emr_timer;
+    vpx_usec_timer_start(&emr_timer);
+
+    {
+      // Take tiles into account and give start/end MB
+      int tile_col, tile_row;
+      TOKENEXTRA *tp = cpi->tok;
+      const int tile_cols = 1 << cm->log2_tile_cols;
+      const int tile_rows = 1 << cm->log2_tile_rows;
+
+      for (tile_row = 0; tile_row < tile_rows; tile_row++) {
+        for (tile_col = 0; tile_col < tile_cols; tile_col++) {
+          TileInfo tile;
+          TOKENEXTRA *tp_old = tp;
+
+          // For each row of SBs in the frame
+          vp9_tile_init(&tile, cm, tile_row, tile_col);
+          for (mi_row = tile.mi_row_start;
+               mi_row < tile.mi_row_end; mi_row += 8)
+            encode_rtc_sb_row(cpi, &tile, mi_row, &tp);
+
+          cpi->tok_count[tile_row][tile_col] = (unsigned int)(tp - tp_old);
+          assert(tp - cpi->tok <= get_token_alloc(cm->mb_rows, cm->mb_cols));
+        }
+      }
+    }
+
+    vpx_usec_timer_mark(&emr_timer);
+    cpi->time_encode_sb_row += vpx_usec_timer_elapsed(&emr_timer);
+  }
+
+  if (cpi->sf.skip_encode_sb) {
+    int j;
+    unsigned int intra_count = 0, inter_count = 0;
+    for (j = 0; j < INTRA_INTER_CONTEXTS; ++j) {
+      intra_count += cm->counts.intra_inter[j][0];
+      inter_count += cm->counts.intra_inter[j][1];
+    }
+    cpi->sf.skip_encode_frame = ((intra_count << 2) < inter_count);
+    cpi->sf.skip_encode_frame &= (cm->frame_type != KEY_FRAME);
+    cpi->sf.skip_encode_frame &= cm->show_frame;
+  } else {
+    cpi->sf.skip_encode_frame = 0;
+  }
+
+#if 0
+  // Keep record of the total distortion this time around for future use
+  cpi->last_frame_distortion = cpi->frame_distortion;
+#endif
+}
+// end RTC play code
+
 
 void vp9_encode_frame(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
@@ -2512,7 +2786,11 @@ void vp9_encode_frame(VP9_COMP *cpi) {
     select_tx_mode(cpi);
     cm->reference_mode = reference_mode;
     cm->interp_filter = interp_filter;
-    encode_frame_internal(cpi);
+
+    if (cpi->compressor_speed == 3)
+      encode_rtc_frame_internal(cpi);
+    else
+      encode_frame_internal(cpi);
 
     for (i = 0; i < REFERENCE_MODES; ++i) {
       const int diff = (int) (cpi->rd_comp_pred_diff[i] / cm->MBs);
@@ -2590,7 +2868,7 @@ void vp9_encode_frame(VP9_COMP *cpi) {
       }
     }
   } else {
-    encode_frame_internal(cpi);
+    encode_rtc_frame_internal(cpi);
   }
 }
 
@@ -2666,7 +2944,8 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
   const int mi_width = num_8x8_blocks_wide_lookup[bsize];
   const int mi_height = num_8x8_blocks_high_lookup[bsize];
   x->skip_recode = !x->select_txfm_size && mbmi->sb_type >= BLOCK_8X8 &&
-                   (cpi->oxcf.aq_mode != COMPLEXITY_AQ);
+                   (cpi->oxcf.aq_mode != COMPLEXITY_AQ) &&
+                   cpi->compressor_speed != 3;
   x->skip_optimize = ctx->is_coded;
   ctx->is_coded = 1;
   x->use_lp32x32fdct = cpi->sf.use_lp32x32fdct;
@@ -2681,7 +2960,8 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
       vp9_update_zbin_extra(cpi, x);
     }
   } else {
-    vp9_setup_interp_filters(xd, mbmi->interp_filter, cm);
+    set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+    xd->interp_kernel = vp9_get_interp_kernel(mbmi->interp_filter);
 
     if (cpi->oxcf.tuning == VP8_TUNE_SSIM) {
       // Adjust the zbin based on this MB rate.
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index d1485307d..8ff23c79a 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -25,24 +25,6 @@
 #include "vp9/encoder/vp9_rdopt.h"
 #include "vp9/encoder/vp9_tokenize.h"
 
-void vp9_setup_interp_filters(MACROBLOCKD *xd, INTERP_FILTER filter,
-                              VP9_COMMON *cm) {
-  if (xd->mi_8x8 && xd->mi_8x8[0]) {
-    MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
-
-    set_ref_ptrs(cm, xd, mbmi->ref_frame[0] - LAST_FRAME,
-                 mbmi->ref_frame[1] - LAST_FRAME);
-
-  } else {
-    set_ref_ptrs(cm, xd, -1, -1);
-  }
-
-  xd->subpix.filter_x = xd->subpix.filter_y =
-      vp9_get_interp_kernel(filter == SWITCHABLE ? EIGHTTAP : filter);
-
-  assert(((intptr_t)xd->subpix.filter_x & 0xff) == 0);
-}
-
 void vp9_subtract_block_c(int rows, int cols,
                           int16_t *diff_ptr, ptrdiff_t diff_stride,
                           const uint8_t *src_ptr, ptrdiff_t src_stride,
@@ -356,7 +338,6 @@ static void optimize_init_b(int plane, BLOCK_SIZE bsize,
                            pd->above_context, pd->left_context,
                            num_4x4_w, num_4x4_h);
 }
-
 void vp9_xform_quant(int plane, int block, BLOCK_SIZE plane_bsize,
                      TX_SIZE tx_size, void *arg) {
   struct encode_b_args* const args = arg;
diff --git a/vp9/encoder/vp9_encodemb.h b/vp9/encoder/vp9_encodemb.h
index c728efd49..9f6c9f069 100644
--- a/vp9/encoder/vp9_encodemb.h
+++ b/vp9/encoder/vp9_encodemb.h
@@ -48,8 +48,7 @@ void vp9_encode_intra_block_y(MACROBLOCK *x, BLOCK_SIZE bsize);
 void vp9_encode_intra_block_uv(MACROBLOCK *x, BLOCK_SIZE bsize);
 
 int vp9_encode_intra(MACROBLOCK *x, int use_16x16_pred);
-void vp9_setup_interp_filters(MACROBLOCKD *xd, INTERP_FILTER filter,
-                              VP9_COMMON *cm);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 395ce2008..af9fa1ba7 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -447,6 +447,16 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   }
 }
 
+static BLOCK_SIZE get_bsize(const VP9_COMMON *cm, int mb_row, int mb_col) {
+  if (2 * mb_col + 1 < cm->mi_cols) {
+    return 2 * mb_row + 1 < cm->mi_rows ? BLOCK_16X16
+                                        : BLOCK_16X8;
+  } else {
+    return 2 * mb_row + 1 < cm->mi_rows ? BLOCK_8X16
+                                        : BLOCK_8X8;
+  }
+}
+
 void vp9_first_pass(VP9_COMP *cpi) {
   int mb_row, mb_col;
   MACROBLOCK *const x = &cpi->mb;
@@ -481,10 +491,7 @@ void vp9_first_pass(VP9_COMP *cpi) {
   int sum_in_vectors = 0;
   uint32_t lastmv_as_int = 0;
   struct twopass_rc *const twopass = &cpi->twopass;
-
-  int_mv zero_ref_mv;
-
-  zero_ref_mv.as_int = 0;
+  const MV zero_mv = {0, 0};
 
   vp9_clear_system_state();  // __asm emms;
 
@@ -493,8 +500,7 @@ void vp9_first_pass(VP9_COMP *cpi) {
   setup_dst_planes(xd, new_yv12, 0, 0);
 
   xd->mi_8x8 = cm->mi_grid_visible;
-  // required for vp9_frame_init_quantizer
-  xd->mi_8x8[0] = cm->mi;
+  xd->mi_8x8[0] = cm->mi;  // required for vp9_frame_init_quantizer
 
   setup_block_dptrs(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);
 
@@ -508,14 +514,8 @@ void vp9_first_pass(VP9_COMP *cpi) {
   }
   x->skip_recode = 0;
 
-
-  // Initialise the MV cost table to the defaults
-  // if( cm->current_video_frame == 0)
-  // if ( 0 )
-  {
-    vp9_init_mv_probs(cm);
-    vp9_initialize_rd_consts(cpi);
-  }
+  vp9_init_mv_probs(cm);
+  vp9_initialize_rd_consts(cpi);
 
   // tiling is ignored in the first pass
   vp9_tile_init(&tile, cm, 0, 0);
@@ -540,8 +540,9 @@ void vp9_first_pass(VP9_COMP *cpi) {
     // for each macroblock col in image
     for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
       int this_error;
-      int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
+      const int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
       double error_weight = 1.0;
+      const BLOCK_SIZE bsize = get_bsize(cm, mb_row, mb_col);
 
       vp9_clear_system_state();  // __asm emms;
 
@@ -549,30 +550,15 @@ void vp9_first_pass(VP9_COMP *cpi) {
       xd->plane[1].dst.buf = new_yv12->u_buffer + recon_uvoffset;
       xd->plane[2].dst.buf = new_yv12->v_buffer + recon_uvoffset;
       xd->left_available = (mb_col != 0);
-
-      if (mb_col * 2 + 1 < cm->mi_cols) {
-        if (mb_row * 2 + 1 < cm->mi_rows) {
-          xd->mi_8x8[0]->mbmi.sb_type = BLOCK_16X16;
-        } else {
-          xd->mi_8x8[0]->mbmi.sb_type = BLOCK_16X8;
-        }
-      } else {
-        if (mb_row * 2 + 1 < cm->mi_rows) {
-          xd->mi_8x8[0]->mbmi.sb_type = BLOCK_8X16;
-        } else {
-          xd->mi_8x8[0]->mbmi.sb_type = BLOCK_8X8;
-        }
-      }
+      xd->mi_8x8[0]->mbmi.sb_type = bsize;
       xd->mi_8x8[0]->mbmi.ref_frame[0] = INTRA_FRAME;
       set_mi_row_col(xd, &tile,
-                     mb_row << 1,
-                     num_8x8_blocks_high_lookup[xd->mi_8x8[0]->mbmi.sb_type],
-                     mb_col << 1,
-                     num_8x8_blocks_wide_lookup[xd->mi_8x8[0]->mbmi.sb_type],
+                     mb_row << 1, num_8x8_blocks_high_lookup[bsize],
+                     mb_col << 1, num_8x8_blocks_wide_lookup[bsize],
                      cm->mi_rows, cm->mi_cols);
 
       if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
-        int energy = vp9_block_energy(cpi, x, xd->mi_8x8[0]->mbmi.sb_type);
+        const int energy = vp9_block_energy(cpi, x, bsize);
         error_weight = vp9_vaq_inv_q_ratio(energy);
       }
 
@@ -598,8 +584,7 @@ void vp9_first_pass(VP9_COMP *cpi) {
       // Set up limit values for motion vectors to prevent them extending
       // outside the UMV borders.
       x->mv_col_min = -((mb_col * 16) + BORDER_MV_PIXELS_B16);
-      x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16)
-                      + BORDER_MV_PIXELS_B16;
+      x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16) + BORDER_MV_PIXELS_B16;
 
       // Other than for the first frame do a motion search
       if (cm->current_video_frame > 0) {
@@ -624,7 +609,7 @@ void vp9_first_pass(VP9_COMP *cpi) {
         // based search as well.
         if (best_ref_mv.as_int) {
           tmp_err = INT_MAX;
-          first_pass_motion_search(cpi, x, &zero_ref_mv.as_mv, &tmp_mv.as_mv,
+          first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv.as_mv,
                                    &tmp_err);
           if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
             vp9_clear_system_state();  // __asm emms;
@@ -645,17 +630,15 @@ void vp9_first_pass(VP9_COMP *cpi) {
           xd->plane[0].pre[0].buf = gld_yv12->y_buffer + recon_yoffset;
           gf_motion_error = zz_motion_search(cpi, x);
 
-          first_pass_motion_search(cpi, x, &zero_ref_mv.as_mv, &tmp_mv.as_mv,
+          first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv.as_mv,
                                    &gf_motion_error);
           if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
             vp9_clear_system_state();  // __asm emms;
             gf_motion_error *= error_weight;
           }
 
-          if ((gf_motion_error < motion_error) &&
-              (gf_motion_error < this_error)) {
+          if (gf_motion_error < motion_error && gf_motion_error < this_error)
             second_ref_count++;
-          }
 
           // Reset to last frame as reference buffer
           xd->plane[0].pre[0].buf = lst_yv12->y_buffer + recon_yoffset;
@@ -692,9 +675,8 @@ void vp9_first_pass(VP9_COMP *cpi) {
           xd->mi_8x8[0]->mbmi.tx_size = TX_4X4;
           xd->mi_8x8[0]->mbmi.ref_frame[0] = LAST_FRAME;
           xd->mi_8x8[0]->mbmi.ref_frame[1] = NONE;
-          vp9_build_inter_predictors_sby(xd, mb_row << 1, mb_col << 1,
-                                         xd->mi_8x8[0]->mbmi.sb_type);
-          vp9_encode_sby(x, xd->mi_8x8[0]->mbmi.sb_type);
+          vp9_build_inter_predictors_sby(xd, mb_row << 1, mb_col << 1, bsize);
+          vp9_encode_sby(x, bsize);
           sum_mvr += mv.as_mv.row;
           sum_mvr_abs += abs(mv.as_mv.row);
           sum_mvc += mv.as_mv.col;
@@ -784,13 +766,11 @@ void vp9_first_pass(VP9_COMP *cpi) {
       fps.mvr_abs = (double)sum_mvr_abs / mvcount;
       fps.MVc = (double)sum_mvc / mvcount;
       fps.mvc_abs = (double)sum_mvc_abs / mvcount;
-      fps.MVrv = ((double)sum_mvrs - (fps.MVr * fps.MVr / mvcount)) /
-                     mvcount;
-      fps.MVcv = ((double)sum_mvcs - (fps.MVc * fps.MVc / mvcount)) /
-                     mvcount;
+      fps.MVrv = ((double)sum_mvrs - (fps.MVr * fps.MVr / mvcount)) / mvcount;
+      fps.MVcv = ((double)sum_mvcs - (fps.MVc * fps.MVc / mvcount)) / mvcount;
       fps.mv_in_out_count = (double)sum_in_vectors / (mvcount * 2);
       fps.new_mv_count = new_mv_count;
-      fps.pcnt_motion = (double)mvcount / cpi->common.MBs;
+      fps.pcnt_motion = (double)mvcount / cm->MBs;
     } else {
       fps.MVr = 0.0;
       fps.mvr_abs = 0.0;
@@ -918,8 +898,7 @@ static double calc_correction_factor(double err_per_mb,
   return fclamp(pow(error_term, power_term), 0.05, 5.0);
 }
 
-static int estimate_max_q(VP9_COMP *cpi,
-                          FIRSTPASS_STATS *fpstats,
+static int estimate_max_q(VP9_COMP *cpi, FIRSTPASS_STATS *fpstats,
                           int section_target_bandwitdh) {
   int q;
   const int num_mbs = cpi->common.MBs;
@@ -1093,12 +1072,12 @@ void vp9_end_second_pass(VP9_COMP *cpi) {
 
 // This function gives and estimate of how badly we believe
 // the prediction quality is decaying from frame to frame.
-static double get_prediction_decay_rate(VP9_COMP *cpi,
-                                        FIRSTPASS_STATS *next_frame) {
+static double get_prediction_decay_rate(const VP9_COMMON *cm,
+                                        const FIRSTPASS_STATS *next_frame) {
   // Look at the observed drop in prediction quality between the last frame
   // and the GF buffer (which contains an older frame).
   const double mb_sr_err_diff = (next_frame->sr_coded_error -
-                                     next_frame->coded_error) / cpi->common.MBs;
+                                     next_frame->coded_error) / cm->MBs;
   const double second_ref_decay = mb_sr_err_diff <= 512.0
       ? fclamp(pow(1.0 - (mb_sr_err_diff / 512.0), 0.5), 0.85, 1.0)
       : 0.85;
@@ -1126,7 +1105,6 @@ static int detect_transition_to_still(
     int j;
     FIRSTPASS_STATS *position = cpi->twopass.stats_in;
     FIRSTPASS_STATS tmp_next_frame;
-    double zz_inter;
 
     // Look ahead a few frames to see if static condition
     // persists...
@@ -1134,11 +1112,10 @@ static int detect_transition_to_still(
       if (EOF == input_stats(&cpi->twopass, &tmp_next_frame))
         break;
 
-      zz_inter = (tmp_next_frame.pcnt_inter - tmp_next_frame.pcnt_motion);
-      if (zz_inter < 0.999)
+      if (tmp_next_frame.pcnt_inter - tmp_next_frame.pcnt_motion < 0.999)
         break;
     }
-    // Reset file position
+
     reset_fpf_position(&cpi->twopass, position);
 
     // Only if it does do we signal a transition to still
@@ -1152,14 +1129,14 @@ static int detect_transition_to_still(
 // This function detects a flash through the high relative pcnt_second_ref
 // score in the frame following a flash frame. The offset passed in should
 // reflect this
-static int detect_flash(VP9_COMP *cpi, int offset) {
+static int detect_flash(const struct twopass_rc *twopass, int offset) {
   FIRSTPASS_STATS next_frame;
 
   int flash_detected = 0;
 
   // Read the frame data.
   // The return is FALSE (no flash detected) if not a valid frame
-  if (read_frame_stats(&cpi->twopass, &next_frame, offset) != EOF) {
+  if (read_frame_stats(twopass, &next_frame, offset) != EOF) {
     // What we are looking for here is a situation where there is a
     // brief break in prediction (such as a flash) but subsequent frames
     // are reasonably well predicted by an earlier (pre flash) frame.
@@ -1188,16 +1165,15 @@ static void accumulate_frame_motion_stats(
   // Accumulate Motion In/Out of frame stats
   *this_frame_mv_in_out = this_frame->mv_in_out_count * motion_pct;
   *mv_in_out_accumulator += this_frame->mv_in_out_count * motion_pct;
-  *abs_mv_in_out_accumulator +=
-    fabs(this_frame->mv_in_out_count * motion_pct);
+  *abs_mv_in_out_accumulator += fabs(this_frame->mv_in_out_count * motion_pct);
 
   // Accumulate a measure of how uniform (or conversely how random)
   // the motion field is. (A ratio of absmv / mv)
   if (motion_pct > 0.05) {
-    double this_frame_mvr_ratio = fabs(this_frame->mvr_abs) /
+    const double this_frame_mvr_ratio = fabs(this_frame->mvr_abs) /
                            DOUBLE_DIVIDE_CHECK(fabs(this_frame->MVr));
 
-    double this_frame_mvc_ratio = fabs(this_frame->mvc_abs) /
+    const double this_frame_mvc_ratio = fabs(this_frame->mvc_abs) /
                            DOUBLE_DIVIDE_CHECK(fabs(this_frame->MVc));
 
     *mv_ratio_accumulator += (this_frame_mvr_ratio < this_frame->mvr_abs)
@@ -1240,7 +1216,7 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset,
                           int f_frames, int b_frames,
                           int *f_boost, int *b_boost) {
   FIRSTPASS_STATS this_frame;
-
+  struct twopass_rc *const twopass = &cpi->twopass;
   int i;
   double boost_score = 0.0;
   double mv_ratio_accumulator = 0.0;
@@ -1253,7 +1229,7 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset,
 
   // Search forward from the proposed arf/next gf position
   for (i = 0; i < f_frames; i++) {
-    if (read_frame_stats(&cpi->twopass, &this_frame, (i + offset)) == EOF)
+    if (read_frame_stats(twopass, &this_frame, (i + offset)) == EOF)
       break;
 
     // Update the motion related elements to the boost calculation
@@ -1264,12 +1240,12 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset,
 
     // We want to discount the flash frame itself and the recovery
     // frame that follows as both will have poor scores.
-    flash_detected = detect_flash(cpi, (i + offset)) ||
-                     detect_flash(cpi, (i + offset + 1));
+    flash_detected = detect_flash(twopass, i + offset) ||
+                     detect_flash(twopass, i + offset + 1);
 
     // Cumulative effect of prediction quality decay
     if (!flash_detected) {
-      decay_accumulator *= get_prediction_decay_rate(cpi, &this_frame);
+      decay_accumulator *= get_prediction_decay_rate(&cpi->common, &this_frame);
       decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR
                           ? MIN_DECAY_FACTOR : decay_accumulator;
     }
@@ -1290,7 +1266,7 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset,
 
   // Search backward towards last gf position
   for (i = -1; i >= -b_frames; i--) {
-    if (read_frame_stats(&cpi->twopass, &this_frame, (i + offset)) == EOF)
+    if (read_frame_stats(twopass, &this_frame, (i + offset)) == EOF)
       break;
 
     // Update the motion related elements to the boost calculation
@@ -1301,12 +1277,12 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset,
 
     // We want to discount the the flash frame itself and the recovery
     // frame that follows as both will have poor scores.
-    flash_detected = detect_flash(cpi, (i + offset)) ||
-                     detect_flash(cpi, (i + offset + 1));
+    flash_detected = detect_flash(twopass, i + offset) ||
+                     detect_flash(twopass, i + offset + 1);
 
     // Cumulative effect of prediction quality decay
     if (!flash_detected) {
-      decay_accumulator *= get_prediction_decay_rate(cpi, &this_frame);
+      decay_accumulator *= get_prediction_decay_rate(&cpi->common, &this_frame);
       decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR
                               ? MIN_DECAY_FACTOR : decay_accumulator;
     }
@@ -1466,6 +1442,7 @@ void define_fixed_arf_period(VP9_COMP *cpi) {
 static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   FIRSTPASS_STATS next_frame = { 0 };
   FIRSTPASS_STATS *start_pos;
+  struct twopass_rc *const twopass = &cpi->twopass;
   int i;
   double boost_score = 0.0;
   double old_boost_score = 0.0;
@@ -1486,8 +1463,8 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   double mv_ratio_accumulator_thresh;
   int max_bits = frame_max_bits(cpi);     // Max for a single frame
 
-  unsigned int allow_alt_ref =
-    cpi->oxcf.play_alternate && cpi->oxcf.lag_in_frames;
+  unsigned int allow_alt_ref = cpi->oxcf.play_alternate &&
+                               cpi->oxcf.lag_in_frames;
 
   int f_boost = 0;
   int b_boost = 0;
@@ -1495,11 +1472,11 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   int active_max_gf_interval;
   RATE_CONTROL *const rc = &cpi->rc;
 
-  cpi->twopass.gf_group_bits = 0;
+  twopass->gf_group_bits = 0;
 
   vp9_clear_system_state();  // __asm emms;
 
-  start_pos = cpi->twopass.stats_in;
+  start_pos = twopass->stats_in;
 
   // Load stats for the current frame.
   mod_frame_err = calculate_modified_err(cpi, this_frame);
@@ -1530,20 +1507,19 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     active_max_gf_interval = rc->max_gf_interval;
 
   i = 0;
-  while ((i < cpi->twopass.static_scene_max_gf_interval) &&
-         (i < rc->frames_to_key)) {
+  while (i < twopass->static_scene_max_gf_interval && i < rc->frames_to_key) {
     i++;    // Increment the loop counter
 
     // Accumulate error score of frames in this gf group
     mod_frame_err = calculate_modified_err(cpi, this_frame);
     gf_group_err += mod_frame_err;
 
-    if (EOF == input_stats(&cpi->twopass, &next_frame))
+    if (EOF == input_stats(twopass, &next_frame))
       break;
 
     // Test for the case where there is a brief flash but the prediction
     // quality back to an earlier frame is then restored.
-    flash_detected = detect_flash(cpi, 0);
+    flash_detected = detect_flash(twopass, 0);
 
     // Update the motion related elements to the boost calculation
     accumulate_frame_motion_stats(&next_frame,
@@ -1554,14 +1530,14 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     // Cumulative effect of prediction quality decay
     if (!flash_detected) {
       last_loop_decay_rate = loop_decay_rate;
-      loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
+      loop_decay_rate = get_prediction_decay_rate(&cpi->common, &next_frame);
       decay_accumulator = decay_accumulator * loop_decay_rate;
 
       // Monitor for static sections.
       if ((next_frame.pcnt_inter - next_frame.pcnt_motion) <
           zero_motion_accumulator) {
-        zero_motion_accumulator =
-          (next_frame.pcnt_inter - next_frame.pcnt_motion);
+        zero_motion_accumulator = next_frame.pcnt_inter -
+                                      next_frame.pcnt_motion;
       }
 
       // Break clause to detect very still sections after motion
@@ -1599,14 +1575,14 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     old_boost_score = boost_score;
   }
 
-  cpi->twopass.gf_zeromotion_pct = (int)(zero_motion_accumulator * 1000.0);
+  twopass->gf_zeromotion_pct = (int)(zero_motion_accumulator * 1000.0);
 
   // Don't allow a gf too near the next kf
   if ((rc->frames_to_key - i) < MIN_GF_INTERVAL) {
     while (i < (rc->frames_to_key + !rc->next_key_frame_forced)) {
       i++;
 
-      if (EOF == input_stats(&cpi->twopass, this_frame))
+      if (EOF == input_stats(twopass, this_frame))
         break;
 
       if (i < rc->frames_to_key) {
@@ -2069,7 +2045,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
 
 
       // How fast is prediction quality decaying
-      loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
+      loop_decay_rate = get_prediction_decay_rate(&cpi->common, &next_frame);
 
       // We want to know something about the recent past... rather than
       // as used elsewhere where we are concened with decay in prediction
@@ -2203,8 +2179,8 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
         r = RMAX;
 
       // How fast is prediction quality decaying
-      if (!detect_flash(cpi, 0)) {
-        loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
+      if (!detect_flash(twopass, 0)) {
+        loop_decay_rate = get_prediction_decay_rate(&cpi->common, &next_frame);
         decay_accumulator *= loop_decay_rate;
         decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR
                               ? MIN_DECAY_FACTOR : decay_accumulator;
@@ -2333,6 +2309,7 @@ void vp9_get_svc_params(VP9_COMP *cpi) {
       (cpi->oxcf.auto_key && (cpi->rc.frames_since_key %
                               cpi->key_frame_frequency == 0))) {
     cm->frame_type = KEY_FRAME;
+    cpi->rc.source_alt_ref_active = 0;
   } else {
     cm->frame_type = INTER_FRAME;
   }
@@ -2340,6 +2317,9 @@ void vp9_get_svc_params(VP9_COMP *cpi) {
   cpi->rc.baseline_gf_interval = INT_MAX;
 }
 
+// Use this macro to turn on/off use of alt-refs in one-pass mode.
+#define USE_ALTREF_FOR_ONE_PASS   1
+
 void vp9_get_one_pass_params(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   if (!cpi->refresh_alt_ref_frame &&
@@ -2351,13 +2331,20 @@ void vp9_get_one_pass_params(VP9_COMP *cpi) {
     cpi->rc.this_key_frame_forced = cm->current_video_frame != 0 &&
                                     cpi->rc.frames_to_key == 0;
     cpi->rc.frames_to_key = cpi->key_frame_frequency;
-    cpi->rc.kf_boost = 300;
+    cpi->rc.kf_boost = 2000;
+    cpi->rc.source_alt_ref_active = 0;
   } else {
     cm->frame_type = INTER_FRAME;
   }
   if (cpi->rc.frames_till_gf_update_due == 0) {
+    cpi->rc.baseline_gf_interval = DEFAULT_GF_INTERVAL;
     cpi->rc.frames_till_gf_update_due = cpi->rc.baseline_gf_interval;
+    // NOTE: frames_till_gf_update_due must be <= frames_to_key.
+    if (cpi->rc.frames_till_gf_update_due > cpi->rc.frames_to_key)
+      cpi->rc.frames_till_gf_update_due = cpi->rc.frames_to_key;
     cpi->refresh_golden_frame = 1;
+    cpi->rc.source_alt_ref_pending = USE_ALTREF_FOR_ONE_PASS;
+    cpi->rc.gfu_boost = 1000;
   }
 }
 
@@ -2371,7 +2358,8 @@ void vp9_get_one_pass_cbr_params(VP9_COMP *cpi) {
     cpi->rc.this_key_frame_forced = cm->current_video_frame != 0 &&
                                     cpi->rc.frames_to_key == 0;
     cpi->rc.frames_to_key = cpi->key_frame_frequency;
-    cpi->rc.kf_boost = 300;
+    cpi->rc.kf_boost = 2000;
+    cpi->rc.source_alt_ref_active = 0;
   } else {
     cm->frame_type = INTER_FRAME;
   }
@@ -2405,12 +2393,13 @@ void vp9_get_second_pass_params(VP9_COMP *cpi) {
   double this_frame_intra_error;
   double this_frame_coded_error;
 
+  if (!cpi->twopass.stats_in)
+    return;
   if (cpi->refresh_alt_ref_frame) {
     cpi->common.frame_type = INTER_FRAME;
+    rc->per_frame_bandwidth = cpi->twopass.gf_bits;
     return;
   }
-  if (!cpi->twopass.stats_in)
-    return;
 
   vp9_clear_system_state();
 
diff --git a/vp9/encoder/vp9_lookahead.c b/vp9/encoder/vp9_lookahead.c
index ee73ff15a..e6e59c05a 100644
--- a/vp9/encoder/vp9_lookahead.c
+++ b/vp9/encoder/vp9_lookahead.c
@@ -173,7 +173,6 @@ struct lookahead_entry * vp9_lookahead_peek(struct lookahead_ctx *ctx,
                                             int index) {
   struct lookahead_entry *buf = NULL;
 
-  assert(index < (int)ctx->max_sz);
   if (index < (int)ctx->sz) {
     index += ctx->read_idx;
     if (index >= (int)ctx->max_sz)
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index 88023513a..6bc88ec44 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -626,7 +626,7 @@ static void set_good_speed_feature(VP9_COMMON *cm,
     sf->disable_filter_search_var_thresh = 50;
     sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
 
-    sf->auto_min_max_partition_size = 1;
+    sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX;
     sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_LOW_MOTION;
     sf->adjust_partitioning_from_last_frame = 1;
     sf->last_partitioning_redo_frequency = 3;
@@ -663,7 +663,7 @@ static void set_good_speed_feature(VP9_COMMON *cm,
     sf->disable_filter_search_var_thresh = 100;
     sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
 
-    sf->auto_min_max_partition_size = 1;
+    sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX;
     sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_ALL;
     sf->adjust_partitioning_from_last_frame = 1;
     sf->last_partitioning_redo_frequency = 3;
@@ -698,7 +698,7 @@ static void set_good_speed_feature(VP9_COMMON *cm,
     sf->disable_filter_search_var_thresh = 200;
     sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
 
-    sf->auto_min_max_partition_size = 1;
+    sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX;
     sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_ALL;
     sf->adjust_partitioning_from_last_frame = 1;
     sf->last_partitioning_redo_frequency = 3;
@@ -797,7 +797,7 @@ static void set_rt_speed_feature(VP9_COMMON *cm,
     sf->disable_filter_search_var_thresh = 50;
     sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
 
-    sf->auto_min_max_partition_size = 1;
+    sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX;
     sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_LOW_MOTION;
     sf->adjust_partitioning_from_last_frame = 1;
     sf->last_partitioning_redo_frequency = 3;
@@ -839,6 +839,9 @@ static void set_rt_speed_feature(VP9_COMMON *cm,
   if (speed >= 5) {
     int i;
     sf->disable_split_mask = DISABLE_ALL_SPLIT;
+    sf->auto_min_max_partition_size = frame_is_intra_only(cm) ?
+        RELAXED_NEIGHBORING_MIN_MAX : STRICT_NEIGHBORING_MIN_MAX;
+    sf->subpel_force_stop = 1;
     for (i = 0; i < TX_SIZES; i++) {
       sf->intra_y_mode_mask[i] = INTRA_DC_H_V;
       sf->intra_uv_mode_mask[i] = INTRA_DC_ONLY;
@@ -867,6 +870,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
   sf->recode_loop = 1;
   sf->subpel_search_method = SUBPEL_TREE;
   sf->subpel_iters_per_step = 2;
+  sf->subpel_force_stop = 0;
   sf->optimize_coefficients = !cpi->oxcf.lossless;
   sf->reduce_first_step_size = 0;
   sf->auto_mv_step_size = 0;
@@ -882,7 +886,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
   sf->use_one_partition_size_always = 0;
   sf->less_rectangular_check = 0;
   sf->use_square_partition_only = 0;
-  sf->auto_min_max_partition_size = 0;
+  sf->auto_min_max_partition_size = NOT_IN_USE;
   sf->max_partition_size = BLOCK_64X64;
   sf->min_partition_size = BLOCK_4X4;
   sf->adjust_partitioning_from_last_frame = 0;
@@ -1258,6 +1262,11 @@ void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {
       cpi->pass = 2;
       cpi->compressor_speed = 0;
       break;
+
+    case MODE_REALTIME:
+      cpi->pass = 0;
+      cpi->compressor_speed = 3;
+      break;
   }
 
   cpi->oxcf.worst_allowed_q = q_trans[oxcf->worst_allowed_q];
@@ -2541,7 +2550,10 @@ static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) {
 
     vpx_usec_timer_start(&timer);
 
-    vp9_pick_filter_level(cpi->Source, cpi, cpi->sf.use_fast_lpf_pick);
+    if (cpi->compressor_speed == 3)
+      lf->filter_level = 4;
+    else
+      vp9_pick_filter_level(cpi->Source, cpi, cpi->sf.use_fast_lpf_pick);
 
     vpx_usec_timer_mark(&timer);
     cpi->time_pick_lpf += vpx_usec_timer_elapsed(&timer);
@@ -2730,7 +2742,9 @@ static void encode_with_recode_loop(VP9_COMP *cpi,
     if (cpi->sf.recode_loop != 0) {
       vp9_save_coding_context(cpi);
       cpi->dummy_packing = 1;
-      vp9_pack_bitstream(cpi, dest, size);
+      if (cpi->compressor_speed != 3)
+        vp9_pack_bitstream(cpi, dest, size);
+
       cpi->rc.projected_frame_size = (*size) << 3;
       vp9_restore_coding_context(cpi);
 
@@ -2953,15 +2967,6 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
   // Clear down mmx registers to allow floating point in what follows.
   vp9_clear_system_state();
 
-  // For an alt ref frame in 2 pass we skip the call to the second
-  // pass function that sets the target bandwidth so we must set it here.
-  if (cpi->refresh_alt_ref_frame) {
-    // Set a per frame bit target for the alt ref frame.
-    cpi->rc.per_frame_bandwidth = cpi->twopass.gf_bits;
-    // Set a per second target bitrate.
-    cpi->target_bandwidth = (int)(cpi->twopass.gf_bits * cpi->output_framerate);
-  }
-
   // Clear zbin over-quant value and mode boost values.
   cpi->zbin_mode_boost = 0;
 
@@ -3088,11 +3093,22 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
                                    &frame_under_shoot_limit,
                                    &frame_over_shoot_limit);
 
-  // Decide q and q bounds
+  // Decide q and q bounds.
   q = vp9_rc_pick_q_and_adjust_q_bounds(cpi,
                                         &bottom_index,
                                         &top_index);
 
+  // JBB : This is realtime mode.  In real time mode the first frame
+  // should be larger. Q of 0 is disabled because we force tx size to be
+  // 16x16...
+  if (cpi->compressor_speed == 3) {
+    if (cpi->common.current_video_frame == 0)
+      q /= 3;
+
+    if (q == 0)
+      q++;
+  }
+
   if (!frame_is_intra_only(cm)) {
     cm->interp_filter = DEFAULT_INTERP_FILTER;
     /* TODO: Decide this more intelligently */
@@ -3293,7 +3309,6 @@ static void Pass2Encode(VP9_COMP *cpi, size_t *size,
 
   vp9_get_second_pass_params(cpi);
   encode_frame_to_data_rate(cpi, size, dest, frame_flags);
-  // vp9_print_modes_and_motion_vectors(&cpi->common, "encode.stt");
 
   vp9_twopass_postencode_update(cpi, *size);
 }
@@ -3402,6 +3417,7 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
                             int64_t *time_stamp, int64_t *time_end, int flush) {
   VP9_COMP *cpi = (VP9_COMP *) ptr;
   VP9_COMMON *cm = &cpi->common;
+  MACROBLOCKD *xd = &cpi->mb.e_mbd;
   struct vpx_usec_timer  cmptimer;
   YV12_BUFFER_CONFIG *force_src_buffer = NULL;
   MV_REFERENCE_FRAME ref_frame;
@@ -3449,8 +3465,7 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
       if (cpi->oxcf.arnr_max_frames > 0) {
         // Produce the filtered ARF frame.
         // TODO(agrange) merge these two functions.
-        configure_arnr_filter(cpi, cm->current_video_frame + frames_to_arf,
-                              cpi->rc.gfu_boost);
+        vp9_configure_arnr_filter(cpi, frames_to_arf, cpi->rc.gfu_boost);
         vp9_temporal_filter_prepare(cpi, frames_to_arf);
         vp9_extend_frame_borders(&cpi->alt_ref_buffer,
                                  cm->subsampling_x, cm->subsampling_y);
@@ -3466,7 +3481,9 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
 #if CONFIG_MULTIPLE_ARF
       if (!cpi->multi_arf_enabled)
 #endif
-        cpi->rc.source_alt_ref_pending = 0;   // Clear Pending altf Ref flag.
+        cpi->rc.source_alt_ref_pending = 0;
+    } else {
+      cpi->rc.source_alt_ref_pending = 0;
     }
   }
 
@@ -3585,11 +3602,12 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
       vp9_extend_frame_borders(buf, cm->subsampling_x, cm->subsampling_y);
   }
 
-  vp9_setup_interp_filters(&cpi->mb.e_mbd, DEFAULT_INTERP_FILTER, cm);
+  set_ref_ptrs(cm, xd, LAST_FRAME, LAST_FRAME);
+  xd->interp_kernel = vp9_get_interp_kernel(
+      DEFAULT_INTERP_FILTER == SWITCHABLE ? EIGHTTAP : DEFAULT_INTERP_FILTER);
 
-  if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
-      vp9_vaq_init();
-  }
+  if (cpi->oxcf.aq_mode == VARIANCE_AQ)
+    vp9_vaq_init();
 
   if (cpi->use_svc) {
     SvcEncode(cpi, size, dest, frame_flags);
diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h
index 9cf3f62d6..d2f42dd3e 100644
--- a/vp9/encoder/vp9_onyx_int.h
+++ b/vp9/encoder/vp9_onyx_int.h
@@ -8,25 +8,28 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
 #ifndef VP9_ENCODER_VP9_ONYX_INT_H_
 #define VP9_ENCODER_VP9_ONYX_INT_H_
 
 #include <stdio.h>
+
 #include "./vpx_config.h"
+#include "vpx_ports/mem.h"
+#include "vpx/internal/vpx_codec_internal.h"
+
+#include "vp9/common/vp9_entropy.h"
+#include "vp9/common/vp9_entropymode.h"
 #include "vp9/common/vp9_onyx.h"
-#include "vp9/encoder/vp9_treewriter.h"
-#include "vp9/encoder/vp9_tokenize.h"
 #include "vp9/common/vp9_onyxc_int.h"
-#include "vp9/encoder/vp9_variance.h"
+
 #include "vp9/encoder/vp9_encodemb.h"
-#include "vp9/encoder/vp9_quantize.h"
-#include "vp9/common/vp9_entropy.h"
-#include "vp9/common/vp9_entropymode.h"
-#include "vpx_ports/mem.h"
-#include "vpx/internal/vpx_codec_internal.h"
-#include "vp9/encoder/vp9_mcomp.h"
 #include "vp9/encoder/vp9_lookahead.h"
+#include "vp9/encoder/vp9_mcomp.h"
+#include "vp9/encoder/vp9_quantize.h"
+#include "vp9/encoder/vp9_ratectrl.h"
+#include "vp9/encoder/vp9_tokenize.h"
+#include "vp9/encoder/vp9_treewriter.h"
+#include "vp9/encoder/vp9_variance.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -98,18 +101,6 @@ typedef struct {
 } FIRSTPASS_STATS;
 
 typedef struct {
-  int frames_so_far;
-  double frame_intra_error;
-  double frame_coded_error;
-  double frame_pcnt_inter;
-  double frame_pcnt_motion;
-  double frame_mvr;
-  double frame_mvr_abs;
-  double frame_mvc;
-  double frame_mvc_abs;
-} ONEPASS_FRAMESTATS;
-
-typedef struct {
   struct {
     int err;
     union {
@@ -190,6 +181,12 @@ typedef enum {
 } TX_SIZE_SEARCH_METHOD;
 
 typedef enum {
+  NOT_IN_USE = 0,
+  RELAXED_NEIGHBORING_MIN_MAX = 1,
+  STRICT_NEIGHBORING_MIN_MAX = 2
+} AUTO_MIN_MAX_MODE;
+
+typedef enum {
   // Values should be powers of 2 so that they can be selected as bits of
   // an integer flags field
 
@@ -256,6 +253,9 @@ typedef struct {
   // Maximum number of steps in logarithmic subpel search before giving up.
   int subpel_iters_per_step;
 
+  // Control when to stop subpel search
+  int subpel_force_stop;
+
   // Thresh_mult is used to set a threshold for the rd score. A higher value
   // means that we will accept the best mode so far more often. This number
   // is used in combination with the current block size, and thresh_freq_fact
@@ -343,9 +343,8 @@ typedef struct {
   BLOCK_SIZE always_this_block_size;
 
   // Sets min and max partition sizes for this 64x64 region based on the
-  // same superblock in last encoded frame, and the left and above neighbor
-  // in this block.
-  int auto_min_max_partition_size;
+  // same 64x64 in last encoded frame, and the left and above neighbor.
+  AUTO_MIN_MAX_MODE auto_min_max_partition_size;
 
   // Min and max partition size we enable (block_size) as per auto
   // min max, but also used by adjust partitioning, and pick_partitioning.
@@ -419,67 +418,6 @@ typedef struct {
   int use_fast_coef_updates;  // 0: 2-loop, 1: 1-loop, 2: 1-loop reduced
 } SPEED_FEATURES;
 
-typedef struct {
-  // Rate targetting variables
-  int this_frame_target;
-  int projected_frame_size;
-  int sb64_target_rate;
-  int last_q[3];                   // Separate values for Intra/Inter/ARF-GF
-  int last_boosted_qindex;         // Last boosted GF/KF/ARF q
-
-  int gfu_boost;
-  int last_boost;
-  int kf_boost;
-
-  double rate_correction_factor;
-  double key_frame_rate_correction_factor;
-  double gf_rate_correction_factor;
-
-  unsigned int frames_since_golden;
-  unsigned int frames_till_gf_update_due;  // Count down till next GF
-  unsigned int max_gf_interval;
-  unsigned int baseline_gf_interval;
-  unsigned int frames_to_key;
-  unsigned int frames_since_key;
-  unsigned int this_key_frame_forced;
-  unsigned int next_key_frame_forced;
-  unsigned int source_alt_ref_pending;
-  unsigned int source_alt_ref_active;
-  unsigned int is_src_frame_alt_ref;
-
-  int per_frame_bandwidth;        // Current section per frame bandwidth target
-  int av_per_frame_bandwidth;     // Average frame size target for clip
-  int min_frame_bandwidth;        // Minimum allocation used for any frame
-  int max_frame_bandwidth;        // Maximum burst rate allowed for a frame.
-
-  int ni_av_qi;
-  int ni_tot_qi;
-  int ni_frames;
-  int avg_frame_qindex[3];  // 0 - KEY, 1 - INTER, 2 - ARF/GF
-  double tot_q;
-  double avg_q;
-
-  int buffer_level;
-  int bits_off_target;
-
-  int decimation_factor;
-  int decimation_count;
-
-  int rolling_target_bits;
-  int rolling_actual_bits;
-
-  int long_rolling_target_bits;
-  int long_rolling_actual_bits;
-
-  int64_t total_actual_bits;
-  int total_target_vs_actual;        // debug stats
-
-  int worst_quality;
-  int active_worst_quality;
-  int best_quality;
-  // int active_best_quality;
-} RATE_CONTROL;
-
 typedef struct VP9_COMP {
   DECLARE_ALIGNED(16, int16_t, y_quant[QINDEX_RANGE][8]);
   DECLARE_ALIGNED(16, int16_t, y_quant_shift[QINDEX_RANGE][8]);
@@ -606,11 +544,6 @@ typedef struct VP9_COMP {
   int64_t target_bandwidth;
   struct vpx_codec_pkt_list  *output_pkt_list;
 
-#if 0
-  // Experimental code for lagged and one pass
-  ONEPASS_FRAMESTATS one_pass_frame_stats[MAX_LAG_BUFFERS];
-  int one_pass_frame_index;
-#endif
   MBGRAPH_FRAME_STATS mbgraph_stats[MAX_LAG_BUFFERS];
   int mbgraph_n_frames;             // number of frames filled in the above
   int static_mb_pct;                // % forced skip mbs by segmentation
@@ -821,6 +754,14 @@ static int get_token_alloc(int mb_rows, int mb_cols) {
   return mb_rows * mb_cols * (48 * 16 + 4);
 }
 
+static void set_ref_ptrs(VP9_COMMON *cm, MACROBLOCKD *xd,
+                         MV_REFERENCE_FRAME ref0, MV_REFERENCE_FRAME ref1) {
+  xd->block_refs[0] = &cm->frame_refs[ref0 >= LAST_FRAME ? ref0 - LAST_FRAME
+                                                         : 0];
+  xd->block_refs[1] = &cm->frame_refs[ref1 >= LAST_FRAME ? ref1 - LAST_FRAME
+                                                         : 0];
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 210d15f0d..2b9e31f08 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -80,7 +80,7 @@ static int full_pixel_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   step_param = 6;
   further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;
 
-  for (i = LAST_FRAME; i <= ALTREF_FRAME && cpi->common.show_frame; ++i) {
+  for (i = LAST_FRAME; i <= LAST_FRAME && cpi->common.show_frame; ++i) {
     if ((x->pred_mv_sad[ref] >> 3) > x->pred_mv_sad[i]) {
       tmp_mv->as_int = INVALID_MV;
 
@@ -142,8 +142,7 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                             int mi_row, int mi_col,
                             int *returnrate,
                             int64_t *returndistortion,
-                            BLOCK_SIZE bsize,
-                            PICK_MODE_CONTEXT *ctx) {
+                            BLOCK_SIZE bsize) {
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
   const BLOCK_SIZE block_size = get_plane_block_size(bsize, &xd->plane[0]);
@@ -155,6 +154,7 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                     VP9_ALT_FLAG };
   int64_t best_rd = INT64_MAX;
   int64_t this_rd;
+  int64_t cost[4]= { 0, 100, 150,  205 };
 
   x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
 
@@ -171,7 +171,7 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   mbmi->tx_size = MIN(max_txsize_lookup[bsize],
                       tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
 
-  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+  for (ref_frame = LAST_FRAME; ref_frame <= LAST_FRAME ; ++ref_frame) {
     x->pred_mv_sad[ref_frame] = INT_MAX;
     if (cpi->ref_frame_flags & flag_list[ref_frame]) {
       vp9_setup_buffer_inter(cpi, x, tile,
@@ -182,7 +182,7 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     frame_mv[ZEROMV][ref_frame].as_int = 0;
   }
 
-  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+  for (ref_frame = LAST_FRAME; ref_frame <= LAST_FRAME ; ++ref_frame) {
     int rate_mv = 0;
 
     if (!(cpi->ref_frame_flags & flag_list[ref_frame]))
@@ -191,29 +191,42 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     // Select prediction reference frames.
     xd->plane[0].pre[0] = yv12_mb[ref_frame][0];
 
-
-    x->mode_sad[ref_frame][INTER_OFFSET(NEWMV)] =
-        full_pixel_motion_search(cpi, x, tile, bsize, mi_row, mi_col,
-                                 &frame_mv[NEWMV][ref_frame], &rate_mv);
-
-    if (frame_mv[NEWMV][ref_frame].as_int == INVALID_MV)
-      continue;
-
     clamp_mv2(&frame_mv[NEARESTMV][ref_frame].as_mv, xd);
     clamp_mv2(&frame_mv[NEARMV][ref_frame].as_mv, xd);
 
     for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
-      int rate = x->inter_mode_cost[mbmi->mode_context[ref_frame]]
-                                   [INTER_OFFSET(this_mode)];
-      int64_t dist = x->mode_sad[ref_frame][INTER_OFFSET(this_mode)] *
-                      x->mode_sad[ref_frame][INTER_OFFSET(this_mode)];
-      this_rd = RDCOST(x->rdmult, x->rddiv, rate, dist);
+      int rate = cost[this_mode - NEARESTMV];
+      int64_t dist;
+
+      if (this_mode == NEWMV) {
+        if (this_rd < 300)
+          continue;
+
+        x->mode_sad[ref_frame][INTER_OFFSET(NEWMV)] =
+            full_pixel_motion_search(cpi, x, tile, bsize, mi_row, mi_col,
+                                     &frame_mv[NEWMV][ref_frame], &rate_mv);
+
+        if (frame_mv[NEWMV][ref_frame].as_int == INVALID_MV)
+          continue;
+      }
+
+      dist = x->mode_sad[ref_frame][INTER_OFFSET(this_mode)];
+      this_rd = rate + dist;
 
       if (this_rd < best_rd) {
         best_rd = this_rd;
         mbmi->mode = this_mode;
         mbmi->ref_frame[0] = ref_frame;
         mbmi->mv[0].as_int = frame_mv[this_mode][ref_frame].as_int;
+        xd->mi_8x8[0]->bmi[0].as_mv[0].as_int = mbmi->mv[0].as_int;
+        mbmi->interp_filter = EIGHTTAP;
+
+        mbmi->ref_frame[1] = INTRA_FRAME;
+        mbmi->tx_size = max_txsize_lookup[bsize];
+        mbmi->uv_mode = this_mode;
+        mbmi->skip_coeff = 0;
+        mbmi->sb_type = bsize;
+        mbmi->segment_id = 0;
       }
     }
   }
@@ -223,8 +236,5 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   // TODO(jingning) intra prediction search, if the best SAD is above a certain
   // threshold.
 
-  // store mode decisions
-  ctx->mic = *xd->mi_8x8[0];
-
   return INT64_MAX;
 }
diff --git a/vp9/encoder/vp9_pickmode.h b/vp9/encoder/vp9_pickmode.h
index 82904ae8a..05ff18762 100644
--- a/vp9/encoder/vp9_pickmode.h
+++ b/vp9/encoder/vp9_pickmode.h
@@ -22,8 +22,7 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                             int mi_row, int mi_col,
                             int *returnrate,
                             int64_t *returndistortion,
-                            BLOCK_SIZE bsize,
-                            PICK_MODE_CONTEXT *ctx);
+                            BLOCK_SIZE bsize);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index 3c816a3d0..74eb98fb0 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -218,7 +218,7 @@ static void calc_iframe_target_size(VP9_COMP *cpi) {
   vp9_clear_system_state();  // __asm emms;
 
   // For 1-pass.
-  if (cpi->pass == 0) {
+  if (cpi->pass == 0 && oxcf->end_usage == USAGE_STREAM_FROM_SERVER) {
     if (cpi->common.current_video_frame == 0) {
       target = oxcf->starting_buffer_level / 2;
     } else {
@@ -246,7 +246,7 @@ static void calc_iframe_target_size(VP9_COMP *cpi) {
 
   if (oxcf->rc_max_intra_bitrate_pct) {
     const int max_rate = rc->per_frame_bandwidth *
-                             oxcf->rc_max_intra_bitrate_pct / 100;
+        oxcf->rc_max_intra_bitrate_pct / 100;
     target = MIN(target, max_rate);
   }
   rc->this_frame_target = target;
@@ -375,27 +375,22 @@ static int target_size_from_buffer_level(const VP9_CONFIG *oxcf,
 static void calc_pframe_target_size(VP9_COMP *const cpi) {
   RATE_CONTROL *const rc = &cpi->rc;
   const VP9_CONFIG *const oxcf = &cpi->oxcf;
-  int min_frame_target = MAX(rc->min_frame_bandwidth,
-                             rc->av_per_frame_bandwidth >> 5);
-  if (cpi->refresh_alt_ref_frame) {
-    // Special alt reference frame case
-    // Per frame bit target for the alt ref frame
-    rc->per_frame_bandwidth = cpi->twopass.gf_bits;
-    rc->this_frame_target = rc->per_frame_bandwidth;
-  } else {
-    // Normal frames (gf and inter).
-    rc->this_frame_target = rc->per_frame_bandwidth;
-    // Set target frame size based on buffer level, for 1 pass CBR.
-    if (cpi->pass == 0 && oxcf->end_usage == USAGE_STREAM_FROM_SERVER) {
-      // Need to decide how low min_frame_target should be for 1-pass CBR.
-      // For now, use: cpi->rc.av_per_frame_bandwidth / 16:
-      min_frame_target = MAX(rc->av_per_frame_bandwidth >> 4,
-                             FRAME_OVERHEAD_BITS);
-      rc->this_frame_target = target_size_from_buffer_level(oxcf, rc);
-      // Adjust qp-max based on buffer level.
-      rc->active_worst_quality =
-          adjust_active_worst_quality_from_buffer_level(oxcf, rc);
-    }
+  int min_frame_target;
+  rc->this_frame_target = rc->per_frame_bandwidth;
+
+  if (cpi->pass == 0 && oxcf->end_usage == USAGE_STREAM_FROM_SERVER) {
+    // Need to decide how low min_frame_target should be for 1-pass CBR.
+    // For now, use: cpi->rc.av_per_frame_bandwidth / 16:
+    min_frame_target = MAX(rc->av_per_frame_bandwidth >> 4,
+                           FRAME_OVERHEAD_BITS);
+    rc->this_frame_target = target_size_from_buffer_level(oxcf, rc);
+    // Adjust qp-max based on buffer level.
+    rc->active_worst_quality =
+        adjust_active_worst_quality_from_buffer_level(oxcf, rc);
+
+    if (rc->this_frame_target < min_frame_target)
+      rc->this_frame_target = min_frame_target;
+    return;
   }
 
   // Check that the total sum of adjustments is not above the maximum allowed.
@@ -404,6 +399,9 @@ static void calc_pframe_target_size(VP9_COMP *const cpi) {
   // not capable of recovering all the extra bits we have spent in the KF or GF,
   // then the remainder will have to be recovered over a longer time span via
   // other buffer / rate control mechanisms.
+  min_frame_target = MAX(rc->min_frame_bandwidth,
+                         rc->av_per_frame_bandwidth >> 5);
+
   if (rc->this_frame_target < min_frame_target)
     rc->this_frame_target = min_frame_target;
 
diff --git a/vp9/encoder/vp9_ratectrl.h b/vp9/encoder/vp9_ratectrl.h
index 65ddead11..eba4b7a92 100644
--- a/vp9/encoder/vp9_ratectrl.h
+++ b/vp9/encoder/vp9_ratectrl.h
@@ -12,66 +12,127 @@
 #ifndef VP9_ENCODER_VP9_RATECTRL_H_
 #define VP9_ENCODER_VP9_RATECTRL_H_
 
-#include "vp9/encoder/vp9_onyx_int.h"
-
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 #define FRAME_OVERHEAD_BITS 200
 
-void vp9_save_coding_context(VP9_COMP *cpi);
-void vp9_restore_coding_context(VP9_COMP *cpi);
-
-void vp9_setup_key_frame(VP9_COMP *cpi);
-void vp9_setup_inter_frame(VP9_COMP *cpi);
+typedef struct {
+  // Rate targetting variables
+  int this_frame_target;
+  int projected_frame_size;
+  int sb64_target_rate;
+  int last_q[3];                   // Separate values for Intra/Inter/ARF-GF
+  int last_boosted_qindex;         // Last boosted GF/KF/ARF q
+
+  int gfu_boost;
+  int last_boost;
+  int kf_boost;
+
+  double rate_correction_factor;
+  double key_frame_rate_correction_factor;
+  double gf_rate_correction_factor;
+
+  unsigned int frames_since_golden;
+  unsigned int frames_till_gf_update_due;  // Count down till next GF
+  unsigned int max_gf_interval;
+  unsigned int baseline_gf_interval;
+  unsigned int frames_to_key;
+  unsigned int frames_since_key;
+  unsigned int this_key_frame_forced;
+  unsigned int next_key_frame_forced;
+  unsigned int source_alt_ref_pending;
+  unsigned int source_alt_ref_active;
+  unsigned int is_src_frame_alt_ref;
+
+  int per_frame_bandwidth;        // Current section per frame bandwidth target
+  int av_per_frame_bandwidth;     // Average frame size target for clip
+  int min_frame_bandwidth;        // Minimum allocation used for any frame
+  int max_frame_bandwidth;        // Maximum burst rate allowed for a frame.
+
+  int ni_av_qi;
+  int ni_tot_qi;
+  int ni_frames;
+  int avg_frame_qindex[3];  // 0 - KEY, 1 - INTER, 2 - ARF/GF
+  double tot_q;
+  double avg_q;
+
+  int buffer_level;
+  int bits_off_target;
+
+  int decimation_factor;
+  int decimation_count;
+
+  int rolling_target_bits;
+  int rolling_actual_bits;
+
+  int long_rolling_target_bits;
+  int long_rolling_actual_bits;
+
+  int64_t total_actual_bits;
+  int total_target_vs_actual;        // debug stats
+
+  int worst_quality;
+  int active_worst_quality;
+  int best_quality;
+  // int active_best_quality;
+} RATE_CONTROL;
+
+struct VP9_COMP;
+
+void vp9_save_coding_context(struct VP9_COMP *cpi);
+void vp9_restore_coding_context(struct VP9_COMP *cpi);
+
+void vp9_setup_key_frame(struct VP9_COMP *cpi);
+void vp9_setup_inter_frame(struct VP9_COMP *cpi);
 
 double vp9_convert_qindex_to_q(int qindex);
 
 // Updates rate correction factors
-void vp9_rc_update_rate_correction_factors(VP9_COMP *cpi, int damp_var);
+void vp9_rc_update_rate_correction_factors(struct VP9_COMP *cpi, int damp_var);
 
 // initialize luts for minq
 void vp9_rc_init_minq_luts(void);
 
 // return of 0 means drop frame
 // Changes only rc.this_frame_target and rc.sb64_rate_target
-int vp9_rc_pick_frame_size_target(VP9_COMP *cpi);
+int vp9_rc_pick_frame_size_target(struct VP9_COMP *cpi);
 
-void vp9_rc_compute_frame_size_bounds(const VP9_COMP *cpi,
+void vp9_rc_compute_frame_size_bounds(const struct VP9_COMP *cpi,
                                       int this_frame_target,
                                       int *frame_under_shoot_limit,
                                       int *frame_over_shoot_limit);
 
 // Picks q and q bounds given the target for bits
-int vp9_rc_pick_q_and_adjust_q_bounds(const VP9_COMP *cpi,
+int vp9_rc_pick_q_and_adjust_q_bounds(const struct VP9_COMP *cpi,
                                       int *bottom_index,
                                       int *top_index);
 
 // Estimates q to achieve a target bits per frame
-int vp9_rc_regulate_q(const VP9_COMP *cpi, int target_bits_per_frame,
+int vp9_rc_regulate_q(const struct VP9_COMP *cpi, int target_bits_per_frame,
                       int active_best_quality, int active_worst_quality);
 
 // Post encode update of the rate control parameters based
 // on bytes used
-void vp9_rc_postencode_update(VP9_COMP *cpi,
+void vp9_rc_postencode_update(struct VP9_COMP *cpi,
                               uint64_t bytes_used);
 // for dropped frames
-void vp9_rc_postencode_update_drop_frame(VP9_COMP *cpi);
+void vp9_rc_postencode_update_drop_frame(struct VP9_COMP *cpi);
 
 // estimates bits per mb for a given qindex and correction factor
 int vp9_rc_bits_per_mb(FRAME_TYPE frame_type, int qindex,
                        double correction_factor);
 
 // Post encode update of the rate control parameters for 2-pass
-void vp9_twopass_postencode_update(VP9_COMP *cpi,
+void vp9_twopass_postencode_update(struct VP9_COMP *cpi,
                                    uint64_t bytes_used);
 
 // Decide if we should drop this frame: For 1-pass CBR.
-int vp9_drop_frame(VP9_COMP *cpi);
+int vp9_drop_frame(struct VP9_COMP *cpi);
 
 // Update the buffer level.
-void vp9_update_buffer_level(VP9_COMP *cpi, int encoded_frame_size);
+void vp9_update_buffer_level(struct VP9_COMP *cpi, int encoded_frame_size);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 3be79f46b..f375a88ff 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -280,22 +280,24 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi) {
 
   fill_token_costs(x->token_costs, cm->fc.coef_probs);
 
-  for (i = 0; i < PARTITION_CONTEXTS; i++)
-    vp9_cost_tokens(x->partition_cost[i], get_partition_probs(cm, i),
-                    vp9_partition_tree);
-
-  fill_mode_costs(cpi);
-
-  if (!frame_is_intra_only(cm)) {
-    vp9_build_nmv_cost_table(x->nmvjointcost,
-                             cm->allow_high_precision_mv ? x->nmvcost_hp
-                                                         : x->nmvcost,
-                             &cm->fc.nmvc,
-                             cm->allow_high_precision_mv, 1, 1);
-
-    for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
-      vp9_cost_tokens((int *)x->inter_mode_cost[i],
-                      cm->fc.inter_mode_probs[i], vp9_inter_mode_tree);
+  if (cpi->compressor_speed != 3) {
+    for (i = 0; i < PARTITION_CONTEXTS; i++)
+      vp9_cost_tokens(x->partition_cost[i], get_partition_probs(cm, i),
+                      vp9_partition_tree);
+
+    fill_mode_costs(cpi);
+
+    if (!frame_is_intra_only(cm)) {
+      vp9_build_nmv_cost_table(x->nmvjointcost,
+                               cm->allow_high_precision_mv ? x->nmvcost_hp
+                                                           : x->nmvcost,
+                               &cm->fc.nmvc,
+                               cm->allow_high_precision_mv, 1, 1);
+
+      for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
+        vp9_cost_tokens((int *)x->inter_mode_cost[i],
+                        cm->fc.inter_mode_probs[i], vp9_inter_mode_tree);
+    }
   }
 }
 
@@ -419,12 +421,22 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize,
     struct macroblock_plane *const p = &x->plane[i];
     struct macroblockd_plane *const pd = &xd->plane[i];
     const BLOCK_SIZE bs = get_plane_block_size(bsize, pd);
+
     (void) cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride,
                               pd->dst.buf, pd->dst.stride, &sse);
+
     if (i == 0)
       x->pred_sse[ref] = sse;
-
-    dist_sum += (int)sse;
+    if (cpi->compressor_speed > 2) {
+      dist_sum += (int)sse;
+    } else {
+      int rate;
+      int64_t dist;
+      model_rd_from_var_lapndz(sse, 1 << num_pels_log2_lookup[bs],
+                               pd->dequant[1] >> 3, &rate, &dist);
+      rate_sum += rate;
+      dist_sum += (int)dist;
+    }
   }
 
   *out_rate_sum = rate_sum;
@@ -1517,8 +1529,8 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi,
     vp9_build_inter_predictor(pre, pd->pre[ref].stride,
                               dst, pd->dst.stride,
                               &mi->bmi[i].as_mv[ref].as_mv,
-                              &xd->block_refs[ref]->sf,
-                              width, height, ref, &xd->subpix, MV_PRECISION_Q3,
+                              &xd->block_refs[ref]->sf, width, height, ref,
+                              xd->interp_kernel, MV_PRECISION_Q3,
                               mi_col * MI_SIZE + 4 * (i % 2),
                               mi_row * MI_SIZE + 4 * (i / 2));
   }
@@ -1840,7 +1852,8 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
                                          &bsi->ref_mv->as_mv,
                                          cm->allow_high_precision_mv,
                                          x->errorperbit, v_fn_ptr,
-                                         0, cpi->sf.subpel_iters_per_step,
+                                         cpi->sf.subpel_force_stop,
+                                         cpi->sf.subpel_iters_per_step,
                                          x->nmvjointcost, x->mvcost,
                                          &distortion,
                                          &x->pred_sse[mbmi->ref_frame[0]]);
@@ -2451,7 +2464,8 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
                                  cm->allow_high_precision_mv,
                                  x->errorperbit,
                                  &cpi->fn_ptr[bsize],
-                                 0, cpi->sf.subpel_iters_per_step,
+                                 cpi->sf.subpel_force_stop,
+                                 cpi->sf.subpel_iters_per_step,
                                  x->nmvjointcost, x->mvcost,
                                  &dis, &x->pred_sse[ref]);
   }
@@ -2466,6 +2480,7 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
     for (i = 0; i < MAX_MB_PLANE; i++)
       xd->plane[i].pre[0] = backup_yv12[i];
   }
+  return;
 }
 
 static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
@@ -2536,7 +2551,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
                               &frame_mv[refs[!id]].as_mv,
                               &xd->block_refs[!id]->sf,
                               pw, ph, 0,
-                              &xd->subpix, MV_PRECISION_Q3,
+                              xd->interp_kernel, MV_PRECISION_Q3,
                               mi_col * MI_SIZE, mi_row * MI_SIZE);
 
     // Compound motion search on first ref frame.
@@ -2783,7 +2798,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
         int j;
         int64_t rs_rd;
         mbmi->interp_filter = i;
-        vp9_setup_interp_filters(xd, mbmi->interp_filter, cm);
+        xd->interp_kernel = vp9_get_interp_kernel(mbmi->interp_filter);
         rs = get_switchable_rate(x);
         rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0);
 
@@ -2854,7 +2869,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   // Set the appropriate filter
   mbmi->interp_filter = cm->interp_filter != SWITCHABLE ?
       cm->interp_filter : *best_filter;
-  vp9_setup_interp_filters(xd, mbmi->interp_filter, cm);
+  xd->interp_kernel = vp9_get_interp_kernel(mbmi->interp_filter);
   rs = cm->interp_filter == SWITCHABLE ? get_switchable_rate(x) : 0;
 
   if (pred_exists) {
@@ -3277,13 +3292,14 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
           continue;
     }
 
-    set_ref_ptrs(cm, xd, ref_frame - 1, second_ref_frame - 1);
+    set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
     mbmi->uv_mode = DC_PRED;
 
     // Evaluate all sub-pel filters irrespective of whether we can use
     // them for this frame.
-    mbmi->interp_filter = cm->interp_filter;
-    vp9_setup_interp_filters(xd, mbmi->interp_filter, cm);
+    mbmi->interp_filter = cm->interp_filter == SWITCHABLE ? EIGHTTAP
+                                                          : cm->interp_filter;
+    xd->interp_kernel = vp9_get_interp_kernel(mbmi->interp_filter);
 
     if (comp_pred) {
       if (!(cpi->ref_frame_flags & flag_list[second_ref_frame]))
@@ -3709,7 +3725,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     vp9_zero(best_tx_diff);
   }
 
-  set_ref_ptrs(cm, xd, mbmi->ref_frame[0] - 1, mbmi->ref_frame[1] - 1);
+  set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
   store_coding_context(x, ctx, best_mode_index,
                        &mbmi->ref_mvs[mbmi->ref_frame[0]][0],
                        &mbmi->ref_mvs[mbmi->ref_frame[1] < 0 ? 0 :
@@ -3902,13 +3918,14 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
         vp9_is_scaled(&cm->frame_refs[second_ref_frame - 1].sf))
       continue;
 
-    set_ref_ptrs(cm, xd, ref_frame - 1, second_ref_frame - 1);
+    set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
     mbmi->uv_mode = DC_PRED;
 
     // Evaluate all sub-pel filters irrespective of whether we can use
     // them for this frame.
-    mbmi->interp_filter = cm->interp_filter;
-    vp9_setup_interp_filters(xd, mbmi->interp_filter, cm);
+    mbmi->interp_filter = cm->interp_filter == SWITCHABLE ? EIGHTTAP
+                                                          : cm->interp_filter;
+    xd->interp_kernel = vp9_get_interp_kernel(mbmi->interp_filter);
 
     if (comp_pred) {
       if (!(cpi->ref_frame_flags & flag_list[second_ref_frame]))
@@ -4031,8 +4048,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
             int newbest, rs;
             int64_t rs_rd;
             mbmi->interp_filter = switchable_filter_index;
-            vp9_setup_interp_filters(xd, mbmi->interp_filter, cm);
-
+            xd->interp_kernel = vp9_get_interp_kernel(mbmi->interp_filter);
             tmp_rd = rd_pick_best_mbsegmentation(cpi, x, tile,
                                                  &mbmi->ref_mvs[ref_frame][0],
                                                  second_ref,
@@ -4097,7 +4113,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
 
       mbmi->interp_filter = (cm->interp_filter == SWITCHABLE ?
                              tmp_best_filter : cm->interp_filter);
-      vp9_setup_interp_filters(xd, mbmi->interp_filter, cm);
+      xd->interp_kernel = vp9_get_interp_kernel(mbmi->interp_filter);
       if (!pred_exists) {
         // Handles the special case when a filter that is not in the
         // switchable list (bilinear, 6-tap) is indicated at the frame level
@@ -4442,7 +4458,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
     vp9_zero(best_tx_diff);
   }
 
-  set_ref_ptrs(cm, xd, mbmi->ref_frame[0] - 1, mbmi->ref_frame[1] - 1);
+  set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
   store_coding_context(x, ctx, best_mode_index,
                        &mbmi->ref_mvs[mbmi->ref_frame[0]][0],
                        &mbmi->ref_mvs[mbmi->ref_frame[1] < 0 ? 0 :
diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c
index c2eea0aaa..e822e4c64 100644
--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c
@@ -60,7 +60,7 @@ static void temporal_filter_predictors_mb_c(MACROBLOCKD *xd,
                             scale,
                             16, 16,
                             which_mv,
-                            &xd->subpix, MV_PRECISION_Q3, x, y);
+                            xd->interp_kernel, MV_PRECISION_Q3, x, y);
 
   vp9_build_inter_predictor(u_mb_ptr, uv_stride,
                             &pred[256], uv_block_size,
@@ -68,7 +68,7 @@ static void temporal_filter_predictors_mb_c(MACROBLOCKD *xd,
                             scale,
                             uv_block_size, uv_block_size,
                             which_mv,
-                            &xd->subpix, mv_precision_uv, x, y);
+                            xd->interp_kernel, mv_precision_uv, x, y);
 
   vp9_build_inter_predictor(v_mb_ptr, uv_stride,
                             &pred[512], uv_block_size,
@@ -76,7 +76,7 @@ static void temporal_filter_predictors_mb_c(MACROBLOCKD *xd,
                             scale,
                             uv_block_size, uv_block_size,
                             which_mv,
-                            &xd->subpix, mv_precision_uv, x, y);
+                            xd->interp_kernel, mv_precision_uv, x, y);
 }
 
 void vp9_temporal_filter_apply_c(uint8_t *frame1,
@@ -392,7 +392,6 @@ void vp9_temporal_filter_prepare(VP9_COMP *cpi, int distance) {
   const int num_frames_backward = distance;
   const int num_frames_forward = vp9_lookahead_depth(cpi->lookahead)
                                - (num_frames_backward + 1);
-
   struct scale_factors sf;
 
   switch (blur_type) {
@@ -408,7 +407,6 @@ void vp9_temporal_filter_prepare(VP9_COMP *cpi, int distance) {
 
     case 2:
       // Forward Blur
-
       frames_to_blur_forward = num_frames_forward;
 
       if (frames_to_blur_forward >= max_frames)
@@ -471,22 +469,24 @@ void vp9_temporal_filter_prepare(VP9_COMP *cpi, int distance) {
                             strength, &sf);
 }
 
-void configure_arnr_filter(VP9_COMP *cpi, const unsigned int this_frame,
-                           const int group_boost) {
+void vp9_configure_arnr_filter(VP9_COMP *cpi,
+                               const unsigned int frames_to_arnr,
+                               const int group_boost) {
   int half_gf_int;
   int frames_after_arf;
   int frames_bwd = cpi->oxcf.arnr_max_frames - 1;
   int frames_fwd = cpi->oxcf.arnr_max_frames - 1;
   int q;
 
-  // Define the arnr filter width for this group of frames:
-  // We only filter frames that lie within a distance of half
-  // the GF interval from the ARF frame. We also have to trap
-  // cases where the filter extends beyond the end of clip.
-  // Note: this_frame->frame has been updated in the loop
-  // so it now points at the ARF frame.
+  // Define the arnr filter width for this group of frames. We only
+  // filter frames that lie within a distance of half the GF interval
+  // from the ARF frame. We also have to trap cases where the filter
+  // extends beyond the end of the lookahead buffer.
+  // Note: frames_to_arnr parameter is the offset of the arnr
+  // frame from the current frame.
   half_gf_int = cpi->rc.baseline_gf_interval >> 1;
-  frames_after_arf = (int)(cpi->twopass.total_stats.count - this_frame - 1);
+  frames_after_arf = vp9_lookahead_depth(cpi->lookahead)
+      - frames_to_arnr - 1;
 
   switch (cpi->oxcf.arnr_type) {
     case 1:  // Backward filter
diff --git a/vp9/encoder/vp9_temporal_filter.h b/vp9/encoder/vp9_temporal_filter.h
index eea2f1018..3028d7884 100644
--- a/vp9/encoder/vp9_temporal_filter.h
+++ b/vp9/encoder/vp9_temporal_filter.h
@@ -16,8 +16,9 @@ extern "C" {
 #endif
 
 void vp9_temporal_filter_prepare(VP9_COMP *cpi, int distance);
-void configure_arnr_filter(VP9_COMP *cpi, const unsigned int this_frame,
-                           const int group_boost);
+void vp9_configure_arnr_filter(VP9_COMP *cpi,
+                               const unsigned int frames_to_arnr,
+                               const int group_boost);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vp9/encoder/x86/vp9_dct_avx2.c b/vp9/encoder/x86/vp9_dct_avx2.c
index d81b72bba..ea031fb07 100644
--- a/vp9/encoder/x86/vp9_dct_avx2.c
+++ b/vp9/encoder/x86/vp9_dct_avx2.c
@@ -163,7 +163,7 @@ static INLINE void transpose_4x4_avx2(__m128i *res) {
   res[3] = _mm_unpackhi_epi64(res[2], res[2]);
 }
 
-void fdct4_1d_avx2(__m128i *in) {
+void fdct4_avx2(__m128i *in) {
   const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
@@ -196,7 +196,7 @@ void fdct4_1d_avx2(__m128i *in) {
   transpose_4x4_avx2(in);
 }
 
-void fadst4_1d_avx2(__m128i *in) {
+void fadst4_avx2(__m128i *in) {
   const __m128i k__sinpi_p01_p02 = pair_set_epi16(sinpi_1_9, sinpi_2_9);
   const __m128i k__sinpi_p04_m01 = pair_set_epi16(sinpi_4_9, -sinpi_1_9);
   const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9);
@@ -250,20 +250,20 @@ void vp9_short_fht4x4_avx2(const int16_t *input, int16_t *output,
   load_buffer_4x4_avx2(input, in, stride);
   switch (tx_type) {
     case 0:  // DCT_DCT
-      fdct4_1d_avx2(in);
-      fdct4_1d_avx2(in);
+      fdct4_avx2(in);
+      fdct4_avx2(in);
       break;
     case 1:  // ADST_DCT
-      fadst4_1d_avx2(in);
-      fdct4_1d_avx2(in);
+      fadst4_avx2(in);
+      fdct4_avx2(in);
       break;
     case 2:  // DCT_ADST
-      fdct4_1d_avx2(in);
-      fadst4_1d_avx2(in);
+      fdct4_avx2(in);
+      fadst4_avx2(in);
       break;
     case 3:  // ADST_ADST
-      fadst4_1d_avx2(in);
-      fadst4_1d_avx2(in);
+      fadst4_avx2(in);
+      fadst4_avx2(in);
       break;
     default:
       assert(0);
@@ -658,7 +658,7 @@ static INLINE void array_transpose_8x8_avx2(__m128i *in, __m128i *res) {
   // 07 17 27 37 47 57 67 77
 }
 
-void fdct8_1d_avx2(__m128i *in) {
+void fdct8_avx2(__m128i *in) {
   // constants
   const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
@@ -798,7 +798,7 @@ void fdct8_1d_avx2(__m128i *in) {
   array_transpose_8x8_avx2(in, in);
 }
 
-void fadst8_1d_avx2(__m128i *in) {
+void fadst8_avx2(__m128i *in) {
   // Constants
   const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
   const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
@@ -1034,20 +1034,20 @@ void vp9_short_fht8x8_avx2(const int16_t *input, int16_t *output,
   load_buffer_8x8_avx2(input, in, stride);
   switch (tx_type) {
     case 0:  // DCT_DCT
-      fdct8_1d_avx2(in);
-      fdct8_1d_avx2(in);
+      fdct8_avx2(in);
+      fdct8_avx2(in);
       break;
     case 1:  // ADST_DCT
-      fadst8_1d_avx2(in);
-      fdct8_1d_avx2(in);
+      fadst8_avx2(in);
+      fdct8_avx2(in);
       break;
     case 2:  // DCT_ADST
-      fdct8_1d_avx2(in);
-      fadst8_1d_avx2(in);
+      fdct8_avx2(in);
+      fadst8_avx2(in);
       break;
     case 3:  // ADST_ADST
-      fadst8_1d_avx2(in);
-      fadst8_1d_avx2(in);
+      fadst8_avx2(in);
+      fadst8_avx2(in);
       break;
     default:
       assert(0);
@@ -1216,7 +1216,7 @@ void vp9_fdct16x16_avx2(const int16_t *input, int16_t *output, int stride) {
         step1_6 = _mm_sub_epi16(in01, in14);
         step1_7 = _mm_sub_epi16(in00, in15);
       }
-      // Work on the first eight values; fdct8_1d(input, even_results);
+      // Work on the first eight values; fdct8(input, even_results);
       {
         // Add/substract
         const __m128i q0 = _mm_add_epi16(input0, input7);
@@ -1730,7 +1730,7 @@ static INLINE void right_shift_16x16_avx2(__m128i *res0, __m128i *res1) {
   right_shift_8x8_avx2(res1 + 8, 2);
 }
 
-void fdct16_1d_8col_avx2(__m128i *in) {
+void fdct16_8col_avx2(__m128i *in) {
   // perform 16x16 1-D DCT for 8 columns
   __m128i i[8], s[8], p[8], t[8], u[16], v[16];
   const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
@@ -2052,7 +2052,7 @@ void fdct16_1d_8col_avx2(__m128i *in) {
   in[15] = _mm_packs_epi32(v[14], v[15]);
 }
 
-void fadst16_1d_8col_avx2(__m128i *in) {
+void fadst16_8col_avx2(__m128i *in) {
   // perform 16x16 1-D ADST for 8 columns
   __m128i s[16], x[16], u[32], v[32];
   const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
@@ -2522,15 +2522,15 @@ void fadst16_1d_8col_avx2(__m128i *in) {
   in[15] = _mm_sub_epi16(kZero, s[1]);
 }
 
-void fdct16_1d_avx2(__m128i *in0, __m128i *in1) {
-  fdct16_1d_8col_avx2(in0);
-  fdct16_1d_8col_avx2(in1);
+void fdct16_avx2(__m128i *in0, __m128i *in1) {
+  fdct16_8col_avx2(in0);
+  fdct16_8col_avx2(in1);
   array_transpose_16x16_avx2(in0, in1);
 }
 
-void fadst16_1d_avx2(__m128i *in0, __m128i *in1) {
-  fadst16_1d_8col_avx2(in0);
-  fadst16_1d_8col_avx2(in1);
+void fadst16_avx2(__m128i *in0, __m128i *in1) {
+  fadst16_8col_avx2(in0);
+  fadst16_8col_avx2(in1);
   array_transpose_16x16_avx2(in0, in1);
 }
 
@@ -2540,24 +2540,24 @@ void vp9_short_fht16x16_avx2(const int16_t *input, int16_t *output,
   load_buffer_16x16_avx2(input, in0, in1, stride);
   switch (tx_type) {
     case 0:  // DCT_DCT
-      fdct16_1d_avx2(in0, in1);
+      fdct16_avx2(in0, in1);
       right_shift_16x16_avx2(in0, in1);
-      fdct16_1d_avx2(in0, in1);
+      fdct16_avx2(in0, in1);
       break;
     case 1:  // ADST_DCT
-      fadst16_1d_avx2(in0, in1);
+      fadst16_avx2(in0, in1);
       right_shift_16x16_avx2(in0, in1);
-      fdct16_1d_avx2(in0, in1);
+      fdct16_avx2(in0, in1);
       break;
     case 2:  // DCT_ADST
-      fdct16_1d_avx2(in0, in1);
+      fdct16_avx2(in0, in1);
       right_shift_16x16_avx2(in0, in1);
-      fadst16_1d_avx2(in0, in1);
+      fadst16_avx2(in0, in1);
       break;
     case 3:  // ADST_ADST
-      fadst16_1d_avx2(in0, in1);
+      fadst16_avx2(in0, in1);
       right_shift_16x16_avx2(in0, in1);
-      fadst16_1d_avx2(in0, in1);
+      fadst16_avx2(in0, in1);
       break;
     default:
       assert(0);
diff --git a/vp9/encoder/x86/vp9_dct_sse2.c b/vp9/encoder/x86/vp9_dct_sse2.c
index 65431bdbf..c876cc273 100644
--- a/vp9/encoder/x86/vp9_dct_sse2.c
+++ b/vp9/encoder/x86/vp9_dct_sse2.c
@@ -161,7 +161,7 @@ static INLINE void transpose_4x4(__m128i *res) {
   res[3] = _mm_unpackhi_epi64(res[2], res[2]);
 }
 
-void fdct4_1d_sse2(__m128i *in) {
+void fdct4_sse2(__m128i *in) {
   const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
@@ -194,7 +194,7 @@ void fdct4_1d_sse2(__m128i *in) {
   transpose_4x4(in);
 }
 
-void fadst4_1d_sse2(__m128i *in) {
+void fadst4_sse2(__m128i *in) {
   const __m128i k__sinpi_p01_p02 = pair_set_epi16(sinpi_1_9, sinpi_2_9);
   const __m128i k__sinpi_p04_m01 = pair_set_epi16(sinpi_4_9, -sinpi_1_9);
   const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9);
@@ -248,20 +248,20 @@ void vp9_short_fht4x4_sse2(const int16_t *input, int16_t *output,
   load_buffer_4x4(input, in, stride);
   switch (tx_type) {
     case 0:  // DCT_DCT
-      fdct4_1d_sse2(in);
-      fdct4_1d_sse2(in);
+      fdct4_sse2(in);
+      fdct4_sse2(in);
       break;
     case 1:  // ADST_DCT
-      fadst4_1d_sse2(in);
-      fdct4_1d_sse2(in);
+      fadst4_sse2(in);
+      fdct4_sse2(in);
       break;
     case 2:  // DCT_ADST
-      fdct4_1d_sse2(in);
-      fadst4_1d_sse2(in);
+      fdct4_sse2(in);
+      fadst4_sse2(in);
       break;
     case 3:  // ADST_ADST
-      fadst4_1d_sse2(in);
-      fadst4_1d_sse2(in);
+      fadst4_sse2(in);
+      fadst4_sse2(in);
       break;
     default:
       assert(0);
@@ -656,7 +656,7 @@ static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
   // 07 17 27 37 47 57 67 77
 }
 
-void fdct8_1d_sse2(__m128i *in) {
+void fdct8_sse2(__m128i *in) {
   // constants
   const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
@@ -796,7 +796,7 @@ void fdct8_1d_sse2(__m128i *in) {
   array_transpose_8x8(in, in);
 }
 
-void fadst8_1d_sse2(__m128i *in) {
+void fadst8_sse2(__m128i *in) {
   // Constants
   const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
   const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
@@ -1032,20 +1032,20 @@ void vp9_short_fht8x8_sse2(const int16_t *input, int16_t *output,
   load_buffer_8x8(input, in, stride);
   switch (tx_type) {
     case 0:  // DCT_DCT
-      fdct8_1d_sse2(in);
-      fdct8_1d_sse2(in);
+      fdct8_sse2(in);
+      fdct8_sse2(in);
       break;
     case 1:  // ADST_DCT
-      fadst8_1d_sse2(in);
-      fdct8_1d_sse2(in);
+      fadst8_sse2(in);
+      fdct8_sse2(in);
       break;
     case 2:  // DCT_ADST
-      fdct8_1d_sse2(in);
-      fadst8_1d_sse2(in);
+      fdct8_sse2(in);
+      fadst8_sse2(in);
       break;
     case 3:  // ADST_ADST
-      fadst8_1d_sse2(in);
-      fadst8_1d_sse2(in);
+      fadst8_sse2(in);
+      fadst8_sse2(in);
       break;
     default:
       assert(0);
@@ -1214,7 +1214,7 @@ void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) {
         step1_6 = _mm_sub_epi16(in01, in14);
         step1_7 = _mm_sub_epi16(in00, in15);
       }
-      // Work on the first eight values; fdct8_1d(input, even_results);
+      // Work on the first eight values; fdct8(input, even_results);
       {
         // Add/substract
         const __m128i q0 = _mm_add_epi16(input0, input7);
@@ -1728,7 +1728,7 @@ static INLINE void right_shift_16x16(__m128i *res0, __m128i *res1) {
   right_shift_8x8(res1 + 8, 2);
 }
 
-void fdct16_1d_8col(__m128i *in) {
+void fdct16_8col(__m128i *in) {
   // perform 16x16 1-D DCT for 8 columns
   __m128i i[8], s[8], p[8], t[8], u[16], v[16];
   const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
@@ -2050,7 +2050,7 @@ void fdct16_1d_8col(__m128i *in) {
   in[15] = _mm_packs_epi32(v[14], v[15]);
 }
 
-void fadst16_1d_8col(__m128i *in) {
+void fadst16_8col(__m128i *in) {
   // perform 16x16 1-D ADST for 8 columns
   __m128i s[16], x[16], u[32], v[32];
   const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
@@ -2520,15 +2520,15 @@ void fadst16_1d_8col(__m128i *in) {
   in[15] = _mm_sub_epi16(kZero, s[1]);
 }
 
-void fdct16_1d_sse2(__m128i *in0, __m128i *in1) {
-  fdct16_1d_8col(in0);
-  fdct16_1d_8col(in1);
+void fdct16_sse2(__m128i *in0, __m128i *in1) {
+  fdct16_8col(in0);
+  fdct16_8col(in1);
   array_transpose_16x16(in0, in1);
 }
 
-void fadst16_1d_sse2(__m128i *in0, __m128i *in1) {
-  fadst16_1d_8col(in0);
-  fadst16_1d_8col(in1);
+void fadst16_sse2(__m128i *in0, __m128i *in1) {
+  fadst16_8col(in0);
+  fadst16_8col(in1);
   array_transpose_16x16(in0, in1);
 }
 
@@ -2538,24 +2538,24 @@ void vp9_short_fht16x16_sse2(const int16_t *input, int16_t *output,
   load_buffer_16x16(input, in0, in1, stride);
   switch (tx_type) {
     case 0:  // DCT_DCT
-      fdct16_1d_sse2(in0, in1);
+      fdct16_sse2(in0, in1);
       right_shift_16x16(in0, in1);
-      fdct16_1d_sse2(in0, in1);
+      fdct16_sse2(in0, in1);
       break;
     case 1:  // ADST_DCT
-      fadst16_1d_sse2(in0, in1);
+      fadst16_sse2(in0, in1);
       right_shift_16x16(in0, in1);
-      fdct16_1d_sse2(in0, in1);
+      fdct16_sse2(in0, in1);
       break;
     case 2:  // DCT_ADST
-      fdct16_1d_sse2(in0, in1);
+      fdct16_sse2(in0, in1);
       right_shift_16x16(in0, in1);
-      fadst16_1d_sse2(in0, in1);
+      fadst16_sse2(in0, in1);
       break;
     case 3:  // ADST_ADST
-      fadst16_1d_sse2(in0, in1);
+      fadst16_sse2(in0, in1);
       right_shift_16x16(in0, in1);
-      fadst16_1d_sse2(in0, in1);
+      fadst16_sse2(in0, in1);
       break;
     default:
       assert(0);