37 files changed, 1285 insertions, 1166 deletions
diff --git a/vp9/common/arm/neon/vp9_convolve8_avg_neon.asm b/vp9/common/arm/neon/vp9_convolve8_avg_neon.asm
index 6b20cb9bf..4d85846f0 100644
--- a/vp9/common/arm/neon/vp9_convolve8_avg_neon.asm
+++ b/vp9/common/arm/neon/vp9_convolve8_avg_neon.asm
@@ -78,7 +78,7 @@
 
     mov             r10, r6                 ; w loop counter
 
-loop_horiz_v
+vp9_convolve8_avg_loop_horiz_v
     vld1.8          {d24}, [r0], r1
     vld1.8          {d25}, [r0], r1
     vld1.8          {d26}, [r0], r1
@@ -101,7 +101,7 @@ loop_horiz_v
 
     add             r0, r0, #3
 
-loop_horiz
+vp9_convolve8_avg_loop_horiz
     add             r5, r0, #64
 
     vld1.32         {d28[]}, [r0], r1
@@ -170,14 +170,14 @@ loop_horiz
     vmov            q9,  q13
 
     subs            r6, r6, #4              ; w -= 4
-    bgt             loop_horiz
+    bgt             vp9_convolve8_avg_loop_horiz
 
     ; outer loop
     mov             r6, r10                 ; restore w counter
     add             r0, r0, r9              ; src += src_stride * 4 - w
     add             r2, r2, r12             ; dst += dst_stride * 4 - w
     subs            r7, r7, #4              ; h -= 4
-    bgt loop_horiz_v
+    bgt vp9_convolve8_avg_loop_horiz_v
 
     pop             {r4-r10, pc}
 
@@ -203,7 +203,7 @@ loop_horiz
     lsl             r1, r1, #1
     lsl             r3, r3, #1
 
-loop_vert_h
+vp9_convolve8_avg_loop_vert_h
     mov             r4, r0
     add             r7, r0, r1, asr #1
     mov             r5, r2
@@ -223,7 +223,7 @@ loop_vert_h
     vmovl.u8        q10, d20
     vmovl.u8        q11, d22
 
-loop_vert
+vp9_convolve8_avg_loop_vert
     ; always process a 4x4 block at a time
     vld1.u32        {d24[0]}, [r7], r1
     vld1.u32        {d26[0]}, [r4], r1
@@ -288,13 +288,13 @@ loop_vert
     vmov            d22, d25
 
     subs            r12, r12, #4            ; h -= 4
-    bgt             loop_vert
+    bgt             vp9_convolve8_avg_loop_vert
 
     ; outer loop
     add             r0, r0, #4
     add             r2, r2, #4
     subs            r6, r6, #4              ; w -= 4
-    bgt             loop_vert_h
+    bgt             vp9_convolve8_avg_loop_vert_h
 
     pop             {r4-r8, pc}
 
diff --git a/vp9/common/arm/neon/vp9_convolve8_neon.asm b/vp9/common/arm/neon/vp9_convolve8_neon.asm
index 45258454c..184c3ad67 100644
--- a/vp9/common/arm/neon/vp9_convolve8_neon.asm
+++ b/vp9/common/arm/neon/vp9_convolve8_neon.asm
@@ -78,7 +78,7 @@
 
     mov             r10, r6                 ; w loop counter
 
-loop_horiz_v
+vp9_convolve8_loop_horiz_v
     vld1.8          {d24}, [r0], r1
     vld1.8          {d25}, [r0], r1
     vld1.8          {d26}, [r0], r1
@@ -101,7 +101,7 @@ loop_horiz_v
 
     add             r0, r0, #3
 
-loop_horiz
+vp9_convolve8_loop_horiz
     add             r5, r0, #64
 
     vld1.32         {d28[]}, [r0], r1
@@ -159,14 +159,14 @@ loop_horiz
     vmov            q9,  q13
 
     subs            r6, r6, #4              ; w -= 4
-    bgt             loop_horiz
+    bgt             vp9_convolve8_loop_horiz
 
     ; outer loop
     mov             r6, r10                 ; restore w counter
     add             r0, r0, r9              ; src += src_stride * 4 - w
     add             r2, r2, r12             ; dst += dst_stride * 4 - w
     subs            r7, r7, #4              ; h -= 4
-    bgt loop_horiz_v
+    bgt vp9_convolve8_loop_horiz_v
 
     pop             {r4-r10, pc}
 
@@ -192,7 +192,7 @@ loop_horiz
     lsl             r1, r1, #1
     lsl             r3, r3, #1
 
-loop_vert_h
+vp9_convolve8_loop_vert_h
     mov             r4, r0
     add             r7, r0, r1, asr #1
     mov             r5, r2
@@ -212,7 +212,7 @@ loop_vert_h
     vmovl.u8        q10, d20
     vmovl.u8        q11, d22
 
-loop_vert
+vp9_convolve8_loop_vert
     ; always process a 4x4 block at a time
     vld1.u32        {d24[0]}, [r7], r1
     vld1.u32        {d26[0]}, [r4], r1
@@ -266,13 +266,13 @@ loop_vert
     vmov            d22, d25
 
     subs            r12, r12, #4            ; h -= 4
-    bgt             loop_vert
+    bgt             vp9_convolve8_loop_vert
 
     ; outer loop
     add             r0, r0, #4
     add             r2, r2, #4
     subs            r6, r6, #4              ; w -= 4
-    bgt             loop_vert_h
+    bgt             vp9_convolve8_loop_vert_h
 
     pop             {r4-r8, pc}
 
diff --git a/vp9/common/vp9_entropy.c b/vp9/common/vp9_entropy.c
index c3fdeb48a..77a8709f0 100644
--- a/vp9/common/vp9_entropy.c
+++ b/vp9/common/vp9_entropy.c
@@ -748,10 +748,10 @@ void vp9_model_to_full_probs(const vp9_prob *model, vp9_prob *full) {
 }
 
 void vp9_default_coef_probs(VP9_COMMON *cm) {
-  vp9_copy(cm->fc.coef_probs[TX_4X4], default_coef_probs_4x4);
-  vp9_copy(cm->fc.coef_probs[TX_8X8], default_coef_probs_8x8);
-  vp9_copy(cm->fc.coef_probs[TX_16X16], default_coef_probs_16x16);
-  vp9_copy(cm->fc.coef_probs[TX_32X32], default_coef_probs_32x32);
+  vp9_copy(cm->fc->coef_probs[TX_4X4], default_coef_probs_4x4);
+  vp9_copy(cm->fc->coef_probs[TX_8X8], default_coef_probs_8x8);
+  vp9_copy(cm->fc->coef_probs[TX_16X16], default_coef_probs_16x16);
+  vp9_copy(cm->fc->coef_probs[TX_32X32], default_coef_probs_32x32);
 }
 
 #define COEF_COUNT_SAT 24
@@ -765,7 +765,7 @@ static void adapt_coef_probs(VP9_COMMON *cm, TX_SIZE tx_size,
                              unsigned int count_sat,
                              unsigned int update_factor) {
   const FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx];
-  vp9_coeff_probs_model *const probs = cm->fc.coef_probs[tx_size];
+  vp9_coeff_probs_model *const probs = cm->fc->coef_probs[tx_size];
   const vp9_coeff_probs_model *const pre_probs = pre_fc->coef_probs[tx_size];
   vp9_coeff_count_model *counts = cm->counts.coef[tx_size];
   unsigned int (*eob_counts)[REF_TYPES][COEF_BANDS][COEFF_CONTEXTS] =
diff --git a/vp9/common/vp9_entropymode.c b/vp9/common/vp9_entropymode.c
index 5b00b0082..1a24572ba 100644
--- a/vp9/common/vp9_entropymode.c
+++ b/vp9/common/vp9_entropymode.c
@@ -350,7 +350,7 @@ static void adapt_probs(const vp9_tree_index *tree,
 
 void vp9_adapt_mode_probs(VP9_COMMON *cm) {
   int i, j;
-  FRAME_CONTEXT *fc = &cm->fc;
+  FRAME_CONTEXT *fc = cm->fc;
   const FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx];
   const FRAME_COUNTS *counts = &cm->counts;
 
@@ -451,17 +451,17 @@ void vp9_setup_past_independence(VP9_COMMON *cm) {
   lf->last_sharpness_level = -1;
 
   vp9_default_coef_probs(cm);
-  vp9_init_mode_probs(&cm->fc);
+  vp9_init_mode_probs(cm->fc);
   vp9_init_mv_probs(cm);
 
   if (cm->frame_type == KEY_FRAME ||
       cm->error_resilient_mode || cm->reset_frame_context == 3) {
     // Reset all frame contexts.
     for (i = 0; i < FRAME_CONTEXTS; ++i)
-      cm->frame_contexts[i] = cm->fc;
+      cm->frame_contexts[i] = *cm->fc;
   } else if (cm->reset_frame_context == 2) {
     // Reset only the frame context specified in the frame header.
-    cm->frame_contexts[cm->frame_context_idx] = cm->fc;
+    cm->frame_contexts[cm->frame_context_idx] = *cm->fc;
   }
 
   if (frame_is_intra_only(cm))
diff --git a/vp9/common/vp9_entropymv.c b/vp9/common/vp9_entropymv.c
index 5bb048202..922c03947 100644
--- a/vp9/common/vp9_entropymv.c
+++ b/vp9/common/vp9_entropymv.c
@@ -196,7 +196,7 @@ static void adapt_probs(const vp9_tree_index *tree, const vp9_prob *pre_probs,
 void vp9_adapt_mv_probs(VP9_COMMON *cm, int allow_hp) {
   int i, j;
 
-  nmv_context *fc = &cm->fc.nmvc;
+  nmv_context *fc = &cm->fc->nmvc;
   const nmv_context *pre_fc = &cm->frame_contexts[cm->frame_context_idx].nmvc;
   const nmv_context_counts *counts = &cm->counts.mv;
 
@@ -229,5 +229,5 @@ void vp9_adapt_mv_probs(VP9_COMMON *cm, int allow_hp) {
 }
 
 void vp9_init_mv_probs(VP9_COMMON *cm) {
-  cm->fc.nmvc = default_nmv_context;
+  cm->fc->nmvc = default_nmv_context;
 }
diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c
index aca8d7b33..43a4fe5b9 100644
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@@ -1625,6 +1625,17 @@ void vp9_loop_filter_frame(YV12_BUFFER_CONFIG *frame,
                        y_only);
 }
 
+void vp9_loop_filter_data_reset(
+    LFWorkerData *lf_data, YV12_BUFFER_CONFIG *frame_buffer,
+    struct VP9Common *cm, const struct macroblockd_plane planes[MAX_MB_PLANE]) {
+  lf_data->frame_buffer = frame_buffer;
+  lf_data->cm = cm;
+  lf_data->start = 0;
+  lf_data->stop = 0;
+  lf_data->y_only = 0;
+  vpx_memcpy(lf_data->planes, planes, sizeof(lf_data->planes));
+}
+
 int vp9_loop_filter_worker(LFWorkerData *const lf_data, void *unused) {
   (void)unused;
   vp9_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
diff --git a/vp9/common/vp9_loopfilter.h b/vp9/common/vp9_loopfilter.h
index 0ede58ae4..4c15e6bd4 100644
--- a/vp9/common/vp9_loopfilter.h
+++ b/vp9/common/vp9_loopfilter.h
@@ -124,11 +124,12 @@ typedef struct LoopFilterWorkerData {
   int start;
   int stop;
   int y_only;
-
-  struct VP9LfSyncData *lf_sync;
-  int num_lf_workers;
 } LFWorkerData;
 
+void vp9_loop_filter_data_reset(
+    LFWorkerData *lf_data, YV12_BUFFER_CONFIG *frame_buffer,
+    struct VP9Common *cm, const struct macroblockd_plane planes[MAX_MB_PLANE]);
+
 // Operates on the rows described by 'lf_data'.
 int vp9_loop_filter_worker(LFWorkerData *const lf_data, void *unused);
 #ifdef __cplusplus
diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h
index f1eda9117..b818ae818 100644
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h
@@ -169,8 +169,8 @@ typedef struct VP9Common {
   MV_REFERENCE_FRAME comp_var_ref[2];
   REFERENCE_MODE reference_mode;
 
-  FRAME_CONTEXT fc;  /* this frame entropy */
-  FRAME_CONTEXT frame_contexts[FRAME_CONTEXTS];
+  FRAME_CONTEXT *fc;  /* this frame entropy */
+  FRAME_CONTEXT *frame_contexts;   // FRAME_CONTEXTS
   unsigned int  frame_context_idx; /* Context to use/update */
   FRAME_COUNTS counts;
 
@@ -261,7 +261,7 @@ static INLINE int frame_is_intra_only(const VP9_COMMON *const cm) {
 static INLINE const vp9_prob* get_partition_probs(const VP9_COMMON *cm,
                                                   int ctx) {
   return frame_is_intra_only(cm) ? vp9_kf_partition_probs[ctx]
-                                 : cm->fc.partition_prob[ctx];
+                                 : cm->fc->partition_prob[ctx];
 }
 
 static INLINE void set_skip_context(MACROBLOCKD *xd, int mi_row, int mi_col) {
diff --git a/vp9/common/vp9_pred_common.h b/vp9/common/vp9_pred_common.h
index 39774f142..cf13e4a91 100644
--- a/vp9/common/vp9_pred_common.h
+++ b/vp9/common/vp9_pred_common.h
@@ -54,7 +54,7 @@ static INLINE int vp9_get_skip_context(const MACROBLOCKD *xd) {
 
 static INLINE vp9_prob vp9_get_skip_prob(const VP9_COMMON *cm,
                                          const MACROBLOCKD *xd) {
-  return cm->fc.skip_probs[vp9_get_skip_context(xd)];
+  return cm->fc->skip_probs[vp9_get_skip_context(xd)];
 }
 
 int vp9_get_pred_context_switchable_interp(const MACROBLOCKD *xd);
@@ -63,14 +63,14 @@ int vp9_get_intra_inter_context(const MACROBLOCKD *xd);
 
 static INLINE vp9_prob vp9_get_intra_inter_prob(const VP9_COMMON *cm,
                                                 const MACROBLOCKD *xd) {
-  return cm->fc.intra_inter_prob[vp9_get_intra_inter_context(xd)];
+  return cm->fc->intra_inter_prob[vp9_get_intra_inter_context(xd)];
 }
 
 int vp9_get_reference_mode_context(const VP9_COMMON *cm, const MACROBLOCKD *xd);
 
 static INLINE vp9_prob vp9_get_reference_mode_prob(const VP9_COMMON *cm,
                                                    const MACROBLOCKD *xd) {
-  return cm->fc.comp_inter_prob[vp9_get_reference_mode_context(cm, xd)];
+  return cm->fc->comp_inter_prob[vp9_get_reference_mode_context(cm, xd)];
 }
 
 int vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm,
@@ -79,21 +79,21 @@ int vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm,
 static INLINE vp9_prob vp9_get_pred_prob_comp_ref_p(const VP9_COMMON *cm,
                                                     const MACROBLOCKD *xd) {
   const int pred_context = vp9_get_pred_context_comp_ref_p(cm, xd);
-  return cm->fc.comp_ref_prob[pred_context];
+  return cm->fc->comp_ref_prob[pred_context];
 }
 
 int vp9_get_pred_context_single_ref_p1(const MACROBLOCKD *xd);
 
 static INLINE vp9_prob vp9_get_pred_prob_single_ref_p1(const VP9_COMMON *cm,
                                                        const MACROBLOCKD *xd) {
-  return cm->fc.single_ref_prob[vp9_get_pred_context_single_ref_p1(xd)][0];
+  return cm->fc->single_ref_prob[vp9_get_pred_context_single_ref_p1(xd)][0];
 }
 
 int vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd);
 
 static INLINE vp9_prob vp9_get_pred_prob_single_ref_p2(const VP9_COMMON *cm,
                                                        const MACROBLOCKD *xd) {
-  return cm->fc.single_ref_prob[vp9_get_pred_context_single_ref_p2(xd)][1];
+  return cm->fc->single_ref_prob[vp9_get_pred_context_single_ref_p2(xd)][1];
 }
 
 int vp9_get_tx_size_context(const MACROBLOCKD *xd);
diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index dc712f045..66da63ac6 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -127,7 +127,7 @@ static REFERENCE_MODE read_frame_reference_mode(const VP9_COMMON *cm,
 }
 
 static void read_frame_reference_mode_probs(VP9_COMMON *cm, vp9_reader *r) {
-  FRAME_CONTEXT *const fc = &cm->fc;
+  FRAME_CONTEXT *const fc = cm->fc;
   int i;
 
   if (cm->reference_mode == REFERENCE_MODE_SELECT)
@@ -902,11 +902,8 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi,
     LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1;
     // Be sure to sync as we might be resuming after a failed frame decode.
     winterface->sync(&pbi->lf_worker);
-    lf_data->frame_buffer = get_frame_new_buffer(cm);
-    lf_data->cm = cm;
-    vp9_copy(lf_data->planes, pbi->mb.plane);
-    lf_data->stop = 0;
-    lf_data->y_only = 0;
+    vp9_loop_filter_data_reset(lf_data, get_frame_new_buffer(cm), cm,
+                               pbi->mb.plane);
     vp9_loop_filter_frame_init(cm, cm->lf.filter_level);
   }
 
@@ -1065,14 +1062,19 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi,
     // use num_threads - 1 workers.
     CHECK_MEM_ERROR(cm, pbi->tile_workers,
                     vpx_malloc(num_threads * sizeof(*pbi->tile_workers)));
+    // Ensure tile data offsets will be properly aligned. This may fail on
+    // platforms without DECLARE_ALIGNED().
+    assert((sizeof(*pbi->tile_worker_data) % 16) == 0);
+    CHECK_MEM_ERROR(cm, pbi->tile_worker_data,
+                    vpx_memalign(32, num_threads *
+                                 sizeof(*pbi->tile_worker_data)));
+    CHECK_MEM_ERROR(cm, pbi->tile_worker_info,
+                    vpx_malloc(num_threads * sizeof(*pbi->tile_worker_info)));
     for (i = 0; i < num_threads; ++i) {
       VP9Worker *const worker = &pbi->tile_workers[i];
       ++pbi->num_tile_workers;
 
       winterface->init(worker);
-      CHECK_MEM_ERROR(cm, worker->data1,
-                      vpx_memalign(32, sizeof(TileWorkerData)));
-      CHECK_MEM_ERROR(cm, worker->data2, vpx_malloc(sizeof(TileInfo)));
       if (i < num_threads - 1 && !winterface->reset(worker)) {
         vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
                            "Tile decoder thread creation failed");
@@ -1082,8 +1084,11 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi,
 
   // Reset tile decoding hook
   for (n = 0; n < num_workers; ++n) {
-    winterface->sync(&pbi->tile_workers[n]);
-    pbi->tile_workers[n].hook = (VP9WorkerHook)tile_worker_hook;
+    VP9Worker *const worker = &pbi->tile_workers[n];
+    winterface->sync(worker);
+    worker->hook = (VP9WorkerHook)tile_worker_hook;
+    worker->data1 = &pbi->tile_worker_data[n];
+    worker->data2 = &pbi->tile_worker_info[n];
   }
 
   // Note: this memset assumes above_context[0], [1] and [2]
@@ -1386,7 +1391,7 @@ static int read_compressed_header(VP9Decoder *pbi, const uint8_t *data,
                                   size_t partition_size) {
   VP9_COMMON *const cm = &pbi->common;
   MACROBLOCKD *const xd = &pbi->mb;
-  FRAME_CONTEXT *const fc = &cm->fc;
+  FRAME_CONTEXT *const fc = cm->fc;
   vp9_reader r;
   int k;
 
@@ -1540,7 +1545,7 @@ void vp9_decode_frame(VP9Decoder *pbi,
   setup_plane_dequants(cm, xd, cm->base_qindex);
   vp9_setup_block_planes(xd, cm->subsampling_x, cm->subsampling_y);
 
-  cm->fc = cm->frame_contexts[cm->frame_context_idx];
+  *cm->fc = cm->frame_contexts[cm->frame_context_idx];
   vp9_zero(cm->counts);
   vp9_zero(xd->dqcoeff);
 
@@ -1555,7 +1560,9 @@ void vp9_decode_frame(VP9Decoder *pbi,
     if (!xd->corrupted) {
       // If multiple threads are used to decode tiles, then we use those threads
       // to do parallel loopfiltering.
-      vp9_loop_filter_frame_mt(new_fb, pbi, cm, cm->lf.filter_level, 0);
+      vp9_loop_filter_frame_mt(&pbi->lf_row_sync, new_fb, pbi->mb.plane, cm,
+                               pbi->tile_workers, pbi->num_tile_workers,
+                               cm->lf.filter_level, 0);
     }
   } else {
     *p_data_end = decode_tiles(pbi, data + first_partition_size, data_end);
@@ -1580,5 +1587,5 @@ void vp9_decode_frame(VP9Decoder *pbi,
   }
 
   if (cm->refresh_frame_context)
-    cm->frame_contexts[cm->frame_context_idx] = cm->fc;
+    cm->frame_contexts[cm->frame_context_idx] = *cm->fc;
 }
diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index a01fe842e..d0e0b76da 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -30,7 +30,7 @@ static PREDICTION_MODE read_intra_mode(vp9_reader *r, const vp9_prob *p) {
 static PREDICTION_MODE read_intra_mode_y(VP9_COMMON *cm, vp9_reader *r,
                                             int size_group) {
   const PREDICTION_MODE y_mode =
-      read_intra_mode(r, cm->fc.y_mode_prob[size_group]);
+      read_intra_mode(r, cm->fc->y_mode_prob[size_group]);
   if (!cm->frame_parallel_decoding_mode)
     ++cm->counts.y_mode[size_group][y_mode];
   return y_mode;
@@ -39,7 +39,7 @@ static PREDICTION_MODE read_intra_mode_y(VP9_COMMON *cm, vp9_reader *r,
 static PREDICTION_MODE read_intra_mode_uv(VP9_COMMON *cm, vp9_reader *r,
                                           PREDICTION_MODE y_mode) {
   const PREDICTION_MODE uv_mode = read_intra_mode(r,
-                                         cm->fc.uv_mode_prob[y_mode]);
+                                         cm->fc->uv_mode_prob[y_mode]);
   if (!cm->frame_parallel_decoding_mode)
     ++cm->counts.uv_mode[y_mode][uv_mode];
   return uv_mode;
@@ -47,7 +47,7 @@ static PREDICTION_MODE read_intra_mode_uv(VP9_COMMON *cm, vp9_reader *r,
 
 static PREDICTION_MODE read_inter_mode(VP9_COMMON *cm, vp9_reader *r, int ctx) {
   const int mode = vp9_read_tree(r, vp9_inter_mode_tree,
-                                 cm->fc.inter_mode_probs[ctx]);
+                                 cm->fc->inter_mode_probs[ctx]);
   if (!cm->frame_parallel_decoding_mode)
     ++cm->counts.inter_mode[ctx][mode];
 
@@ -61,7 +61,7 @@ static int read_segment_id(vp9_reader *r, const struct segmentation *seg) {
 static TX_SIZE read_selected_tx_size(VP9_COMMON *cm, MACROBLOCKD *xd,
                                      TX_SIZE max_tx_size, vp9_reader *r) {
   const int ctx = vp9_get_tx_size_context(xd);
-  const vp9_prob *tx_probs = get_tx_probs(max_tx_size, ctx, &cm->fc.tx_probs);
+  const vp9_prob *tx_probs = get_tx_probs(max_tx_size, ctx, &cm->fc->tx_probs);
   int tx_size = vp9_read(r, tx_probs[0]);
   if (tx_size != TX_4X4 && max_tx_size >= TX_16X16) {
     tx_size += vp9_read(r, tx_probs[1]);
@@ -150,7 +150,7 @@ static int read_skip(VP9_COMMON *cm, const MACROBLOCKD *xd,
     return 1;
   } else {
     const int ctx = vp9_get_skip_context(xd);
-    const int skip = vp9_read(r, cm->fc.skip_probs[ctx]);
+    const int skip = vp9_read(r, cm->fc->skip_probs[ctx]);
     if (!cm->frame_parallel_decoding_mode)
       ++cm->counts.skip[ctx][skip];
     return skip;
@@ -258,7 +258,7 @@ static REFERENCE_MODE read_block_reference_mode(VP9_COMMON *cm,
   if (cm->reference_mode == REFERENCE_MODE_SELECT) {
     const int ctx = vp9_get_reference_mode_context(cm, xd);
     const REFERENCE_MODE mode =
-        (REFERENCE_MODE)vp9_read(r, cm->fc.comp_inter_prob[ctx]);
+        (REFERENCE_MODE)vp9_read(r, cm->fc->comp_inter_prob[ctx]);
     if (!cm->frame_parallel_decoding_mode)
       ++cm->counts.comp_inter[ctx][mode];
     return mode;  // SINGLE_REFERENCE or COMPOUND_REFERENCE
@@ -271,7 +271,7 @@ static REFERENCE_MODE read_block_reference_mode(VP9_COMMON *cm,
 static void read_ref_frames(VP9_COMMON *const cm, MACROBLOCKD *const xd,
                             vp9_reader *r,
                             int segment_id, MV_REFERENCE_FRAME ref_frame[2]) {
-  FRAME_CONTEXT *const fc = &cm->fc;
+  FRAME_CONTEXT *const fc = cm->fc;
   FRAME_COUNTS *const counts = &cm->counts;
 
   if (vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
@@ -317,7 +317,7 @@ static INLINE INTERP_FILTER read_switchable_interp_filter(
   const int ctx = vp9_get_pred_context_switchable_interp(xd);
   const INTERP_FILTER type =
       (INTERP_FILTER)vp9_read_tree(r, vp9_switchable_interp_tree,
-                                   cm->fc.switchable_interp_prob[ctx]);
+                                   cm->fc->switchable_interp_prob[ctx]);
   if (!cm->frame_parallel_decoding_mode)
     ++cm->counts.switchable_interp[ctx][type];
   return type;
@@ -372,7 +372,7 @@ static INLINE int assign_mv(VP9_COMMON *cm, PREDICTION_MODE mode,
       nmv_context_counts *const mv_counts = cm->frame_parallel_decoding_mode ?
                                             NULL : &cm->counts.mv;
       for (i = 0; i < 1 + is_compound; ++i) {
-        read_mv(r, &mv[i].as_mv, &ref_mv[i].as_mv, &cm->fc.nmvc, mv_counts,
+        read_mv(r, &mv[i].as_mv, &ref_mv[i].as_mv, &cm->fc->nmvc, mv_counts,
                 allow_hp);
         ret = ret && is_mv_valid(&mv[i].as_mv);
       }
@@ -410,7 +410,7 @@ static int read_is_inter_block(VP9_COMMON *const cm, MACROBLOCKD *const xd,
            INTRA_FRAME;
   } else {
     const int ctx = vp9_get_intra_inter_context(xd);
-    const int is_inter = vp9_read(r, cm->fc.intra_inter_prob[ctx]);
+    const int is_inter = vp9_read(r, cm->fc->intra_inter_prob[ctx]);
     if (!cm->frame_parallel_decoding_mode)
       ++cm->counts.intra_inter[ctx][is_inter];
     return is_inter;
diff --git a/vp9/decoder/vp9_decoder.c b/vp9/decoder/vp9_decoder.c
index baf6ab7ef..fa2f01041 100644
--- a/vp9/decoder/vp9_decoder.c
+++ b/vp9/decoder/vp9_decoder.c
@@ -59,6 +59,13 @@ VP9Decoder *vp9_decoder_create() {
   }
 
   cm->error.setjmp = 1;
+
+  CHECK_MEM_ERROR(cm, cm->fc,
+                  (FRAME_CONTEXT *)vpx_calloc(1, sizeof(*cm->fc)));
+  CHECK_MEM_ERROR(cm, cm->frame_contexts,
+                  (FRAME_CONTEXT *)vpx_calloc(FRAME_CONTEXTS,
+                  sizeof(*cm->frame_contexts)));
+
   pbi->need_resync = 1;
   initialize_dec();
 
@@ -88,15 +95,20 @@ void vp9_decoder_remove(VP9Decoder *pbi) {
   VP9_COMMON *const cm = &pbi->common;
   int i;
 
+  vpx_free(cm->fc);
+  cm->fc = NULL;
+  vpx_free(cm->frame_contexts);
+  cm->frame_contexts = NULL;
+
   vp9_get_worker_interface()->end(&pbi->lf_worker);
   vpx_free(pbi->lf_worker.data1);
   vpx_free(pbi->tile_data);
   for (i = 0; i < pbi->num_tile_workers; ++i) {
     VP9Worker *const worker = &pbi->tile_workers[i];
     vp9_get_worker_interface()->end(worker);
-    vpx_free(worker->data1);
-    vpx_free(worker->data2);
   }
+  vpx_free(pbi->tile_worker_data);
+  vpx_free(pbi->tile_worker_info);
   vpx_free(pbi->tile_workers);
 
   if (pbi->num_tile_workers > 0) {
diff --git a/vp9/decoder/vp9_decoder.h b/vp9/decoder/vp9_decoder.h
index 4f52bb9c4..25b7339ed 100644
--- a/vp9/decoder/vp9_decoder.h
+++ b/vp9/decoder/vp9_decoder.h
@@ -46,6 +46,8 @@ typedef struct VP9Decoder {
 
   VP9Worker lf_worker;
   VP9Worker *tile_workers;
+  TileWorkerData *tile_worker_data;
+  TileInfo *tile_worker_info;
   int num_tile_workers;
 
   TileData *tile_data;
diff --git a/vp9/decoder/vp9_detokenize.c b/vp9/decoder/vp9_detokenize.c
index 421229a28..8704fddac 100644
--- a/vp9/decoder/vp9_detokenize.c
+++ b/vp9/decoder/vp9_detokenize.c
@@ -58,7 +58,7 @@ static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd, PLANE_TYPE type,
                         int ctx, const int16_t *scan, const int16_t *nb,
                         vp9_reader *r) {
   const int max_eob = 16 << (tx_size << 1);
-  const FRAME_CONTEXT *const fc = &cm->fc;
+  const FRAME_CONTEXT *const fc = cm->fc;
   FRAME_COUNTS *const counts = &cm->counts;
   const int ref = is_inter_block(&xd->mi[0].src_mi->mbmi);
   int band, c = 0;
diff --git a/vp9/decoder/vp9_dthread.c b/vp9/decoder/vp9_dthread.c
index 69e4fde85..3d2d0dd2e 100644
--- a/vp9/decoder/vp9_dthread.c
+++ b/vp9/decoder/vp9_dthread.c
@@ -92,12 +92,12 @@ static void loop_filter_rows_mt(const YV12_BUFFER_CONFIG *const frame_buffer,
                                 VP9_COMMON *const cm,
                                 struct macroblockd_plane planes[MAX_MB_PLANE],
                                 int start, int stop, int y_only,
-                                VP9LfSync *const lf_sync, int num_lf_workers) {
+                                VP9LfSync *const lf_sync) {
   const int num_planes = y_only ? 1 : MAX_MB_PLANE;
   int r, c;  // SB row and col
   const int sb_cols = mi_cols_aligned_to_sb(cm->mi_cols) >> MI_BLOCK_SIZE_LOG2;
 
-  for (r = start; r < stop; r += num_lf_workers) {
+  for (r = start; r < stop; r += lf_sync->num_workers) {
     const int mi_row = r << MI_BLOCK_SIZE_LOG2;
     MODE_INFO *const mi = cm->mi + mi_row * cm->mi_stride;
 
@@ -121,35 +121,35 @@ static void loop_filter_rows_mt(const YV12_BUFFER_CONFIG *const frame_buffer,
 }
 
 // Row-based multi-threaded loopfilter hook
-static int loop_filter_row_worker(TileWorkerData *const tile_data,
-                                  void *unused) {
-  LFWorkerData *const lf_data = &tile_data->lfdata;
-  (void)unused;
+static int loop_filter_row_worker(VP9LfSync *const lf_sync,
+                                  LFWorkerData *const lf_data) {
   loop_filter_rows_mt(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
-                      lf_data->start, lf_data->stop, lf_data->y_only,
-                      lf_data->lf_sync, lf_data->num_lf_workers);
+                      lf_data->start, lf_data->stop, lf_data->y_only, lf_sync);
   return 1;
 }
 
 // VP9 decoder: Implement multi-threaded loopfilter that uses the tile
 // threads.
-void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
-                              VP9Decoder *pbi, VP9_COMMON *cm,
+void vp9_loop_filter_frame_mt(VP9LfSync *lf_sync,
+                              YV12_BUFFER_CONFIG *frame,
+                              struct macroblockd_plane planes[MAX_MB_PLANE],
+                              VP9_COMMON *cm,
+                              VP9Worker *workers, int nworkers,
                               int frame_filter_level,
                               int y_only) {
-  VP9LfSync *const lf_sync = &pbi->lf_row_sync;
   const VP9WorkerInterface *const winterface = vp9_get_worker_interface();
   // Number of superblock rows and cols
   const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;
   const int tile_cols = 1 << cm->log2_tile_cols;
-  const int num_workers = MIN(pbi->max_threads & ~1, tile_cols);
+  const int num_workers = MIN(nworkers, tile_cols);
   int i;
 
   if (!frame_filter_level) return;
 
-  if (!lf_sync->sync_range || cm->last_height != cm->height) {
+  if (!lf_sync->sync_range || cm->last_height != cm->height ||
+      num_workers > lf_sync->num_workers) {
     vp9_loop_filter_dealloc(lf_sync);
-    vp9_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width);
+    vp9_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_workers);
   }
 
   vp9_loop_filter_frame_init(cm, frame_filter_level);
@@ -158,32 +158,26 @@ void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
   vpx_memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows);
 
   // Set up loopfilter thread data.
-  // The decoder is using num_workers instead of pbi->num_tile_workers
-  // because it has been observed that using more threads on the
-  // loopfilter, than there are tile columns in the frame will hurt
-  // performance on Android. This is because the system will only
-  // schedule the tile decode workers on cores equal to the number
-  // of tile columns. Then if the decoder tries to use more threads for the
-  // loopfilter, it will hurt performance because of contention. If the
-  // multithreading code changes in the future then the number of workers
-  // used by the loopfilter should be revisited.
+  // The decoder is capping num_workers because it has been observed that using
+  // more threads on the loopfilter than there are cores will hurt performance
+  // on Android. This is because the system will only schedule the tile decode
+  // workers on cores equal to the number of tile columns. Then if the decoder
+  // tries to use more threads for the loopfilter, it will hurt performance
+  // because of contention. If the multithreading code changes in the future
+  // then the number of workers used by the loopfilter should be revisited.
   for (i = 0; i < num_workers; ++i) {
-    VP9Worker *const worker = &pbi->tile_workers[i];
-    TileWorkerData *const tile_data = (TileWorkerData*)worker->data1;
-    LFWorkerData *const lf_data = &tile_data->lfdata;
+    VP9Worker *const worker = &workers[i];
+    LFWorkerData *const lf_data = &lf_sync->lfdata[i];
 
     worker->hook = (VP9WorkerHook)loop_filter_row_worker;
+    worker->data1 = lf_sync;
+    worker->data2 = lf_data;
 
     // Loopfilter data
-    lf_data->frame_buffer = frame;
-    lf_data->cm = cm;
-    vp9_copy(lf_data->planes, pbi->mb.plane);
+    vp9_loop_filter_data_reset(lf_data, frame, cm, planes);
     lf_data->start = i;
     lf_data->stop = sb_rows;
-    lf_data->y_only = y_only;   // always do all planes in decoder
-
-    lf_data->lf_sync = lf_sync;
-    lf_data->num_lf_workers = num_workers;
+    lf_data->y_only = y_only;
 
     // Start loopfiltering
     if (i == num_workers - 1) {
@@ -195,7 +189,7 @@ void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
 
   // Wait till all rows are finished
   for (i = 0; i < num_workers; ++i) {
-    winterface->sync(&pbi->tile_workers[i]);
+    winterface->sync(&workers[i]);
   }
 }
 
@@ -215,7 +209,7 @@ static int get_sync_range(int width) {
 
 // Allocate memory for lf row synchronization
 void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows,
-                           int width) {
+                           int width, int num_workers) {
   lf_sync->rows = rows;
 #if CONFIG_MULTITHREAD
   {
@@ -239,6 +233,10 @@ void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows,
   }
 #endif  // CONFIG_MULTITHREAD
 
+  CHECK_MEM_ERROR(cm, lf_sync->lfdata,
+                  vpx_malloc(num_workers * sizeof(*lf_sync->lfdata)));
+  lf_sync->num_workers = num_workers;
+
   CHECK_MEM_ERROR(cm, lf_sync->cur_sb_col,
                   vpx_malloc(sizeof(*lf_sync->cur_sb_col) * rows));
 
@@ -265,6 +263,7 @@ void vp9_loop_filter_dealloc(VP9LfSync *lf_sync) {
       vpx_free(lf_sync->cond_);
     }
 #endif  // CONFIG_MULTITHREAD
+    vpx_free(lf_sync->lfdata);
     vpx_free(lf_sync->cur_sb_col);
     // clear the structure as the source of this call may be a resize in which
     // case this call will be followed by an _alloc() which may fail.
diff --git a/vp9/decoder/vp9_dthread.h b/vp9/decoder/vp9_dthread.h
index b1fbdeb74..d5810b45b 100644
--- a/vp9/decoder/vp9_dthread.h
+++ b/vp9/decoder/vp9_dthread.h
@@ -22,9 +22,6 @@ typedef struct TileWorkerData {
   struct VP9Common *cm;
   vp9_reader bit_reader;
   DECLARE_ALIGNED(16, struct macroblockd, xd);
-
-  // Row-based parallel loopfilter data
-  LFWorkerData lfdata;
 } TileWorkerData;
 
 // Loopfilter row synchronization
@@ -39,19 +36,25 @@ typedef struct VP9LfSyncData {
   // determined by testing. Currently, it is chosen to be a power-of-2 number.
   int sync_range;
   int rows;
+
+  // Row-based parallel loopfilter data
+  LFWorkerData *lfdata;
+  int num_workers;
 } VP9LfSync;
 
 // Allocate memory for loopfilter row synchronization.
 void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows,
-                           int width);
+                           int width, int num_workers);
 
 // Deallocate loopfilter synchronization related mutex and data.
 void vp9_loop_filter_dealloc(VP9LfSync *lf_sync);
 
 // Multi-threaded loopfilter that uses the tile threads.
-void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
-                              struct VP9Decoder *pbi,
+void vp9_loop_filter_frame_mt(VP9LfSync *lf_sync,
+                              YV12_BUFFER_CONFIG *frame,
+                              struct macroblockd_plane planes[MAX_MB_PLANE],
                               struct VP9Common *cm,
+                              VP9Worker *workers, int num_workers,
                               int frame_filter_level,
                               int y_only);
 
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index 421e04969..7cfd14307 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -84,7 +84,7 @@ static void write_selected_tx_size(const VP9_COMMON *cm,
                                    vp9_writer *w) {
   const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
   const vp9_prob *const tx_probs = get_tx_probs2(max_tx_size, xd,
-                                                 &cm->fc.tx_probs);
+                                                 &cm->fc->tx_probs);
   vp9_write(w, tx_size != TX_4X4, tx_probs[0]);
   if (tx_size != TX_4X4 && max_tx_size >= TX_16X16) {
     vp9_write(w, tx_size != TX_8X8, tx_probs[1]);
@@ -108,14 +108,14 @@ static void update_skip_probs(VP9_COMMON *cm, vp9_writer *w) {
   int k;
 
   for (k = 0; k < SKIP_CONTEXTS; ++k)
-    vp9_cond_prob_diff_update(w, &cm->fc.skip_probs[k], cm->counts.skip[k]);
+    vp9_cond_prob_diff_update(w, &cm->fc->skip_probs[k], cm->counts.skip[k]);
 }
 
 static void update_switchable_interp_probs(VP9_COMMON *cm, vp9_writer *w) {
   int j;
   for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j)
     prob_diff_update(vp9_switchable_interp_tree,
-                     cm->fc.switchable_interp_prob[j],
+                     cm->fc->switchable_interp_prob[j],
                      cm->counts.switchable_interp[j], SWITCHABLE_FILTERS, w);
 }
 
@@ -237,7 +237,7 @@ static void write_ref_frames(const VP9_COMMON *cm, const MACROBLOCKD *xd,
 static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi,
                                 vp9_writer *w) {
   VP9_COMMON *const cm = &cpi->common;
-  const nmv_context *nmvc = &cm->fc.nmvc;
+  const nmv_context *nmvc = &cm->fc->nmvc;
   const MACROBLOCK *const x = &cpi->mb;
   const MACROBLOCKD *const xd = &x->e_mbd;
   const struct segmentation *const seg = &cm->seg;
@@ -275,7 +275,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi,
 
   if (!is_inter) {
     if (bsize >= BLOCK_8X8) {
-      write_intra_mode(w, mode, cm->fc.y_mode_prob[size_group_lookup[bsize]]);
+      write_intra_mode(w, mode, cm->fc->y_mode_prob[size_group_lookup[bsize]]);
     } else {
       int idx, idy;
       const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
@@ -283,14 +283,14 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi,
       for (idy = 0; idy < 2; idy += num_4x4_h) {
         for (idx = 0; idx < 2; idx += num_4x4_w) {
           const PREDICTION_MODE b_mode = mi->bmi[idy * 2 + idx].as_mode;
-          write_intra_mode(w, b_mode, cm->fc.y_mode_prob[0]);
+          write_intra_mode(w, b_mode, cm->fc->y_mode_prob[0]);
         }
       }
     }
-    write_intra_mode(w, mbmi->uv_mode, cm->fc.uv_mode_prob[mode]);
+    write_intra_mode(w, mbmi->uv_mode, cm->fc->uv_mode_prob[mode]);
   } else {
     const int mode_ctx = mbmi->mode_context[mbmi->ref_frame[0]];
-    const vp9_prob *const inter_probs = cm->fc.inter_mode_probs[mode_ctx];
+    const vp9_prob *const inter_probs = cm->fc->inter_mode_probs[mode_ctx];
     write_ref_frames(cm, xd, w);
 
     // If segment skip is not enabled code the mode.
@@ -304,7 +304,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi,
     if (cm->interp_filter == SWITCHABLE) {
       const int ctx = vp9_get_pred_context_switchable_interp(xd);
       vp9_write_token(w, vp9_switchable_interp_tree,
-                      cm->fc.switchable_interp_prob[ctx],
+                      cm->fc->switchable_interp_prob[ctx],
                       &switchable_interp_encodings[mbmi->interp_filter]);
       ++cpi->interp_filter_selected[0][mbmi->interp_filter];
     } else {
@@ -528,7 +528,7 @@ static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi,
                                      TX_SIZE tx_size,
                                      vp9_coeff_stats *frame_branch_ct,
                                      vp9_coeff_probs_model *new_coef_probs) {
-  vp9_coeff_probs_model *old_coef_probs = cpi->common.fc.coef_probs[tx_size];
+  vp9_coeff_probs_model *old_coef_probs = cpi->common.fc->coef_probs[tx_size];
   const vp9_prob upd = DIFF_UPDATE_PROB;
   const int entropy_nodes_update = UNCONSTRAINED_NODES;
   int i, j, k, l, t;
@@ -830,20 +830,20 @@ static void encode_txfm_probs(VP9_COMMON *cm, vp9_writer *w) {
     for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
       tx_counts_to_branch_counts_8x8(cm->counts.tx.p8x8[i], ct_8x8p);
       for (j = 0; j < TX_SIZES - 3; j++)
-        vp9_cond_prob_diff_update(w, &cm->fc.tx_probs.p8x8[i][j], ct_8x8p[j]);
+        vp9_cond_prob_diff_update(w, &cm->fc->tx_probs.p8x8[i][j], ct_8x8p[j]);
     }
 
     for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
       tx_counts_to_branch_counts_16x16(cm->counts.tx.p16x16[i], ct_16x16p);
       for (j = 0; j < TX_SIZES - 2; j++)
-        vp9_cond_prob_diff_update(w, &cm->fc.tx_probs.p16x16[i][j],
+        vp9_cond_prob_diff_update(w, &cm->fc->tx_probs.p16x16[i][j],
                                   ct_16x16p[j]);
     }
 
     for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
       tx_counts_to_branch_counts_32x32(cm->counts.tx.p32x32[i], ct_32x32p);
       for (j = 0; j < TX_SIZES - 1; j++)
-        vp9_cond_prob_diff_update(w, &cm->fc.tx_probs.p32x32[i][j],
+        vp9_cond_prob_diff_update(w, &cm->fc->tx_probs.p32x32[i][j],
                                   ct_32x32p[j]);
     }
   }
@@ -929,13 +929,11 @@ static int get_refresh_mask(VP9_COMP *cpi) {
 static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) {
   VP9_COMMON *const cm = &cpi->common;
   vp9_writer residual_bc;
-
   int tile_row, tile_col;
   TOKENEXTRA *tok[4][1 << 6], *tok_end;
   size_t total_size = 0;
   const int tile_cols = 1 << cm->log2_tile_cols;
   const int tile_rows = 1 << cm->log2_tile_rows;
-  TileInfo tile[4][1 << 6];
   TOKENEXTRA *pre_tok = cpi->tok;
   int tile_tok = 0;
 
@@ -944,18 +942,16 @@ static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) {
 
   for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
     for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
-      vp9_tile_init(&tile[tile_row][tile_col], cm, tile_row, tile_col);
-
+      int tile_idx = tile_row * tile_cols + tile_col;
       tok[tile_row][tile_col] = pre_tok + tile_tok;
       pre_tok = tok[tile_row][tile_col];
-      tile_tok = allocated_tokens(tile[tile_row][tile_col]);
+      tile_tok = allocated_tokens(cpi->tile_data[tile_idx].tile_info);
     }
   }
 
   for (tile_row = 0; tile_row < tile_rows; tile_row++) {
     for (tile_col = 0; tile_col < tile_cols; tile_col++) {
-      const TileInfo * const ptile = &tile[tile_row][tile_col];
-
+      int tile_idx = tile_row * tile_cols + tile_col;
       tok_end = tok[tile_row][tile_col] + cpi->tok_count[tile_row][tile_col];
 
       if (tile_col < tile_cols - 1 || tile_row < tile_rows - 1)
@@ -963,7 +959,8 @@ static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) {
       else
         vp9_start_encode(&residual_bc, data_ptr + total_size);
 
-      write_modes(cpi, ptile, &residual_bc, &tok[tile_row][tile_col], tok_end);
+      write_modes(cpi, &cpi->tile_data[tile_idx].tile_info,
+                  &residual_bc, &tok[tile_row][tile_col], tok_end);
       assert(tok[tile_row][tile_col] == tok_end);
       vp9_stop_encode(&residual_bc);
       if (tile_col < tile_cols - 1 || tile_row < tile_rows - 1) {
@@ -1161,7 +1158,7 @@ static void write_uncompressed_header(VP9_COMP *cpi,
 static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &cpi->mb.e_mbd;
-  FRAME_CONTEXT *const fc = &cm->fc;
+  FRAME_CONTEXT *const fc = cm->fc;
   vp9_writer header_bc;
 
   vp9_start_encode(&header_bc, data);
@@ -1178,7 +1175,7 @@ static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) {
     int i;
 
     for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
-      prob_diff_update(vp9_inter_mode_tree, cm->fc.inter_mode_probs[i],
+      prob_diff_update(vp9_inter_mode_tree, cm->fc->inter_mode_probs[i],
                        cm->counts.inter_mode[i], INTER_MODES, &header_bc);
 
     vp9_zero(cm->counts.inter_mode);
@@ -1219,7 +1216,7 @@ static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) {
                                   cm->counts.comp_ref[i]);
 
     for (i = 0; i < BLOCK_SIZE_GROUPS; ++i)
-      prob_diff_update(vp9_intra_mode_tree, cm->fc.y_mode_prob[i],
+      prob_diff_update(vp9_intra_mode_tree, cm->fc->y_mode_prob[i],
                        cm->counts.y_mode[i], INTRA_MODES, &header_bc);
 
     for (i = 0; i < PARTITION_CONTEXTS; ++i)
diff --git a/vp9/encoder/vp9_context_tree.h b/vp9/encoder/vp9_context_tree.h
index 6b28ee591..47d9580a8 100644
--- a/vp9/encoder/vp9_context_tree.h
+++ b/vp9/encoder/vp9_context_tree.h
@@ -34,6 +34,7 @@ typedef struct {
   int is_coded;
   int num_4x4_blk;
   int skip;
+  int pred_pixel_ready;
   // For current partition, only if all Y, U, and V transform blocks'
   // coefficients are quantized to 0, skippable is set to 0.
   int skippable;
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index f4e71aeb3..baa4908d4 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -467,7 +467,6 @@ static void choose_partitioning(VP9_COMP *cpi,
   int sp;
   int dp;
   int pixels_wide = 64, pixels_high = 64;
-  int_mv nearest_mv, near_mv;
   const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, LAST_FRAME);
   const struct scale_factors *const sf = &cm->frame_refs[LAST_FRAME - 1].sf;
 
@@ -488,11 +487,7 @@ static void choose_partitioning(VP9_COMP *cpi,
 
     xd->mi[0].src_mi->mbmi.ref_frame[0] = LAST_FRAME;
     xd->mi[0].src_mi->mbmi.sb_type = BLOCK_64X64;
-    vp9_find_best_ref_mvs(xd, cm->allow_high_precision_mv,
-                          xd->mi[0].src_mi->mbmi.ref_mvs[LAST_FRAME],
-                          &nearest_mv, &near_mv);
-
-    xd->mi[0].src_mi->mbmi.mv[0] = nearest_mv;
+    xd->mi[0].src_mi->mbmi.mv[0].as_int = 0;
     vp9_build_inter_predictors_sby(xd, mi_row, mi_col, BLOCK_64X64);
 
     d = xd->plane[0].dst.buf;
@@ -750,8 +745,8 @@ void vp9_setup_src_planes(MACROBLOCK *x, const YV12_BUFFER_CONFIG *src,
                      x->e_mbd.plane[i].subsampling_y);
 }
 
-static void set_mode_info_seg_skip(MACROBLOCK *x, TX_MODE tx_mode, int *rate,
-                                   int64_t *dist, BLOCK_SIZE bsize) {
+static void set_mode_info_seg_skip(MACROBLOCK *x, TX_MODE tx_mode,
+                                   RD_COST *rd_cost, BLOCK_SIZE bsize) {
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi[0].src_mi->mbmi;
   INTERP_FILTER filter_ref;
@@ -777,15 +772,16 @@ static void set_mode_info_seg_skip(MACROBLOCK *x, TX_MODE tx_mode, int *rate,
   xd->mi[0].src_mi->bmi[0].as_mv[0].as_int = 0;
   x->skip = 1;
 
-  *rate = 0;
-  *dist = 0;
+  vp9_rd_cost_init(rd_cost);
 }
 
-static void rd_pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile,
+static void rd_pick_sb_modes(VP9_COMP *cpi,
+                             TileDataEnc *tile_data,
                              int mi_row, int mi_col, RD_COST *rd_cost,
                              BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
                              int64_t best_rd) {
   VP9_COMMON *const cm = &cpi->common;
+  TileInfo *const tile_info = &tile_data->tile_info;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *mbmi;
@@ -801,7 +797,7 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile,
   // Use the lower precision, but faster, 32x32 fdct for mode selection.
   x->use_lp32x32fdct = 1;
 
-  set_offsets(cpi, tile, mi_row, mi_col, bsize);
+  set_offsets(cpi, tile_info, mi_row, mi_col, bsize);
   mbmi = &xd->mi[0].src_mi->mbmi;
   mbmi->sb_type = bsize;
 
@@ -813,6 +809,7 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile,
   }
   ctx->is_coded = 0;
   ctx->skippable = 0;
+  ctx->pred_pixel_ready = 0;
   x->skip_recode = 0;
 
   // Set to zero to make sure we do not use the previous encoded frame stats
@@ -872,14 +869,14 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile,
   } else {
     if (bsize >= BLOCK_8X8) {
       if (vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP))
-        vp9_rd_pick_inter_mode_sb_seg_skip(cpi, x, rd_cost, bsize,
+        vp9_rd_pick_inter_mode_sb_seg_skip(cpi, tile_data, x, rd_cost, bsize,
                                            ctx, best_rd);
       else
-        vp9_rd_pick_inter_mode_sb(cpi, x, tile, mi_row, mi_col,
+        vp9_rd_pick_inter_mode_sb(cpi, tile_data, x, mi_row, mi_col,
                                   rd_cost, bsize, ctx, best_rd);
     } else {
-      vp9_rd_pick_inter_mode_sub8x8(cpi, x, tile, mi_row, mi_col, rd_cost,
-                                    bsize, ctx, best_rd);
+      vp9_rd_pick_inter_mode_sub8x8(cpi, tile_data, x, mi_row, mi_col,
+                                    rd_cost, bsize, ctx, best_rd);
     }
   }
 
@@ -1161,79 +1158,6 @@ static void set_fixed_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
   }
 }
 
-static void copy_partitioning(VP9_COMMON *cm, MODE_INFO *mi_8x8,
-  MODE_INFO *prev_mi_8x8) {
-  const int mis = cm->mi_stride;
-  int block_row, block_col;
-
-  for (block_row = 0; block_row < 8; ++block_row) {
-    for (block_col = 0; block_col < 8; ++block_col) {
-      MODE_INFO *const prev_mi =
-          prev_mi_8x8[block_row * mis + block_col].src_mi;
-      const BLOCK_SIZE sb_type = prev_mi ? prev_mi->mbmi.sb_type : 0;
-
-      if (prev_mi) {
-        const ptrdiff_t offset = prev_mi - cm->prev_mi;
-        mi_8x8[block_row * mis + block_col].src_mi = cm->mi + offset;
-        mi_8x8[block_row * mis + block_col].src_mi->mbmi.sb_type = sb_type;
-      }
-    }
-  }
-}
-
-static void constrain_copy_partitioning(VP9_COMP *const cpi,
-                                        const TileInfo *const tile,
-                                        MODE_INFO *mi_8x8,
-                                        MODE_INFO *prev_mi_8x8,
-                                        int mi_row, int mi_col,
-                                        BLOCK_SIZE bsize) {
-  VP9_COMMON *const cm = &cpi->common;
-  const int mis = cm->mi_stride;
-  const int row8x8_remaining = tile->mi_row_end - mi_row;
-  const int col8x8_remaining = tile->mi_col_end - mi_col;
-  MODE_INFO *const mi_upper_left = cm->mi + mi_row * mis + mi_col;
-  const int bh = num_8x8_blocks_high_lookup[bsize];
-  const int bw = num_8x8_blocks_wide_lookup[bsize];
-  int block_row, block_col;
-
-  assert((row8x8_remaining > 0) && (col8x8_remaining > 0));
-
-  // If the SB64 if it is all "in image".
-  if ((col8x8_remaining >= MI_BLOCK_SIZE) &&
-      (row8x8_remaining >= MI_BLOCK_SIZE)) {
-    for (block_row = 0; block_row < MI_BLOCK_SIZE; block_row += bh) {
-      for (block_col = 0; block_col < MI_BLOCK_SIZE; block_col += bw) {
-        const int index = block_row * mis + block_col;
-        MODE_INFO *prev_mi = prev_mi_8x8[index].src_mi;
-        const BLOCK_SIZE sb_type = prev_mi ? prev_mi->mbmi.sb_type : 0;
-        // Use previous partition if block size is not larger than bsize.
-        if (prev_mi && sb_type <= bsize) {
-          int block_row2, block_col2;
-          for (block_row2 = 0; block_row2 < bh; ++block_row2) {
-            for (block_col2 = 0; block_col2 < bw; ++block_col2) {
-              const int index2 = (block_row + block_row2) * mis +
-                  block_col + block_col2;
-              prev_mi = prev_mi_8x8[index2].src_mi;
-              if (prev_mi) {
-                const ptrdiff_t offset = prev_mi - cm->prev_mi;
-                mi_8x8[index2].src_mi = cm->mi + offset;
-                mi_8x8[index2].src_mi->mbmi.sb_type = prev_mi->mbmi.sb_type;
-              }
-            }
-          }
-        } else {
-          // Otherwise, use fixed partition of size bsize.
-          mi_8x8[index].src_mi = mi_upper_left + index;
-          mi_8x8[index].src_mi->mbmi.sb_type = bsize;
-        }
-      }
-    }
-  } else {
-    // Else this is a partial SB64, copy previous partition.
-    copy_partitioning(cm, mi_8x8, prev_mi_8x8);
-  }
-}
-
 const struct {
   int row;
   int col;
@@ -1364,27 +1288,6 @@ static int is_background(const VP9_COMP *cpi, const TileInfo *const tile,
   return this_sad < 2 * threshold;
 }
 
-static int sb_has_motion(const VP9_COMMON *cm, MODE_INFO *prev_mi_8x8,
-                         const int motion_thresh) {
-  const int mis = cm->mi_stride;
-  int block_row, block_col;
-
-  if (cm->prev_mi) {
-    for (block_row = 0; block_row < 8; ++block_row) {
-      for (block_col = 0; block_col < 8; ++block_col) {
-        const MODE_INFO *prev_mi =
-            prev_mi_8x8[block_row * mis + block_col].src_mi;
-        if (prev_mi) {
-          if (abs(prev_mi->mbmi.mv[0].as_mv.row) > motion_thresh ||
-              abs(prev_mi->mbmi.mv[0].as_mv.col) > motion_thresh)
-            return 1;
-        }
-      }
-    }
-  }
-  return 0;
-}
-
 static void update_state_rt(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
                             int mi_row, int mi_col, int bsize) {
   VP9_COMMON *const cm = &cpi->common;
@@ -1516,12 +1419,15 @@ static void encode_sb_rt(VP9_COMP *cpi, const TileInfo *const tile,
     update_partition_context(xd, mi_row, mi_col, subsize, bsize);
 }
 
-static void rd_use_partition(VP9_COMP *cpi, const TileInfo *const tile,
+static void rd_use_partition(VP9_COMP *cpi,
+                             TileDataEnc *tile_data,
                              MODE_INFO *mi_8x8, TOKENEXTRA **tp,
                              int mi_row, int mi_col,
-                             BLOCK_SIZE bsize, int *rate, int64_t *dist,
+                             BLOCK_SIZE bsize,
+                             int *rate, int64_t *dist,
                              int do_recon, PC_TREE *pc_tree) {
   VP9_COMMON *const cm = &cpi->common;
+  TileInfo *const tile_info = &tile_data->tile_info;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   const int mis = cm->mi_stride;
@@ -1557,7 +1463,7 @@ static void rd_use_partition(VP9_COMP *cpi, const TileInfo *const tile,
   save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
 
   if (bsize == BLOCK_16X16 && cpi->oxcf.aq_mode) {
-    set_offsets(cpi, tile, mi_row, mi_col, bsize);
+    set_offsets(cpi, tile_info, mi_row, mi_col, bsize);
     x->mb_energy = vp9_block_energy(cpi, x, bsize);
   }
 
@@ -1583,7 +1489,7 @@ static void rd_use_partition(VP9_COMP *cpi, const TileInfo *const tile,
         mi_row + (mi_step >> 1) < cm->mi_rows &&
         mi_col + (mi_step >> 1) < cm->mi_cols) {
       pc_tree->partitioning = PARTITION_NONE;
-      rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &none_rdc, bsize,
+      rd_pick_sb_modes(cpi, tile_data, mi_row, mi_col, &none_rdc, bsize,
                        ctx, INT64_MAX);
 
       pl = partition_plane_context(xd, mi_row, mi_col, bsize);
@@ -1602,11 +1508,11 @@ static void rd_use_partition(VP9_COMP *cpi, const TileInfo *const tile,
 
   switch (partition) {
     case PARTITION_NONE:
-      rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rdc,
+      rd_pick_sb_modes(cpi, tile_data, mi_row, mi_col, &last_part_rdc,
                        bsize, ctx, INT64_MAX);
       break;
     case PARTITION_HORZ:
-      rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rdc,
+      rd_pick_sb_modes(cpi, tile_data, mi_row, mi_col, &last_part_rdc,
                        subsize, &pc_tree->horizontal[0],
                        INT64_MAX);
       if (last_part_rdc.rate != INT_MAX &&
@@ -1616,7 +1522,8 @@ static void rd_use_partition(VP9_COMP *cpi, const TileInfo *const tile,
         vp9_rd_cost_init(&tmp_rdc);
         update_state(cpi, ctx, mi_row, mi_col, subsize, 0);
         encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize, ctx);
-        rd_pick_sb_modes(cpi, tile, mi_row + (mi_step >> 1), mi_col, &tmp_rdc,
+        rd_pick_sb_modes(cpi, tile_data,
+                         mi_row + (mi_step >> 1), mi_col, &tmp_rdc,
                          subsize, &pc_tree->horizontal[1], INT64_MAX);
         if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
           vp9_rd_cost_reset(&last_part_rdc);
@@ -1628,7 +1535,7 @@ static void rd_use_partition(VP9_COMP *cpi, const TileInfo *const tile,
       }
       break;
     case PARTITION_VERT:
-      rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rdc,
+      rd_pick_sb_modes(cpi, tile_data, mi_row, mi_col, &last_part_rdc,
                        subsize, &pc_tree->vertical[0], INT64_MAX);
       if (last_part_rdc.rate != INT_MAX &&
           bsize >= BLOCK_8X8 && mi_col + (mi_step >> 1) < cm->mi_cols) {
@@ -1637,7 +1544,8 @@ static void rd_use_partition(VP9_COMP *cpi, const TileInfo *const tile,
         vp9_rd_cost_init(&tmp_rdc);
         update_state(cpi, ctx, mi_row, mi_col, subsize, 0);
         encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize, ctx);
-        rd_pick_sb_modes(cpi, tile, mi_row, mi_col + (mi_step >> 1), &tmp_rdc,
+        rd_pick_sb_modes(cpi, tile_data,
+                         mi_row, mi_col + (mi_step >> 1), &tmp_rdc,
                          subsize, &pc_tree->vertical[bsize > BLOCK_8X8],
                          INT64_MAX);
         if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
@@ -1651,7 +1559,7 @@ static void rd_use_partition(VP9_COMP *cpi, const TileInfo *const tile,
       break;
     case PARTITION_SPLIT:
       if (bsize == BLOCK_8X8) {
-        rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rdc,
+        rd_pick_sb_modes(cpi, tile_data, mi_row, mi_col, &last_part_rdc,
                          subsize, pc_tree->leaf_split[0], INT64_MAX);
         break;
       }
@@ -1667,7 +1575,8 @@ static void rd_use_partition(VP9_COMP *cpi, const TileInfo *const tile,
           continue;
 
         vp9_rd_cost_init(&tmp_rdc);
-        rd_use_partition(cpi, tile, mi_8x8 + jj * bss * mis + ii * bss, tp,
+        rd_use_partition(cpi, tile_data,
+                         mi_8x8 + jj * bss * mis + ii * bss, tp,
                          mi_row + y_idx, mi_col + x_idx, subsize,
                          &tmp_rdc.rate, &tmp_rdc.dist,
                          i != 3, pc_tree->split[i]);
@@ -1718,7 +1627,8 @@ static void rd_use_partition(VP9_COMP *cpi, const TileInfo *const tile,
 
       save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
       pc_tree->split[i]->partitioning = PARTITION_NONE;
-      rd_pick_sb_modes(cpi, tile, mi_row + y_idx, mi_col + x_idx, &tmp_rdc,
+      rd_pick_sb_modes(cpi, tile_data,
+                       mi_row + y_idx, mi_col + x_idx, &tmp_rdc,
                        split_subsize, &pc_tree->split[i]->none, INT64_MAX);
 
       restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
@@ -1732,7 +1642,7 @@ static void rd_use_partition(VP9_COMP *cpi, const TileInfo *const tile,
       chosen_rdc.dist += tmp_rdc.dist;
 
       if (i != 3)
-        encode_sb(cpi, tile, tp,  mi_row + y_idx, mi_col + x_idx, 0,
+        encode_sb(cpi, tile_info, tp,  mi_row + y_idx, mi_col + x_idx, 0,
                   split_subsize, pc_tree->split[i]);
 
       pl = partition_plane_context(xd, mi_row + y_idx, mi_col + x_idx,
@@ -1782,7 +1692,7 @@ static void rd_use_partition(VP9_COMP *cpi, const TileInfo *const tile,
     if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
       vp9_cyclic_refresh_set_rate_and_dist_sb(cpi->cyclic_refresh,
                                               chosen_rdc.rate, chosen_rdc.dist);
-    encode_sb(cpi, tile, tp, mi_row, mi_col, output_enabled, bsize,
+    encode_sb(cpi, tile_info, tp, mi_row, mi_col, output_enabled, bsize,
               pc_tree);
   }
 
@@ -2115,11 +2025,13 @@ static INLINE int get_motion_inconsistency(MOTION_DIRECTION this_mv,
 // TODO(jingning,jimbankoski,rbultje): properly skip partition types that are
 // unlikely to be selected depending on previous rate-distortion optimization
 // results, for encoding speed-up.
-static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
+static void rd_pick_partition(VP9_COMP *cpi,
+                              TileDataEnc *tile_data,
                               TOKENEXTRA **tp, int mi_row, int mi_col,
                               BLOCK_SIZE bsize, RD_COST *rd_cost,
                               int64_t best_rd, PC_TREE *pc_tree) {
   VP9_COMMON *const cm = &cpi->common;
+  TileInfo *const tile_info = &tile_data->tile_info;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   const int mi_step = num_8x8_blocks_wide_lookup[bsize] / 2;
@@ -2162,7 +2074,7 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
   vp9_rd_cost_reset(&best_rdc);
   best_rdc.rdcost = best_rd;
 
-  set_offsets(cpi, tile, mi_row, mi_col, bsize);
+  set_offsets(cpi, tile_info, mi_row, mi_col, bsize);
 
   if (bsize == BLOCK_16X16 && cpi->oxcf.aq_mode)
     x->mb_energy = vp9_block_energy(cpi, x, bsize);
@@ -2194,7 +2106,7 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
 
 #if CONFIG_FP_MB_STATS
   if (cpi->use_fp_mb_stats) {
-    set_offsets(cpi, tile, mi_row, mi_col, bsize);
+    set_offsets(cpi, tile_info, mi_row, mi_col, bsize);
     src_diff_var = get_sby_perpixel_diff_variance(cpi, &cpi->mb.plane[0].src,
                                                   mi_row, mi_col, bsize);
   }
@@ -2253,8 +2165,8 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
 
   // PARTITION_NONE
   if (partition_none_allowed) {
-    rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &this_rdc, bsize, ctx,
-                     best_rdc.rdcost);
+    rd_pick_sb_modes(cpi, tile_data, mi_row, mi_col,
+                     &this_rdc, bsize, ctx, best_rdc.rdcost);
     if (this_rdc.rate != INT_MAX) {
       if (bsize >= BLOCK_8X8) {
         pl = partition_plane_context(xd, mi_row, mi_col, bsize);
@@ -2323,7 +2235,7 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
           }
           if (skip) {
             if (src_diff_var == UINT_MAX) {
-              set_offsets(cpi, tile, mi_row, mi_col, bsize);
+              set_offsets(cpi, tile_info, mi_row, mi_col, bsize);
               src_diff_var = get_sby_perpixel_diff_variance(
                   cpi, &cpi->mb.plane[0].src, mi_row, mi_col, bsize);
             }
@@ -2353,7 +2265,7 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
       if (cpi->sf.adaptive_pred_interp_filter && partition_none_allowed)
         pc_tree->leaf_split[0]->pred_interp_filter =
             ctx->mic.mbmi.interp_filter;
-      rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &sum_rdc, subsize,
+      rd_pick_sb_modes(cpi, tile_data, mi_row, mi_col, &sum_rdc, subsize,
                        pc_tree->leaf_split[0], best_rdc.rdcost);
       if (sum_rdc.rate == INT_MAX)
         sum_rdc.rdcost = INT64_MAX;
@@ -2369,7 +2281,8 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
           load_pred_mv(x, ctx);
 
         pc_tree->split[i]->index = i;
-        rd_pick_partition(cpi, tile, tp, mi_row + y_idx, mi_col + x_idx,
+        rd_pick_partition(cpi, tile_data, tp,
+                          mi_row + y_idx, mi_col + x_idx,
                           subsize, &this_rdc,
                           best_rdc.rdcost - sum_rdc.rdcost, pc_tree->split[i]);
 
@@ -2412,7 +2325,7 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
         partition_none_allowed)
       pc_tree->horizontal[0].pred_interp_filter =
           ctx->mic.mbmi.interp_filter;
-    rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &sum_rdc, subsize,
+    rd_pick_sb_modes(cpi, tile_data, mi_row, mi_col, &sum_rdc, subsize,
                      &pc_tree->horizontal[0], best_rdc.rdcost);
 
     if (sum_rdc.rdcost < best_rdc.rdcost && mi_row + mi_step < cm->mi_rows &&
@@ -2427,8 +2340,8 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
           partition_none_allowed)
         pc_tree->horizontal[1].pred_interp_filter =
             ctx->mic.mbmi.interp_filter;
-      rd_pick_sb_modes(cpi, tile, mi_row + mi_step, mi_col, &this_rdc,
-                       subsize, &pc_tree->horizontal[1],
+      rd_pick_sb_modes(cpi, tile_data, mi_row + mi_step, mi_col,
+                       &this_rdc, subsize, &pc_tree->horizontal[1],
                        best_rdc.rdcost - sum_rdc.rdcost);
       if (this_rdc.rate == INT_MAX) {
         sum_rdc.rdcost = INT64_MAX;
@@ -2460,7 +2373,7 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
         partition_none_allowed)
       pc_tree->vertical[0].pred_interp_filter =
           ctx->mic.mbmi.interp_filter;
-    rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &sum_rdc, subsize,
+    rd_pick_sb_modes(cpi, tile_data, mi_row, mi_col, &sum_rdc, subsize,
                      &pc_tree->vertical[0], best_rdc.rdcost);
     if (sum_rdc.rdcost < best_rdc.rdcost && mi_col + mi_step < cm->mi_cols &&
         bsize > BLOCK_8X8) {
@@ -2474,7 +2387,8 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
           partition_none_allowed)
         pc_tree->vertical[1].pred_interp_filter =
             ctx->mic.mbmi.interp_filter;
-      rd_pick_sb_modes(cpi, tile, mi_row, mi_col + mi_step, &this_rdc, subsize,
+      rd_pick_sb_modes(cpi, tile_data, mi_row, mi_col + mi_step,
+                       &this_rdc, subsize,
                        &pc_tree->vertical[1], best_rdc.rdcost - sum_rdc.rdcost);
       if (this_rdc.rate == INT_MAX) {
         sum_rdc.rdcost = INT64_MAX;
@@ -2520,7 +2434,8 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
       vp9_cyclic_refresh_set_rate_and_dist_sb(cpi->cyclic_refresh,
                                               best_rdc.rate, best_rdc.dist);
 
-    encode_sb(cpi, tile, tp, mi_row, mi_col, output_enabled, bsize, pc_tree);
+    encode_sb(cpi, tile_info, tp, mi_row, mi_col, output_enabled,
+              bsize, pc_tree);
   }
 
   if (bsize == BLOCK_64X64) {
@@ -2532,9 +2447,12 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
   }
 }
 
-static void encode_rd_sb_row(VP9_COMP *cpi, const TileInfo *const tile,
-                             int mi_row, TOKENEXTRA **tp) {
+static void encode_rd_sb_row(VP9_COMP *cpi,
+                             TileDataEnc *tile_data,
+                             int mi_row,
+                             TOKENEXTRA **tp) {
   VP9_COMMON *const cm = &cpi->common;
+  TileInfo *const tile_info = &tile_data->tile_info;
   MACROBLOCKD *const xd = &cpi->mb.e_mbd;
   SPEED_FEATURES *const sf = &cpi->sf;
   int mi_col;
@@ -2544,7 +2462,7 @@ static void encode_rd_sb_row(VP9_COMP *cpi, const TileInfo *const tile,
   vpx_memset(xd->left_seg_context, 0, sizeof(xd->left_seg_context));
 
   // Code each SB in the row
-  for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
+  for (mi_col = tile_info->mi_col_start; mi_col < tile_info->mi_col_end;
        mi_col += MI_BLOCK_SIZE) {
     int dummy_rate;
     int64_t dummy_dist;
@@ -2553,10 +2471,6 @@ static void encode_rd_sb_row(VP9_COMP *cpi, const TileInfo *const tile,
 
     const int idx_str = cm->mi_stride * mi_row + mi_col;
     MODE_INFO *mi = cm->mi + idx_str;
-    MODE_INFO *prev_mi = NULL;
-
-    if (cm->frame_type != KEY_FRAME)
-      prev_mi = (cm->prev_mip + cm->mi_stride + 1 + idx_str)->src_mi;
 
     if (sf->adaptive_pred_interp_filter) {
       for (i = 0; i < 64; ++i)
@@ -2573,56 +2487,34 @@ static void encode_rd_sb_row(VP9_COMP *cpi, const TileInfo *const tile,
     vp9_zero(cpi->mb.pred_mv);
     cpi->pc_root->index = 0;
 
-    // TODO(yunqingwang): use_lastframe_partitioning is no longer used in good-
-    // quality encoding. Need to evaluate it in real-time encoding later to
-    // decide if it can be removed too. And then, do the code cleanup.
     cpi->mb.source_variance = UINT_MAX;
     if (sf->partition_search_type == FIXED_PARTITION) {
-      set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64);
-      set_fixed_partitioning(cpi, tile, mi, mi_row, mi_col,
+      set_offsets(cpi, tile_info, mi_row, mi_col, BLOCK_64X64);
+      set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col,
                              sf->always_this_block_size);
-      rd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64,
-                       &dummy_rate, &dummy_dist, 1, cpi->pc_root);
+      rd_use_partition(cpi, tile_data, mi, tp, mi_row, mi_col,
+                       BLOCK_64X64, &dummy_rate, &dummy_dist, 1, cpi->pc_root);
     } else if (cpi->partition_search_skippable_frame) {
       BLOCK_SIZE bsize;
-      set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64);
+      set_offsets(cpi, tile_info, mi_row, mi_col, BLOCK_64X64);
       bsize = get_rd_var_based_fixed_partition(cpi, mi_row, mi_col);
-      set_fixed_partitioning(cpi, tile, mi, mi_row, mi_col, bsize);
-      rd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64,
-                       &dummy_rate, &dummy_dist, 1, cpi->pc_root);
+      set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
+      rd_use_partition(cpi, tile_data, mi, tp, mi_row, mi_col,
+                       BLOCK_64X64, &dummy_rate, &dummy_dist, 1, cpi->pc_root);
     } else if (sf->partition_search_type == VAR_BASED_PARTITION &&
                cm->frame_type != KEY_FRAME ) {
-      choose_partitioning(cpi, tile, mi_row, mi_col);
-      rd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64,
-                       &dummy_rate, &dummy_dist, 1, cpi->pc_root);
-    } else if (sf->partition_search_type == SEARCH_PARTITION &&
-               sf->use_lastframe_partitioning &&
-               (cpi->rc.frames_since_key %
-                   sf->last_partitioning_redo_frequency) &&
-               cm->prev_mi &&
-               cm->show_frame &&
-               cm->frame_type != KEY_FRAME &&
-               !cpi->rc.is_src_frame_alt_ref &&
-               ((sf->use_lastframe_partitioning !=
-                   LAST_FRAME_PARTITION_LOW_MOTION) ||
-                   !sb_has_motion(cm, prev_mi, sf->lf_motion_threshold))) {
-      if (sf->constrain_copy_partition &&
-          sb_has_motion(cm, prev_mi, sf->lf_motion_threshold))
-        constrain_copy_partitioning(cpi, tile, mi, prev_mi,
-                                    mi_row, mi_col, BLOCK_16X16);
-      else
-        copy_partitioning(cm, mi, prev_mi);
-      rd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64,
-                       &dummy_rate, &dummy_dist, 1, cpi->pc_root);
+      choose_partitioning(cpi, tile_info, mi_row, mi_col);
+      rd_use_partition(cpi, tile_data, mi, tp, mi_row, mi_col,
+                       BLOCK_64X64, &dummy_rate, &dummy_dist, 1, cpi->pc_root);
     } else {
       // If required set upper and lower partition size limits
       if (sf->auto_min_max_partition_size) {
-        set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64);
-        rd_auto_partition_range(cpi, tile, mi_row, mi_col,
+        set_offsets(cpi, tile_info, mi_row, mi_col, BLOCK_64X64);
+        rd_auto_partition_range(cpi, tile_info, mi_row, mi_col,
                                 &sf->min_partition_size,
                                 &sf->max_partition_size);
       }
-      rd_pick_partition(cpi, tile, tp, mi_row, mi_col, BLOCK_64X64,
+      rd_pick_partition(cpi, tile_data, tp, mi_row, mi_col, BLOCK_64X64,
                         &dummy_rdc, INT64_MAX, cpi->pc_root);
     }
   }
@@ -2695,15 +2587,16 @@ static TX_MODE select_tx_mode(const VP9_COMP *cpi) {
     return cpi->common.tx_mode;
 }
 
-static void nonrd_pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile,
-                                int mi_row, int mi_col,
-                                int *rate, int64_t *dist,
+static void nonrd_pick_sb_modes(VP9_COMP *cpi,
+                                TileDataEnc *tile_data,
+                                int mi_row, int mi_col, RD_COST *rd_cost,
                                 BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) {
   VP9_COMMON *const cm = &cpi->common;
+  TileInfo *const tile_info = &tile_data->tile_info;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *mbmi;
-  set_offsets(cpi, tile, mi_row, mi_col, bsize);
+  set_offsets(cpi, tile_info, mi_row, mi_col, bsize);
   mbmi = &xd->mi[0].src_mi->mbmi;
   mbmi->sb_type = bsize;
 
@@ -2712,11 +2605,15 @@ static void nonrd_pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile,
       x->rdmult = vp9_cyclic_refresh_get_rdmult(cpi->cyclic_refresh);
 
   if (vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP))
-    set_mode_info_seg_skip(x, cm->tx_mode, rate, dist, bsize);
+    set_mode_info_seg_skip(x, cm->tx_mode, rd_cost, bsize);
   else
-    vp9_pick_inter_mode(cpi, x, tile, mi_row, mi_col, rate, dist, bsize, ctx);
+    vp9_pick_inter_mode(cpi, x, tile_data, mi_row, mi_col,
+                        rd_cost, bsize, ctx);
 
   duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col, bsize);
+
+  if (rd_cost->rate == INT_MAX)
+    vp9_rd_cost_reset(rd_cost);
 }
 
 static void fill_mode_info_sb(VP9_COMMON *cm, MACROBLOCK *x,
@@ -2776,14 +2673,16 @@ static void fill_mode_info_sb(VP9_COMMON *cm, MACROBLOCK *x,
   }
 }
 
-static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
+static void nonrd_pick_partition(VP9_COMP *cpi,
+                                 TileDataEnc *tile_data,
                                  TOKENEXTRA **tp, int mi_row,
-                                 int mi_col, BLOCK_SIZE bsize, int *rate,
-                                 int64_t *dist, int do_recon, int64_t best_rd,
+                                 int mi_col, BLOCK_SIZE bsize, RD_COST *rd_cost,
+                                 int do_recon, int64_t best_rd,
                                  PC_TREE *pc_tree) {
   const SPEED_FEATURES *const sf = &cpi->sf;
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
   VP9_COMMON *const cm = &cpi->common;
+  TileInfo *const tile_info = &tile_data->tile_info;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   const int ms = num_8x8_blocks_wide_lookup[bsize] / 2;
@@ -2791,9 +2690,7 @@ static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
   PICK_MODE_CONTEXT *ctx = &pc_tree->none;
   int i;
   BLOCK_SIZE subsize = bsize;
-  int this_rate, sum_rate = 0, best_rate = INT_MAX;
-  int64_t this_dist, sum_dist = 0, best_dist = INT64_MAX;
-  int64_t sum_rd = 0;
+  RD_COST this_rdc, sum_rdc, best_rdc;
   int do_split = bsize >= BLOCK_8X8;
   int do_rect = 1;
   // Override skipping rectangular partition operations for edge blocks
@@ -2812,6 +2709,10 @@ static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
   assert(num_8x8_blocks_wide_lookup[bsize] ==
              num_8x8_blocks_high_lookup[bsize]);
 
+  vp9_rd_cost_init(&sum_rdc);
+  vp9_rd_cost_reset(&best_rdc);
+  best_rdc.rdcost = best_rd;
+
   // Determine partition types in search according to the speed features.
   // The threshold set here has to be of square block size.
   if (sf->auto_min_max_partition_size) {
@@ -2832,17 +2733,19 @@ static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
 
   // PARTITION_NONE
   if (partition_none_allowed) {
-    nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col,
-                        &this_rate, &this_dist, bsize, ctx);
+    nonrd_pick_sb_modes(cpi, tile_data, mi_row, mi_col,
+                        &this_rdc, bsize, ctx);
     ctx->mic.mbmi = xd->mi[0].src_mi->mbmi;
     ctx->skip_txfm[0] = x->skip_txfm[0];
     ctx->skip = x->skip;
+    ctx->pred_pixel_ready = 0;
 
-    if (this_rate != INT_MAX) {
+    if (this_rdc.rate != INT_MAX) {
       int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
-      this_rate += cpi->partition_cost[pl][PARTITION_NONE];
-      sum_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_dist);
-      if (sum_rd < best_rd) {
+      this_rdc.rate += cpi->partition_cost[pl][PARTITION_NONE];
+      this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
+                              this_rdc.rate, this_rdc.dist);
+      if (this_rdc.rdcost < best_rdc.rdcost) {
         int64_t dist_breakout_thr = sf->partition_search_breakout_dist_thr;
         int64_t rate_breakout_thr = sf->partition_search_breakout_rate_thr;
 
@@ -2851,15 +2754,13 @@ static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
 
         rate_breakout_thr *= num_pels_log2_lookup[bsize];
 
-        best_rate = this_rate;
-        best_dist = this_dist;
-        best_rd = sum_rd;
+        best_rdc = this_rdc;
         if (bsize >= BLOCK_8X8)
           pc_tree->partitioning = PARTITION_NONE;
 
         if (!x->e_mbd.lossless &&
-            this_rate < rate_breakout_thr &&
-            this_dist < dist_breakout_thr) {
+            this_rdc.rate < rate_breakout_thr &&
+            this_rdc.dist < dist_breakout_thr) {
           do_split = 0;
           do_rect = 0;
         }
@@ -2871,35 +2772,34 @@ static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
   store_pred_mv(x, ctx);
 
   // PARTITION_SPLIT
-  sum_rd = 0;
   if (do_split) {
     int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
-    sum_rate += cpi->partition_cost[pl][PARTITION_SPLIT];
+    sum_rdc.rate += cpi->partition_cost[pl][PARTITION_SPLIT];
+    sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
     subsize = get_subsize(bsize, PARTITION_SPLIT);
-    for (i = 0; i < 4 && sum_rd < best_rd; ++i) {
+    for (i = 0; i < 4 && sum_rdc.rdcost < best_rdc.rdcost; ++i) {
       const int x_idx = (i & 1) * ms;
       const int y_idx = (i >> 1) * ms;
 
       if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols)
         continue;
       load_pred_mv(x, ctx);
-      nonrd_pick_partition(cpi, tile, tp, mi_row + y_idx, mi_col + x_idx,
-                           subsize, &this_rate, &this_dist, 0,
-                           best_rd - sum_rd, pc_tree->split[i]);
+      nonrd_pick_partition(cpi, tile_data, tp,
+                           mi_row + y_idx, mi_col + x_idx,
+                           subsize, &this_rdc, 0,
+                           best_rdc.rdcost - sum_rdc.rdcost, pc_tree->split[i]);
 
-      if (this_rate == INT_MAX) {
-        sum_rd = INT64_MAX;
+      if (this_rdc.rate == INT_MAX) {
+        vp9_rd_cost_reset(&sum_rdc);
       } else {
-        sum_rate += this_rate;
-        sum_dist += this_dist;
-        sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
+        sum_rdc.rate += this_rdc.rate;
+        sum_rdc.dist += this_rdc.dist;
+        sum_rdc.rdcost += this_rdc.rdcost;
       }
     }
 
-    if (sum_rd < best_rd) {
-      best_rate = sum_rate;
-      best_dist = sum_dist;
-      best_rd = sum_rd;
+    if (sum_rdc.rdcost < best_rdc.rdcost) {
+      best_rdc = sum_rdc;
       pc_tree->partitioning = PARTITION_SPLIT;
     } else {
       // skip rectangular partition test when larger block size
@@ -2915,40 +2815,39 @@ static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
     if (sf->adaptive_motion_search)
       load_pred_mv(x, ctx);
 
-    nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col,
-                        &this_rate, &this_dist, subsize,
+    nonrd_pick_sb_modes(cpi, tile_data, mi_row, mi_col, &sum_rdc, subsize,
                         &pc_tree->horizontal[0]);
 
     pc_tree->horizontal[0].mic.mbmi = xd->mi[0].src_mi->mbmi;
     pc_tree->horizontal[0].skip_txfm[0] = x->skip_txfm[0];
     pc_tree->horizontal[0].skip = x->skip;
+    pc_tree->horizontal[0].pred_pixel_ready = 0;
 
-    sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
-
-    if (sum_rd < best_rd && mi_row + ms < cm->mi_rows) {
+    if (sum_rdc.rdcost < best_rdc.rdcost && mi_row + ms < cm->mi_rows) {
       load_pred_mv(x, ctx);
-      nonrd_pick_sb_modes(cpi, tile, mi_row + ms, mi_col,
-                          &this_rate, &this_dist, subsize,
+      nonrd_pick_sb_modes(cpi, tile_data, mi_row + ms, mi_col,
+                          &this_rdc, subsize,
                           &pc_tree->horizontal[1]);
 
       pc_tree->horizontal[1].mic.mbmi = xd->mi[0].src_mi->mbmi;
       pc_tree->horizontal[1].skip_txfm[0] = x->skip_txfm[0];
       pc_tree->horizontal[1].skip = x->skip;
+      pc_tree->horizontal[1].pred_pixel_ready = 0;
 
-      if (this_rate == INT_MAX) {
-        sum_rd = INT64_MAX;
+      if (this_rdc.rate == INT_MAX) {
+        vp9_rd_cost_reset(&sum_rdc);
       } else {
         int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
-        this_rate += cpi->partition_cost[pl][PARTITION_HORZ];
-        sum_rate += this_rate;
-        sum_dist += this_dist;
-        sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
+        this_rdc.rate += cpi->partition_cost[pl][PARTITION_HORZ];
+        sum_rdc.rate += this_rdc.rate;
+        sum_rdc.dist += this_rdc.dist;
+        sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
+                                sum_rdc.rate, sum_rdc.dist);
       }
     }
-    if (sum_rd < best_rd) {
-      best_rd = sum_rd;
-      best_rate = sum_rate;
-      best_dist = sum_dist;
+
+    if (sum_rdc.rdcost < best_rdc.rdcost) {
+      best_rdc = sum_rdc;
       pc_tree->partitioning = PARTITION_HORZ;
     }
   }
@@ -2960,55 +2859,54 @@ static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
     if (sf->adaptive_motion_search)
       load_pred_mv(x, ctx);
 
-    nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col,
-                        &this_rate, &this_dist, subsize,
+    nonrd_pick_sb_modes(cpi, tile_data, mi_row, mi_col, &sum_rdc, subsize,
                         &pc_tree->vertical[0]);
     pc_tree->vertical[0].mic.mbmi = xd->mi[0].src_mi->mbmi;
     pc_tree->vertical[0].skip_txfm[0] = x->skip_txfm[0];
     pc_tree->vertical[0].skip = x->skip;
-    sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
-    if (sum_rd < best_rd && mi_col + ms < cm->mi_cols) {
+    pc_tree->vertical[0].pred_pixel_ready = 0;
+
+    if (sum_rdc.rdcost < best_rdc.rdcost && mi_col + ms < cm->mi_cols) {
       load_pred_mv(x, ctx);
-      nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col + ms,
-                          &this_rate, &this_dist, subsize,
+      nonrd_pick_sb_modes(cpi, tile_data, mi_row, mi_col + ms,
+                          &this_rdc, subsize,
                           &pc_tree->vertical[1]);
       pc_tree->vertical[1].mic.mbmi = xd->mi[0].src_mi->mbmi;
       pc_tree->vertical[1].skip_txfm[0] = x->skip_txfm[0];
       pc_tree->vertical[1].skip = x->skip;
-      if (this_rate == INT_MAX) {
-        sum_rd = INT64_MAX;
+      pc_tree->vertical[1].pred_pixel_ready = 0;
+
+      if (this_rdc.rate == INT_MAX) {
+        vp9_rd_cost_reset(&sum_rdc);
       } else {
         int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
-        this_rate += cpi->partition_cost[pl][PARTITION_VERT];
-        sum_rate += this_rate;
-        sum_dist += this_dist;
-        sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
+        sum_rdc.rate += cpi->partition_cost[pl][PARTITION_VERT];
+        sum_rdc.rate += this_rdc.rate;
+        sum_rdc.dist += this_rdc.dist;
+        sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
+                                sum_rdc.rate, sum_rdc.dist);
       }
     }
-    if (sum_rd < best_rd) {
-      best_rate = sum_rate;
-      best_dist = sum_dist;
-      best_rd = sum_rd;
+
+    if (sum_rdc.rdcost < best_rdc.rdcost) {
+      best_rdc = sum_rdc;
       pc_tree->partitioning = PARTITION_VERT;
     }
   }
-  // TODO(JBB): The following line is here just to avoid a static warning
-  // that occurs because at this point we never again reuse best_rd
-  // despite setting it here.  The code should be refactored to avoid this.
-  (void) best_rd;
 
-  *rate = best_rate;
-  *dist = best_dist;
+  *rd_cost = best_rdc;
 
-  if (best_rate == INT_MAX)
+  if (best_rdc.rate == INT_MAX) {
+    vp9_rd_cost_reset(rd_cost);
     return;
+  }
 
   // update mode info array
   subsize = get_subsize(bsize, pc_tree->partitioning);
   fill_mode_info_sb(cm, x, mi_row, mi_col, bsize, subsize,
                     pc_tree);
 
-  if (best_rate < INT_MAX && best_dist < INT64_MAX && do_recon) {
+  if (best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX && do_recon) {
     int output_enabled = (bsize == BLOCK_64X64);
 
     // Check the projected output rate for this SB against it's target
@@ -3016,33 +2914,165 @@ static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
     // closer to the target.
     if ((oxcf->aq_mode == COMPLEXITY_AQ) && cm->seg.update_map) {
       vp9_select_in_frame_q_segment(cpi, mi_row, mi_col, output_enabled,
-                                    best_rate);
+                                    best_rdc.rate);
     }
 
     if (oxcf->aq_mode == CYCLIC_REFRESH_AQ)
       vp9_cyclic_refresh_set_rate_and_dist_sb(cpi->cyclic_refresh,
-                                              best_rate, best_dist);
+                                              best_rdc.rate, best_rdc.dist);
 
-    encode_sb_rt(cpi, tile, tp, mi_row, mi_col, output_enabled, bsize, pc_tree);
+    encode_sb_rt(cpi, tile_info, tp, mi_row, mi_col, output_enabled,
+                 bsize, pc_tree);
   }
 
   if (bsize == BLOCK_64X64) {
     assert(tp_orig < *tp);
-    assert(best_rate < INT_MAX);
-    assert(best_dist < INT64_MAX);
+    assert(best_rdc.rate < INT_MAX);
+    assert(best_rdc.dist < INT64_MAX);
   } else {
     assert(tp_orig == *tp);
   }
 }
 
+static void nonrd_select_partition(VP9_COMP *cpi,
+                                   TileDataEnc *tile_data,
+                                   MODE_INFO *mi,
+                                   TOKENEXTRA **tp,
+                                   int mi_row, int mi_col,
+                                   BLOCK_SIZE bsize, int output_enabled,
+                                   RD_COST *rd_cost, PC_TREE *pc_tree) {
+  VP9_COMMON *const cm = &cpi->common;
+  TileInfo *const tile_info = &tile_data->tile_info;
+  MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int bsl = b_width_log2_lookup[bsize], hbs = (1 << bsl) / 4;
+  const int mis = cm->mi_stride;
+  PARTITION_TYPE partition;
+  BLOCK_SIZE subsize;
+  RD_COST this_rdc;
+
+  vp9_rd_cost_reset(&this_rdc);
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
+    return;
+
+  subsize = (bsize >= BLOCK_8X8) ? mi[0].src_mi->mbmi.sb_type : BLOCK_4X4;
+  partition = partition_lookup[bsl][subsize];
+
+  if (bsize == BLOCK_32X32 && partition != PARTITION_NONE &&
+      subsize >= BLOCK_16X16) {
+    cpi->sf.max_partition_size = BLOCK_32X32;
+    cpi->sf.min_partition_size = BLOCK_8X8;
+    nonrd_pick_partition(cpi, tile_data, tp, mi_row, mi_col, bsize,
+                         rd_cost, 0, INT64_MAX, pc_tree);
+  } else if (bsize == BLOCK_16X16 && partition != PARTITION_NONE) {
+    cpi->sf.max_partition_size = BLOCK_16X16;
+    cpi->sf.min_partition_size = BLOCK_8X8;
+    nonrd_pick_partition(cpi, tile_data, tp, mi_row, mi_col, bsize,
+                         rd_cost, 0, INT64_MAX, pc_tree);
+  } else {
+    switch (partition) {
+      case PARTITION_NONE:
+        nonrd_pick_sb_modes(cpi, tile_data, mi_row, mi_col, rd_cost,
+                            subsize, &pc_tree->none);
+        pc_tree->none.mic.mbmi = xd->mi[0].src_mi->mbmi;
+        pc_tree->none.skip_txfm[0] = x->skip_txfm[0];
+        pc_tree->none.skip = x->skip;
+        pc_tree->none.pred_pixel_ready = 1;
+        break;
+      case PARTITION_VERT:
+        nonrd_pick_sb_modes(cpi, tile_data, mi_row, mi_col, rd_cost,
+                            subsize, &pc_tree->vertical[0]);
+        pc_tree->vertical[0].mic.mbmi = xd->mi[0].src_mi->mbmi;
+        pc_tree->vertical[0].skip_txfm[0] = x->skip_txfm[0];
+        pc_tree->vertical[0].skip = x->skip;
+        pc_tree->vertical[0].pred_pixel_ready = 1;
+        if (mi_col + hbs < cm->mi_cols) {
+          nonrd_pick_sb_modes(cpi, tile_data, mi_row, mi_col + hbs,
+                              &this_rdc, subsize, &pc_tree->vertical[1]);
+          pc_tree->vertical[1].mic.mbmi = xd->mi[0].src_mi->mbmi;
+          pc_tree->vertical[1].skip_txfm[0] = x->skip_txfm[0];
+          pc_tree->vertical[1].skip = x->skip;
+          pc_tree->vertical[1].pred_pixel_ready = 1;
+          if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX &&
+              rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) {
+            rd_cost->rate += this_rdc.rate;
+            rd_cost->dist += this_rdc.dist;
+          }
+        }
+        break;
+      case PARTITION_HORZ:
+        nonrd_pick_sb_modes(cpi, tile_data, mi_row, mi_col, rd_cost,
+                            subsize, &pc_tree->horizontal[0]);
+        pc_tree->horizontal[0].mic.mbmi = xd->mi[0].src_mi->mbmi;
+        pc_tree->horizontal[0].skip_txfm[0] = x->skip_txfm[0];
+        pc_tree->horizontal[0].skip = x->skip;
+        pc_tree->horizontal[0].pred_pixel_ready = 1;
+        if (mi_row + hbs < cm->mi_rows) {
+          nonrd_pick_sb_modes(cpi, tile_data, mi_row + hbs, mi_col,
+                              &this_rdc, subsize, &pc_tree->horizontal[0]);
+          pc_tree->horizontal[1].mic.mbmi = xd->mi[0].src_mi->mbmi;
+          pc_tree->horizontal[1].skip_txfm[0] = x->skip_txfm[0];
+          pc_tree->horizontal[1].skip = x->skip;
+          pc_tree->horizontal[1].pred_pixel_ready = 1;
+          if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX &&
+              rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) {
+            rd_cost->rate += this_rdc.rate;
+            rd_cost->dist += this_rdc.dist;
+          }
+        }
+        break;
+      case PARTITION_SPLIT:
+        subsize = get_subsize(bsize, PARTITION_SPLIT);
+        nonrd_select_partition(cpi, tile_data, mi, tp, mi_row, mi_col,
+                               subsize, output_enabled, rd_cost,
+                               pc_tree->split[0]);
+        nonrd_select_partition(cpi, tile_data, mi + hbs, tp,
+                               mi_row, mi_col + hbs, subsize, output_enabled,
+                               &this_rdc, pc_tree->split[1]);
+        if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX &&
+            rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) {
+          rd_cost->rate += this_rdc.rate;
+          rd_cost->dist += this_rdc.dist;
+        }
+        nonrd_select_partition(cpi, tile_data, mi + hbs * mis, tp,
+                               mi_row + hbs, mi_col, subsize, output_enabled,
+                               &this_rdc, pc_tree->split[2]);
+        if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX &&
+            rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) {
+          rd_cost->rate += this_rdc.rate;
+          rd_cost->dist += this_rdc.dist;
+        }
+        nonrd_select_partition(cpi, tile_data, mi + hbs * mis + hbs, tp,
+                               mi_row + hbs, mi_col + hbs, subsize,
+                               output_enabled, &this_rdc, pc_tree->split[3]);
+        if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX &&
+            rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) {
+          rd_cost->rate += this_rdc.rate;
+          rd_cost->dist += this_rdc.dist;
+        }
+        break;
+      default:
+        assert("Invalid partition type.");
+        break;
+    }
+  }
+
+  if (bsize == BLOCK_64X64 && output_enabled) {
+    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
+      vp9_cyclic_refresh_set_rate_and_dist_sb(cpi->cyclic_refresh,
+                                              rd_cost->rate, rd_cost->dist);
+    encode_sb_rt(cpi, tile_info, tp, mi_row, mi_col, 1, bsize, pc_tree);
+  }
+}
+
+
 static void nonrd_use_partition(VP9_COMP *cpi,
-                                const TileInfo *const tile,
+                                TileDataEnc *tile_data,
                                 MODE_INFO *mi,
                                 TOKENEXTRA **tp,
                                 int mi_row, int mi_col,
                                 BLOCK_SIZE bsize, int output_enabled,
-                                int *totrate, int64_t *totdist,
-                                PC_TREE *pc_tree) {
+                                RD_COST *rd_cost, PC_TREE *pc_tree) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -3050,9 +3080,9 @@ static void nonrd_use_partition(VP9_COMP *cpi,
   const int mis = cm->mi_stride;
   PARTITION_TYPE partition;
   BLOCK_SIZE subsize;
-  int rate = INT_MAX;
-  int64_t dist = INT64_MAX;
+  RD_COST this_rdc;
 
+  vp9_rd_cost_reset(&this_rdc);
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
 
@@ -3061,78 +3091,78 @@ static void nonrd_use_partition(VP9_COMP *cpi,
 
   switch (partition) {
     case PARTITION_NONE:
-      nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col, totrate, totdist,
+      nonrd_pick_sb_modes(cpi, tile_data, mi_row, mi_col, rd_cost,
                           subsize, &pc_tree->none);
       pc_tree->none.mic.mbmi = xd->mi[0].src_mi->mbmi;
       pc_tree->none.skip_txfm[0] = x->skip_txfm[0];
       pc_tree->none.skip = x->skip;
       break;
     case PARTITION_VERT:
-      nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col, totrate, totdist,
+      nonrd_pick_sb_modes(cpi, tile_data, mi_row, mi_col, rd_cost,
                           subsize, &pc_tree->vertical[0]);
       pc_tree->vertical[0].mic.mbmi = xd->mi[0].src_mi->mbmi;
       pc_tree->vertical[0].skip_txfm[0] = x->skip_txfm[0];
       pc_tree->vertical[0].skip = x->skip;
       if (mi_col + hbs < cm->mi_cols) {
-        nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col + hbs,
-                            &rate, &dist, subsize, &pc_tree->vertical[1]);
+        nonrd_pick_sb_modes(cpi, tile_data, mi_row, mi_col + hbs,
+                            &this_rdc, subsize, &pc_tree->vertical[1]);
         pc_tree->vertical[1].mic.mbmi = xd->mi[0].src_mi->mbmi;
         pc_tree->vertical[1].skip_txfm[0] = x->skip_txfm[0];
         pc_tree->vertical[1].skip = x->skip;
-        if (rate != INT_MAX && dist != INT64_MAX &&
-            *totrate != INT_MAX && *totdist != INT64_MAX) {
-          *totrate += rate;
-          *totdist += dist;
+        if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX &&
+            rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) {
+          rd_cost->rate += this_rdc.rate;
+          rd_cost->dist += this_rdc.dist;
         }
       }
       break;
     case PARTITION_HORZ:
-      nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col, totrate, totdist,
+      nonrd_pick_sb_modes(cpi, tile_data, mi_row, mi_col, rd_cost,
                           subsize, &pc_tree->horizontal[0]);
       pc_tree->horizontal[0].mic.mbmi = xd->mi[0].src_mi->mbmi;
       pc_tree->horizontal[0].skip_txfm[0] = x->skip_txfm[0];
       pc_tree->horizontal[0].skip = x->skip;
       if (mi_row + hbs < cm->mi_rows) {
-        nonrd_pick_sb_modes(cpi, tile, mi_row + hbs, mi_col,
-                            &rate, &dist, subsize, &pc_tree->horizontal[0]);
+        nonrd_pick_sb_modes(cpi, tile_data, mi_row + hbs, mi_col,
+                            &this_rdc, subsize, &pc_tree->horizontal[0]);
         pc_tree->horizontal[1].mic.mbmi = xd->mi[0].src_mi->mbmi;
         pc_tree->horizontal[1].skip_txfm[0] = x->skip_txfm[0];
         pc_tree->horizontal[1].skip = x->skip;
-        if (rate != INT_MAX && dist != INT64_MAX &&
-            *totrate != INT_MAX && *totdist != INT64_MAX) {
-          *totrate += rate;
-          *totdist += dist;
+        if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX &&
+            rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) {
+          rd_cost->rate += this_rdc.rate;
+          rd_cost->dist += this_rdc.dist;
         }
       }
       break;
     case PARTITION_SPLIT:
       subsize = get_subsize(bsize, PARTITION_SPLIT);
-      nonrd_use_partition(cpi, tile, mi, tp, mi_row, mi_col,
-                          subsize, output_enabled, totrate, totdist,
+      nonrd_use_partition(cpi, tile_data, mi, tp, mi_row, mi_col,
+                          subsize, output_enabled, rd_cost,
                           pc_tree->split[0]);
-      nonrd_use_partition(cpi, tile, mi + hbs, tp,
+      nonrd_use_partition(cpi, tile_data, mi + hbs, tp,
                           mi_row, mi_col + hbs, subsize, output_enabled,
-                          &rate, &dist, pc_tree->split[1]);
-      if (rate != INT_MAX && dist != INT64_MAX &&
-          *totrate != INT_MAX && *totdist != INT64_MAX) {
-        *totrate += rate;
-        *totdist += dist;
+                          &this_rdc, pc_tree->split[1]);
+      if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX &&
+          rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) {
+        rd_cost->rate += this_rdc.rate;
+        rd_cost->dist += this_rdc.dist;
       }
-      nonrd_use_partition(cpi, tile, mi + hbs * mis, tp,
+      nonrd_use_partition(cpi, tile_data, mi + hbs * mis, tp,
                           mi_row + hbs, mi_col, subsize, output_enabled,
-                          &rate, &dist, pc_tree->split[2]);
-      if (rate != INT_MAX && dist != INT64_MAX &&
-          *totrate != INT_MAX && *totdist != INT64_MAX) {
-        *totrate += rate;
-        *totdist += dist;
+                          &this_rdc, pc_tree->split[2]);
+      if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX &&
+          rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) {
+        rd_cost->rate += this_rdc.rate;
+        rd_cost->dist += this_rdc.dist;
       }
-      nonrd_use_partition(cpi, tile, mi + hbs * mis + hbs, tp,
+      nonrd_use_partition(cpi, tile_data, mi + hbs * mis + hbs, tp,
                           mi_row + hbs, mi_col + hbs, subsize, output_enabled,
-                          &rate, &dist, pc_tree->split[3]);
-      if (rate != INT_MAX && dist != INT64_MAX &&
-          *totrate != INT_MAX && *totdist != INT64_MAX) {
-        *totrate += rate;
-        *totdist += dist;
+                          &this_rdc, pc_tree->split[3]);
+      if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX &&
+          rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) {
+        rd_cost->rate += this_rdc.rate;
+        rd_cost->dist += this_rdc.dist;
       }
       break;
     default:
@@ -3143,15 +3173,19 @@ static void nonrd_use_partition(VP9_COMP *cpi,
   if (bsize == BLOCK_64X64 && output_enabled) {
     if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
       vp9_cyclic_refresh_set_rate_and_dist_sb(cpi->cyclic_refresh,
-                                              *totrate, *totdist);
-    encode_sb_rt(cpi, tile, tp, mi_row, mi_col, 1, bsize, pc_tree);
+                                              rd_cost->rate, rd_cost->dist);
+    encode_sb_rt(cpi, &tile_data->tile_info, tp, mi_row, mi_col,
+                 1, bsize, pc_tree);
   }
 }
 
-static void encode_nonrd_sb_row(VP9_COMP *cpi, const TileInfo *const tile,
-                                int mi_row, TOKENEXTRA **tp) {
+static void encode_nonrd_sb_row(VP9_COMP *cpi,
+                                TileDataEnc *tile_data,
+                                int mi_row,
+                                TOKENEXTRA **tp) {
   SPEED_FEATURES *const sf = &cpi->sf;
   VP9_COMMON *const cm = &cpi->common;
+  TileInfo *const tile_info = &tile_data->tile_info;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   int mi_col;
@@ -3161,53 +3195,55 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, const TileInfo *const tile,
   vpx_memset(xd->left_seg_context, 0, sizeof(xd->left_seg_context));
 
   // Code each SB in the row
-  for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
+  for (mi_col = tile_info->mi_col_start; mi_col < tile_info->mi_col_end;
        mi_col += MI_BLOCK_SIZE) {
-    int dummy_rate = 0;
-    int64_t dummy_dist = 0;
+    RD_COST dummy_rdc;
     const int idx_str = cm->mi_stride * mi_row + mi_col;
     MODE_INFO *mi = cm->mi + idx_str;
     BLOCK_SIZE bsize;
     x->in_static_area = 0;
     x->source_variance = UINT_MAX;
     vp9_zero(x->pred_mv);
+    vp9_rd_cost_init(&dummy_rdc);
 
     // Set the partition type of the 64X64 block
     switch (sf->partition_search_type) {
       case VAR_BASED_PARTITION:
-        choose_partitioning(cpi, tile, mi_row, mi_col);
-        nonrd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64,
-                            1, &dummy_rate, &dummy_dist, cpi->pc_root);
+        choose_partitioning(cpi, tile_info, mi_row, mi_col);
+        nonrd_use_partition(cpi, tile_data, mi, tp, mi_row, mi_col,
+                            BLOCK_64X64, 1, &dummy_rdc, cpi->pc_root);
         break;
       case SOURCE_VAR_BASED_PARTITION:
-        set_source_var_based_partition(cpi, tile, mi, mi_row, mi_col);
-        nonrd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64,
-                            1, &dummy_rate, &dummy_dist, cpi->pc_root);
+        set_source_var_based_partition(cpi, tile_info, mi, mi_row, mi_col);
+        nonrd_use_partition(cpi, tile_data, mi, tp, mi_row, mi_col,
+                            BLOCK_64X64, 1, &dummy_rdc, cpi->pc_root);
         break;
       case FIXED_PARTITION:
         bsize = sf->partition_search_type == FIXED_PARTITION ?
                 sf->always_this_block_size :
                 get_nonrd_var_based_fixed_partition(cpi, mi_row, mi_col);
-        set_fixed_partitioning(cpi, tile, mi, mi_row, mi_col, bsize);
-        nonrd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64,
-                            1, &dummy_rate, &dummy_dist, cpi->pc_root);
+        set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
+        nonrd_use_partition(cpi, tile_data, mi, tp, mi_row, mi_col,
+                            BLOCK_64X64, 1, &dummy_rdc, cpi->pc_root);
         break;
       case REFERENCE_PARTITION:
-        if (sf->partition_check ||
-            !(x->in_static_area = is_background(cpi, tile, mi_row, mi_col))) {
-          set_modeinfo_offsets(cm, xd, mi_row, mi_col);
-          auto_partition_range(cpi, tile, mi_row, mi_col,
+        set_offsets(cpi, tile_info, mi_row, mi_col, BLOCK_64X64);
+        x->in_static_area = is_background(cpi, tile_info, mi_row, mi_col);
+
+        if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled &&
+            xd->mi[0].src_mi->mbmi.segment_id && x->in_static_area) {
+          auto_partition_range(cpi, tile_info, mi_row, mi_col,
                                &sf->min_partition_size,
                                &sf->max_partition_size);
-          nonrd_pick_partition(cpi, tile, tp, mi_row, mi_col, BLOCK_64X64,
-                               &dummy_rate, &dummy_dist, 1, INT64_MAX,
-                               cpi->pc_root);
+          nonrd_pick_partition(cpi, tile_data, tp, mi_row, mi_col,
+                               BLOCK_64X64, &dummy_rdc, 1,
+                               INT64_MAX, cpi->pc_root);
         } else {
-          choose_partitioning(cpi, tile, mi_row, mi_col);
-          nonrd_use_partition(cpi, tile, mi, tp, mi_row, mi_col,
-                              BLOCK_64X64, 1, &dummy_rate, &dummy_dist,
-                              cpi->pc_root);
+          choose_partitioning(cpi, tile_info, mi_row, mi_col);
+          nonrd_select_partition(cpi, tile_data, mi, tp, mi_row, mi_col,
+                                 BLOCK_64X64, 1, &dummy_rdc, cpi->pc_root);
         }
+
         break;
       default:
         assert(0);
@@ -3343,43 +3379,64 @@ static int get_skip_encode_frame(const VP9_COMMON *cm) {
          cm->show_frame;
 }
 
+static void tile_data_init(TileDataEnc *tile_data) {
+  int i, j;
+  for (i = 0; i < BLOCK_SIZES; ++i) {
+    for (j = 0; j < MAX_MODES; ++j) {
+      tile_data->thresh_freq_fact[i][j] = 32;
+      tile_data->mode_map[i][j] = j;
+    }
+  }
+}
+
 static void encode_tiles(VP9_COMP *cpi) {
-  const VP9_COMMON *const cm = &cpi->common;
+  VP9_COMMON *const cm = &cpi->common;
   const int tile_cols = 1 << cm->log2_tile_cols;
   const int tile_rows = 1 << cm->log2_tile_rows;
 
   int tile_col, tile_row;
-  TileInfo tile[4][1 << 6];
   TOKENEXTRA *tok[4][1 << 6];
   TOKENEXTRA *pre_tok = cpi->tok;
   int tile_tok = 0;
 
+  if (cpi->tile_data == NULL) {
+    CHECK_MEM_ERROR(cm, cpi->tile_data,
+        vpx_malloc(tile_cols * tile_rows * sizeof(*cpi->tile_data)));
+    for (tile_row = 0; tile_row < tile_rows; ++tile_row)
+      for (tile_col = 0; tile_col < tile_cols; ++tile_col)
+        tile_data_init(&cpi->tile_data[tile_row * tile_cols + tile_col]);
+  }
+
   for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
     for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
-      vp9_tile_init(&tile[tile_row][tile_col], cm, tile_row, tile_col);
+      TileInfo *tile_info =
+          &cpi->tile_data[tile_row * tile_cols + tile_col].tile_info;
+      vp9_tile_init(tile_info, cm, tile_row, tile_col);
 
       tok[tile_row][tile_col] = pre_tok + tile_tok;
       pre_tok = tok[tile_row][tile_col];
-      tile_tok = allocated_tokens(tile[tile_row][tile_col]);
+      tile_tok = allocated_tokens(*tile_info);
     }
   }
 
   for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
     for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
-      const TileInfo * const ptile = &tile[tile_row][tile_col];
+      const TileInfo * const tile_info =
+          &cpi->tile_data[tile_row * tile_cols + tile_col].tile_info;
       TOKENEXTRA * const old_tok = tok[tile_row][tile_col];
       int mi_row;
+      TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col];
 
-      for (mi_row = ptile->mi_row_start; mi_row < ptile->mi_row_end;
+      for (mi_row = tile_info->mi_row_start; mi_row < tile_info->mi_row_end;
            mi_row += MI_BLOCK_SIZE) {
         if (cpi->sf.use_nonrd_pick_mode && !frame_is_intra_only(cm))
-          encode_nonrd_sb_row(cpi, ptile, mi_row, &tok[tile_row][tile_col]);
+          encode_nonrd_sb_row(cpi, this_tile, mi_row, &tok[tile_row][tile_col]);
         else
-          encode_rd_sb_row(cpi, ptile, mi_row, &tok[tile_row][tile_col]);
+          encode_rd_sb_row(cpi, this_tile, mi_row, &tok[tile_row][tile_col]);
       }
       cpi->tok_count[tile_row][tile_col] =
           (unsigned int)(tok[tile_row][tile_col] - old_tok);
-      assert(tok[tile_row][tile_col] - old_tok <= allocated_tokens(*ptile));
+      assert(tok[tile_row][tile_col] - old_tok <= allocated_tokens(*tile_info));
     }
   }
 }
@@ -3737,7 +3794,7 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
       vp9_setup_pre_planes(xd, ref, cfg, mi_row, mi_col,
                            &xd->block_refs[ref]->sf);
     }
-    if (!cpi->sf.reuse_inter_pred_sby || seg_skip)
+    if (!(cpi->sf.reuse_inter_pred_sby && ctx->pred_pixel_ready) || seg_skip)
       vp9_build_inter_predictors_sby(xd, mi_row, mi_col, MAX(bsize, BLOCK_8X8));
 
     vp9_build_inter_predictors_sbuv(xd, mi_row, mi_col, MAX(bsize, BLOCK_8X8));
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index f5faa7c23..8ce30789f 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -29,12 +29,6 @@ struct optimize_ctx {
   ENTROPY_CONTEXT tl[MAX_MB_PLANE][16];
 };
 
-struct encode_b_args {
-  MACROBLOCK *x;
-  struct optimize_ctx *ctx;
-  int8_t *skip;
-};
-
 void vp9_subtract_block_c(int rows, int cols,
                           int16_t *diff, ptrdiff_t diff_stride,
                           const uint8_t *src, ptrdiff_t src_stride,
@@ -802,7 +796,7 @@ void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize) {
   }
 }
 
-static void encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
+void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
                                TX_SIZE tx_size, void *arg) {
   struct encode_b_args* const args = arg;
   MACROBLOCK *const x = args->x;
@@ -1040,18 +1034,10 @@ static void encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
     *(args->skip) = 0;
 }
 
-void vp9_encode_block_intra(MACROBLOCK *x, int plane, int block,
-                            BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
-                            int8_t *skip) {
-  struct encode_b_args arg = {x, NULL, skip};
-  encode_block_intra(plane, block, plane_bsize, tx_size, &arg);
-}
-
-
 void vp9_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
   const MACROBLOCKD *const xd = &x->e_mbd;
   struct encode_b_args arg = {x, NULL, &xd->mi[0].src_mi->mbmi.skip};
 
-  vp9_foreach_transformed_block_in_plane(xd, bsize, plane, encode_block_intra,
-                                         &arg);
+  vp9_foreach_transformed_block_in_plane(xd, bsize, plane,
+                                         vp9_encode_block_intra, &arg);
 }
diff --git a/vp9/encoder/vp9_encodemb.h b/vp9/encoder/vp9_encodemb.h
index 54d2b3751..97df8a66b 100644
--- a/vp9/encoder/vp9_encodemb.h
+++ b/vp9/encoder/vp9_encodemb.h
@@ -18,6 +18,11 @@
 extern "C" {
 #endif
 
+struct encode_b_args {
+  MACROBLOCK *x;
+  struct optimize_ctx *ctx;
+  int8_t *skip;
+};
 void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize);
 void vp9_encode_sby_pass1(MACROBLOCK *x, BLOCK_SIZE bsize);
 void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block,
@@ -29,9 +34,8 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block,
 
 void vp9_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane);
 
-void vp9_encode_block_intra(MACROBLOCK *x, int plane, int block,
-                            BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
-                            int8_t *skip);
+void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
+                            TX_SIZE tx_size, void *arg);
 
 void vp9_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane);
 
diff --git a/vp9/encoder/vp9_encodemv.c b/vp9/encoder/vp9_encodemv.c
index 089839567..f36d76e3d 100644
--- a/vp9/encoder/vp9_encodemv.c
+++ b/vp9/encoder/vp9_encodemv.c
@@ -163,7 +163,7 @@ static void write_mv_update(const vp9_tree_index *tree,
 
 void vp9_write_nmv_probs(VP9_COMMON *cm, int usehp, vp9_writer *w) {
   int i, j;
-  nmv_context *const mvc = &cm->fc.nmvc;
+  nmv_context *const mvc = &cm->fc->nmvc;
   nmv_context_counts *const counts = &cm->counts.mv;
 
   write_mv_update(vp9_mv_joint_tree, mvc->joints, counts->joints, MV_JOINTS, w);
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index c5e872607..dfc636a41 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -134,7 +134,7 @@ static void setup_frame(VP9_COMP *cpi) {
     cpi->refresh_alt_ref_frame = 1;
     vp9_zero(cpi->interp_filter_selected);
   } else {
-    cm->fc = cm->frame_contexts[cm->frame_context_idx];
+    *cm->fc = cm->frame_contexts[cm->frame_context_idx];
     vp9_zero(cpi->interp_filter_selected[0]);
   }
 }
@@ -160,6 +160,13 @@ static void dealloc_compressor_data(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   int i;
 
+  vpx_free(cm->fc);
+  cm->fc = NULL;
+  vpx_free(cm->frame_contexts);
+  cm->frame_contexts = NULL;
+  vpx_free(cpi->tile_data);
+  cpi->tile_data = NULL;
+
   // Delete sementation map
   vpx_free(cpi->segmentation_map);
   cpi->segmentation_map = NULL;
@@ -257,7 +264,7 @@ static void save_coding_context(VP9_COMP *cpi) {
   vp9_copy(cc->last_ref_lf_deltas, cm->lf.last_ref_deltas);
   vp9_copy(cc->last_mode_lf_deltas, cm->lf.last_mode_deltas);
 
-  cc->fc = cm->fc;
+  cc->fc = *cm->fc;
 }
 
 static void restore_coding_context(VP9_COMP *cpi) {
@@ -286,7 +293,7 @@ static void restore_coding_context(VP9_COMP *cpi) {
   vp9_copy(cm->lf.last_ref_deltas, cc->last_ref_lf_deltas);
   vp9_copy(cm->lf.last_mode_deltas, cc->last_mode_lf_deltas);
 
-  cm->fc = cc->fc;
+  *cm->fc = cc->fc;
 }
 
 static void configure_static_seg_features(VP9_COMP *cpi) {
@@ -1374,6 +1381,12 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf) {
 
   cm->error.setjmp = 1;
 
+  CHECK_MEM_ERROR(cm, cm->fc,
+                  (FRAME_CONTEXT *)vpx_calloc(1, sizeof(*cm->fc)));
+  CHECK_MEM_ERROR(cm, cm->frame_contexts,
+                  (FRAME_CONTEXT *)vpx_calloc(FRAME_CONTEXTS,
+                  sizeof(*cm->frame_contexts)));
+
   cpi->use_svc = 0;
 
   init_config(cpi, oxcf);
@@ -1381,6 +1394,7 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf) {
 
   cm->current_video_frame = 0;
   cpi->partition_search_skippable_frame = 0;
+  cpi->tile_data = NULL;
 
   // Create the encoder segmentation map and set all entries to 0
   CHECK_MEM_ERROR(cm, cpi->segmentation_map,
@@ -1435,6 +1449,7 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf) {
 #endif
 
   cpi->refresh_alt_ref_frame = 0;
+  cpi->multi_arf_last_grp_enabled = 0;
 
   cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS;
 #if CONFIG_INTERNAL_STATS
@@ -2397,30 +2412,37 @@ void vp9_scale_references(VP9_COMP *cpi) {
   const VP9_REFFRAME ref_mask[3] = {VP9_LAST_FLAG, VP9_GOLD_FLAG, VP9_ALT_FLAG};
 
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
-    const int idx = cm->ref_frame_map[get_ref_frame_idx(cpi, ref_frame)];
-    const YV12_BUFFER_CONFIG *const ref = &cm->frame_bufs[idx].buf;
-
     // Need to convert from VP9_REFFRAME to index into ref_mask (subtract 1).
-    if ((cpi->ref_frame_flags & ref_mask[ref_frame - 1]) &&
-        (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height)) {
-      const int new_fb = get_free_fb(cm);
-      vp9_realloc_frame_buffer(&cm->frame_bufs[new_fb].buf,
-                               cm->width, cm->height,
-                               cm->subsampling_x, cm->subsampling_y,
-#if CONFIG_VP9_HIGHBITDEPTH
-                               cm->use_highbitdepth,
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-                               VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL);
+    if (cpi->ref_frame_flags & ref_mask[ref_frame - 1]) {
+      const int idx = cm->ref_frame_map[get_ref_frame_idx(cpi, ref_frame)];
+      const YV12_BUFFER_CONFIG *const ref = &cm->frame_bufs[idx].buf;
+
 #if CONFIG_VP9_HIGHBITDEPTH
-      scale_and_extend_frame(ref, &cm->frame_bufs[new_fb].buf,
-                             (int)cm->bit_depth);
+      if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) {
+        const int new_fb = get_free_fb(cm);
+        vp9_realloc_frame_buffer(&cm->frame_bufs[new_fb].buf,
+                                 cm->width, cm->height,
+                                 cm->subsampling_x, cm->subsampling_y,
+                                 cm->use_highbitdepth,
+                                 VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL);
+        scale_and_extend_frame(ref, &cm->frame_bufs[new_fb].buf,
+                               (int)cm->bit_depth);
 #else
-      scale_and_extend_frame(ref, &cm->frame_bufs[new_fb].buf);
+      if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) {
+        const int new_fb = get_free_fb(cm);
+        vp9_realloc_frame_buffer(&cm->frame_bufs[new_fb].buf,
+                                 cm->width, cm->height,
+                                 cm->subsampling_x, cm->subsampling_y,
+                                 VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL);
+        scale_and_extend_frame(ref, &cm->frame_bufs[new_fb].buf);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-      cpi->scaled_ref_idx[ref_frame - 1] = new_fb;
+        cpi->scaled_ref_idx[ref_frame - 1] = new_fb;
+      } else {
+        cpi->scaled_ref_idx[ref_frame - 1] = idx;
+        ++cm->frame_bufs[idx].ref_count;
+      }
     } else {
-      cpi->scaled_ref_idx[ref_frame - 1] = idx;
-      cm->frame_bufs[idx].ref_count++;
+      cpi->scaled_ref_idx[ref_frame - 1] = INVALID_REF_BUFFER_IDX;
     }
   }
 }
@@ -2428,9 +2450,13 @@ void vp9_scale_references(VP9_COMP *cpi) {
 static void release_scaled_references(VP9_COMP *cpi) {
   VP9_COMMON *cm = &cpi->common;
   int i;
-
-  for (i = 0; i < 3; i++)
-    cm->frame_bufs[cpi->scaled_ref_idx[i]].ref_count--;
+  for (i = 0; i < MAX_REF_FRAMES; ++i) {
+    const int idx = cpi->scaled_ref_idx[i];
+    RefCntBuffer *const buf =
+        idx != INVALID_REF_BUFFER_IDX ? &cm->frame_bufs[idx] : NULL;
+    if (buf != NULL)
+      --buf->ref_count;
+  }
 }
 
 static void full_to_model_count(unsigned int *model_count,
@@ -2515,10 +2541,181 @@ static void output_frame_level_debug_stats(VP9_COMP *cpi) {
 }
 #endif
 
-static void encode_without_recode_loop(VP9_COMP *cpi,
-                                       int q) {
+static void set_mv_search_params(VP9_COMP *cpi) {
+  const VP9_COMMON *const cm = &cpi->common;
+  const unsigned int max_mv_def = MIN(cm->width, cm->height);
+
+  // Default based on max resolution.
+  cpi->mv_step_param = vp9_init_search_range(max_mv_def);
+
+  if (cpi->sf.mv.auto_mv_step_size) {
+    if (frame_is_intra_only(cm)) {
+      // Initialize max_mv_magnitude for use in the first INTER frame
+      // after a key/intra-only frame.
+      cpi->max_mv_magnitude = max_mv_def;
+    } else {
+      if (cm->show_frame) {
+        // Allow mv_steps to correspond to twice the max mv magnitude found
+        // in the previous frame, capped by the default max_mv_magnitude based
+        // on resolution.
+        cpi->mv_step_param =
+            vp9_init_search_range(MIN(max_mv_def, 2 * cpi->max_mv_magnitude));
+      }
+      cpi->max_mv_magnitude = 0;
+    }
+  }
+}
+
+static void set_size_dependent_vars(VP9_COMP *cpi, int *q,
+                                    int *bottom_index, int *top_index) {
+  VP9_COMMON *const cm = &cpi->common;
+  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+
+  // Setup variables that depend on the dimensions of the frame.
+  set_mv_search_params(cpi);
+
+  // Configure experimental use of segmentation for enhanced coding of
+  // static regions if indicated.
+  // Only allowed in the second pass of a two pass encode, as it requires
+  // lagged coding, and if the relevant speed feature flag is set.
+  if (oxcf->pass == 2 && cpi->sf.static_segmentation)
+    configure_static_seg_features(cpi);
+
+#if CONFIG_VP9_POSTPROC
+  if (oxcf->noise_sensitivity > 0) {
+    int l = 0;
+    switch (oxcf->noise_sensitivity) {
+      case 1:
+        l = 20;
+        break;
+      case 2:
+        l = 40;
+        break;
+      case 3:
+        l = 60;
+        break;
+      case 4:
+      case 5:
+        l = 100;
+        break;
+      case 6:
+        l = 150;
+        break;
+    }
+    vp9_denoise(cpi->Source, cpi->Source, l);
+  }
+#endif  // CONFIG_VP9_POSTPROC
+
+  vp9_set_speed_features(cpi);
+
+  vp9_set_rd_speed_thresholds(cpi);
+  vp9_set_rd_speed_thresholds_sub8x8(cpi);
+
+  // Decide q and q bounds.
+  *q = vp9_rc_pick_q_and_bounds(cpi, bottom_index, top_index);
+
+  if (!frame_is_intra_only(cm)) {
+    cm->interp_filter = cpi->sf.default_interp_filter;
+    vp9_set_high_precision_mv(cpi, (*q) < HIGH_PRECISION_MV_QTHRESH);
+  }
+}
+
+static void init_motion_estimation(VP9_COMP *cpi) {
+  int y_stride = cpi->scaled_source.y_stride;
+
+  if (cpi->sf.mv.search_method == NSTEP) {
+    vp9_init3smotion_compensation(&cpi->ss_cfg, y_stride);
+  } else if (cpi->sf.mv.search_method == DIAMOND) {
+    vp9_init_dsmotion_compensation(&cpi->ss_cfg, y_stride);
+  }
+}
+
+extern void vbr_rate_correction(VP9_COMP *cpi,
+                                int * this_frame_target,
+                                const int64_t vbr_bits_off_target);
+
+void set_frame_size(VP9_COMP *cpi) {
+  int ref_frame;
+  VP9_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+
+  if ((oxcf->pass == 2) &&
+      (!cpi->use_svc ||
+          (is_two_pass_svc(cpi) &&
+              cpi->svc.encode_empty_frame_state != ENCODING))) {
+    int target_rate = rc->base_frame_target;
+    if (oxcf->rc_mode == VPX_VBR)
+      vbr_rate_correction(cpi, &target_rate, rc->vbr_bits_off_target);
+    vp9_rc_set_frame_target(cpi, target_rate);
+  }
+
+  if (oxcf->pass == 2 &&
+      cm->current_video_frame == 0 &&
+      oxcf->allow_spatial_resampling &&
+      oxcf->rc_mode == VPX_VBR) {
+    // Internal scaling is triggered on the first frame.
+    vp9_set_size_literal(cpi, oxcf->scaled_frame_width,
+                         oxcf->scaled_frame_height);
+  }
+
+  // Reset the frame pointers to the current frame size.
+  vp9_realloc_frame_buffer(get_frame_new_buffer(cm),
+                           cm->width, cm->height,
+                           cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                           cm->use_highbitdepth,
+#endif
+                           VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL);
+
+  alloc_util_frame_buffers(cpi);
+  init_motion_estimation(cpi);
+
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    const int idx = cm->ref_frame_map[get_ref_frame_idx(cpi, ref_frame)];
+    YV12_BUFFER_CONFIG *const buf = &cm->frame_bufs[idx].buf;
+    RefBuffer *const ref_buf = &cm->frame_refs[ref_frame - 1];
+    ref_buf->buf = buf;
+    ref_buf->idx = idx;
+#if CONFIG_VP9_HIGHBITDEPTH
+    vp9_setup_scale_factors_for_frame(&ref_buf->sf,
+                                      buf->y_crop_width, buf->y_crop_height,
+                                      cm->width, cm->height,
+                                      (buf->flags & YV12_FLAG_HIGHBITDEPTH) ?
+                                          1 : 0);
+#else
+    vp9_setup_scale_factors_for_frame(&ref_buf->sf,
+                                      buf->y_crop_width, buf->y_crop_height,
+                                      cm->width, cm->height);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    if (vp9_is_scaled(&ref_buf->sf))
+      vp9_extend_frame_borders(buf);
+  }
+
+  set_ref_ptrs(cm, xd, LAST_FRAME, LAST_FRAME);
+}
+
+static void encode_without_recode_loop(VP9_COMP *cpi) {
+  int q;
+  int bottom_index, top_index;  // Dummy.
   VP9_COMMON *const cm = &cpi->common;
+
   vp9_clear_system_state();
+
+  set_frame_size(cpi);
+
+  cpi->Source = vp9_scale_if_required(cm, cpi->un_scaled_source,
+                                      &cpi->scaled_source);
+
+  if (cpi->unscaled_last_source != NULL)
+    cpi->Last_Source = vp9_scale_if_required(cm, cpi->unscaled_last_source,
+                                             &cpi->scaled_last_source);
+
+  vp9_scale_references(cpi);
+
+  set_size_dependent_vars(cpi, &q, &bottom_index, &top_index);
+
   vp9_set_quantizer(cm, q);
   setup_frame(cpi);
   // Variance adaptive and in frame q adjustment experiments are mutually
@@ -2541,28 +2738,45 @@ static void encode_without_recode_loop(VP9_COMP *cpi,
 
 static void encode_with_recode_loop(VP9_COMP *cpi,
                                     size_t *size,
-                                    uint8_t *dest,
-                                    int q,
-                                    int bottom_index,
-                                    int top_index) {
+                                    uint8_t *dest) {
   VP9_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
+  int q;
+  int q_low, q_high;
+  int bottom_index, top_index;
   int loop_count = 0;
   int loop = 0;
   int overshoot_seen = 0;
   int undershoot_seen = 0;
-  int q_low = bottom_index, q_high = top_index;
   int frame_over_shoot_limit;
   int frame_under_shoot_limit;
 
-  // Decide frame size bounds
-  vp9_rc_compute_frame_size_bounds(cpi, rc->this_frame_target,
-                                   &frame_under_shoot_limit,
-                                   &frame_over_shoot_limit);
-
   do {
     vp9_clear_system_state();
 
+    if (loop_count == 0) {
+      set_frame_size(cpi);
+
+      // Decide frame size bounds
+      vp9_rc_compute_frame_size_bounds(cpi, rc->this_frame_target,
+                                       &frame_under_shoot_limit,
+                                       &frame_over_shoot_limit);
+
+      cpi->Source = vp9_scale_if_required(cm, cpi->un_scaled_source,
+                                        &cpi->scaled_source);
+
+      if (cpi->unscaled_last_source != NULL)
+        cpi->Last_Source = vp9_scale_if_required(cm, cpi->unscaled_last_source,
+                                                 &cpi->scaled_last_source);
+
+      vp9_scale_references(cpi);
+
+      set_size_dependent_vars(cpi, &q, &bottom_index, &top_index);
+
+      q_low = bottom_index;
+      q_high = top_index;
+    }
+
     vp9_set_quantizer(cm, q);
 
     if (loop_count == 0)
@@ -2804,25 +3018,6 @@ YV12_BUFFER_CONFIG *vp9_scale_if_required(VP9_COMMON *cm,
   }
 }
 
-static int is_skippable_frame(const VP9_COMP *cpi) {
-  // If the current frame does not have non-zero motion vector detected in the
-  // first  pass, and so do its previous and forward frames, then this frame
-  // can be skipped for partition check, and the partition size is assigned
-  // according to the variance
-  const SVC *const svc = &cpi->svc;
-  const TWO_PASS *const twopass = is_two_pass_svc(cpi) ?
-      &svc->layer_context[svc->spatial_layer_id].twopass : &cpi->twopass;
-
-  return (!frame_is_intra_only(&cpi->common) &&
-    twopass->stats_in - 2 > twopass->stats_in_start &&
-    twopass->stats_in < twopass->stats_in_end &&
-    (twopass->stats_in - 1)->pcnt_inter - (twopass->stats_in - 1)->pcnt_motion
-    == 1 &&
-    (twopass->stats_in - 2)->pcnt_inter - (twopass->stats_in - 2)->pcnt_motion
-    == 1 &&
-    twopass->stats_in->pcnt_inter - twopass->stats_in->pcnt_motion == 1);
-}
-
 static void set_arf_sign_bias(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   int arf_sign_bias;
@@ -2839,31 +3034,6 @@ static void set_arf_sign_bias(VP9_COMP *cpi) {
   cm->ref_frame_sign_bias[ALTREF_FRAME] = arf_sign_bias;
 }
 
-static void set_mv_search_params(VP9_COMP *cpi) {
-  const VP9_COMMON *const cm = &cpi->common;
-  const unsigned int max_mv_def = MIN(cm->width, cm->height);
-
-  // Default based on max resolution.
-  cpi->mv_step_param = vp9_init_search_range(max_mv_def);
-
-  if (cpi->sf.mv.auto_mv_step_size) {
-    if (frame_is_intra_only(cm)) {
-      // Initialize max_mv_magnitude for use in the first INTER frame
-      // after a key/intra-only frame.
-      cpi->max_mv_magnitude = max_mv_def;
-    } else {
-      if (cm->show_frame)
-        // Allow mv_steps to correspond to twice the max mv magnitude found
-        // in the previous frame, capped by the default max_mv_magnitude based
-        // on resolution.
-        cpi->mv_step_param =
-            vp9_init_search_range(MIN(max_mv_def, 2 * cpi->max_mv_magnitude));
-      cpi->max_mv_magnitude = 0;
-    }
-  }
-}
-
-
 int setup_interp_filter_search_mask(VP9_COMP *cpi) {
   INTERP_FILTER ifilter;
   int ref_total[MAX_REF_FRAMES] = {0};
@@ -2898,21 +3068,9 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
   struct segmentation *const seg = &cm->seg;
   TX_SIZE t;
-  int q;
-  int top_index;
-  int bottom_index;
 
   set_ext_overrides(cpi);
 
-  cpi->Source = vp9_scale_if_required(cm, cpi->un_scaled_source,
-                                      &cpi->scaled_source);
-
-  if (cpi->unscaled_last_source != NULL)
-    cpi->Last_Source = vp9_scale_if_required(cm, cpi->unscaled_last_source,
-                                             &cpi->scaled_last_source);
-
-  vp9_scale_references(cpi);
-
   vp9_clear_system_state();
 
   // Enable or disable mode based tweaking of the zbin.
@@ -2927,14 +3085,11 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
   // Set default state for segment based loop filter update flags.
   cm->lf.mode_ref_delta_update = 0;
 
-  set_mv_search_params(cpi);
-
   if (cpi->oxcf.pass == 2 &&
       cpi->sf.adaptive_interp_filter_search)
     cpi->sf.interp_filter_search_mask =
         setup_interp_filter_search_mask(cpi);
 
-
   // Set various flags etc to special state if it is a key frame.
   if (frame_is_intra_only(cm)) {
     // Reset the loop filter deltas and segmentation map.
@@ -2994,20 +3149,6 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
     }
   }
 
-  // Configure experimental use of segmentation for enhanced coding of
-  // static regions if indicated.
-  // Only allowed in second pass of two pass (as requires lagged coding)
-  // and if the relevant speed feature flag is set.
-  if (oxcf->pass == 2 && cpi->sf.static_segmentation)
-    configure_static_seg_features(cpi);
-
-  // Check if the current frame is skippable for the partition search in the
-  // second pass according to the first pass stats
-  if (cpi->sf.allow_partition_search_skip && oxcf->pass == 2 &&
-      (!cpi->use_svc || is_two_pass_svc(cpi))) {
-    cpi->partition_search_skippable_frame = is_skippable_frame(cpi);
-  }
-
   // For 1 pass CBR, check if we are dropping this frame.
   // Never drop on key frame.
   if (oxcf->pass == 0 &&
@@ -3022,31 +3163,6 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
 
   vp9_clear_system_state();
 
-#if CONFIG_VP9_POSTPROC
-  if (oxcf->noise_sensitivity > 0) {
-    int l = 0;
-    switch (oxcf->noise_sensitivity) {
-      case 1:
-        l = 20;
-        break;
-      case 2:
-        l = 40;
-        break;
-      case 3:
-        l = 60;
-        break;
-      case 4:
-      case 5:
-        l = 100;
-        break;
-      case 6:
-        l = 150;
-        break;
-    }
-    vp9_denoise(cpi->Source, cpi->Source, l);
-  }
-#endif
-
 #if CONFIG_INTERNAL_STATS
   {
     int i;
@@ -3055,24 +3171,10 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
   }
 #endif
 
-  vp9_set_speed_features(cpi);
-
-  vp9_set_rd_speed_thresholds(cpi);
-  vp9_set_rd_speed_thresholds_sub8x8(cpi);
-
-  // Decide q and q bounds.
-  q = vp9_rc_pick_q_and_bounds(cpi, &bottom_index, &top_index);
-
-  if (!frame_is_intra_only(cm)) {
-    cm->interp_filter = cpi->sf.default_interp_filter;
-    /* TODO: Decide this more intelligently */
-    vp9_set_high_precision_mv(cpi, q < HIGH_PRECISION_MV_QTHRESH);
-  }
-
   if (cpi->sf.recode_loop == DISALLOW_RECODE) {
-    encode_without_recode_loop(cpi, q);
+    encode_without_recode_loop(cpi);
   } else {
-    encode_with_recode_loop(cpi, size, dest, q, bottom_index, top_index);
+    encode_with_recode_loop(cpi, size, dest);
   }
 
 #if CONFIG_VP9_TEMPORAL_DENOISING
@@ -3215,16 +3317,6 @@ static void Pass2Encode(VP9_COMP *cpi, size_t *size,
     vp9_twopass_postencode_update(cpi);
 }
 
-static void init_motion_estimation(VP9_COMP *cpi) {
-  int y_stride = cpi->scaled_source.y_stride;
-
-  if (cpi->sf.mv.search_method == NSTEP) {
-    vp9_init3smotion_compensation(&cpi->ss_cfg, y_stride);
-  } else if (cpi->sf.mv.search_method == DIAMOND) {
-    vp9_init_dsmotion_compensation(&cpi->ss_cfg, y_stride);
-  }
-}
-
 static void check_initial_width(VP9_COMP *cpi,
 #if CONFIG_VP9_HIGHBITDEPTH
                                 int use_highbitdepth,
@@ -3243,10 +3335,11 @@ static void check_initial_width(VP9_COMP *cpi,
     alloc_ref_frame_buffers(cpi);
     alloc_util_frame_buffers(cpi);
 
-    init_motion_estimation(cpi);
+    init_motion_estimation(cpi);  // TODO(agrange) This can be removed.
 
     cpi->initial_width = cm->width;
     cpi->initial_height = cm->height;
+    cpi->initial_mbs = cm->MBs;
   }
 }
 
@@ -3388,14 +3481,13 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
                             int64_t *time_stamp, int64_t *time_end, int flush) {
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
   VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
   RATE_CONTROL *const rc = &cpi->rc;
   struct vpx_usec_timer  cmptimer;
   YV12_BUFFER_CONFIG *force_src_buffer = NULL;
   struct lookahead_entry *last_source = NULL;
   struct lookahead_entry *source = NULL;
-  MV_REFERENCE_FRAME ref_frame;
   int arf_src_index;
+  int i;
 
   if (is_two_pass_svc(cpi)) {
 #if CONFIG_SPATIAL_SVC
@@ -3416,11 +3508,10 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
   // Note that at the moment multi_arf is only configured for 2 pass VBR and
   // will not work properly with svc.
   if ((oxcf->pass == 2) && !cpi->use_svc &&
-      (cpi->oxcf.enable_auto_arf > 1) && (cpi->oxcf.rc_mode == VPX_VBR))
+      (cpi->oxcf.enable_auto_arf > 1))
     cpi->multi_arf_allowed = 1;
   else
     cpi->multi_arf_allowed = 0;
-  cpi->multi_arf_last_grp_enabled = 0;
 
   // Normal defaults
   cm->reset_frame_context = 0;
@@ -3535,24 +3626,11 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
     vp9_restore_layer_context(cpi);
   }
 
-  // start with a 0 size frame
-  *size = 0;
-
-  /* find a free buffer for the new frame, releasing the reference previously
-   * held.
-   */
+  // Find a free buffer for the new frame, releasing the reference previously
+  // held.
   cm->frame_bufs[cm->new_fb_idx].ref_count--;
   cm->new_fb_idx = get_free_fb(cm);
 
-  // For two pass encodes analyse the first pass stats and determine
-  // the bit allocation and other parameters for this frame / group of frames.
-  if ((oxcf->pass == 2) &&
-      (!cpi->use_svc ||
-       (is_two_pass_svc(cpi) &&
-        cpi->svc.encode_empty_frame_state != ENCODING))) {
-    vp9_rc_get_second_pass_params(cpi);
-  }
-
   if (!cpi->use_svc && cpi->multi_arf_allowed) {
     if (cm->frame_type == KEY_FRAME) {
       init_buffer_indices(cpi);
@@ -3562,56 +3640,27 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
     }
   }
 
-  cpi->frame_flags = *frame_flags;
-
-  if (oxcf->pass == 2 &&
-      cm->current_video_frame == 0 &&
-      oxcf->allow_spatial_resampling &&
-      oxcf->rc_mode == VPX_VBR) {
-    // Internal scaling is triggered on the first frame.
-    vp9_set_size_literal(cpi, oxcf->scaled_frame_width,
-                         oxcf->scaled_frame_height);
-  }
-
-  // Reset the frame pointers to the current frame size
-  vp9_realloc_frame_buffer(get_frame_new_buffer(cm),
-                           cm->width, cm->height,
-                           cm->subsampling_x, cm->subsampling_y,
-#if CONFIG_VP9_HIGHBITDEPTH
-                           cm->use_highbitdepth,
-#endif
-                           VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL);
+  // Start with a 0 size frame.
+  *size = 0;
 
-  alloc_util_frame_buffers(cpi);
-  init_motion_estimation(cpi);
+  cpi->frame_flags = *frame_flags;
 
-  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
-    const int idx = cm->ref_frame_map[get_ref_frame_idx(cpi, ref_frame)];
-    YV12_BUFFER_CONFIG *const buf = &cm->frame_bufs[idx].buf;
-    RefBuffer *const ref_buf = &cm->frame_refs[ref_frame - 1];
-    ref_buf->buf = buf;
-    ref_buf->idx = idx;
-#if CONFIG_VP9_HIGHBITDEPTH
-    vp9_setup_scale_factors_for_frame(&ref_buf->sf,
-                                      buf->y_crop_width, buf->y_crop_height,
-                                      cm->width, cm->height,
-                                      (buf->flags & YV12_FLAG_HIGHBITDEPTH) ?
-                                          1 : 0);
-#else
-    vp9_setup_scale_factors_for_frame(&ref_buf->sf,
-                                      buf->y_crop_width, buf->y_crop_height,
-                                      cm->width, cm->height);
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-    if (vp9_is_scaled(&ref_buf->sf))
-      vp9_extend_frame_borders(buf);
+  if ((oxcf->pass == 2) &&
+      (!cpi->use_svc ||
+          (is_two_pass_svc(cpi) &&
+              cpi->svc.encode_empty_frame_state != ENCODING))) {
+    vp9_rc_get_second_pass_params(cpi);
+  } else {
+    set_frame_size(cpi);
   }
 
-  set_ref_ptrs(cm, xd, LAST_FRAME, LAST_FRAME);
-
   if (oxcf->aq_mode == VARIANCE_AQ) {
     vp9_vaq_init();
   }
 
+  for (i = 0; i < MAX_REF_FRAMES; ++i)
+    cpi->scaled_ref_idx[i] = INVALID_REF_BUFFER_IDX;
+
   if (oxcf->pass == 1 &&
       (!cpi->use_svc || is_two_pass_svc(cpi))) {
     const int lossless = is_lossless_requested(oxcf);
@@ -3638,9 +3687,9 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
   }
 
   if (cm->refresh_frame_context)
-    cm->frame_contexts[cm->frame_context_idx] = cm->fc;
+    cm->frame_contexts[cm->frame_context_idx] = *cm->fc;
 
-  // Frame was dropped, release scaled references.
+  // No frame encoded, or frame was dropped, release scaled references.
   if (*size == 0) {
     release_scaled_references(cpi);
   }
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 1e6047464..0e112f2ff 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -44,6 +44,7 @@ extern "C" {
 #endif
 
 #define DEFAULT_GF_INTERVAL         10
+#define INVALID_REF_BUFFER_IDX      -1  // Marks an invalid reference buffer id.
 
 typedef struct {
   int nmvjointcost[MV_JOINTS];
@@ -122,7 +123,12 @@ typedef struct VP9EncoderConfig {
   int noise_sensitivity;  // pre processing blur: recommendation 0
   int sharpness;  // sharpening output: recommendation 0:
   int speed;
+  // maximum allowed bitrate for any intra frame in % of bitrate target.
   unsigned int rc_max_intra_bitrate_pct;
+  // maximum allowed bitrate for any inter frame in % of bitrate target.
+  unsigned int rc_max_inter_bitrate_pct;
+  // percent of rate boost for golden frame in CBR mode.
+  unsigned int gf_cbr_boost_pct;
 
   MODE mode;
   int pass;
@@ -223,6 +229,13 @@ static INLINE int is_lossless_requested(const VP9EncoderConfig *cfg) {
   return cfg->best_allowed_q == 0 && cfg->worst_allowed_q == 0;
 }
 
+// TODO(jingning) All spatially adaptive variables should go to TileDataEnc.
+typedef struct TileDataEnc {
+  TileInfo tile_info;
+  int thresh_freq_fact[BLOCK_SIZES][MAX_MODES];
+  int mode_map[BLOCK_SIZES][MAX_MODES];
+} TileDataEnc;
+
 typedef struct VP9_COMP {
   QUANTS quants;
   MACROBLOCK mb;
@@ -238,10 +251,12 @@ typedef struct VP9_COMP {
   YV12_BUFFER_CONFIG *unscaled_last_source;
   YV12_BUFFER_CONFIG scaled_last_source;
 
+  TileDataEnc *tile_data;
+
   // For a still frame, this flag is set to 1 to skip partition search.
   int partition_search_skippable_frame;
 
-  int scaled_ref_idx[3];
+  int scaled_ref_idx[MAX_REF_FRAMES];
   int lst_fb_idx;
   int gld_fb_idx;
   int alt_fb_idx;
@@ -374,6 +389,10 @@ typedef struct VP9_COMP {
 
   int initial_width;
   int initial_height;
+  int initial_mbs;  // Number of MBs in the full-size frame; to be used to
+                    // normalize the firstpass stats. This will differ from the
+                    // number of MBs in the current frame when the frame is
+                    // scaled.
 
   int use_svc;
 
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index f1baf8323..c8c784b73 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -138,7 +138,7 @@ static void output_fpmb_stats(uint8_t *this_frame_mb_stats, VP9_COMMON *cm,
   struct vpx_codec_cx_pkt pkt;
   pkt.kind = VPX_CODEC_FPMB_STATS_PKT;
   pkt.data.firstpass_mb_stats.buf = this_frame_mb_stats;
-  pkt.data.firstpass_mb_stats.sz = cm->MBs * sizeof(uint8_t);
+  pkt.data.firstpass_mb_stats.sz = cm->initial_mbs * sizeof(uint8_t);
   vpx_codec_pkt_list_add(pktlist, &pkt);
 }
 #endif
@@ -483,7 +483,7 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
 
 #if CONFIG_FP_MB_STATS
   if (cpi->use_fp_mb_stats) {
-    vp9_zero_array(cpi->twopass.frame_mb_stats_buf, cm->MBs);
+    vp9_zero_array(cpi->twopass.frame_mb_stats_buf, cm->initial_mbs);
   }
 #endif
 
@@ -934,12 +934,14 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
   vp9_clear_system_state();
   {
     FIRSTPASS_STATS fps;
-    // The minimum error here insures some bit alocation to frames even
+    // The minimum error here insures some bit allocation to frames even
     // in static regions. The allocation per MB declines for larger formats
     // where the typical "real" energy per MB also falls.
     // Initial estimate here uses sqrt(mbs) to define the min_err, where the
-    // number of mbs is propotional to image area.
-    const double min_err = 200 * sqrt(cm->MBs);
+    // number of mbs is proportional to the image area.
+    const int num_mbs =
+        cpi->oxcf.allow_spatial_resampling ? cpi->initial_mbs : cpi->common.MBs;
+    const double min_err = 200 * sqrt(num_mbs);
 
     fps.frame = cm->current_video_frame;
     fps.spatial_layer_id = cpi->svc.spatial_layer_id;
@@ -947,9 +949,9 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
     fps.sr_coded_error = (double)(sr_coded_error >> 8) + min_err;
     fps.intra_error = (double)(intra_error >> 8) + min_err;
     fps.count = 1.0;
-    fps.pcnt_inter = (double)intercount / cm->MBs;
-    fps.pcnt_second_ref = (double)second_ref_count / cm->MBs;
-    fps.pcnt_neutral = (double)neutral_count / cm->MBs;
+    fps.pcnt_inter = (double)intercount / num_mbs;
+    fps.pcnt_second_ref = (double)second_ref_count / num_mbs;
+    fps.pcnt_neutral = (double)neutral_count / num_mbs;
 
     if (mvcount > 0) {
       fps.MVr = (double)sum_mvr / mvcount;
@@ -960,7 +962,7 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
       fps.MVcv = ((double)sum_mvcs - (fps.MVc * fps.MVc / mvcount)) / mvcount;
       fps.mv_in_out_count = (double)sum_in_vectors / (mvcount * 2);
       fps.new_mv_count = new_mv_count;
-      fps.pcnt_motion = (double)mvcount / cm->MBs;
+      fps.pcnt_motion = (double)mvcount / num_mbs;
     } else {
       fps.MVr = 0.0;
       fps.mvr_abs = 0.0;
@@ -1074,7 +1076,8 @@ static int get_twopass_worst_quality(const VP9_COMP *cpi,
   if (section_target_bandwidth <= 0) {
     return rc->worst_quality;  // Highest value allowed
   } else {
-    const int num_mbs = cpi->common.MBs;
+    const int num_mbs =
+        cpi->oxcf.allow_spatial_resampling ? cpi->initial_mbs : cpi->common.MBs;
     const double section_err = stats->coded_error / stats->count;
     const double err_per_mb = section_err / num_mbs;
     const double speed_term = 1.0 + 0.04 * oxcf->speed;
@@ -1188,9 +1191,12 @@ void vp9_init_second_pass(VP9_COMP *cpi) {
 #define LOW_SR_DIFF_TRHESH 0.1
 #define SR_DIFF_MAX 128.0
 
-static double get_sr_decay_rate(const VP9_COMMON *cm,
+static double get_sr_decay_rate(const VP9_COMP *cpi,
                                 const FIRSTPASS_STATS *frame) {
-  double sr_diff = (frame->sr_coded_error - frame->coded_error) / cm->MBs;
+  const int num_mbs =
+      cpi->oxcf.allow_spatial_resampling ? cpi->initial_mbs : cpi->common.MBs;
+  double sr_diff =
+      (frame->sr_coded_error - frame->coded_error) / num_mbs;
   double sr_decay = 1.0;
   const double motion_amplitude_factor =
     frame->pcnt_motion * ((frame->mvc_abs + frame->mvr_abs) / 2);
@@ -1207,19 +1213,19 @@ static double get_sr_decay_rate(const VP9_COMMON *cm,
 
 // This function gives an estimate of how badly we believe the prediction
 // quality is decaying from frame to frame.
-static double get_zero_motion_factor(const VP9_COMMON *cm,
+static double get_zero_motion_factor(const VP9_COMP *cpi,
                                      const FIRSTPASS_STATS *frame) {
   const double zero_motion_pct = frame->pcnt_inter -
                                  frame->pcnt_motion;
-  double sr_decay = get_sr_decay_rate(cm, frame);
+  double sr_decay = get_sr_decay_rate(cpi, frame);
   return MIN(sr_decay, zero_motion_pct);
 }
 
 #define ZM_POWER_FACTOR 0.75
 
-static double get_prediction_decay_rate(const VP9_COMMON *cm,
+static double get_prediction_decay_rate(const VP9_COMP *cpi,
                                         const FIRSTPASS_STATS *next_frame) {
-  const double sr_decay_rate = get_sr_decay_rate(cm, next_frame);
+  const double sr_decay_rate = get_sr_decay_rate(cpi, next_frame);
   const double zero_motion_factor =
     (0.95 * pow((next_frame->pcnt_inter - next_frame->pcnt_motion),
                 ZM_POWER_FACTOR));
@@ -1314,9 +1320,11 @@ static double calc_frame_boost(VP9_COMP *cpi,
     vp9_convert_qindex_to_q(cpi->rc.avg_frame_qindex[INTER_FRAME],
                             cpi->common.bit_depth);
   const double boost_correction = MIN((0.5 + (lq * 0.015)), 1.5);
+  const int num_mbs =
+      cpi->oxcf.allow_spatial_resampling ? cpi->initial_mbs : cpi->common.MBs;
 
   // Underlying boost factor is based on inter error ratio.
-  frame_boost = (BASELINE_ERR_PER_MB * cpi->common.MBs) /
+  frame_boost = (BASELINE_ERR_PER_MB * num_mbs) /
                 DOUBLE_DIVIDE_CHECK(this_frame->coded_error);
   frame_boost = frame_boost * BOOST_FACTOR * boost_correction;
 
@@ -1365,7 +1373,7 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset,
 
     // Accumulate the effect of prediction quality decay.
     if (!flash_detected) {
-      decay_accumulator *= get_prediction_decay_rate(&cpi->common, this_frame);
+      decay_accumulator *= get_prediction_decay_rate(cpi, this_frame);
       decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR
                           ? MIN_DECAY_FACTOR : decay_accumulator;
     }
@@ -1404,7 +1412,7 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset,
 
     // Cumulative effect of prediction quality decay.
     if (!flash_detected) {
-      decay_accumulator *= get_prediction_decay_rate(&cpi->common, this_frame);
+      decay_accumulator *= get_prediction_decay_rate(cpi, this_frame);
       decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR
                               ? MIN_DECAY_FACTOR : decay_accumulator;
     }
@@ -1723,7 +1731,8 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     gf_group_err -= gf_first_frame_err;
 
   // Motion breakout threshold for loop below depends on image size.
-  mv_ratio_accumulator_thresh = (cpi->common.width + cpi->common.height) / 4.0;
+  mv_ratio_accumulator_thresh =
+      (cpi->common.height + cpi->common.width) / 4.0;
 
   // Set a maximum and minimum interval for the GF group.
   // If the image appears almost completely static we can extend beyond this.
@@ -1775,14 +1784,14 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     // Accumulate the effect of prediction quality decay.
     if (!flash_detected) {
       last_loop_decay_rate = loop_decay_rate;
-      loop_decay_rate = get_prediction_decay_rate(&cpi->common, &next_frame);
+      loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
 
       decay_accumulator = decay_accumulator * loop_decay_rate;
 
       // Monitor for static sections.
       zero_motion_accumulator =
         MIN(zero_motion_accumulator,
-            get_zero_motion_factor(&cpi->common, &next_frame));
+            get_zero_motion_factor(cpi, &next_frame));
 
       // Break clause to detect very still sections after motion. For example,
       // a static image after a fade or other transition.
@@ -2048,8 +2057,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
         break;
 
       // How fast is the prediction quality decaying?
-      loop_decay_rate = get_prediction_decay_rate(&cpi->common,
-                                                  twopass->stats_in);
+      loop_decay_rate = get_prediction_decay_rate(cpi, twopass->stats_in);
 
       // We want to know something about the recent past... rather than
       // as used elsewhere where we are concerned with decay in prediction
@@ -2160,7 +2168,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     // Monitor for static sections.
     zero_motion_accumulator =
       MIN(zero_motion_accumulator,
-          get_zero_motion_factor(&cpi->common, &next_frame));
+          get_zero_motion_factor(cpi, &next_frame));
 
     // Not all frames in the group are necessarily used in calculating boost.
     if ((i <= rc->max_gf_interval) ||
@@ -2171,7 +2179,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
       // How fast is prediction quality decaying.
       if (!detect_flash(twopass, 0)) {
         const double loop_decay_rate =
-          get_prediction_decay_rate(&cpi->common, &next_frame);
+          get_prediction_decay_rate(cpi, &next_frame);
         decay_accumulator *= loop_decay_rate;
         decay_accumulator = MAX(decay_accumulator, MIN_DECAY_FACTOR);
         av_decay_accumulator += decay_accumulator;
@@ -2295,6 +2303,24 @@ void configure_buffer_updates(VP9_COMP *cpi) {
   }
 }
 
+int is_skippable_frame(const VP9_COMP *cpi) {
+  // If the current frame does not have non-zero motion vector detected in the
+  // first  pass, and so do its previous and forward frames, then this frame
+  // can be skipped for partition check, and the partition size is assigned
+  // according to the variance
+  const SVC *const svc = &cpi->svc;
+  const TWO_PASS *const twopass = is_two_pass_svc(cpi) ?
+      &svc->layer_context[svc->spatial_layer_id].twopass : &cpi->twopass;
+
+  return (!frame_is_intra_only(&cpi->common) &&
+    twopass->stats_in - 2 > twopass->stats_in_start &&
+    twopass->stats_in < twopass->stats_in_end &&
+    (twopass->stats_in - 1)->pcnt_inter - (twopass->stats_in - 1)->pcnt_motion
+    == 1 &&
+    (twopass->stats_in - 2)->pcnt_inter - (twopass->stats_in - 2)->pcnt_motion
+    == 1 &&
+    twopass->stats_in->pcnt_inter - twopass->stats_in->pcnt_motion == 1);
+}
 
 void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
@@ -2329,11 +2355,6 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
     target_rate = vp9_rc_clamp_pframe_target_size(cpi, target_rate);
     rc->base_frame_target = target_rate;
 
-    // Correction to rate target based on prior over or under shoot.
-    if (cpi->oxcf.rc_mode == VPX_VBR)
-      vbr_rate_correction(cpi, &target_rate, rc->vbr_bits_off_target);
-
-    vp9_rc_set_frame_target(cpi, target_rate);
     cm->frame_type = INTER_FRAME;
 
     if (lc != NULL) {
@@ -2347,6 +2368,13 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
       }
     }
 
+    // Do the firstpass stats indicate that this frame is skippable for the
+    // partition search?
+    if (cpi->sf.allow_partition_search_skip &&
+        cpi->oxcf.pass == 2 && (!cpi->use_svc || is_two_pass_svc(cpi))) {
+      cpi->partition_search_skippable_frame = is_skippable_frame(cpi);
+    }
+
     return;
   }
 
@@ -2377,8 +2405,7 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
   this_frame_copy = this_frame;
 
   // Keyframe and section processing.
-  if (rc->frames_to_key == 0 ||
-      (cpi->frame_flags & FRAMEFLAGS_KEY)) {
+  if (rc->frames_to_key == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY)) {
     // Define next KF group and assign bits to it.
     find_next_key_frame(cpi, &this_frame_copy);
   } else {
@@ -2431,6 +2458,13 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
 
   configure_buffer_updates(cpi);
 
+  // Do the firstpass stats indicate that this frame is skippable for the
+  // partition search?
+  if (cpi->sf.allow_partition_search_skip && cpi->oxcf.pass == 2 &&
+      (!cpi->use_svc || is_two_pass_svc(cpi))) {
+    cpi->partition_search_skippable_frame = is_skippable_frame(cpi);
+  }
+
   target_rate = gf_group->bit_allocation[gf_group->index];
   if (cpi->common.frame_type == KEY_FRAME)
     target_rate = vp9_rc_clamp_iframe_target_size(cpi, target_rate);
@@ -2439,18 +2473,11 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
 
   rc->base_frame_target = target_rate;
 
-  // Correction to rate target based on prior over or under shoot.
-  if (cpi->oxcf.rc_mode == VPX_VBR)
-    vbr_rate_correction(cpi, &target_rate, rc->vbr_bits_off_target);
-
-  vp9_rc_set_frame_target(cpi, target_rate);
-
   // Update the total stats remaining structure.
   subtract_stats(&twopass->total_left_stats, &this_frame);
 }
 
 #define MINQ_ADJ_LIMIT 32
-#define Q_LIMIT_STEP 1
 void vp9_twopass_postencode_update(VP9_COMP *cpi) {
   TWO_PASS *const twopass = &cpi->twopass;
   RATE_CONTROL *const rc = &cpi->rc;
@@ -2495,16 +2522,22 @@ void vp9_twopass_postencode_update(VP9_COMP *cpi) {
     if (rc->rate_error_estimate > cpi->oxcf.under_shoot_pct) {
       --twopass->extend_maxq;
       if (rc->rolling_target_bits >= rc->rolling_actual_bits)
-        twopass->extend_minq += Q_LIMIT_STEP;
+        ++twopass->extend_minq;
     // Overshoot.
     } else if (rc->rate_error_estimate < -cpi->oxcf.over_shoot_pct) {
       --twopass->extend_minq;
       if (rc->rolling_target_bits < rc->rolling_actual_bits)
-        twopass->extend_maxq += Q_LIMIT_STEP;
+        ++twopass->extend_maxq;
     } else {
+      // Adjustment for extreme local overshoot.
+      if (rc->projected_frame_size > (2 * rc->base_frame_target) &&
+          rc->projected_frame_size > (2 * rc->avg_frame_bandwidth))
+        ++twopass->extend_maxq;
+
+      // Unwind undershoot or overshoot adjustment.
       if (rc->rolling_target_bits < rc->rolling_actual_bits)
         --twopass->extend_minq;
-      if (rc->rolling_target_bits > rc->rolling_actual_bits)
+      else if (rc->rolling_target_bits > rc->rolling_actual_bits)
         --twopass->extend_maxq;
     }
     twopass->extend_minq = clamp(twopass->extend_minq, 0, MINQ_ADJ_LIMIT);
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index ae9ed66cd..28f12916e 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -452,7 +452,8 @@ static void estimate_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
   args->dist += dist;
 }
 
-static const THR_MODES mode_idx[MAX_REF_FRAMES - 1][INTER_MODES] = {
+static const THR_MODES mode_idx[MAX_REF_FRAMES][4] = {
+  {THR_DC, THR_H_PRED, THR_V_PRED},
   {THR_NEARESTMV, THR_NEARMV, THR_ZEROMV, THR_NEWMV},
   {THR_NEARESTG, THR_NEARG, THR_ZEROG, THR_NEWG},
   {THR_NEARESTA, THR_NEARA, THR_ZEROA, THR_NEWA},
@@ -461,13 +462,11 @@ static const THR_MODES mode_idx[MAX_REF_FRAMES - 1][INTER_MODES] = {
 // TODO(jingning) placeholder for inter-frame non-RD mode decision.
 // this needs various further optimizations. to be continued..
 void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
-                         const TileInfo *const tile,
-                         int mi_row, int mi_col,
-                         int *returnrate,
-                         int64_t *returndistortion,
-                         BLOCK_SIZE bsize,
-                         PICK_MODE_CONTEXT *ctx) {
+                         TileDataEnc *tile_data,
+                         int mi_row, int mi_col, RD_COST *rd_cost,
+                         BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) {
   VP9_COMMON *const cm = &cpi->common;
+  TileInfo *const tile_info = &tile_data->tile_info;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi[0].src_mi->mbmi;
   struct macroblockd_plane *const pd = &xd->plane[0];
@@ -480,11 +479,8 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   struct buf_2d yv12_mb[4][MAX_MB_PLANE];
   static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
                                     VP9_ALT_FLAG };
-  int64_t best_rd = INT64_MAX;
-  int64_t this_rd = INT64_MAX;
+  RD_COST this_rdc, best_rdc;
   uint8_t skip_txfm = 0;
-  int rate = INT_MAX;
-  int64_t dist = INT64_MAX;
   // var_y and sse_y are saved to be used in skipping checking
   unsigned int var_y = UINT_MAX;
   unsigned int sse_y = UINT_MAX;
@@ -500,8 +496,8 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
 
   const int8_t segment_id = mbmi->segment_id;
   const int *const rd_threshes = cpi->rd.threshes[segment_id][bsize];
-  const int *const rd_thresh_freq_fact = cpi->rd.thresh_freq_fact[bsize];
-  INTERP_FILTER filter_ref = cm->interp_filter;
+  const int *const rd_thresh_freq_fact = tile_data->thresh_freq_fact[bsize];
+  INTERP_FILTER filter_ref;
   const int bsl = mi_width_log2_lookup[bsize];
   const int pred_filter_search = cm->interp_filter == SWITCHABLE ?
       (((mi_row + mi_col) >> bsl) +
@@ -544,9 +540,17 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
   x->skip = 0;
 
+  if (xd->up_available)
+    filter_ref = xd->mi[-xd->mi_stride].src_mi->mbmi.interp_filter;
+  else if (xd->left_available)
+    filter_ref = xd->mi[-1].src_mi->mbmi.interp_filter;
+  else
+    filter_ref = cm->interp_filter;
+
   // initialize mode decisions
-  *returnrate = INT_MAX;
-  *returndistortion = INT64_MAX;
+  vp9_rd_cost_reset(&best_rdc);
+  vp9_rd_cost_reset(&this_rdc);
+  vp9_rd_cost_reset(rd_cost);
   vpx_memset(mbmi, 0, sizeof(MB_MODE_INFO));
   mbmi->sb_type = bsize;
   mbmi->ref_frame[0] = NONE;
@@ -557,17 +561,12 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                         EIGHTTAP : cm->interp_filter;
   mbmi->segment_id = segment_id;
 
-  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+  for (ref_frame = LAST_FRAME; ref_frame <= GOLDEN_FRAME; ++ref_frame) {
     PREDICTION_MODE this_mode;
     x->pred_mv_sad[ref_frame] = INT_MAX;
     frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
     frame_mv[ZEROMV][ref_frame].as_int = 0;
 
-    if (xd->up_available)
-      filter_ref = xd->mi[-xd->mi_stride].src_mi->mbmi.interp_filter;
-    else if (xd->left_available)
-      filter_ref = xd->mi[-1].src_mi->mbmi.interp_filter;
-
     if (cpi->ref_frame_flags & flag_list[ref_frame]) {
       const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
       int_mv *const candidates = mbmi->ref_mvs[ref_frame];
@@ -576,10 +575,11 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                            sf, sf);
 
       if (!cm->error_resilient_mode)
-        vp9_find_mv_refs(cm, xd, tile, xd->mi[0].src_mi, ref_frame,
+        vp9_find_mv_refs(cm, xd, tile_info, xd->mi[0].src_mi, ref_frame,
                          candidates, mi_row, mi_col);
       else
-        const_motion[ref_frame] = mv_refs_rt(cm, xd, tile, xd->mi[0].src_mi,
+        const_motion[ref_frame] = mv_refs_rt(cm, xd, tile_info,
+                                             xd->mi[0].src_mi,
                                              ref_frame, candidates,
                                              mi_row, mi_col);
 
@@ -606,27 +606,27 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
       int rate_mv = 0;
       int mode_rd_thresh;
 
-      if (const_motion[ref_frame] &&
-          (this_mode == NEARMV || this_mode == ZEROMV))
+      if (const_motion[ref_frame] && this_mode == NEARMV)
         continue;
 
       if (!(cpi->sf.inter_mode_mask[bsize] & (1 << this_mode)))
         continue;
 
       mode_rd_thresh =
-          rd_threshes[mode_idx[ref_frame -
-                               LAST_FRAME][INTER_OFFSET(this_mode)]];
-      if (rd_less_than_thresh(best_rd, mode_rd_thresh,
+          rd_threshes[mode_idx[ref_frame][INTER_OFFSET(this_mode)]];
+      if (rd_less_than_thresh(best_rdc.rdcost, mode_rd_thresh,
                               rd_thresh_freq_fact[this_mode]))
         continue;
 
       if (this_mode == NEWMV) {
+        if (ref_frame > LAST_FRAME)
+          continue;
         if (cpi->sf.partition_search_type != VAR_BASED_PARTITION &&
-            this_rd < (int64_t)(1 << num_pels_log2_lookup[bsize]))
+            this_rdc.rdcost < (int64_t)(1 << num_pels_log2_lookup[bsize]))
           continue;
         if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col,
                                     &frame_mv[NEWMV][ref_frame],
-                                    &rate_mv, best_rd))
+                                    &rate_mv, best_rdc.rdcost))
           continue;
       }
 
@@ -642,7 +642,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
       // motion vector is at sub-pixel accuracy level for luma component, i.e.,
       // the last three bits are all zeros.
       if (cpi->sf.reuse_inter_pred_sby) {
-        if (this_mode == NEARESTMV) {
+        if (!this_mode_pred) {
           this_mode_pred = &tmp[3];
         } else {
           this_mode_pred = &tmp[get_pred_buffer(tmp, 3)];
@@ -699,30 +699,34 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
 
         mbmi->interp_filter = best_filter;
         mbmi->tx_size = pf_tx_size[mbmi->interp_filter];
-        rate = pf_rate[mbmi->interp_filter];
-        dist = pf_dist[mbmi->interp_filter];
+        this_rdc.rate = pf_rate[mbmi->interp_filter];
+        this_rdc.dist = pf_dist[mbmi->interp_filter];
         var_y = pf_var[mbmi->interp_filter];
         sse_y = pf_sse[mbmi->interp_filter];
         x->skip_txfm[0] = skip_txfm;
       } else {
         mbmi->interp_filter = (filter_ref == SWITCHABLE) ? EIGHTTAP: filter_ref;
         vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
-        model_rd_for_sb_y(cpi, bsize, x, xd, &rate, &dist, &var_y, &sse_y);
+        model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc.rate, &this_rdc.dist,
+                          &var_y, &sse_y);
       }
 
-      rate += rate_mv;
-      rate += cpi->inter_mode_cost[mbmi->mode_context[ref_frame]]
+      this_rdc.rate += rate_mv;
+      this_rdc.rate += cpi->inter_mode_cost[mbmi->mode_context[ref_frame]]
                                   [INTER_OFFSET(this_mode)];
-      this_rd = RDCOST(x->rdmult, x->rddiv, rate, dist);
+      this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
+                               this_rdc.rate, this_rdc.dist);
 
       // Skipping checking: test to see if this block can be reconstructed by
       // prediction only.
       if (cpi->allow_encode_breakout) {
         encode_breakout_test(cpi, x, bsize, mi_row, mi_col, ref_frame,
-                             this_mode, var_y, sse_y, yv12_mb, &rate, &dist);
+                             this_mode, var_y, sse_y, yv12_mb,
+                             &this_rdc.rate, &this_rdc.dist);
         if (x->skip) {
-          rate += rate_mv;
-          this_rd = RDCOST(x->rdmult, x->rddiv, rate, dist);
+          this_rdc.rate += rate_mv;
+          this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
+                                   this_rdc.rate, this_rdc.dist);
         }
       }
 
@@ -734,10 +738,8 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
       (void)ctx;
 #endif
 
-      if (this_rd < best_rd || x->skip) {
-        best_rd = this_rd;
-        *returnrate = rate;
-        *returndistortion = dist;
+      if (this_rdc.rdcost < best_rdc.rdcost || x->skip) {
+        best_rdc = this_rdc;
         best_mode = this_mode;
         best_pred_filter = mbmi->interp_filter;
         best_tx_size = mbmi->tx_size;
@@ -757,10 +759,9 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
       if (x->skip)
         break;
     }
-    // If the current reference frame is valid and we found a usable mode,
-    // we are done.
-    if (best_rd < INT64_MAX)
-      break;
+
+    // Check that a prediction mode has been selected.
+    assert(best_rdc.rdcost < INT64_MAX);
   }
 
   // If best prediction is not in dst buf, then copy the prediction block from
@@ -792,7 +793,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
 
   // Perform intra prediction search, if the best SAD is above a certain
   // threshold.
-  if (!x->skip && best_rd > inter_mode_thresh &&
+  if (!x->skip && best_rdc.rdcost > inter_mode_thresh &&
       bsize <= cpi->sf.max_intra_bsize) {
     PREDICTION_MODE this_mode;
     struct estimate_block_intra_args args = { cpi, x, DC_PRED, 0, 0 };
@@ -814,16 +815,15 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
       vp9_foreach_transformed_block_in_plane(xd, bsize, 0,
                                              estimate_block_intra, &args);
       mbmi->tx_size = saved_tx_size;
-      rate = args.rate;
-      dist = args.dist;
-      rate += cpi->mbmode_cost[this_mode];
-      rate += intra_cost_penalty;
-      this_rd = RDCOST(x->rdmult, x->rddiv, rate, dist);
-
-      if (this_rd + intra_mode_cost < best_rd) {
-        best_rd = this_rd;
-        *returnrate = rate;
-        *returndistortion = dist;
+      this_rdc.rate = args.rate;
+      this_rdc.dist = args.dist;
+      this_rdc.rate += cpi->mbmode_cost[this_mode];
+      this_rdc.rate += intra_cost_penalty;
+      this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
+                               this_rdc.rate, this_rdc.dist);
+
+      if (this_rdc.rdcost + intra_mode_cost < best_rdc.rdcost) {
+        best_rdc = this_rdc;
         mbmi->mode = this_mode;
         mbmi->tx_size = intra_tx_size;
         mbmi->ref_frame[0] = INTRA_FRAME;
@@ -836,4 +836,13 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     if (cpi->sf.reuse_inter_pred_sby)
       pd->dst = orig_dst;
   }
+
+  if (is_inter_block(mbmi))
+    vp9_update_rd_thresh_fact(cpi, tile_data, bsize,
+                              mode_idx[ref_frame][INTER_OFFSET(mbmi->mode)]);
+  else
+    vp9_update_rd_thresh_fact(cpi, tile_data, bsize,
+                              mode_idx[ref_frame][mbmi->mode]);
+
+  *rd_cost = best_rdc;
 }
diff --git a/vp9/encoder/vp9_pickmode.h b/vp9/encoder/vp9_pickmode.h
index 97aeca76a..23d347d94 100644
--- a/vp9/encoder/vp9_pickmode.h
+++ b/vp9/encoder/vp9_pickmode.h
@@ -18,10 +18,8 @@ extern "C" {
 #endif
 
 void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
-                         const struct TileInfo *const tile,
-                         int mi_row, int mi_col,
-                         int *returnrate,
-                         int64_t *returndistortion,
+                         TileDataEnc *tile_data,
+                         int mi_row, int mi_col, RD_COST *rd_cost,
                          BLOCK_SIZE bsize,
                          PICK_MODE_CONTEXT *ctx);
 
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index 65bca669a..8a5b6114c 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -196,6 +196,7 @@ static int estimate_bits_at_q(FRAME_TYPE frame_type, int q, int mbs,
 
 int vp9_rc_clamp_pframe_target_size(const VP9_COMP *const cpi, int target) {
   const RATE_CONTROL *rc = &cpi->rc;
+  const VP9EncoderConfig *oxcf = &cpi->oxcf;
   const int min_frame_target = MAX(rc->min_frame_bandwidth,
                                    rc->avg_frame_bandwidth >> 5);
   if (target < min_frame_target)
@@ -210,6 +211,11 @@ int vp9_rc_clamp_pframe_target_size(const VP9_COMP *const cpi, int target) {
   // Clip the frame target to the maximum allowed value.
   if (target > rc->max_frame_bandwidth)
     target = rc->max_frame_bandwidth;
+  if (oxcf->rc_max_inter_bitrate_pct) {
+    const int max_rate = rc->avg_frame_bandwidth *
+                         oxcf->rc_max_inter_bitrate_pct / 100;
+    target = MIN(target, max_rate);
+  }
   return target;
 }
 
@@ -971,7 +977,13 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi,
       if (!cpi->refresh_alt_ref_frame) {
         active_best_quality = cq_level;
       } else {
-        active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
+       const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+       active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
+
+        // Modify best quality for second level arfs. For mode VPX_Q this
+        // becomes the baseline frame q.
+        if (gf_group->rf_level[gf_group->index] == GF_ARF_LOW)
+          active_best_quality = (active_best_quality + cq_level + 1) / 2;
       }
     } else {
       active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
@@ -1327,7 +1339,18 @@ static int calc_pframe_target_size_one_pass_cbr(const VP9_COMP *cpi) {
   const int64_t diff = rc->optimal_buffer_level - rc->buffer_level;
   const int64_t one_pct_bits = 1 + rc->optimal_buffer_level / 100;
   int min_frame_target = MAX(rc->avg_frame_bandwidth >> 4, FRAME_OVERHEAD_BITS);
-  int target = rc->avg_frame_bandwidth;
+  int target;
+
+  if (oxcf->gf_cbr_boost_pct) {
+    const int af_ratio_pct = oxcf->gf_cbr_boost_pct + 100;
+    target =  cpi->refresh_golden_frame ?
+      (rc->avg_frame_bandwidth * rc->baseline_gf_interval * af_ratio_pct) /
+      (rc->baseline_gf_interval * 100 + af_ratio_pct - 100) :
+      (rc->avg_frame_bandwidth * rc->baseline_gf_interval * 100) /
+      (rc->baseline_gf_interval * 100 + af_ratio_pct - 100);
+  } else {
+    target = rc->avg_frame_bandwidth;
+  }
   if (svc->number_temporal_layers > 1 &&
       oxcf->rc_mode == VPX_CBR) {
     // Note that for layers, avg_frame_bandwidth is the cumulative
@@ -1347,6 +1370,11 @@ static int calc_pframe_target_size_one_pass_cbr(const VP9_COMP *cpi) {
     const int pct_high = (int)MIN(-diff / one_pct_bits, oxcf->over_shoot_pct);
     target += (target * pct_high) / 200;
   }
+  if (oxcf->rc_max_inter_bitrate_pct) {
+    const int max_rate = rc->avg_frame_bandwidth *
+                         oxcf->rc_max_inter_bitrate_pct / 100;
+    target = MIN(target, max_rate);
+  }
   return MAX(min_frame_target, target);
 }
 
@@ -1436,15 +1464,25 @@ void vp9_rc_get_one_pass_cbr_params(VP9_COMP *cpi) {
     rc->frames_to_key = cpi->oxcf.key_freq;
     rc->kf_boost = DEFAULT_KF_BOOST;
     rc->source_alt_ref_active = 0;
-    target = calc_iframe_target_size_one_pass_cbr(cpi);
   } else {
     cm->frame_type = INTER_FRAME;
-    target = calc_pframe_target_size_one_pass_cbr(cpi);
   }
+  if (rc->frames_till_gf_update_due == 0) {
+    rc->baseline_gf_interval = DEFAULT_GF_INTERVAL;
+    rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+    // NOTE: frames_till_gf_update_due must be <= frames_to_key.
+    if (rc->frames_till_gf_update_due > rc->frames_to_key)
+      rc->frames_till_gf_update_due = rc->frames_to_key;
+    cpi->refresh_golden_frame = 1;
+    rc->gfu_boost = DEFAULT_GF_BOOST;
+  }
+
+  if (cm->frame_type == KEY_FRAME)
+    target = calc_iframe_target_size_one_pass_cbr(cpi);
+  else
+    target = calc_pframe_target_size_one_pass_cbr(cpi);
+
   vp9_rc_set_frame_target(cpi, target);
-  // Don't use gf_update by default in CBR mode.
-  rc->frames_till_gf_update_due = INT_MAX;
-  rc->baseline_gf_interval = INT_MAX;
 }
 
 int vp9_compute_qdelta(const RATE_CONTROL *rc, double qstart, double qtarget,
diff --git a/vp9/encoder/vp9_rd.c b/vp9/encoder/vp9_rd.c
index 7f526fc42..13e317d6d 100644
--- a/vp9/encoder/vp9_rd.c
+++ b/vp9/encoder/vp9_rd.c
@@ -65,7 +65,7 @@ static const uint8_t rd_thresh_block_size_factor[BLOCK_SIZES] = {
 };
 
 static void fill_mode_costs(VP9_COMP *cpi) {
-  const FRAME_CONTEXT *const fc = &cpi->common.fc;
+  const FRAME_CONTEXT *const fc = cpi->common.fc;
   int i, j;
 
   for (i = 0; i < INTRA_MODES; ++i)
@@ -280,7 +280,7 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi) {
   set_block_thresholds(cm, rd);
 
   if (!cpi->sf.use_nonrd_pick_mode || cm->frame_type == KEY_FRAME) {
-    fill_token_costs(x->token_costs, cm->fc.coef_probs);
+    fill_token_costs(x->token_costs, cm->fc->coef_probs);
 
     for (i = 0; i < PARTITION_CONTEXTS; ++i)
       vp9_cost_tokens(cpi->partition_cost[i], get_partition_probs(cm, i),
@@ -295,11 +295,11 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi) {
       vp9_build_nmv_cost_table(x->nmvjointcost,
                                cm->allow_high_precision_mv ? x->nmvcost_hp
                                                            : x->nmvcost,
-                               &cm->fc.nmvc, cm->allow_high_precision_mv);
+                               &cm->fc->nmvc, cm->allow_high_precision_mv);
 
       for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
         vp9_cost_tokens((int *)cpi->inter_mode_cost[i],
-                        cm->fc.inter_mode_probs[i], vp9_inter_mode_tree);
+                        cm->fc->inter_mode_probs[i], vp9_inter_mode_tree);
     }
   }
 }
@@ -594,21 +594,38 @@ void vp9_set_rd_speed_thresholds_sub8x8(VP9_COMP *cpi) {
   const SPEED_FEATURES *const sf = &cpi->sf;
   RD_OPT *const rd = &cpi->rd;
   int i;
+  static const int thresh_mult[2][MAX_REFS] =
+      {{2500, 2500, 2500, 4500, 4500, 2500},
+       {2000, 2000, 2000, 4000, 4000, 2000}};
+
+  for (i = 0; i < MAX_REFS; ++i) {
+    rd->thresh_mult_sub8x8[i] =
+        (sf->disable_split_mask & (1 << i)) ?
+            INT_MAX : thresh_mult[cpi->oxcf.mode == BEST][i];
+  }
+}
 
-  for (i = 0; i < MAX_REFS; ++i)
-    rd->thresh_mult_sub8x8[i] = cpi->oxcf.mode == BEST ? -500 : 0;
-
-  rd->thresh_mult_sub8x8[THR_LAST] += 2500;
-  rd->thresh_mult_sub8x8[THR_GOLD] += 2500;
-  rd->thresh_mult_sub8x8[THR_ALTR] += 2500;
-  rd->thresh_mult_sub8x8[THR_INTRA] += 2500;
-  rd->thresh_mult_sub8x8[THR_COMP_LA] += 4500;
-  rd->thresh_mult_sub8x8[THR_COMP_GA] += 4500;
-
-  // Check for masked out split cases.
-  for (i = 0; i < MAX_REFS; ++i)
-    if (sf->disable_split_mask & (1 << i))
-      rd->thresh_mult_sub8x8[i] = INT_MAX;
+// TODO(jingning) Refactor this function. Use targeted smaller struct as inputs.
+void vp9_update_rd_thresh_fact(VP9_COMP *cpi, TileDataEnc *tile_data,
+                               int bsize, int best_mode_index) {
+  if (cpi->sf.adaptive_rd_thresh > 0) {
+    const int top_mode = bsize < BLOCK_8X8 ? MAX_REFS : MAX_MODES;
+    int mode;
+    for (mode = 0; mode < top_mode; ++mode) {
+      const BLOCK_SIZE min_size = MAX(bsize - 1, BLOCK_4X4);
+      const BLOCK_SIZE max_size = MIN(bsize + 2, BLOCK_64X64);
+      BLOCK_SIZE bs;
+      for (bs = min_size; bs <= max_size; ++bs) {
+        int *const fact = &tile_data->thresh_freq_fact[bs][mode];
+        if (mode == best_mode_index) {
+          *fact -= (*fact >> 4);
+        } else {
+          *fact = MIN(*fact + RD_THRESH_INC,
+                      cpi->sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT);
+        }
+      }
+    }
+  }
 }
 
 int vp9_get_intra_cost_penalty(int qindex, int qdelta,
diff --git a/vp9/encoder/vp9_rd.h b/vp9/encoder/vp9_rd.h
index 1aa52663a..aecca0b43 100644
--- a/vp9/encoder/vp9_rd.h
+++ b/vp9/encoder/vp9_rd.h
@@ -36,6 +36,9 @@ extern "C" {
 #define MAX_MODES 30
 #define MAX_REFS  6
 
+#define RD_THRESH_MAX_FACT 64
+#define RD_THRESH_INC      1
+
 // This enumerator type needs to be kept aligned with the mode order in
 // const MODE_DEFINITION vp9_mode_order[MAX_MODES] used in the rd code.
 typedef enum {
@@ -129,6 +132,7 @@ void vp9_rd_cost_reset(RD_COST *rd_cost);
 void vp9_rd_cost_init(RD_COST *rd_cost);
 
 struct TileInfo;
+struct TileDataEnc;
 struct VP9_COMP;
 struct macroblock;
 
@@ -158,6 +162,10 @@ void vp9_set_rd_speed_thresholds(struct VP9_COMP *cpi);
 
 void vp9_set_rd_speed_thresholds_sub8x8(struct VP9_COMP *cpi);
 
+void vp9_update_rd_thresh_fact(struct VP9_COMP *cpi,
+                               struct TileDataEnc *tile_data,
+                               int bsize, int best_mode_index);
+
 static INLINE int rd_less_than_thresh(int64_t best_rd, int thresh,
                                       int thresh_fact) {
     return best_rd < ((int64_t)thresh * thresh_fact >> 5) || thresh == INT_MAX;
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index eca8e5880..e80f345e8 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -38,9 +38,6 @@
 #include "vp9/encoder/vp9_rdopt.h"
 #include "vp9/encoder/vp9_variance.h"
 
-#define RD_THRESH_MAX_FACT 64
-#define RD_THRESH_INC      1
-
 #define LAST_FRAME_MODE_MASK    ((1 << GOLDEN_FRAME) | (1 << ALTREF_FRAME) | \
                                  (1 << INTRA_FRAME))
 #define GOLDEN_FRAME_MODE_MASK  ((1 << LAST_FRAME) | (1 << ALTREF_FRAME) | \
@@ -478,7 +475,8 @@ static void block_rd_txfm(int plane, int block, BLOCK_SIZE plane_bsize,
     return;
 
   if (!is_inter_block(mbmi)) {
-    vp9_encode_block_intra(x, plane, block, plane_bsize, tx_size, &mbmi->skip);
+    struct encode_b_args arg = {x, NULL, &mbmi->skip};
+    vp9_encode_block_intra(plane, block, plane_bsize, tx_size, &arg);
 #if CONFIG_VP9_HIGHBITDEPTH
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
       dist_block(plane, block, tx_size, args, xd->bd);
@@ -639,7 +637,7 @@ static void choose_tx_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
   int64_t best_rd = INT64_MAX;
   TX_SIZE best_tx = max_tx_size;
 
-  const vp9_prob *tx_probs = get_tx_probs2(max_tx_size, xd, &cm->fc.tx_probs);
+  const vp9_prob *tx_probs = get_tx_probs2(max_tx_size, xd, &cm->fc->tx_probs);
   assert(skip_prob > 0);
   s0 = vp9_cost_bit(skip_prob, 0);
   s1 = vp9_cost_bit(skip_prob, 1);
@@ -2765,35 +2763,15 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   rd_cost->rdcost = RDCOST(x->rdmult, x->rddiv, rd_cost->rate, rd_cost->dist);
 }
 
-static void update_rd_thresh_fact(VP9_COMP *cpi, int bsize,
-                                  int best_mode_index) {
-  if (cpi->sf.adaptive_rd_thresh > 0) {
-    const int top_mode = bsize < BLOCK_8X8 ? MAX_REFS : MAX_MODES;
-    int mode;
-    for (mode = 0; mode < top_mode; ++mode) {
-      const BLOCK_SIZE min_size = MAX(bsize - 1, BLOCK_4X4);
-      const BLOCK_SIZE max_size = MIN(bsize + 2, BLOCK_64X64);
-      BLOCK_SIZE bs;
-      for (bs = min_size; bs <= max_size; ++bs) {
-        int *const fact = &cpi->rd.thresh_freq_fact[bs][mode];
-        if (mode == best_mode_index) {
-          *fact -= (*fact >> 4);
-        } else {
-          *fact = MIN(*fact + RD_THRESH_INC,
-                      cpi->sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT);
-        }
-      }
-    }
-  }
-}
-
-void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
-                               const TileInfo *const tile,
+void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi,
+                               TileDataEnc *tile_data,
+                               MACROBLOCK *x,
                                int mi_row, int mi_col,
                                RD_COST *rd_cost, BLOCK_SIZE bsize,
                                PICK_MODE_CONTEXT *ctx,
                                int64_t best_rd_so_far) {
   VP9_COMMON *const cm = &cpi->common;
+  TileInfo *const tile_info = &tile_data->tile_info;
   RD_OPT *const rd_opt = &cpi->rd;
   SPEED_FEATURES *const sf = &cpi->sf;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -2836,9 +2814,9 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   uint16_t mode_skip_mask[MAX_REF_FRAMES] = { 0 };
   int mode_skip_start = sf->mode_skip_start + 1;
   const int *const rd_threshes = rd_opt->threshes[segment_id][bsize];
-  const int *const rd_thresh_freq_fact = rd_opt->thresh_freq_fact[bsize];
+  const int *const rd_thresh_freq_fact = tile_data->thresh_freq_fact[bsize];
   int64_t mode_threshold[MAX_MODES];
-  int *mode_map = rd_opt->mode_map[bsize];
+  int *mode_map = tile_data->mode_map[bsize];
   const int mode_search_skip_flags = sf->mode_search_skip_flags;
   vp9_zero(best_mbmode);
 
@@ -2869,7 +2847,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
     x->pred_mv_sad[ref_frame] = INT_MAX;
     if (cpi->ref_frame_flags & flag_list[ref_frame]) {
-      setup_buffer_inter(cpi, x, tile, ref_frame, bsize, mi_row, mi_col,
+      setup_buffer_inter(cpi, x, tile_info, ref_frame, bsize, mi_row, mi_col,
                          frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb);
     }
     frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
@@ -3023,9 +3001,9 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
 
     if (sf->motion_field_mode_search) {
       const int mi_width  = MIN(num_8x8_blocks_wide_lookup[bsize],
-                                tile->mi_col_end - mi_col);
+                                tile_info->mi_col_end - mi_col);
       const int mi_height = MIN(num_8x8_blocks_high_lookup[bsize],
-                                tile->mi_row_end - mi_row);
+                                tile_info->mi_row_end - mi_row);
       const int bsl = mi_width_log2_lookup[bsize];
       int cb_partition_search_ctrl = (((mi_row + mi_col) >> bsl)
           + get_chessboard_index(cm->current_video_frame)) & 0x1;
@@ -3036,7 +3014,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       int_mv ref_mv;
       ref_mv.as_int = INVALID_MV;
 
-      if ((mi_row - 1) >= tile->mi_row_start) {
+      if ((mi_row - 1) >= tile_info->mi_row_start) {
         ref_mv = xd->mi[-xd->mi_stride].src_mi->mbmi.mv[0];
         rf = xd->mi[-xd->mi_stride].src_mi->mbmi.ref_frame[0];
         for (i = 0; i < mi_width; ++i) {
@@ -3047,7 +3025,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
         }
       }
 
-      if ((mi_col - 1) >= tile->mi_col_start) {
+      if ((mi_col - 1) >= tile_info->mi_col_start) {
         if (ref_mv.as_int == INVALID_MV)
           ref_mv = xd->mi[-1].src_mi->mbmi.mv[0];
         if (rf == NONE)
@@ -3420,7 +3398,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
          !is_inter_block(&best_mbmode));
 
   if (!cpi->rc.is_src_frame_alt_ref)
-    update_rd_thresh_fact(cpi, bsize, best_mode_index);
+    vp9_update_rd_thresh_fact(cpi, tile_data, bsize, best_mode_index);
 
   // macroblock modes
   *mbmi = best_mbmode;
@@ -3479,7 +3457,9 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
                        best_tx_diff, best_filter_diff, best_mode_skippable);
 }
 
-void vp9_rd_pick_inter_mode_sb_seg_skip(VP9_COMP *cpi, MACROBLOCK *x,
+void vp9_rd_pick_inter_mode_sb_seg_skip(VP9_COMP *cpi,
+                                        TileDataEnc *tile_data,
+                                        MACROBLOCK *x,
                                         RD_COST *rd_cost,
                                         BLOCK_SIZE bsize,
                                         PICK_MODE_CONTEXT *ctx,
@@ -3573,7 +3553,7 @@ void vp9_rd_pick_inter_mode_sb_seg_skip(VP9_COMP *cpi, MACROBLOCK *x,
   assert((cm->interp_filter == SWITCHABLE) ||
          (cm->interp_filter == mbmi->interp_filter));
 
-  update_rd_thresh_fact(cpi, bsize, THR_ZEROMV);
+  vp9_update_rd_thresh_fact(cpi, tile_data, bsize, THR_ZEROMV);
 
   vp9_zero(best_pred_diff);
   vp9_zero(best_filter_diff);
@@ -3585,14 +3565,16 @@ void vp9_rd_pick_inter_mode_sb_seg_skip(VP9_COMP *cpi, MACROBLOCK *x,
                        best_pred_diff, best_tx_diff, best_filter_diff, 0);
 }
 
-void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
-                                   const TileInfo *const tile,
+void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi,
+                                   TileDataEnc *tile_data,
+                                   MACROBLOCK *x,
                                    int mi_row, int mi_col,
                                    RD_COST *rd_cost,
                                    BLOCK_SIZE bsize,
                                    PICK_MODE_CONTEXT *ctx,
                                    int64_t best_rd_so_far) {
   VP9_COMMON *const cm = &cpi->common;
+  TileInfo *const tile_info = &tile_data->tile_info;
   RD_OPT *const rd_opt = &cpi->rd;
   SPEED_FEATURES *const sf = &cpi->sf;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -3651,10 +3633,10 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
 
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
     if (cpi->ref_frame_flags & flag_list[ref_frame]) {
-      setup_buffer_inter(cpi, x, tile,
-                             ref_frame, bsize, mi_row, mi_col,
-                             frame_mv[NEARESTMV], frame_mv[NEARMV],
-                             yv12_mb);
+      setup_buffer_inter(cpi, x, tile_info,
+                         ref_frame, bsize, mi_row, mi_col,
+                         frame_mv[NEARESTMV], frame_mv[NEARMV],
+                         yv12_mb);
     } else {
       ref_frame_skip_mask[0] |= (1 << ref_frame);
       ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
@@ -3712,7 +3694,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
     // Test best rd so far against threshold for trying this mode.
     if (rd_less_than_thresh(best_rd,
                             rd_opt->threshes[segment_id][bsize][ref_index],
-                            rd_opt->thresh_freq_fact[bsize][ref_index]))
+                            tile_data->thresh_freq_fact[bsize][ref_index]))
       continue;
 
     comp_pred = second_ref_frame > INTRA_FRAME;
@@ -3845,7 +3827,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
             int newbest, rs;
             int64_t rs_rd;
             mbmi->interp_filter = switchable_filter_index;
-            tmp_rd = rd_pick_best_sub8x8_mode(cpi, x, tile,
+            tmp_rd = rd_pick_best_sub8x8_mode(cpi, x, tile_info,
                                               &mbmi->ref_mvs[ref_frame][0],
                                               second_ref, best_yrd, &rate,
                                               &rate_y, &distortion,
@@ -3911,7 +3893,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
       if (!pred_exists) {
         // Handles the special case when a filter that is not in the
         // switchable list (bilinear, 6-tap) is indicated at the frame level
-        tmp_rd = rd_pick_best_sub8x8_mode(cpi, x, tile,
+        tmp_rd = rd_pick_best_sub8x8_mode(cpi, x, tile_info,
                                           &mbmi->ref_mvs[ref_frame][0],
                                           second_ref, best_yrd, &rate, &rate_y,
                                           &distortion, &skippable, &total_sse,
@@ -4146,7 +4128,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
          (cm->interp_filter == best_mbmode.interp_filter) ||
          !is_inter_block(&best_mbmode));
 
-  update_rd_thresh_fact(cpi, bsize, best_ref_index);
+  vp9_update_rd_thresh_fact(cpi, tile_data, bsize, best_ref_index);
 
   // macroblock modes
   *mbmi = best_mbmode;
diff --git a/vp9/encoder/vp9_rdopt.h b/vp9/encoder/vp9_rdopt.h
index ed38ce81a..7bbc3c89a 100644
--- a/vp9/encoder/vp9_rdopt.h
+++ b/vp9/encoder/vp9_rdopt.h
@@ -29,14 +29,16 @@ void vp9_rd_pick_intra_mode_sb(struct VP9_COMP *cpi, struct macroblock *x,
                                struct RD_COST *rd_cost, BLOCK_SIZE bsize,
                                PICK_MODE_CONTEXT *ctx, int64_t best_rd);
 
-void vp9_rd_pick_inter_mode_sb(struct VP9_COMP *cpi, struct macroblock *x,
-                               const struct TileInfo *const tile,
+void vp9_rd_pick_inter_mode_sb(struct VP9_COMP *cpi,
+                               struct TileDataEnc *tile_data,
+                               struct macroblock *x,
                                int mi_row, int mi_col,
                                struct RD_COST *rd_cost,
                                BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
                                int64_t best_rd_so_far);
 
 void vp9_rd_pick_inter_mode_sb_seg_skip(struct VP9_COMP *cpi,
+                                        struct TileDataEnc *tile_data,
                                         struct macroblock *x,
                                         struct RD_COST *rd_cost,
                                         BLOCK_SIZE bsize,
@@ -44,8 +46,8 @@ void vp9_rd_pick_inter_mode_sb_seg_skip(struct VP9_COMP *cpi,
                                         int64_t best_rd_so_far);
 
 void vp9_rd_pick_inter_mode_sub8x8(struct VP9_COMP *cpi,
+                                   struct TileDataEnc *tile_data,
                                    struct macroblock *x,
-                                   const struct TileInfo *const tile,
                                    int mi_row, int mi_col,
                                    struct RD_COST *rd_cost,
                                    BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index 9e3ee2c94..3315aa6a1 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -142,8 +142,6 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm,
 
   if (speed >= 5) {
     int i;
-
-    sf->partition_search_type = FIXED_PARTITION;
     sf->optimize_coefficients = 0;
     sf->mv.search_method = HEX;
     sf->disable_filter_search_var_thresh = 500;
@@ -151,8 +149,7 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm,
       sf->intra_y_mode_mask[i] = INTRA_DC;
       sf->intra_uv_mode_mask[i] = INTRA_DC;
     }
-  }
-  if (speed >= 6) {
+    sf->partition_search_breakout_rate_thr = 500;
     sf->mv.reduce_first_step_size = 1;
   }
 }
@@ -205,7 +202,6 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf,
     sf->disable_filter_search_var_thresh = 50;
     sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
     sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX;
-    sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_LOW_MOTION;
     sf->lf_motion_threshold = LOW_MOTION_THRESHOLD;
     sf->adjust_partitioning_from_last_frame = 1;
     sf->last_partitioning_redo_frequency = 3;
@@ -217,8 +213,6 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf,
   if (speed >= 3) {
     sf->use_square_partition_only = 1;
     sf->disable_filter_search_var_thresh = 100;
-    sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_ALL;
-    sf->constrain_copy_partition = 1;
     sf->use_uv_intra_rd_estimate = 1;
     sf->skip_encode_sb = 1;
     sf->mv.subpel_iters_per_step = 1;
@@ -263,8 +257,6 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf,
                                                   : STRICT_NEIGHBORING_MIN_MAX;
     sf->max_partition_size = BLOCK_32X32;
     sf->min_partition_size = BLOCK_8X8;
-    sf->partition_check =
-        (frames_since_key % sf->last_partitioning_redo_frequency == 1);
     sf->force_frame_boost = is_keyframe ||
         (frames_since_key % (sf->last_partitioning_redo_frequency << 1) == 1);
     sf->max_delta_qindex = is_keyframe ? 20 : 15;
@@ -275,6 +267,9 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf,
     sf->inter_mode_mask[BLOCK_32X64] = INTER_NEAREST_NEW_ZERO;
     sf->inter_mode_mask[BLOCK_64X32] = INTER_NEAREST_NEW_ZERO;
     sf->inter_mode_mask[BLOCK_64X64] = INTER_NEAREST_NEW_ZERO;
+    sf->adaptive_rd_thresh = 2;
+    // This feature is only enabled when partition search is disabled.
+    sf->reuse_inter_pred_sby = 1;
 
     if (MIN(cm->width, cm->height) >= 720)
       sf->partition_search_breakout_dist_thr = (1 << 25);
@@ -295,15 +290,7 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf,
     sf->partition_search_type = VAR_BASED_PARTITION;
     sf->search_type_check_frequency = 50;
     sf->mv.search_method = NSTEP;
-
     sf->tx_size_search_method = is_keyframe ? USE_LARGESTALL : USE_TX_8X8;
-
-    // This feature is only enabled when partition search is disabled.
-    sf->reuse_inter_pred_sby = 1;
-
-    // Increase mode checking threshold for NEWMV.
-    sf->elevate_newmv_thresh = 1000;
-
     sf->mv.reduce_first_step_size = 1;
   }
 
@@ -348,7 +335,6 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
   sf->mv.fullpel_search_step_param = 6;
   sf->comp_inter_joint_search_thresh = BLOCK_4X4;
   sf->adaptive_rd_thresh = 0;
-  sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_OFF;
   sf->tx_size_search_method = USE_FULL_RD;
   sf->use_lp32x32fdct = 0;
   sf->adaptive_motion_search = 0;
@@ -368,7 +354,6 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
   sf->min_partition_size = BLOCK_4X4;
   sf->adjust_partitioning_from_last_frame = 0;
   sf->last_partitioning_redo_frequency = 4;
-  sf->constrain_copy_partition = 0;
   sf->disable_split_mask = 0;
   sf->mode_search_skip_flags = 0;
   sf->force_frame_boost = 0;
diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h
index 951b4af22..a314f6040 100644
--- a/vp9/encoder/vp9_speed_features.h
+++ b/vp9/encoder/vp9_speed_features.h
@@ -93,12 +93,6 @@ typedef enum {
 } MOTION_THRESHOLD;
 
 typedef enum {
-  LAST_FRAME_PARTITION_OFF = 0,
-  LAST_FRAME_PARTITION_LOW_MOTION = 1,
-  LAST_FRAME_PARTITION_ALL = 2
-} LAST_FRAME_PARTITION_METHOD;
-
-typedef enum {
   USE_FULL_RD = 0,
   USE_LARGESTALL,
   USE_TX_8X8
@@ -242,15 +236,6 @@ typedef struct SPEED_FEATURES {
   // level within a frame.
   int allow_skip_recode;
 
-  // This variable allows us to reuse the last frames partition choices
-  // (64x64 v 32x32 etc) for this frame. It can be set to only use the last
-  // frame as a starting point in low motion scenes or always use it. If set
-  // we use last partitioning_redo frequency to determine how often to redo
-  // the partitioning from scratch. Adjust_partitioning_from_last_frame
-  // enables us to adjust up or down one partitioning from the last frames
-  // partitioning.
-  LAST_FRAME_PARTITION_METHOD use_lastframe_partitioning;
-
   // The threshold is to determine how slow the motino is, it is used when
   // use_lastframe_partitioning is set to LAST_FRAME_PARTITION_LOW_MOTION
   MOTION_THRESHOLD lf_motion_threshold;
@@ -264,8 +249,6 @@ typedef struct SPEED_FEATURES {
   // precise but significantly faster than the non lp version.
   int use_lp32x32fdct;
 
-  // TODO(JBB): remove this as its no longer used.
-
   // After looking at the first set of modes (set by index here), skip
   // checking modes for reference frames that don't match the reference frame
   // of the best so far.
@@ -303,12 +286,6 @@ typedef struct SPEED_FEATURES {
   // use_lastframe_partitioning is set.
   int last_partitioning_redo_frequency;
 
-  // This enables constrained copy partitioning, which, given an input block
-  // size bsize, will copy previous partition for partitions less than bsize,
-  // otherwise bsize partition is used. bsize is currently set to 16x16.
-  // Used for the case where motion is detected in superblock.
-  int constrain_copy_partition;
-
   // Disables sub 8x8 blocksizes in different scenarios: Choices are to disable
   // it always, to allow it for only Last frame and Intra, disable it for all
   // inter modes or to enable it always.
@@ -342,10 +319,6 @@ typedef struct SPEED_FEATURES {
   // Fast quantization process path
   int use_quant_fp;
 
-  // Search through variable block partition types in non-RD mode decision
-  // encoding process for RTC.
-  int partition_check;
-
   // Use finer quantizer in every other few frames that run variable block
   // partition type search.
   int force_frame_boost;
diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c
index adf01bf35..0166a50a0 100644
--- a/vp9/encoder/vp9_tokenize.c
+++ b/vp9/encoder/vp9_tokenize.c
@@ -313,7 +313,7 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize,
   unsigned int (*const counts)[COEFF_CONTEXTS][ENTROPY_TOKENS] =
       cpi->coef_counts[tx_size][type][ref];
   vp9_prob (*const coef_probs)[COEFF_CONTEXTS][UNCONSTRAINED_NODES] =
-      cpi->common.fc.coef_probs[tx_size][type][ref];
+      cpi->common.fc->coef_probs[tx_size][type][ref];
   unsigned int (*const eob_branch)[COEFF_CONTEXTS] =
       cpi->common.counts.eob_branch[tx_size][type][ref];
   const uint8_t *const band = get_band_translate(tx_size);
diff --git a/vp9/encoder/x86/vp9_denoiser_sse2.c b/vp9/encoder/x86/vp9_denoiser_sse2.c
index bf5fa889f..4ddee7b74 100644
--- a/vp9/encoder/x86/vp9_denoiser_sse2.c
+++ b/vp9/encoder/x86/vp9_denoiser_sse2.c
@@ -23,18 +23,17 @@
 // Compute the sum of all pixel differences of this MB.
 static INLINE int sum_diff_16x1(__m128i acc_diff) {
   const __m128i k_1 = _mm_set1_epi16(1);
-  const __m128i acc_diff_lo = _mm_srai_epi16(
-        _mm_unpacklo_epi8(acc_diff, acc_diff), 8);
-  const __m128i acc_diff_hi = _mm_srai_epi16(
-        _mm_unpackhi_epi8(acc_diff, acc_diff), 8);
+  const __m128i acc_diff_lo =
+      _mm_srai_epi16(_mm_unpacklo_epi8(acc_diff, acc_diff), 8);
+  const __m128i acc_diff_hi =
+      _mm_srai_epi16(_mm_unpackhi_epi8(acc_diff, acc_diff), 8);
   const __m128i acc_diff_16 = _mm_add_epi16(acc_diff_lo, acc_diff_hi);
   const __m128i hg_fe_dc_ba = _mm_madd_epi16(acc_diff_16, k_1);
-  const __m128i hgfe_dcba = _mm_add_epi32(hg_fe_dc_ba,
-                                          _mm_srli_si128(hg_fe_dc_ba, 8));
-  const __m128i hgfedcba = _mm_add_epi32(hgfe_dcba,
-                                         _mm_srli_si128(hgfe_dcba, 4));
-  int sum_diff = _mm_cvtsi128_si32(hgfedcba);
-  return sum_diff;
+  const __m128i hgfe_dcba =
+      _mm_add_epi32(hg_fe_dc_ba, _mm_srli_si128(hg_fe_dc_ba, 8));
+  const __m128i hgfedcba =
+      _mm_add_epi32(hgfe_dcba, _mm_srli_si128(hgfe_dcba, 4));
+  return _mm_cvtsi128_si32(hgfedcba);
 }
 
 // Denoise a 16x1 vector.
@@ -51,8 +50,8 @@ static INLINE __m128i vp9_denoiser_16x1_sse2(const uint8_t *sig,
                                              __m128i acc_diff) {
   // Calculate differences
   const __m128i v_sig = _mm_loadu_si128((const __m128i *)(&sig[0]));
-  const __m128i v_mc_running_avg_y = _mm_loadu_si128(
-                                     (const __m128i *)(&mc_running_avg_y[0]));
+  const __m128i v_mc_running_avg_y =
+      _mm_loadu_si128((const __m128i *)(&mc_running_avg_y[0]));
   __m128i v_running_avg_y;
   const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig);
   const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y);
@@ -60,8 +59,8 @@ static INLINE __m128i vp9_denoiser_16x1_sse2(const uint8_t *sig,
   const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, *k_0);
   // Clamp absolute difference to 16 to be used to get mask. Doing this
   // allows us to use _mm_cmpgt_epi8, which operates on signed byte.
-  const __m128i clamped_absdiff = _mm_min_epu8(
-                                  _mm_or_si128(pdiff, ndiff), *k_16);
+  const __m128i clamped_absdiff =
+      _mm_min_epu8(_mm_or_si128(pdiff, ndiff), *k_16);
   // Get masks for l2 l1 and l0 adjustments.
   const __m128i mask2 = _mm_cmpgt_epi8(*k_16, clamped_absdiff);
   const __m128i mask1 = _mm_cmpgt_epi8(*k_8, clamped_absdiff);
@@ -95,24 +94,22 @@ static INLINE __m128i vp9_denoiser_16x1_sse2(const uint8_t *sig,
 }
 
 // Denoise a 16x1 vector with a weaker filter.
-static INLINE __m128i vp9_denoiser_adj_16x1_sse2(const uint8_t *sig,
-                                              const uint8_t *mc_running_avg_y,
-                                              uint8_t *running_avg_y,
-                                              const __m128i k_0,
-                                              const __m128i k_delta,
-                                              __m128i acc_diff) {
+static INLINE __m128i vp9_denoiser_adj_16x1_sse2(
+    const uint8_t *sig, const uint8_t *mc_running_avg_y,
+    uint8_t *running_avg_y, const __m128i k_0,
+    const __m128i k_delta, __m128i acc_diff) {
   __m128i v_running_avg_y = _mm_loadu_si128((__m128i *)(&running_avg_y[0]));
   // Calculate differences.
   const __m128i v_sig = _mm_loadu_si128((const __m128i *)(&sig[0]));
   const __m128i v_mc_running_avg_y =
-                _mm_loadu_si128((const __m128i *)(&mc_running_avg_y[0]));
+      _mm_loadu_si128((const __m128i *)(&mc_running_avg_y[0]));
   const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig);
   const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y);
   // Obtain the sign. FF if diff is negative.
   const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0);
   // Clamp absolute difference to delta to get the adjustment.
   const __m128i adj =
-                _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_delta);
+      _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_delta);
   // Restore the sign and get positive and negative adjustments.
   __m128i padj, nadj;
   padj = _mm_andnot_si128(diff_sign, adj);
@@ -128,19 +125,16 @@ static INLINE __m128i vp9_denoiser_adj_16x1_sse2(const uint8_t *sig,
   return acc_diff;
 }
 
-static int vp9_denoiser_4xM_sse2(const uint8_t *sig, int sig_stride,
-                                 const uint8_t *mc_running_avg_y,
-                                 int mc_avg_y_stride,
-                                 uint8_t *running_avg_y, int avg_y_stride,
-                                 int increase_denoising,
-                                 BLOCK_SIZE bs,
-                                 int motion_magnitude) {
-  int sum_diff_thresh;
-  int r;
-  int shift_inc  = (increase_denoising &&
-                   motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 1 : 0;
-  unsigned char sig_buffer[2][16], mc_running_buffer[2][16],
-                running_buffer[2][16];
+// Denoiser for 4xM and 8xM blocks.
+static int vp9_denoiser_NxM_sse2_small(
+    const uint8_t *sig, int sig_stride, const uint8_t *mc_running_avg_y,
+    int mc_avg_y_stride, uint8_t *running_avg_y, int avg_y_stride,
+    int increase_denoising, BLOCK_SIZE bs, int motion_magnitude, int width) {
+  int sum_diff_thresh, r, sum_diff = 0;
+  const int shift_inc  = (increase_denoising &&
+                          motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ?
+                         1 : 0;
+  uint8_t sig_buffer[8][16], mc_running_buffer[8][16], running_buffer[8][16];
   __m128i acc_diff = _mm_setzero_si128();
   const __m128i k_0 = _mm_setzero_si128();
   const __m128i k_4 = _mm_set1_epi8(4 + shift_inc);
@@ -148,145 +142,51 @@ static int vp9_denoiser_4xM_sse2(const uint8_t *sig, int sig_stride,
   const __m128i k_16 = _mm_set1_epi8(16);
   // Modify each level's adjustment according to motion_magnitude.
   const __m128i l3 = _mm_set1_epi8(
-                     (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ?
-                     7 + shift_inc : 6);
+      (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 7 + shift_inc : 6);
   // Difference between level 3 and level 2 is 2.
   const __m128i l32 = _mm_set1_epi8(2);
   // Difference between level 2 and level 1 is 1.
   const __m128i l21 = _mm_set1_epi8(1);
-  int sum_diff = 0;
-
-  for (r = 0; r < ((4 << b_height_log2_lookup[bs]) >> 2); ++r) {
-    vpx_memcpy(sig_buffer[r], sig, 4);
-    vpx_memcpy(sig_buffer[r] + 4, sig + sig_stride, 4);
-    vpx_memcpy(sig_buffer[r] + 8, sig + sig_stride * 2, 4);
-    vpx_memcpy(sig_buffer[r] + 12, sig + sig_stride * 3, 4);
-    vpx_memcpy(mc_running_buffer[r], mc_running_avg_y, 4);
-    vpx_memcpy(mc_running_buffer[r] + 4, mc_running_avg_y +
-               mc_avg_y_stride, 4);
-    vpx_memcpy(mc_running_buffer[r] + 8, mc_running_avg_y +
-               mc_avg_y_stride * 2, 4);
-    vpx_memcpy(mc_running_buffer[r] + 12, mc_running_avg_y +
-               mc_avg_y_stride * 3, 4);
-    vpx_memcpy(running_buffer[r], running_avg_y, 4);
-    vpx_memcpy(running_buffer[r] + 4, running_avg_y +
-               avg_y_stride, 4);
-    vpx_memcpy(running_buffer[r] + 8, running_avg_y +
-               avg_y_stride * 2, 4);
-    vpx_memcpy(running_buffer[r] + 12, running_avg_y +
-               avg_y_stride * 3, 4);
-    acc_diff = vp9_denoiser_16x1_sse2(sig_buffer[r],
-                                      mc_running_buffer[r],
-                                      running_buffer[r],
-                                      &k_0, &k_4, &k_8, &k_16,
-                                      &l3, &l32, &l21, acc_diff);
-    vpx_memcpy(running_avg_y, running_buffer[r], 4);
-    vpx_memcpy(running_avg_y + avg_y_stride, running_buffer[r] + 4, 4);
-    vpx_memcpy(running_avg_y + avg_y_stride * 2,
-               running_buffer[r] + 8, 4);
-    vpx_memcpy(running_avg_y + avg_y_stride * 3,
-               running_buffer[r] + 12, 4);
-    // Update pointers for next iteration.
-    sig += (sig_stride << 2);
-    mc_running_avg_y += (mc_avg_y_stride << 2);
-    running_avg_y += (avg_y_stride << 2);
-  }
-
-  {
-    sum_diff = sum_diff_16x1(acc_diff);
-    sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising);
-    if (abs(sum_diff) > sum_diff_thresh) {
-      // Before returning to copy the block (i.e., apply no denoising),
-      // checK if we can still apply some (weaker) temporal filtering to
-      // this block, that would otherwise not be denoised at all. Simplest
-      // is to apply an additional adjustment to running_avg_y to bring it
-      // closer to sig. The adjustment is capped by a maximum delta, and
-      // chosen such that in most cases the resulting sum_diff will be
-      // within the accceptable range given by sum_diff_thresh.
+  const uint8_t shift = (width == 4) ? 2 : 1;
 
-      // The delta is set by the excess of absolute pixel diff over the
-      // threshold.
-      int delta = ((abs(sum_diff) - sum_diff_thresh)
-                  >> num_pels_log2_lookup[bs]) + 1;
-      // Only apply the adjustment for max delta up to 3.
-      if (delta < 4) {
-        const __m128i k_delta = _mm_set1_epi8(delta);
-        running_avg_y -= avg_y_stride * (4 << b_height_log2_lookup[bs]);
-        sum_diff = 0;
-        for (r = 0; r < ((4 << b_height_log2_lookup[bs]) >> 2); ++r) {
-          acc_diff = vp9_denoiser_adj_16x1_sse2(
-                             sig_buffer[r], mc_running_buffer[r],
-                             running_buffer[r], k_0, k_delta,
-                             acc_diff);
-          vpx_memcpy(running_avg_y, running_buffer[r], 4);
-          vpx_memcpy(running_avg_y + avg_y_stride, running_buffer[r] + 4, 4);
-          vpx_memcpy(running_avg_y + avg_y_stride * 2,
-                     running_buffer[r] + 8, 4);
-          vpx_memcpy(running_avg_y + avg_y_stride * 3,
-                     running_buffer[r] + 12, 4);
-          // Update pointers for next iteration.
-          running_avg_y += (avg_y_stride << 2);
-        }
-        sum_diff = sum_diff_16x1(acc_diff);
-        if (abs(sum_diff) > sum_diff_thresh) {
-          return COPY_BLOCK;
-        }
-      } else {
-        return COPY_BLOCK;
-      }
+  for (r = 0; r < ((4 << b_height_log2_lookup[bs]) >> shift); ++r) {
+    vpx_memcpy(sig_buffer[r], sig, width);
+    vpx_memcpy(sig_buffer[r] + width, sig + sig_stride, width);
+    vpx_memcpy(mc_running_buffer[r], mc_running_avg_y, width);
+    vpx_memcpy(mc_running_buffer[r] + width,
+               mc_running_avg_y + mc_avg_y_stride, width);
+    vpx_memcpy(running_buffer[r], running_avg_y, width);
+    vpx_memcpy(running_buffer[r] + width,
+               running_avg_y + avg_y_stride, width);
+    if (width == 4) {
+      vpx_memcpy(sig_buffer[r] + width * 2, sig + sig_stride * 2, width);
+      vpx_memcpy(sig_buffer[r] + width * 3, sig + sig_stride * 3, width);
+      vpx_memcpy(mc_running_buffer[r] + width * 2,
+                 mc_running_avg_y + mc_avg_y_stride * 2, width);
+      vpx_memcpy(mc_running_buffer[r] + width * 3,
+                 mc_running_avg_y + mc_avg_y_stride * 3, width);
+      vpx_memcpy(running_buffer[r] + width * 2,
+                 running_avg_y + avg_y_stride * 2, width);
+      vpx_memcpy(running_buffer[r] + width * 3,
+                 running_avg_y + avg_y_stride * 3, width);
     }
-  }
-  return FILTER_BLOCK;
-}
-
-static int vp9_denoiser_8xM_sse2(const uint8_t *sig, int sig_stride,
-                                 const uint8_t *mc_running_avg_y,
-                                 int mc_avg_y_stride,
-                                 uint8_t *running_avg_y, int avg_y_stride,
-                                 int increase_denoising,
-                                 BLOCK_SIZE bs,
-                                 int motion_magnitude) {
-  int sum_diff_thresh;
-  int r;
-  int shift_inc  = (increase_denoising &&
-                  motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 1 : 0;
-  unsigned char sig_buffer[8][16], mc_running_buffer[8][16],
-                running_buffer[8][16];
-  __m128i acc_diff = _mm_setzero_si128();
-  const __m128i k_0 = _mm_setzero_si128();
-  const __m128i k_4 = _mm_set1_epi8(4 + shift_inc);
-  const __m128i k_8 = _mm_set1_epi8(8);
-  const __m128i k_16 = _mm_set1_epi8(16);
-  // Modify each level's adjustment according to motion_magnitude.
-  const __m128i l3 = _mm_set1_epi8(
-                     (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ?
-                      7 + shift_inc : 6);
-  // Difference between level 3 and level 2 is 2.
-  const __m128i l32 = _mm_set1_epi8(2);
-  // Difference between level 2 and level 1 is 1.
-  const __m128i l21 = _mm_set1_epi8(1);
-  int sum_diff = 0;
-
-  for (r = 0; r < ((4 << b_height_log2_lookup[bs]) >> 1); ++r) {
-    vpx_memcpy(sig_buffer[r], sig, 8);
-    vpx_memcpy(sig_buffer[r] + 8, sig + sig_stride, 8);
-    vpx_memcpy(mc_running_buffer[r], mc_running_avg_y, 8);
-    vpx_memcpy(mc_running_buffer[r] + 8, mc_running_avg_y +
-               mc_avg_y_stride, 8);
-    vpx_memcpy(running_buffer[r], running_avg_y, 8);
-    vpx_memcpy(running_buffer[r] + 8, running_avg_y +
-               avg_y_stride, 8);
     acc_diff = vp9_denoiser_16x1_sse2(sig_buffer[r],
                                       mc_running_buffer[r],
                                       running_buffer[r],
                                       &k_0, &k_4, &k_8, &k_16,
                                       &l3, &l32, &l21, acc_diff);
-    vpx_memcpy(running_avg_y, running_buffer[r], 8);
-    vpx_memcpy(running_avg_y + avg_y_stride, running_buffer[r] + 8, 8);
+    vpx_memcpy(running_avg_y, running_buffer[r], width);
+    vpx_memcpy(running_avg_y + avg_y_stride, running_buffer[r] + width, width);
+    if (width == 4) {
+      vpx_memcpy(running_avg_y + avg_y_stride * 2,
+                 running_buffer[r] + width * 2, width);
+      vpx_memcpy(running_avg_y + avg_y_stride * 3,
+                 running_buffer[r] + width * 3, width);
+    }
     // Update pointers for next iteration.
-    sig += (sig_stride << 1);
-    mc_running_avg_y += (mc_avg_y_stride << 1);
-    running_avg_y += (avg_y_stride << 1);
+    sig += (sig_stride << shift);
+    mc_running_avg_y += (mc_avg_y_stride << shift);
+    running_avg_y += (avg_y_stride << shift);
   }
 
   {
@@ -294,54 +194,61 @@ static int vp9_denoiser_8xM_sse2(const uint8_t *sig, int sig_stride,
     sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising);
     if (abs(sum_diff) > sum_diff_thresh) {
       // Before returning to copy the block (i.e., apply no denoising),
-      // checK if we can still apply some (weaker) temporal filtering to
+      // check if we can still apply some (weaker) temporal filtering to
       // this block, that would otherwise not be denoised at all. Simplest
       // is to apply an additional adjustment to running_avg_y to bring it
       // closer to sig. The adjustment is capped by a maximum delta, and
       // chosen such that in most cases the resulting sum_diff will be
-      // within the accceptable range given by sum_diff_thresh.
+      // within the acceptable range given by sum_diff_thresh.
 
       // The delta is set by the excess of absolute pixel diff over the
       // threshold.
-      int delta = ((abs(sum_diff) - sum_diff_thresh)
-                  >> num_pels_log2_lookup[bs]) + 1;
+      const int delta = ((abs(sum_diff) - sum_diff_thresh) >>
+                         num_pels_log2_lookup[bs]) + 1;
       // Only apply the adjustment for max delta up to 3.
       if (delta < 4) {
         const __m128i k_delta = _mm_set1_epi8(delta);
         running_avg_y -= avg_y_stride * (4 << b_height_log2_lookup[bs]);
-        for (r = 0; r < ((4 << b_height_log2_lookup[bs]) >> 1); ++r) {
+        for (r = 0; r < ((4 << b_height_log2_lookup[bs]) >> shift); ++r) {
           acc_diff = vp9_denoiser_adj_16x1_sse2(
-                           sig_buffer[r], mc_running_buffer[r],
-                           running_buffer[r], k_0, k_delta,
-                           acc_diff);
-          vpx_memcpy(running_avg_y, running_buffer[r], 8);
-          vpx_memcpy(running_avg_y + avg_y_stride, running_buffer[r] + 8, 8);
+              sig_buffer[r], mc_running_buffer[r], running_buffer[r],
+              k_0, k_delta, acc_diff);
+          vpx_memcpy(running_avg_y, running_buffer[r], width);
+          vpx_memcpy(running_avg_y + avg_y_stride,
+                     running_buffer[r] + width, width);
+          if (width == 4) {
+            vpx_memcpy(running_avg_y + avg_y_stride * 2,
+                       running_buffer[r] + width * 2, width);
+            vpx_memcpy(running_avg_y + avg_y_stride * 3,
+                       running_buffer[r] + width * 3, width);
+          }
           // Update pointers for next iteration.
-          running_avg_y += (avg_y_stride << 1);
+          running_avg_y += (avg_y_stride << shift);
         }
         sum_diff = sum_diff_16x1(acc_diff);
         if (abs(sum_diff) > sum_diff_thresh) {
           return COPY_BLOCK;
         }
       } else {
-          return COPY_BLOCK;
+        return COPY_BLOCK;
       }
     }
   }
   return FILTER_BLOCK;
 }
 
-static int vp9_denoiser_64_32_16xM_sse2(const uint8_t *sig, int sig_stride,
-                                        const uint8_t *mc_running_avg_y,
-                                        int mc_avg_y_stride,
-                                        uint8_t *running_avg_y,
-                                        int avg_y_stride,
-                                        int increase_denoising, BLOCK_SIZE bs,
-                                        int motion_magnitude) {
-  int sum_diff_thresh;
-  int r, c;
-  int shift_inc  = (increase_denoising &&
-                   motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 1 : 0;
+// Denoiser for 16xM, 32xM and 64xM blocks
+static int vp9_denoiser_NxM_sse2_big(const uint8_t *sig, int sig_stride,
+                                     const uint8_t *mc_running_avg_y,
+                                     int mc_avg_y_stride,
+                                     uint8_t *running_avg_y,
+                                     int avg_y_stride,
+                                     int increase_denoising, BLOCK_SIZE bs,
+                                     int motion_magnitude) {
+  int sum_diff_thresh, r, c, sum_diff = 0;
+  const int shift_inc  = (increase_denoising &&
+                          motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ?
+                         1 : 0;
   __m128i acc_diff[4][4];
   const __m128i k_0 = _mm_setzero_si128();
   const __m128i k_4 = _mm_set1_epi8(4 + shift_inc);
@@ -349,13 +256,11 @@ static int vp9_denoiser_64_32_16xM_sse2(const uint8_t *sig, int sig_stride,
   const __m128i k_16 = _mm_set1_epi8(16);
   // Modify each level's adjustment according to motion_magnitude.
   const __m128i l3 = _mm_set1_epi8(
-                     (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ?
-                     7 + shift_inc : 6);
+      (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 7 + shift_inc : 6);
   // Difference between level 3 and level 2 is 2.
   const __m128i l32 = _mm_set1_epi8(2);
   // Difference between level 2 and level 1 is 1.
   const __m128i l21 = _mm_set1_epi8(1);
-  int sum_diff = 0;
 
   for (c = 0; c < 4; ++c) {
     for (r = 0; r < 4; ++r) {
@@ -363,13 +268,11 @@ static int vp9_denoiser_64_32_16xM_sse2(const uint8_t *sig, int sig_stride,
     }
   }
 
-  for (r = 0; r < (4 << b_height_log2_lookup[bs]); r++) {
+  for (r = 0; r < (4 << b_height_log2_lookup[bs]); ++r) {
     for (c = 0; c < (4 << b_width_log2_lookup[bs]); c += 16) {
       acc_diff[c>>4][r>>4] = vp9_denoiser_16x1_sse2(
-                               sig, mc_running_avg_y,
-                               running_avg_y,
-                               &k_0, &k_4, &k_8, &k_16,
-                               &l3, &l32, &l21, acc_diff[c>>4][r>>4]);
+          sig, mc_running_avg_y, running_avg_y, &k_0, &k_4,
+          &k_8, &k_16, &l3, &l32, &l21, acc_diff[c>>4][r>>4]);
       // Update pointers for next iteration.
       sig += 16;
       mc_running_avg_y += 16;
@@ -385,8 +288,8 @@ static int vp9_denoiser_64_32_16xM_sse2(const uint8_t *sig, int sig_stride,
     // Update pointers for next iteration.
     sig = sig - 16 * ((4 << b_width_log2_lookup[bs]) >> 4) + sig_stride;
     mc_running_avg_y = mc_running_avg_y -
-                      16 * ((4 << b_width_log2_lookup[bs]) >> 4) +
-                      mc_avg_y_stride;
+                       16 * ((4 << b_width_log2_lookup[bs]) >> 4) +
+                       mc_avg_y_stride;
     running_avg_y = running_avg_y -
                     16 * ((4 << b_width_log2_lookup[bs]) >> 4) +
                     avg_y_stride;
@@ -395,8 +298,8 @@ static int vp9_denoiser_64_32_16xM_sse2(const uint8_t *sig, int sig_stride,
   {
     sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising);
     if (abs(sum_diff) > sum_diff_thresh) {
-      int delta = ((abs(sum_diff) - sum_diff_thresh)
-                  >> num_pels_log2_lookup[bs]) + 1;
+      const int delta = ((abs(sum_diff) - sum_diff_thresh) >>
+                         num_pels_log2_lookup[bs]) + 1;
 
       // Only apply the adjustment for max delta up to 3.
       if (delta < 4) {
@@ -408,9 +311,8 @@ static int vp9_denoiser_64_32_16xM_sse2(const uint8_t *sig, int sig_stride,
         for (r = 0; r < (4 << b_height_log2_lookup[bs]); ++r) {
           for (c = 0; c < (4 << b_width_log2_lookup[bs]); c += 16) {
             acc_diff[c>>4][r>>4] = vp9_denoiser_adj_16x1_sse2(
-                                             sig, mc_running_avg_y,
-                                             running_avg_y, k_0,
-                                             k_delta, acc_diff[c>>4][r>>4]);
+                sig, mc_running_avg_y, running_avg_y, k_0,
+                k_delta, acc_diff[c>>4][r>>4]);
             // Update pointers for next iteration.
             sig += 16;
             mc_running_avg_y += 16;
@@ -449,25 +351,25 @@ int vp9_denoiser_filter_sse2(const uint8_t *sig, int sig_stride,
                              BLOCK_SIZE bs,
                              int motion_magnitude) {
   if (bs == BLOCK_4X4 || bs == BLOCK_4X8) {
-    return vp9_denoiser_4xM_sse2(sig, sig_stride,
-                                 mc_avg, mc_avg_stride,
-                                 avg, avg_stride,
-                                 increase_denoising,
-                                 bs, motion_magnitude);
+    return vp9_denoiser_NxM_sse2_small(sig, sig_stride,
+                                       mc_avg, mc_avg_stride,
+                                       avg, avg_stride,
+                                       increase_denoising,
+                                       bs, motion_magnitude, 4);
   } else if (bs == BLOCK_8X4 || bs == BLOCK_8X8 || bs == BLOCK_8X16) {
-    return vp9_denoiser_8xM_sse2(sig, sig_stride,
-                                 mc_avg, mc_avg_stride,
-                                 avg, avg_stride,
-                                 increase_denoising,
-                                 bs, motion_magnitude);
+    return vp9_denoiser_NxM_sse2_small(sig, sig_stride,
+                                       mc_avg, mc_avg_stride,
+                                       avg, avg_stride,
+                                       increase_denoising,
+                                       bs, motion_magnitude, 8);
   } else if (bs == BLOCK_16X8 || bs == BLOCK_16X16 || bs == BLOCK_16X32 ||
              bs == BLOCK_32X16|| bs == BLOCK_32X32 || bs == BLOCK_32X64 ||
              bs == BLOCK_64X32 || bs == BLOCK_64X64) {
-    return vp9_denoiser_64_32_16xM_sse2(sig, sig_stride,
-                                        mc_avg, mc_avg_stride,
-                                        avg, avg_stride,
-                                        increase_denoising,
-                                        bs, motion_magnitude);
+    return vp9_denoiser_NxM_sse2_big(sig, sig_stride,
+                                     mc_avg, mc_avg_stride,
+                                     avg, avg_stride,
+                                     increase_denoising,
+                                     bs, motion_magnitude);
   } else {
     return COPY_BLOCK;
   }
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index adae18b48..d3c2a138c 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -33,6 +33,8 @@ struct vp9_extracfg {
   vp8e_tuning                 tuning;
   unsigned int                cq_level;  // constrained quality level
   unsigned int                rc_max_intra_bitrate_pct;
+  unsigned int                rc_max_inter_bitrate_pct;
+  unsigned int                gf_cbr_boost_pct;
   unsigned int                lossless;
   unsigned int                frame_parallel_decoding_mode;
   AQ_MODE                     aq_mode;
@@ -54,6 +56,8 @@ static struct vp9_extracfg default_extra_cfg = {
   VP8_TUNE_PSNR,              // tuning
   10,                         // cq_level
   0,                          // rc_max_intra_bitrate_pct
+  0,                          // rc_max_inter_bitrate_pct
+  0,                          // gf_cbr_boost_pct
   0,                          // lossless
   0,                          // frame_parallel_decoding_mode
   NO_AQ,                      // aq_mode
@@ -380,6 +384,8 @@ static vpx_codec_err_t set_encoder_config(
   // Convert target bandwidth from Kbit/s to Bit/s
   oxcf->target_bandwidth = 1000 * cfg->rc_target_bitrate;
   oxcf->rc_max_intra_bitrate_pct = extra_cfg->rc_max_intra_bitrate_pct;
+  oxcf->rc_max_inter_bitrate_pct = extra_cfg->rc_max_inter_bitrate_pct;
+  oxcf->gf_cbr_boost_pct = extra_cfg->gf_cbr_boost_pct;
 
   oxcf->best_allowed_q =
       extra_cfg->lossless ? 0 : vp9_quantizer_to_qindex(cfg->rc_min_quantizer);
@@ -649,6 +655,22 @@ static vpx_codec_err_t ctrl_set_rc_max_intra_bitrate_pct(
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
+static vpx_codec_err_t ctrl_set_rc_max_inter_bitrate_pct(
+    vpx_codec_alg_priv_t *ctx, va_list args) {
+  struct vp9_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.rc_max_inter_bitrate_pct =
+      CAST(VP8E_SET_MAX_INTER_BITRATE_PCT, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_err_t ctrl_set_rc_gf_cbr_boost_pct(
+    vpx_codec_alg_priv_t *ctx, va_list args) {
+  struct vp9_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.gf_cbr_boost_pct =
+      CAST(VP8E_SET_GF_CBR_BOOST_PCT, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
 static vpx_codec_err_t ctrl_set_lossless(vpx_codec_alg_priv_t *ctx,
                                          va_list args) {
   struct vp9_extracfg extra_cfg = ctx->extra_cfg;
@@ -1266,6 +1288,8 @@ static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
   {VP8E_SET_TUNING,                   ctrl_set_tuning},
   {VP8E_SET_CQ_LEVEL,                 ctrl_set_cq_level},
   {VP8E_SET_MAX_INTRA_BITRATE_PCT,    ctrl_set_rc_max_intra_bitrate_pct},
+  {VP8E_SET_MAX_INTER_BITRATE_PCT,    ctrl_set_rc_max_inter_bitrate_pct},
+  {VP8E_SET_GF_CBR_BOOST_PCT,         ctrl_set_rc_gf_cbr_boost_pct},
   {VP9E_SET_LOSSLESS,                 ctrl_set_lossless},
   {VP9E_SET_FRAME_PARALLEL_DECODING,  ctrl_set_frame_parallel_decoding_mode},
   {VP9E_SET_AQ_MODE,                  ctrl_set_aq_mode},