16 files changed, 437 insertions, 137 deletions
diff --git a/vp9/common/vp9_alloccommon.c b/vp9/common/vp9_alloccommon.c
index 8b3b9dbe0..5b8b2a9ec 100644
--- a/vp9/common/vp9_alloccommon.c
+++ b/vp9/common/vp9_alloccommon.c
@@ -87,13 +87,14 @@ void vp9_free_ref_frame_buffers(VP9_COMMON *cm) {
   int i;
 
   for (i = 0; i < FRAME_BUFFERS; ++i) {
-    vp9_free_frame_buffer(&cm->frame_bufs[i].buf);
-
     if (cm->frame_bufs[i].ref_count > 0 &&
         cm->frame_bufs[i].raw_frame_buffer.data != NULL) {
       cm->release_fb_cb(cm->cb_priv, &cm->frame_bufs[i].raw_frame_buffer);
       cm->frame_bufs[i].ref_count = 0;
     }
+    vpx_free(cm->frame_bufs[i].mvs);
+    cm->frame_bufs[i].mvs = NULL;
+    vp9_free_frame_buffer(&cm->frame_bufs[i].buf);
   }
 
   vp9_free_frame_buffer(&cm->post_proc_buffer);
@@ -166,6 +167,16 @@ int vp9_alloc_ref_frame_buffers(VP9_COMMON *cm, int width, int height) {
 #endif
                                VP9_ENC_BORDER_IN_PIXELS) < 0)
       goto fail;
+    if (cm->frame_bufs[i].mvs == NULL) {
+      cm->frame_bufs[i].mvs =
+          (MV_REF *)vpx_calloc(cm->mi_rows * cm->mi_cols,
+                               sizeof(*cm->frame_bufs[i].mvs));
+      if (cm->frame_bufs[i].mvs == NULL)
+        goto fail;
+
+      cm->frame_bufs[i].mi_rows = cm->mi_rows;
+      cm->frame_bufs[i].mi_cols = cm->mi_cols;
+    }
   }
 
   init_frame_bufs(cm);
diff --git a/vp9/common/vp9_mvref_common.c b/vp9/common/vp9_mvref_common.c
index b310eb44d..561201ffe 100644
--- a/vp9/common/vp9_mvref_common.c
+++ b/vp9/common/vp9_mvref_common.c
@@ -20,13 +20,11 @@ static void find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
                              int block, int mi_row, int mi_col) {
   const int *ref_sign_bias = cm->ref_frame_sign_bias;
   int i, refmv_count = 0;
-  const MODE_INFO *prev_mi = !cm->error_resilient_mode && cm->prev_mi
-        ? cm->prev_mi[mi_row * xd->mi_stride + mi_col].src_mi
-        : NULL;
-  const MB_MODE_INFO *const prev_mbmi = prev_mi ? &prev_mi->src_mi->mbmi : NULL;
   const POSITION *const mv_ref_search = mv_ref_blocks[mi->mbmi.sb_type];
   int different_ref_found = 0;
   int context_counter = 0;
+  const MV_REF *const  prev_frame_mvs = cm->use_prev_frame_mvs ?
+      cm->prev_frame->mvs + mi_row * cm->mi_cols + mi_col : NULL;
 
   // Blank the reference vector list
   vpx_memset(mv_ref_list, 0, sizeof(*mv_ref_list) * MAX_MV_REF_CANDIDATES);
@@ -71,11 +69,12 @@ static void find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
   }
 
   // Check the last frame's mode and mv info.
-  if (prev_mbmi) {
-    if (prev_mbmi->ref_frame[0] == ref_frame)
-      ADD_MV_REF_LIST(prev_mbmi->mv[0], refmv_count, mv_ref_list, Done);
-    else if (prev_mbmi->ref_frame[1] == ref_frame)
-      ADD_MV_REF_LIST(prev_mbmi->mv[1], refmv_count, mv_ref_list, Done);
+  if (cm->use_prev_frame_mvs) {
+    if (prev_frame_mvs->ref_frame[0] == ref_frame) {
+      ADD_MV_REF_LIST(prev_frame_mvs->mv[0], refmv_count, mv_ref_list, Done);
+    } else if (prev_frame_mvs->ref_frame[1] == ref_frame) {
+      ADD_MV_REF_LIST(prev_frame_mvs->mv[1], refmv_count, mv_ref_list, Done);
+    }
   }
 
   // Since we couldn't find 2 mvs from the same reference frame
@@ -96,9 +95,30 @@ static void find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
   }
 
   // Since we still don't have a candidate we'll try the last frame.
-  if (prev_mbmi)
-    IF_DIFF_REF_FRAME_ADD_MV(prev_mbmi, ref_frame, ref_sign_bias, refmv_count,
-                             mv_ref_list, Done);
+  if (cm->use_prev_frame_mvs) {
+    if (prev_frame_mvs->ref_frame[0] != ref_frame &&
+        prev_frame_mvs->ref_frame[0] > INTRA_FRAME) {
+      int_mv mv = prev_frame_mvs->mv[0];
+      if (ref_sign_bias[prev_frame_mvs->ref_frame[0]] !=
+          ref_sign_bias[ref_frame]) {
+        mv.as_mv.row *= -1;
+        mv.as_mv.col *= -1;
+      }
+      ADD_MV_REF_LIST(mv, refmv_count, mv_ref_list, Done);
+    }
+
+    if (prev_frame_mvs->ref_frame[1] > INTRA_FRAME &&
+        prev_frame_mvs->ref_frame[1] != ref_frame &&
+        prev_frame_mvs->mv[1].as_int != prev_frame_mvs->mv[0].as_int) {
+      int_mv mv = prev_frame_mvs->mv[1];
+      if (ref_sign_bias[prev_frame_mvs->ref_frame[1]] !=
+          ref_sign_bias[ref_frame]) {
+        mv.as_mv.row *= -1;
+        mv.as_mv.col *= -1;
+      }
+      ADD_MV_REF_LIST(mv, refmv_count, mv_ref_list, Done);
+    }
+  }
 
  Done:
 
diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h
index b818ae818..b3a6590b2 100644
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h
@@ -56,9 +56,16 @@ typedef enum {
   REFERENCE_MODES       = 3,
 } REFERENCE_MODE;
 
+typedef struct {
+  int_mv mv[2];
+  MV_REFERENCE_FRAME ref_frame[2];
+} MV_REF;
 
 typedef struct {
   int ref_count;
+  MV_REF *mvs;
+  int mi_rows;
+  int mi_cols;
   vpx_codec_frame_buffer_t raw_frame_buffer;
   YV12_BUFFER_CONFIG buf;
 } RefCntBuffer;
@@ -91,6 +98,10 @@ typedef struct VP9Common {
   YV12_BUFFER_CONFIG *frame_to_show;
 
   RefCntBuffer frame_bufs[FRAME_BUFFERS];
+  RefCntBuffer *prev_frame;
+
+  // TODO(hkuang): Combine this with cur_buf in macroblockd.
+  RefCntBuffer *cur_frame;
 
   int ref_frame_map[REF_FRAMES]; /* maps fb_idx to reference slot */
 
@@ -149,6 +160,10 @@ typedef struct VP9Common {
   MODE_INFO *prev_mip; /* MODE_INFO array 'mip' from last decoded frame */
   MODE_INFO *prev_mi;  /* 'mi' from last frame (points into prev_mip) */
 
+
+  // Whether to use previous frame's motion vectors for prediction.
+  int use_prev_frame_mvs;
+
   // Persistent mb segment id map used in prediction.
   unsigned char *last_frame_seg_map;
 
diff --git a/vp9/common/x86/vp9_loopfilter_intrin_avx2.c b/vp9/common/x86/vp9_loopfilter_intrin_avx2.c
index 439c028f2..0cb0912ad 100644
--- a/vp9/common/x86/vp9_loopfilter_intrin_avx2.c
+++ b/vp9/common/x86/vp9_loopfilter_intrin_avx2.c
@@ -9,6 +9,7 @@
  */
 
 #include <immintrin.h>  /* AVX2 */
+#include "vpx_ports/mem.h"
 
 static void mb_lpf_horizontal_edge_w_avx2_8(unsigned char *s, int p,
         const unsigned char *_blimit, const unsigned char *_limit,
@@ -392,6 +393,11 @@ static void mb_lpf_horizontal_edge_w_avx2_8(unsigned char *s, int p,
     }
 }
 
+DECLARE_ALIGNED(32, static const uint8_t, filt_loopfilter_avx2[32]) = {
+  0, 128, 1, 128, 2, 128, 3, 128, 4, 128, 5, 128, 6, 128, 7, 128,
+  8, 128, 9, 128, 10, 128, 11, 128, 12, 128, 13, 128, 14, 128, 15, 128
+};
+
 static void mb_lpf_horizontal_edge_w_avx2_16(unsigned char *s, int p,
         const unsigned char *_blimit, const unsigned char *_limit,
         const unsigned char *_thresh) {
@@ -401,6 +407,9 @@ static void mb_lpf_horizontal_edge_w_avx2_16(unsigned char *s, int p,
     __m128i p7, p6, p5;
     __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
     __m128i q5, q6, q7;
+    __m256i p256_7, q256_7, p256_6, q256_6, p256_5, q256_5, p256_4,
+            q256_4, p256_3, q256_3, p256_2, q256_2, p256_1, q256_1,
+            p256_0, q256_0;
 
     const __m128i thresh = _mm_broadcastb_epi8(
             _mm_cvtsi32_si128((int) _thresh[0]));
@@ -409,16 +418,37 @@ static void mb_lpf_horizontal_edge_w_avx2_16(unsigned char *s, int p,
     const __m128i blimit = _mm_broadcastb_epi8(
             _mm_cvtsi32_si128((int) _blimit[0]));
 
-    p4 = _mm_loadu_si128((__m128i *) (s - 5 * p));
-    p3 = _mm_loadu_si128((__m128i *) (s - 4 * p));
-    p2 = _mm_loadu_si128((__m128i *) (s - 3 * p));
-    p1 = _mm_loadu_si128((__m128i *) (s - 2 * p));
-    p0 = _mm_loadu_si128((__m128i *) (s - 1 * p));
-    q0 = _mm_loadu_si128((__m128i *) (s - 0 * p));
-    q1 = _mm_loadu_si128((__m128i *) (s + 1 * p));
-    q2 = _mm_loadu_si128((__m128i *) (s + 2 * p));
-    q3 = _mm_loadu_si128((__m128i *) (s + 3 * p));
-    q4 = _mm_loadu_si128((__m128i *) (s + 4 * p));
+    p256_4 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                (__m128d const *)(s - 5 * p)));
+    p256_3 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                (__m128d const *)(s - 4 * p)));
+    p256_2 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                (__m128d const *)(s - 3 * p)));
+    p256_1 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                (__m128d const *)(s - 2 * p)));
+    p256_0 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                (__m128d const *)(s - 1 * p)));
+    q256_0 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                (__m128d const *)(s - 0 * p)));
+    q256_1 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                (__m128d const *)(s + 1 * p)));
+    q256_2 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                (__m128d const *)(s + 2 * p)));
+    q256_3 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                (__m128d const *)(s + 3 * p)));
+    q256_4 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                (__m128d const *)(s + 4 * p)));
+
+    p4 = _mm256_castsi256_si128(p256_4);
+    p3 = _mm256_castsi256_si128(p256_3);
+    p2 = _mm256_castsi256_si128(p256_2);
+    p1 = _mm256_castsi256_si128(p256_1);
+    p0 = _mm256_castsi256_si128(p256_0);
+    q0 = _mm256_castsi256_si128(q256_0);
+    q1 = _mm256_castsi256_si128(q256_1);
+    q2 = _mm256_castsi256_si128(q256_2);
+    q3 = _mm256_castsi256_si128(q256_3);
+    q4 = _mm256_castsi256_si128(q256_4);
 
     {
         const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
@@ -534,23 +564,35 @@ static void mb_lpf_horizontal_edge_w_avx2_16(unsigned char *s, int p,
             flat = _mm_cmpeq_epi8(flat, zero);
             flat = _mm_and_si128(flat, mask);
 
-            p5 = _mm_loadu_si128((__m128i *) (s - 6 * p));
-            q5 = _mm_loadu_si128((__m128i *) (s + 5 * p));
+            p256_5 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                        (__m128d const *)(s - 6 * p)));
+            q256_5 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                        (__m128d const *)(s + 5 * p)));
+            p5 = _mm256_castsi256_si128(p256_5);
+            q5 = _mm256_castsi256_si128(q256_5);
             flat2 = _mm_max_epu8(
                     _mm_or_si128(_mm_subs_epu8(p5, p0), _mm_subs_epu8(p0, p5)),
                     _mm_or_si128(_mm_subs_epu8(q5, q0), _mm_subs_epu8(q0, q5)));
 
             flat2 = _mm_max_epu8(work, flat2);
-            p6 = _mm_loadu_si128((__m128i *) (s - 7 * p));
-            q6 = _mm_loadu_si128((__m128i *) (s + 6 * p));
+            p256_6 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                        (__m128d const *)(s - 7 * p)));
+            q256_6 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                        (__m128d const *)(s + 6 * p)));
+            p6 = _mm256_castsi256_si128(p256_6);
+            q6 = _mm256_castsi256_si128(q256_6);
             work = _mm_max_epu8(
                     _mm_or_si128(_mm_subs_epu8(p6, p0), _mm_subs_epu8(p0, p6)),
                     _mm_or_si128(_mm_subs_epu8(q6, q0), _mm_subs_epu8(q0, q6)));
 
             flat2 = _mm_max_epu8(work, flat2);
 
-            p7 = _mm_loadu_si128((__m128i *) (s - 8 * p));
-            q7 = _mm_loadu_si128((__m128i *) (s + 7 * p));
+            p256_7 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                        (__m128d const *)(s - 8 * p)));
+            q256_7 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                        (__m128d const *)(s + 7 * p)));
+            p7 = _mm256_castsi256_si128(p256_7);
+            q7 = _mm256_castsi256_si128(q256_7);
             work = _mm_max_epu8(
                     _mm_or_si128(_mm_subs_epu8(p7, p0), _mm_subs_epu8(p0, p7)),
                     _mm_or_si128(_mm_subs_epu8(q7, q0), _mm_subs_epu8(q0, q7)));
@@ -566,29 +608,28 @@ static void mb_lpf_horizontal_edge_w_avx2_16(unsigned char *s, int p,
         {
             const __m256i eight = _mm256_set1_epi16(8);
             const __m256i four = _mm256_set1_epi16(4);
-            __m256i p256_7, q256_7, p256_6, q256_6, p256_5, q256_5, p256_4,
-                    q256_4, p256_3, q256_3, p256_2, q256_2, p256_1, q256_1,
-                    p256_0, q256_0;
             __m256i pixelFilter_p, pixelFilter_q, pixetFilter_p2p1p0,
                     pixetFilter_q2q1q0, sum_p7, sum_q7, sum_p3, sum_q3, res_p,
                     res_q;
 
-            p256_7 = _mm256_cvtepu8_epi16(p7);
-            p256_6 = _mm256_cvtepu8_epi16(p6);
-            p256_5 = _mm256_cvtepu8_epi16(p5);
-            p256_4 = _mm256_cvtepu8_epi16(p4);
-            p256_3 = _mm256_cvtepu8_epi16(p3);
-            p256_2 = _mm256_cvtepu8_epi16(p2);
-            p256_1 = _mm256_cvtepu8_epi16(p1);
-            p256_0 = _mm256_cvtepu8_epi16(p0);
-            q256_0 = _mm256_cvtepu8_epi16(q0);
-            q256_1 = _mm256_cvtepu8_epi16(q1);
-            q256_2 = _mm256_cvtepu8_epi16(q2);
-            q256_3 = _mm256_cvtepu8_epi16(q3);
-            q256_4 = _mm256_cvtepu8_epi16(q4);
-            q256_5 = _mm256_cvtepu8_epi16(q5);
-            q256_6 = _mm256_cvtepu8_epi16(q6);
-            q256_7 = _mm256_cvtepu8_epi16(q7);
+            const __m256i filter = _mm256_load_si256(
+                                  (__m256i const *)filt_loopfilter_avx2);
+            p256_7 = _mm256_shuffle_epi8(p256_7, filter);
+            p256_6 = _mm256_shuffle_epi8(p256_6, filter);
+            p256_5 = _mm256_shuffle_epi8(p256_5, filter);
+            p256_4 = _mm256_shuffle_epi8(p256_4, filter);
+            p256_3 = _mm256_shuffle_epi8(p256_3, filter);
+            p256_2 = _mm256_shuffle_epi8(p256_2, filter);
+            p256_1 = _mm256_shuffle_epi8(p256_1, filter);
+            p256_0 = _mm256_shuffle_epi8(p256_0, filter);
+            q256_0 = _mm256_shuffle_epi8(q256_0, filter);
+            q256_1 = _mm256_shuffle_epi8(q256_1, filter);
+            q256_2 = _mm256_shuffle_epi8(q256_2, filter);
+            q256_3 = _mm256_shuffle_epi8(q256_3, filter);
+            q256_4 = _mm256_shuffle_epi8(q256_4, filter);
+            q256_5 = _mm256_shuffle_epi8(q256_5, filter);
+            q256_6 = _mm256_shuffle_epi8(q256_6, filter);
+            q256_7 = _mm256_shuffle_epi8(q256_7, filter);
 
             pixelFilter_p = _mm256_add_epi16(_mm256_add_epi16(p256_6, p256_5),
                     _mm256_add_epi16(p256_4, p256_3));
diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index 66da63ac6..a088325df 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -667,6 +667,14 @@ static void setup_display_size(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
     vp9_read_frame_size(rb, &cm->display_width, &cm->display_height);
 }
 
+static void resize_mv_buffer(VP9_COMMON *cm) {
+  vpx_free(cm->cur_frame->mvs);
+  cm->cur_frame->mi_rows = cm->mi_rows;
+  cm->cur_frame->mi_cols = cm->mi_cols;
+  cm->cur_frame->mvs = (MV_REF *)vpx_calloc(cm->mi_rows * cm->mi_cols,
+                                            sizeof(*cm->cur_frame->mvs));
+}
+
 static void resize_context_buffers(VP9_COMMON *cm, int width, int height) {
 #if CONFIG_SIZE_LIMIT
   if (width > DECODE_WIDTH_LIMIT || height > DECODE_HEIGHT_LIMIT)
@@ -692,6 +700,10 @@ static void resize_context_buffers(VP9_COMMON *cm, int width, int height) {
     cm->width = width;
     cm->height = height;
   }
+  if (cm->cur_frame->mvs == NULL || cm->mi_rows > cm->cur_frame->mi_rows ||
+      cm->mi_cols > cm->cur_frame->mi_cols) {
+    resize_mv_buffer(cm);
+  }
 }
 
 static void setup_frame_size(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
@@ -1537,10 +1549,11 @@ void vp9_decode_frame(VP9Decoder *pbi,
 
   init_macroblockd(cm, &pbi->mb);
 
-  if (!cm->error_resilient_mode)
-    set_prev_mi(cm);
-  else
-    cm->prev_mi = NULL;
+  cm->use_prev_frame_mvs = !cm->error_resilient_mode &&
+                           cm->width == cm->last_width &&
+                           cm->height == cm->last_height &&
+                           !cm->intra_only &&
+                           cm->last_show_frame;
 
   setup_plane_dequants(cm, xd, cm->base_qindex);
   vp9_setup_block_planes(xd, cm->subsampling_x, cm->subsampling_y);
diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index d0e0b76da..7ca812f40 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -425,7 +425,6 @@ static void read_inter_block_mode_info(VP9_COMMON *const cm,
   MB_MODE_INFO *const mbmi = &mi->mbmi;
   const BLOCK_SIZE bsize = mbmi->sb_type;
   const int allow_hp = cm->allow_high_precision_mv;
-
   int_mv nearestmv[2], nearmv[2];
   int inter_mode_ctx, ref, is_compound;
 
@@ -544,8 +543,27 @@ static void read_inter_frame_mode_info(VP9_COMMON *const cm,
 void vp9_read_mode_info(VP9_COMMON *cm, MACROBLOCKD *xd,
                         const TileInfo *const tile,
                         int mi_row, int mi_col, vp9_reader *r) {
+  MODE_INFO *const mi = xd->mi[0].src_mi;
+  const int bw = num_8x8_blocks_wide_lookup[mi->mbmi.sb_type];
+  const int bh = num_8x8_blocks_high_lookup[mi->mbmi.sb_type];
+  const int x_mis = MIN(bw, cm->mi_cols - mi_col);
+  const int y_mis = MIN(bh, cm->mi_rows - mi_row);
+  MV_REF* frame_mvs = cm->cur_frame->mvs + mi_row * cm->mi_cols + mi_col;
+  int w, h;
+
   if (frame_is_intra_only(cm))
     read_intra_frame_mode_info(cm, xd, mi_row, mi_col, r);
   else
     read_inter_frame_mode_info(cm, xd, tile, mi_row, mi_col, r);
+
+  for (h = 0; h < y_mis; ++h) {
+    MV_REF *const frame_mv = frame_mvs + h * cm->mi_cols;
+    for (w = 0; w < x_mis; ++w) {
+      MV_REF *const mv = frame_mv + w;
+      mv->ref_frame[0] = mi->src_mi->mbmi.ref_frame[0];
+      mv->ref_frame[1] = mi->src_mi->mbmi.ref_frame[1];
+      mv->mv[0].as_int = mi->src_mi->mbmi.mv[0].as_int;
+      mv->mv[1].as_int = mi->src_mi->mbmi.mv[1].as_int;
+    }
+  }
 }
diff --git a/vp9/decoder/vp9_decoder.c b/vp9/decoder/vp9_decoder.c
index fa2f01041..196816531 100644
--- a/vp9/decoder/vp9_decoder.c
+++ b/vp9/decoder/vp9_decoder.c
@@ -252,6 +252,9 @@ int vp9_receive_compressed_data(VP9Decoder *pbi,
                       &cm->frame_bufs[cm->new_fb_idx].raw_frame_buffer);
   cm->new_fb_idx = get_free_fb(cm);
 
+  // Assign a MV array to the frame buffer.
+  cm->cur_frame = &cm->frame_bufs[cm->new_fb_idx];
+
   if (setjmp(cm->error.jmp)) {
     pbi->need_resync = 1;
     cm->error.setjmp = 0;
@@ -284,14 +287,13 @@ int vp9_receive_compressed_data(VP9Decoder *pbi,
   cm->last_width = cm->width;
   cm->last_height = cm->height;
 
-  if (!cm->show_existing_frame)
+  if (!cm->show_existing_frame) {
     cm->last_show_frame = cm->show_frame;
-  if (cm->show_frame) {
-    if (!cm->show_existing_frame)
-      vp9_swap_mi_and_prev_mi(cm);
+    cm->prev_frame = cm->cur_frame;
+  }
 
+  if (cm->show_frame)
     cm->current_video_frame++;
-  }
 
   cm->error.setjmp = 0;
   return retcode;
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index baa4908d4..b87a28332 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -611,6 +611,13 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
   MB_MODE_INFO *const mbmi = &xd->mi[0].src_mi->mbmi;
   MODE_INFO *mi_addr = &xd->mi[0];
   const struct segmentation *const seg = &cm->seg;
+  const int bw = num_8x8_blocks_wide_lookup[mi->mbmi.sb_type];
+  const int bh = num_8x8_blocks_high_lookup[mi->mbmi.sb_type];
+  const int x_mis = MIN(bw, cm->mi_cols - mi_col);
+  const int y_mis = MIN(bh, cm->mi_rows - mi_row);
+  MV_REF *const frame_mvs =
+      cm->cur_frame->mvs + mi_row * cm->mi_cols + mi_col;
+  int w, h;
 
   const int mis = cm->mi_stride;
   const int mi_width = num_8x8_blocks_wide_lookup[bsize];
@@ -728,6 +735,17 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
     for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
       rd_opt->filter_diff[i] += ctx->best_filter_diff[i];
   }
+
+  for (h = 0; h < y_mis; ++h) {
+    MV_REF *const frame_mv = frame_mvs + h * cm->mi_cols;
+    for (w = 0; w < x_mis; ++w) {
+      MV_REF *const mv = frame_mv + w;
+      mv->ref_frame[0] = mi->src_mi->mbmi.ref_frame[0];
+      mv->ref_frame[1] = mi->src_mi->mbmi.ref_frame[1];
+      mv->mv[0].as_int = mi->src_mi->mbmi.mv[0].as_int;
+      mv->mv[1].as_int = mi->src_mi->mbmi.mv[1].as_int;
+    }
+  }
 }
 
 void vp9_setup_src_planes(MACROBLOCK *x, const YV12_BUFFER_CONFIG *src,
@@ -1293,8 +1311,16 @@ static void update_state_rt(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
+  MODE_INFO *const mi = xd->mi[0].src_mi;
   MB_MODE_INFO *const mbmi = &xd->mi[0].src_mi->mbmi;
   const struct segmentation *const seg = &cm->seg;
+  const int bw = num_8x8_blocks_wide_lookup[mi->mbmi.sb_type];
+  const int bh = num_8x8_blocks_high_lookup[mi->mbmi.sb_type];
+  const int x_mis = MIN(bw, cm->mi_cols - mi_col);
+  const int y_mis = MIN(bh, cm->mi_rows - mi_row);
+  MV_REF *const frame_mvs =
+      cm->cur_frame->mvs + mi_row * cm->mi_cols + mi_col;
+  int w, h;
 
   *(xd->mi[0].src_mi) = ctx->mic;
   xd->mi[0].src_mi = &xd->mi[0];
@@ -1323,6 +1349,17 @@ static void update_state_rt(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
     }
   }
 
+  for (h = 0; h < y_mis; ++h) {
+    MV_REF *const frame_mv = frame_mvs + h * cm->mi_cols;
+    for (w = 0; w < x_mis; ++w) {
+      MV_REF *const mv = frame_mv + w;
+      mv->ref_frame[0] = mi->src_mi->mbmi.ref_frame[0];
+      mv->ref_frame[1] = mi->src_mi->mbmi.ref_frame[1];
+      mv->mv[0].as_int = mi->src_mi->mbmi.mv[0].as_int;
+      mv->mv[1].as_int = mi->src_mi->mbmi.mv[1].as_int;
+    }
+  }
+
   x->skip = ctx->skip;
   x->skip_txfm[0] = mbmi->segment_id ? 0 : ctx->skip_txfm[0];
 }
@@ -2673,6 +2710,22 @@ static void fill_mode_info_sb(VP9_COMMON *cm, MACROBLOCK *x,
   }
 }
 
+// Reset the prediction pixel ready flag recursively.
+static void pred_pixel_ready_reset(PC_TREE *pc_tree, BLOCK_SIZE bsize) {
+  pc_tree->none.pred_pixel_ready = 0;
+  pc_tree->horizontal[0].pred_pixel_ready = 0;
+  pc_tree->horizontal[1].pred_pixel_ready = 0;
+  pc_tree->vertical[0].pred_pixel_ready = 0;
+  pc_tree->vertical[1].pred_pixel_ready = 0;
+
+  if (bsize > BLOCK_8X8) {
+    BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_SPLIT);
+    int i;
+    for (i = 0; i < 4; ++i)
+      pred_pixel_ready_reset(pc_tree->split[i], subsize);
+  }
+}
+
 static void nonrd_pick_partition(VP9_COMP *cpi,
                                  TileDataEnc *tile_data,
                                  TOKENEXTRA **tp, int mi_row,
@@ -2731,6 +2784,10 @@ static void nonrd_pick_partition(VP9_COMP *cpi,
     partition_vert_allowed &= force_vert_split;
   }
 
+  ctx->pred_pixel_ready = !(partition_vert_allowed ||
+                            partition_horz_allowed ||
+                            do_split);
+
   // PARTITION_NONE
   if (partition_none_allowed) {
     nonrd_pick_sb_modes(cpi, tile_data, mi_row, mi_col,
@@ -2738,7 +2795,6 @@ static void nonrd_pick_partition(VP9_COMP *cpi,
     ctx->mic.mbmi = xd->mi[0].src_mi->mbmi;
     ctx->skip_txfm[0] = x->skip_txfm[0];
     ctx->skip = x->skip;
-    ctx->pred_pixel_ready = 0;
 
     if (this_rdc.rate != INT_MAX) {
       int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
@@ -2814,17 +2870,17 @@ static void nonrd_pick_partition(VP9_COMP *cpi,
     subsize = get_subsize(bsize, PARTITION_HORZ);
     if (sf->adaptive_motion_search)
       load_pred_mv(x, ctx);
-
+    pc_tree->horizontal[0].pred_pixel_ready = 1;
     nonrd_pick_sb_modes(cpi, tile_data, mi_row, mi_col, &sum_rdc, subsize,
                         &pc_tree->horizontal[0]);
 
     pc_tree->horizontal[0].mic.mbmi = xd->mi[0].src_mi->mbmi;
     pc_tree->horizontal[0].skip_txfm[0] = x->skip_txfm[0];
     pc_tree->horizontal[0].skip = x->skip;
-    pc_tree->horizontal[0].pred_pixel_ready = 0;
 
     if (sum_rdc.rdcost < best_rdc.rdcost && mi_row + ms < cm->mi_rows) {
       load_pred_mv(x, ctx);
+      pc_tree->horizontal[1].pred_pixel_ready = 1;
       nonrd_pick_sb_modes(cpi, tile_data, mi_row + ms, mi_col,
                           &this_rdc, subsize,
                           &pc_tree->horizontal[1]);
@@ -2832,7 +2888,6 @@ static void nonrd_pick_partition(VP9_COMP *cpi,
       pc_tree->horizontal[1].mic.mbmi = xd->mi[0].src_mi->mbmi;
       pc_tree->horizontal[1].skip_txfm[0] = x->skip_txfm[0];
       pc_tree->horizontal[1].skip = x->skip;
-      pc_tree->horizontal[1].pred_pixel_ready = 0;
 
       if (this_rdc.rate == INT_MAX) {
         vp9_rd_cost_reset(&sum_rdc);
@@ -2849,32 +2904,32 @@ static void nonrd_pick_partition(VP9_COMP *cpi,
     if (sum_rdc.rdcost < best_rdc.rdcost) {
       best_rdc = sum_rdc;
       pc_tree->partitioning = PARTITION_HORZ;
+    } else {
+      pred_pixel_ready_reset(pc_tree, bsize);
     }
   }
 
   // PARTITION_VERT
   if (partition_vert_allowed && do_rect) {
     subsize = get_subsize(bsize, PARTITION_VERT);
-
     if (sf->adaptive_motion_search)
       load_pred_mv(x, ctx);
-
+    pc_tree->vertical[0].pred_pixel_ready = 1;
     nonrd_pick_sb_modes(cpi, tile_data, mi_row, mi_col, &sum_rdc, subsize,
                         &pc_tree->vertical[0]);
     pc_tree->vertical[0].mic.mbmi = xd->mi[0].src_mi->mbmi;
     pc_tree->vertical[0].skip_txfm[0] = x->skip_txfm[0];
     pc_tree->vertical[0].skip = x->skip;
-    pc_tree->vertical[0].pred_pixel_ready = 0;
 
     if (sum_rdc.rdcost < best_rdc.rdcost && mi_col + ms < cm->mi_cols) {
       load_pred_mv(x, ctx);
+      pc_tree->vertical[1].pred_pixel_ready = 1;
       nonrd_pick_sb_modes(cpi, tile_data, mi_row, mi_col + ms,
                           &this_rdc, subsize,
                           &pc_tree->vertical[1]);
       pc_tree->vertical[1].mic.mbmi = xd->mi[0].src_mi->mbmi;
       pc_tree->vertical[1].skip_txfm[0] = x->skip_txfm[0];
       pc_tree->vertical[1].skip = x->skip;
-      pc_tree->vertical[1].pred_pixel_ready = 0;
 
       if (this_rdc.rate == INT_MAX) {
         vp9_rd_cost_reset(&sum_rdc);
@@ -2891,6 +2946,8 @@ static void nonrd_pick_partition(VP9_COMP *cpi,
     if (sum_rdc.rdcost < best_rdc.rdcost) {
       best_rdc = sum_rdc;
       pc_tree->partitioning = PARTITION_VERT;
+    } else {
+      pred_pixel_ready_reset(pc_tree, bsize);
     }
   }
 
@@ -2972,27 +3029,27 @@ static void nonrd_select_partition(VP9_COMP *cpi,
   } else {
     switch (partition) {
       case PARTITION_NONE:
+        pc_tree->none.pred_pixel_ready = 1;
         nonrd_pick_sb_modes(cpi, tile_data, mi_row, mi_col, rd_cost,
                             subsize, &pc_tree->none);
         pc_tree->none.mic.mbmi = xd->mi[0].src_mi->mbmi;
         pc_tree->none.skip_txfm[0] = x->skip_txfm[0];
         pc_tree->none.skip = x->skip;
-        pc_tree->none.pred_pixel_ready = 1;
         break;
       case PARTITION_VERT:
+        pc_tree->vertical[0].pred_pixel_ready = 1;
         nonrd_pick_sb_modes(cpi, tile_data, mi_row, mi_col, rd_cost,
                             subsize, &pc_tree->vertical[0]);
         pc_tree->vertical[0].mic.mbmi = xd->mi[0].src_mi->mbmi;
         pc_tree->vertical[0].skip_txfm[0] = x->skip_txfm[0];
         pc_tree->vertical[0].skip = x->skip;
-        pc_tree->vertical[0].pred_pixel_ready = 1;
         if (mi_col + hbs < cm->mi_cols) {
+          pc_tree->vertical[1].pred_pixel_ready = 1;
           nonrd_pick_sb_modes(cpi, tile_data, mi_row, mi_col + hbs,
                               &this_rdc, subsize, &pc_tree->vertical[1]);
           pc_tree->vertical[1].mic.mbmi = xd->mi[0].src_mi->mbmi;
           pc_tree->vertical[1].skip_txfm[0] = x->skip_txfm[0];
           pc_tree->vertical[1].skip = x->skip;
-          pc_tree->vertical[1].pred_pixel_ready = 1;
           if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX &&
               rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) {
             rd_cost->rate += this_rdc.rate;
@@ -3001,19 +3058,19 @@ static void nonrd_select_partition(VP9_COMP *cpi,
         }
         break;
       case PARTITION_HORZ:
+        pc_tree->horizontal[0].pred_pixel_ready = 1;
         nonrd_pick_sb_modes(cpi, tile_data, mi_row, mi_col, rd_cost,
                             subsize, &pc_tree->horizontal[0]);
         pc_tree->horizontal[0].mic.mbmi = xd->mi[0].src_mi->mbmi;
         pc_tree->horizontal[0].skip_txfm[0] = x->skip_txfm[0];
         pc_tree->horizontal[0].skip = x->skip;
-        pc_tree->horizontal[0].pred_pixel_ready = 1;
         if (mi_row + hbs < cm->mi_rows) {
+          pc_tree->horizontal[1].pred_pixel_ready = 1;
           nonrd_pick_sb_modes(cpi, tile_data, mi_row + hbs, mi_col,
                               &this_rdc, subsize, &pc_tree->horizontal[0]);
           pc_tree->horizontal[1].mic.mbmi = xd->mi[0].src_mi->mbmi;
           pc_tree->horizontal[1].skip_txfm[0] = x->skip_txfm[0];
           pc_tree->horizontal[1].skip = x->skip;
-          pc_tree->horizontal[1].pred_pixel_ready = 1;
           if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX &&
               rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) {
             rd_cost->rate += this_rdc.rate;
@@ -3091,6 +3148,7 @@ static void nonrd_use_partition(VP9_COMP *cpi,
 
   switch (partition) {
     case PARTITION_NONE:
+      pc_tree->none.pred_pixel_ready = 1;
       nonrd_pick_sb_modes(cpi, tile_data, mi_row, mi_col, rd_cost,
                           subsize, &pc_tree->none);
       pc_tree->none.mic.mbmi = xd->mi[0].src_mi->mbmi;
@@ -3098,12 +3156,14 @@ static void nonrd_use_partition(VP9_COMP *cpi,
       pc_tree->none.skip = x->skip;
       break;
     case PARTITION_VERT:
+      pc_tree->vertical[0].pred_pixel_ready = 1;
       nonrd_pick_sb_modes(cpi, tile_data, mi_row, mi_col, rd_cost,
                           subsize, &pc_tree->vertical[0]);
       pc_tree->vertical[0].mic.mbmi = xd->mi[0].src_mi->mbmi;
       pc_tree->vertical[0].skip_txfm[0] = x->skip_txfm[0];
       pc_tree->vertical[0].skip = x->skip;
       if (mi_col + hbs < cm->mi_cols) {
+        pc_tree->vertical[1].pred_pixel_ready = 1;
         nonrd_pick_sb_modes(cpi, tile_data, mi_row, mi_col + hbs,
                             &this_rdc, subsize, &pc_tree->vertical[1]);
         pc_tree->vertical[1].mic.mbmi = xd->mi[0].src_mi->mbmi;
@@ -3117,12 +3177,14 @@ static void nonrd_use_partition(VP9_COMP *cpi,
       }
       break;
     case PARTITION_HORZ:
+      pc_tree->horizontal[0].pred_pixel_ready = 1;
       nonrd_pick_sb_modes(cpi, tile_data, mi_row, mi_col, rd_cost,
                           subsize, &pc_tree->horizontal[0]);
       pc_tree->horizontal[0].mic.mbmi = xd->mi[0].src_mi->mbmi;
       pc_tree->horizontal[0].skip_txfm[0] = x->skip_txfm[0];
       pc_tree->horizontal[0].skip = x->skip;
       if (mi_row + hbs < cm->mi_rows) {
+        pc_tree->horizontal[1].pred_pixel_ready = 1;
         nonrd_pick_sb_modes(cpi, tile_data, mi_row + hbs, mi_col,
                             &this_rdc, subsize, &pc_tree->horizontal[0]);
         pc_tree->horizontal[1].mic.mbmi = xd->mi[0].src_mi->mbmi;
@@ -3504,6 +3566,11 @@ static void encode_frame_internal(VP9_COMP *cpi) {
   vp9_initialize_me_consts(cpi, cm->base_qindex);
   init_encode_frame_mb_context(cpi);
   set_prev_mi(cm);
+  cm->use_prev_frame_mvs = !cm->error_resilient_mode &&
+                           cm->width == cm->last_width &&
+                           cm->height == cm->last_height &&
+                           !cm->intra_only &&
+                           cm->last_show_frame;
 
   x->quant_fp = cpi->sf.use_quant_fp;
   vp9_zero(x->skip_txfm);
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index dfc636a41..1d9fe5e92 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -584,7 +584,7 @@ static void init_config(struct VP9_COMP *cpi, VP9EncoderConfig *oxcf) {
   if ((cpi->svc.number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) ||
       ((cpi->svc.number_temporal_layers > 1 ||
         cpi->svc.number_spatial_layers > 1) &&
-       cpi->oxcf.pass == 2)) {
+       cpi->oxcf.pass != 1)) {
     vp9_init_layer_context(cpi);
   }
 
@@ -1285,7 +1285,7 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
       cpi->oxcf.rc_mode == VPX_CBR) ||
       ((cpi->svc.number_temporal_layers > 1 ||
         cpi->svc.number_spatial_layers > 1) &&
-       cpi->oxcf.pass == 2)) {
+       cpi->oxcf.pass != 1)) {
     vp9_update_layer_context_change_config(cpi,
                                            (int)cpi->oxcf.target_bandwidth);
   }
@@ -2420,6 +2420,7 @@ void vp9_scale_references(VP9_COMP *cpi) {
 #if CONFIG_VP9_HIGHBITDEPTH
       if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) {
         const int new_fb = get_free_fb(cm);
+        cm->cur_frame = &cm->frame_bufs[new_fb];
         vp9_realloc_frame_buffer(&cm->frame_bufs[new_fb].buf,
                                  cm->width, cm->height,
                                  cm->subsampling_x, cm->subsampling_y,
@@ -2437,6 +2438,15 @@ void vp9_scale_references(VP9_COMP *cpi) {
         scale_and_extend_frame(ref, &cm->frame_bufs[new_fb].buf);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
         cpi->scaled_ref_idx[ref_frame - 1] = new_fb;
+        if (cm->frame_bufs[new_fb].mvs == NULL ||
+            cm->frame_bufs[new_fb].mi_rows < cm->mi_rows ||
+            cm->frame_bufs[new_fb].mi_cols < cm->mi_cols) {
+          cm->frame_bufs[new_fb].mvs =
+            (MV_REF *)vpx_calloc(cm->mi_rows * cm->mi_cols,
+                                 sizeof(*cm->frame_bufs[new_fb].mvs));
+          cm->frame_bufs[new_fb].mi_rows = cm->mi_rows;
+          cm->frame_bufs[new_fb].mi_cols = cm->mi_cols;
+        }
       } else {
         cpi->scaled_ref_idx[ref_frame - 1] = idx;
         ++cm->frame_bufs[idx].ref_count;
@@ -2973,7 +2983,9 @@ static int get_ref_frame_flags(const VP9_COMP *cpi) {
   if (gold_is_last)
     flags &= ~VP9_GOLD_FLAG;
 
-  if (cpi->rc.frames_till_gf_update_due == INT_MAX && !is_two_pass_svc(cpi))
+  if (cpi->rc.frames_till_gf_update_due == INT_MAX &&
+      (cpi->svc.number_temporal_layers == 1 &&
+       cpi->svc.number_spatial_layers == 1))
     flags &= ~VP9_GOLD_FLAG;
 
   if (alt_is_last)
@@ -3279,13 +3291,13 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
 
   if (cm->show_frame) {
     vp9_swap_mi_and_prev_mi(cm);
-
     // Don't increment frame counters if this was an altref buffer
     // update not a real frame
     ++cm->current_video_frame;
     if (cpi->use_svc)
       vp9_inc_frame_in_layer(cpi);
   }
+  cm->prev_frame = cm->cur_frame;
 
   if (is_two_pass_svc(cpi))
     cpi->svc.layer_context[cpi->svc.spatial_layer_id].last_frame_type =
@@ -3630,6 +3642,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
   // held.
   cm->frame_bufs[cm->new_fb_idx].ref_count--;
   cm->new_fb_idx = get_free_fb(cm);
+  cm->cur_frame = &cm->frame_bufs[cm->new_fb_idx];
 
   if (!cpi->use_svc && cpi->multi_arf_allowed) {
     if (cm->frame_type == KEY_FRAME) {
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 0e112f2ff..f0c05430a 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -531,9 +531,8 @@ void vp9_apply_encoding_flags(VP9_COMP *cpi, vpx_enc_frame_flags_t flags);
 
 static INLINE int is_two_pass_svc(const struct VP9_COMP *const cpi) {
   return cpi->use_svc &&
-         (cpi->svc.number_temporal_layers > 1 ||
-          cpi->svc.number_spatial_layers > 1) &&
-         (cpi->oxcf.pass == 1 || cpi->oxcf.pass == 2);
+         ((cpi->svc.number_spatial_layers > 1) ||
+         (cpi->svc.number_temporal_layers > 1 && cpi->oxcf.pass != 0));
 }
 
 static INLINE int is_altref_enabled(const VP9_COMP *const cpi) {
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 69b419384..28598f1aa 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -590,6 +590,13 @@ int vp9_find_best_sub_pixel_tree_pruned(const MACROBLOCK *x,
   return besterr;
 }
 
+const MV search_step_table[12] = {
+    // left, right, up, down
+    {0, -4}, {0, 4}, {-4, 0}, {4, 0},
+    {0, -2}, {0, 2}, {-2, 0}, {2, 0},
+    {0, -1}, {0, 1}, {-1, 0}, {1, 0}
+};
+
 int vp9_find_best_sub_pixel_tree(const MACROBLOCK *x,
                                  MV *bestmv, const MV *ref_mv,
                                  int allow_hp,
@@ -603,43 +610,134 @@ int vp9_find_best_sub_pixel_tree(const MACROBLOCK *x,
                                  unsigned int *sse1,
                                  const uint8_t *second_pred,
                                  int w, int h) {
-  SETUP_SUBPEL_SEARCH;
-  SETUP_CENTER_ERROR;
-  (void) cost_list;  // to silence compiler warning
+  const uint8_t *const z = x->plane[0].src.buf;
+  const uint8_t *const src_address = z;
+  const int src_stride = x->plane[0].src.stride;
+  const MACROBLOCKD *xd = &x->e_mbd;
+  unsigned int besterr = INT_MAX;
+  unsigned int sse;
+  unsigned int whichdir = 0;
+  int thismse;
+  const int y_stride = xd->plane[0].pre[0].stride;
+  const int offset = bestmv->row * y_stride + bestmv->col;
+  const uint8_t *const y = xd->plane[0].pre[0].buf;
+
+  int rr = ref_mv->row;
+  int rc = ref_mv->col;
+  int br = bestmv->row * 8;
+  int bc = bestmv->col * 8;
+  int hstep = 4;
+  int iter, round = 3 - forced_stop;
+  const int minc = MAX(x->mv_col_min * 8, ref_mv->col - MV_MAX);
+  const int maxc = MIN(x->mv_col_max * 8, ref_mv->col + MV_MAX);
+  const int minr = MAX(x->mv_row_min * 8, ref_mv->row - MV_MAX);
+  const int maxr = MIN(x->mv_row_max * 8, ref_mv->row + MV_MAX);
+  int tr = br;
+  int tc = bc;
+  const MV *search_step = search_step_table;
+  int idx, best_idx = -1;
+  unsigned int cost_array[5];
+
+  if (!(allow_hp && vp9_use_mv_hp(ref_mv)))
+    if (round == 3)
+      round = 2;
+
+  bestmv->row *= 8;
+  bestmv->col *= 8;
 
-  // Each subsequent iteration checks at least one point in
-  // common with the last iteration could be 2 ( if diag selected)
-  // 1/2 pel
-  FIRST_LEVEL_CHECKS;
-  if (halfiters > 1) {
-    SECOND_LEVEL_CHECKS;
+  if (second_pred != NULL) {
+    DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64);
+    vp9_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
+    besterr = vfp->vf(comp_pred, w, src_address, src_stride, sse1);
+  } else {
+    besterr = vfp->vf(y + offset, y_stride, src_address, src_stride, sse1);
   }
-  tr = br;
-  tc = bc;
+  *distortion = besterr;
+  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
 
-  // Each subsequent iteration checks at least one point in common with
-  // the last iteration could be 2 ( if diag selected) 1/4 pel
+  (void) cost_list;  // to silence compiler warning
 
-  // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only
-  if (forced_stop != 2) {
-    hstep >>= 1;
-    FIRST_LEVEL_CHECKS;
-    if (quarteriters > 1) {
-      SECOND_LEVEL_CHECKS;
+  for (iter = 0; iter < round; ++iter) {
+    // Check vertical and horizontal sub-pixel positions.
+    for (idx = 0; idx < 4; ++idx) {
+      tr = br + search_step[idx].row;
+      tc = bc + search_step[idx].col;
+      if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
+        const uint8_t *const pre_address = y + (tr >> 3) * y_stride + (tc >> 3);
+        int row_offset = (tr & 0x07) << 1;
+        int col_offset = (tc & 0x07) << 1;
+        MV this_mv;
+        this_mv.row = tr;
+        this_mv.col = tc;
+        if (second_pred == NULL)
+          thismse = vfp->svf(pre_address, y_stride, col_offset, row_offset,
+                             src_address, src_stride, &sse);
+        else
+          thismse = vfp->svaf(pre_address, y_stride, col_offset, row_offset,
+                              src_address, src_stride, &sse, second_pred);
+        cost_array[idx] = thismse +
+            mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit);
+
+        if (cost_array[idx] < besterr) {
+          best_idx = idx;
+          besterr = cost_array[idx];
+          *distortion = thismse;
+          *sse1 = sse;
+        }
+      } else {
+        cost_array[idx] = INT_MAX;
+      }
     }
-    tr = br;
-    tc = bc;
-  }
 
-  if (allow_hp && vp9_use_mv_hp(ref_mv) && forced_stop == 0) {
-    hstep >>= 1;
-    FIRST_LEVEL_CHECKS;
-    if (eighthiters > 1) {
-      SECOND_LEVEL_CHECKS;
+    // Check diagonal sub-pixel position
+    tc = bc + (cost_array[0] < cost_array[1] ? -hstep : hstep);
+    tr = br + (cost_array[2] < cost_array[3] ? -hstep : hstep);
+    if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
+      const uint8_t *const pre_address = y + (tr >> 3) * y_stride + (tc >> 3);
+      int row_offset = (tr & 0x07) << 1;
+      int col_offset = (tc & 0x07) << 1;
+      MV this_mv = {tr, tc};
+      if (second_pred == NULL)
+        thismse = vfp->svf(pre_address, y_stride, col_offset, row_offset,
+                           src_address, src_stride, &sse);
+      else
+        thismse = vfp->svaf(pre_address, y_stride, col_offset, row_offset,
+                            src_address, src_stride, &sse, second_pred);
+      cost_array[4] = thismse +
+          mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit);
+
+      if (cost_array[4] < besterr) {
+        best_idx = 4;
+        besterr = cost_array[4];
+        *distortion = thismse;
+        *sse1 = sse;
+      }
+    } else {
+      cost_array[idx] = INT_MAX;
     }
+
+    if (best_idx < 4 && best_idx >= 0) {
+      br += search_step[best_idx].row;
+      bc += search_step[best_idx].col;
+    } else if (best_idx == 4) {
+      br = tr;
+      bc = tc;
+    }
+
+    if (iters_per_step > 1)
+      SECOND_LEVEL_CHECKS;
+
     tr = br;
     tc = bc;
+
+    search_step += 4;
+    hstep >>= 1;
+    best_idx = -1;
   }
+
+  // Each subsequent iteration checks at least one point in common with
+  // the last iteration could be 2 ( if diag selected) 1/4 pel
+
   // These lines insure static analysis doesn't warn that
   // tr and tc aren't used after the above point.
   (void) tr;
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 28f12916e..1943fdb28 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -453,7 +453,7 @@ static void estimate_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
 }
 
 static const THR_MODES mode_idx[MAX_REF_FRAMES][4] = {
-  {THR_DC, THR_H_PRED, THR_V_PRED},
+  {THR_DC, THR_H_PRED, THR_V_PRED, THR_TM},
   {THR_NEARESTMV, THR_NEARMV, THR_ZEROMV, THR_NEWMV},
   {THR_NEARESTG, THR_NEARG, THR_ZEROG, THR_NEWG},
   {THR_NEARESTA, THR_NEARA, THR_ZEROA, THR_NEWA},
@@ -517,8 +517,9 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   PRED_BUFFER *best_pred = NULL;
   PRED_BUFFER *this_mode_pred = NULL;
   const int pixels_in_block = bh * bw;
+  int reuse_inter_pred = cpi->sf.reuse_inter_pred_sby && ctx->pred_pixel_ready;
 
-  if (cpi->sf.reuse_inter_pred_sby) {
+  if (reuse_inter_pred) {
     int i;
     for (i = 0; i < 3; i++) {
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -605,6 +606,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
       int rate_mv = 0;
       int mode_rd_thresh;
+      int mode_index = mode_idx[ref_frame][INTER_OFFSET(this_mode)];
 
       if (const_motion[ref_frame] && this_mode == NEARMV)
         continue;
@@ -612,10 +614,9 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
       if (!(cpi->sf.inter_mode_mask[bsize] & (1 << this_mode)))
         continue;
 
-      mode_rd_thresh =
-          rd_threshes[mode_idx[ref_frame][INTER_OFFSET(this_mode)]];
+      mode_rd_thresh = rd_threshes[mode_index];
       if (rd_less_than_thresh(best_rdc.rdcost, mode_rd_thresh,
-                              rd_thresh_freq_fact[this_mode]))
+                              rd_thresh_freq_fact[mode_index]))
         continue;
 
       if (this_mode == NEWMV) {
@@ -641,7 +642,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
       // Search for the best prediction filter type, when the resulting
       // motion vector is at sub-pixel accuracy level for luma component, i.e.,
       // the last three bits are all zeros.
-      if (cpi->sf.reuse_inter_pred_sby) {
+      if (reuse_inter_pred) {
         if (!this_mode_pred) {
           this_mode_pred = &tmp[3];
         } else {
@@ -679,7 +680,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
             best_cost = cost;
             skip_txfm = x->skip_txfm[0];
 
-            if (cpi->sf.reuse_inter_pred_sby) {
+            if (reuse_inter_pred) {
               if (this_mode_pred != current_pred) {
                 free_pred_buffer(this_mode_pred);
                 this_mode_pred = current_pred;
@@ -694,7 +695,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
           }
         }
 
-        if (cpi->sf.reuse_inter_pred_sby && this_mode_pred != current_pred)
+        if (reuse_inter_pred && this_mode_pred != current_pred)
           free_pred_buffer(current_pred);
 
         mbmi->interp_filter = best_filter;
@@ -746,13 +747,12 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
         best_ref_frame = ref_frame;
         skip_txfm = x->skip_txfm[0];
 
-        if (cpi->sf.reuse_inter_pred_sby) {
+        if (reuse_inter_pred) {
           free_pred_buffer(best_pred);
-
           best_pred = this_mode_pred;
         }
       } else {
-        if (cpi->sf.reuse_inter_pred_sby)
+        if (reuse_inter_pred)
           free_pred_buffer(this_mode_pred);
       }
 
@@ -766,7 +766,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
 
   // If best prediction is not in dst buf, then copy the prediction block from
   // temp buf to dst buf.
-  if (best_pred != NULL && cpi->sf.reuse_inter_pred_sby &&
+  if (best_pred != NULL && reuse_inter_pred &&
       best_pred->data != orig_dst.buf) {
     pd->dst = orig_dst;
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -801,7 +801,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
         MIN(max_txsize_lookup[bsize],
             tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
 
-    if (cpi->sf.reuse_inter_pred_sby) {
+    if (reuse_inter_pred) {
       pd->dst.buf = tmp[0].data;
       pd->dst.stride = bw;
     }
@@ -833,16 +833,18 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
         x->skip_txfm[0] = skip_txfm;
       }
     }
-    if (cpi->sf.reuse_inter_pred_sby)
+    if (reuse_inter_pred)
       pd->dst = orig_dst;
   }
 
   if (is_inter_block(mbmi))
-    vp9_update_rd_thresh_fact(cpi, tile_data, bsize,
-                              mode_idx[ref_frame][INTER_OFFSET(mbmi->mode)]);
+    vp9_update_rd_thresh_fact(tile_data->thresh_freq_fact,
+                            cpi->sf.adaptive_rd_thresh, bsize,
+                            mode_idx[best_ref_frame][INTER_OFFSET(mbmi->mode)]);
   else
-    vp9_update_rd_thresh_fact(cpi, tile_data, bsize,
-                              mode_idx[ref_frame][mbmi->mode]);
+    vp9_update_rd_thresh_fact(tile_data->thresh_freq_fact,
+                              cpi->sf.adaptive_rd_thresh, bsize,
+                              mode_idx[INTRA_FRAME][mbmi->mode]);
 
   *rd_cost = best_rdc;
 }
diff --git a/vp9/encoder/vp9_rd.c b/vp9/encoder/vp9_rd.c
index 13e317d6d..2f19d2942 100644
--- a/vp9/encoder/vp9_rd.c
+++ b/vp9/encoder/vp9_rd.c
@@ -605,10 +605,9 @@ void vp9_set_rd_speed_thresholds_sub8x8(VP9_COMP *cpi) {
   }
 }
 
-// TODO(jingning) Refactor this function. Use targeted smaller struct as inputs.
-void vp9_update_rd_thresh_fact(VP9_COMP *cpi, TileDataEnc *tile_data,
+void vp9_update_rd_thresh_fact(int (*factor_buf)[MAX_MODES], int rd_thresh,
                                int bsize, int best_mode_index) {
-  if (cpi->sf.adaptive_rd_thresh > 0) {
+  if (rd_thresh > 0) {
     const int top_mode = bsize < BLOCK_8X8 ? MAX_REFS : MAX_MODES;
     int mode;
     for (mode = 0; mode < top_mode; ++mode) {
@@ -616,12 +615,12 @@ void vp9_update_rd_thresh_fact(VP9_COMP *cpi, TileDataEnc *tile_data,
       const BLOCK_SIZE max_size = MIN(bsize + 2, BLOCK_64X64);
       BLOCK_SIZE bs;
       for (bs = min_size; bs <= max_size; ++bs) {
-        int *const fact = &tile_data->thresh_freq_fact[bs][mode];
+        int *const fact = &factor_buf[bs][mode];
         if (mode == best_mode_index) {
           *fact -= (*fact >> 4);
         } else {
           *fact = MIN(*fact + RD_THRESH_INC,
-                      cpi->sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT);
+                      rd_thresh * RD_THRESH_MAX_FACT);
         }
       }
     }
diff --git a/vp9/encoder/vp9_rd.h b/vp9/encoder/vp9_rd.h
index aecca0b43..ebbe821d5 100644
--- a/vp9/encoder/vp9_rd.h
+++ b/vp9/encoder/vp9_rd.h
@@ -162,8 +162,7 @@ void vp9_set_rd_speed_thresholds(struct VP9_COMP *cpi);
 
 void vp9_set_rd_speed_thresholds_sub8x8(struct VP9_COMP *cpi);
 
-void vp9_update_rd_thresh_fact(struct VP9_COMP *cpi,
-                               struct TileDataEnc *tile_data,
+void vp9_update_rd_thresh_fact(int (*fact)[MAX_MODES], int rd_thresh,
                                int bsize, int best_mode_index);
 
 static INLINE int rd_less_than_thresh(int64_t best_rd, int thresh,
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index e80f345e8..882bac105 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -3398,7 +3398,8 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi,
          !is_inter_block(&best_mbmode));
 
   if (!cpi->rc.is_src_frame_alt_ref)
-    vp9_update_rd_thresh_fact(cpi, tile_data, bsize, best_mode_index);
+    vp9_update_rd_thresh_fact(tile_data->thresh_freq_fact,
+                              sf->adaptive_rd_thresh, bsize, best_mode_index);
 
   // macroblock modes
   *mbmi = best_mbmode;
@@ -3553,7 +3554,8 @@ void vp9_rd_pick_inter_mode_sb_seg_skip(VP9_COMP *cpi,
   assert((cm->interp_filter == SWITCHABLE) ||
          (cm->interp_filter == mbmi->interp_filter));
 
-  vp9_update_rd_thresh_fact(cpi, tile_data, bsize, THR_ZEROMV);
+  vp9_update_rd_thresh_fact(tile_data->thresh_freq_fact,
+                            cpi->sf.adaptive_rd_thresh, bsize, THR_ZEROMV);
 
   vp9_zero(best_pred_diff);
   vp9_zero(best_filter_diff);
@@ -4128,7 +4130,8 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi,
          (cm->interp_filter == best_mbmode.interp_filter) ||
          !is_inter_block(&best_mbmode));
 
-  vp9_update_rd_thresh_fact(cpi, tile_data, bsize, best_ref_index);
+  vp9_update_rd_thresh_fact(tile_data->thresh_freq_fact,
+                            sf->adaptive_rd_thresh, bsize, best_ref_index);
 
   // macroblock modes
   *mbmi = best_mbmode;
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index 3315aa6a1..7a1b0cc1f 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -295,16 +295,16 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf,
   }
 
   if (speed >= 7) {
+    sf->adaptive_rd_thresh = 3;
     sf->mv.search_method = FAST_DIAMOND;
     sf->mv.fullpel_search_step_param = 10;
     sf->lpf_pick = LPF_PICK_MINIMAL_LPF;
     sf->encode_breakout_thresh = (MIN(cm->width, cm->height) >= 720) ?
         800 : 300;
-    sf->elevate_newmv_thresh = 2500;
   }
 
   if (speed >= 12) {
-    sf->elevate_newmv_thresh = 4000;
+    sf->adaptive_rd_thresh = 4;
     sf->mv.subpel_force_stop = 2;
   }
 
@@ -386,7 +386,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
   sf->search_type_check_frequency = 50;
   sf->encode_breakout_thresh = 0;
   sf->elevate_newmv_thresh = 0;
-  // Recode loop tolerence %.
+  // Recode loop tolerance %.
   sf->recode_tolerance = 25;
   sf->default_interp_filter = SWITCHABLE;
   sf->tx_size_search_breakout = 0;