12 files changed, 368 insertions, 96 deletions
diff --git a/vp9/common/vp9_alloccommon.c b/vp9/common/vp9_alloccommon.c
index 8b3b9dbe0..5b8b2a9ec 100644
--- a/vp9/common/vp9_alloccommon.c
+++ b/vp9/common/vp9_alloccommon.c
@@ -87,13 +87,14 @@ void vp9_free_ref_frame_buffers(VP9_COMMON *cm) {
   int i;
 
   for (i = 0; i < FRAME_BUFFERS; ++i) {
-    vp9_free_frame_buffer(&cm->frame_bufs[i].buf);
-
     if (cm->frame_bufs[i].ref_count > 0 &&
         cm->frame_bufs[i].raw_frame_buffer.data != NULL) {
       cm->release_fb_cb(cm->cb_priv, &cm->frame_bufs[i].raw_frame_buffer);
       cm->frame_bufs[i].ref_count = 0;
     }
+    vpx_free(cm->frame_bufs[i].mvs);
+    cm->frame_bufs[i].mvs = NULL;
+    vp9_free_frame_buffer(&cm->frame_bufs[i].buf);
   }
 
   vp9_free_frame_buffer(&cm->post_proc_buffer);
@@ -166,6 +167,16 @@ int vp9_alloc_ref_frame_buffers(VP9_COMMON *cm, int width, int height) {
 #endif
                                VP9_ENC_BORDER_IN_PIXELS) < 0)
       goto fail;
+    if (cm->frame_bufs[i].mvs == NULL) {
+      cm->frame_bufs[i].mvs =
+          (MV_REF *)vpx_calloc(cm->mi_rows * cm->mi_cols,
+                               sizeof(*cm->frame_bufs[i].mvs));
+      if (cm->frame_bufs[i].mvs == NULL)
+        goto fail;
+
+      cm->frame_bufs[i].mi_rows = cm->mi_rows;
+      cm->frame_bufs[i].mi_cols = cm->mi_cols;
+    }
   }
 
   init_frame_bufs(cm);
diff --git a/vp9/common/vp9_mvref_common.c b/vp9/common/vp9_mvref_common.c
index b310eb44d..561201ffe 100644
--- a/vp9/common/vp9_mvref_common.c
+++ b/vp9/common/vp9_mvref_common.c
@@ -20,13 +20,11 @@ static void find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
                              int block, int mi_row, int mi_col) {
   const int *ref_sign_bias = cm->ref_frame_sign_bias;
   int i, refmv_count = 0;
-  const MODE_INFO *prev_mi = !cm->error_resilient_mode && cm->prev_mi
-        ? cm->prev_mi[mi_row * xd->mi_stride + mi_col].src_mi
-        : NULL;
-  const MB_MODE_INFO *const prev_mbmi = prev_mi ? &prev_mi->src_mi->mbmi : NULL;
   const POSITION *const mv_ref_search = mv_ref_blocks[mi->mbmi.sb_type];
   int different_ref_found = 0;
   int context_counter = 0;
+  const MV_REF *const  prev_frame_mvs = cm->use_prev_frame_mvs ?
+      cm->prev_frame->mvs + mi_row * cm->mi_cols + mi_col : NULL;
 
   // Blank the reference vector list
   vpx_memset(mv_ref_list, 0, sizeof(*mv_ref_list) * MAX_MV_REF_CANDIDATES);
@@ -71,11 +69,12 @@ static void find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
   }
 
   // Check the last frame's mode and mv info.
-  if (prev_mbmi) {
-    if (prev_mbmi->ref_frame[0] == ref_frame)
-      ADD_MV_REF_LIST(prev_mbmi->mv[0], refmv_count, mv_ref_list, Done);
-    else if (prev_mbmi->ref_frame[1] == ref_frame)
-      ADD_MV_REF_LIST(prev_mbmi->mv[1], refmv_count, mv_ref_list, Done);
+  if (cm->use_prev_frame_mvs) {
+    if (prev_frame_mvs->ref_frame[0] == ref_frame) {
+      ADD_MV_REF_LIST(prev_frame_mvs->mv[0], refmv_count, mv_ref_list, Done);
+    } else if (prev_frame_mvs->ref_frame[1] == ref_frame) {
+      ADD_MV_REF_LIST(prev_frame_mvs->mv[1], refmv_count, mv_ref_list, Done);
+    }
   }
 
   // Since we couldn't find 2 mvs from the same reference frame
@@ -96,9 +95,30 @@ static void find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
   }
 
   // Since we still don't have a candidate we'll try the last frame.
-  if (prev_mbmi)
-    IF_DIFF_REF_FRAME_ADD_MV(prev_mbmi, ref_frame, ref_sign_bias, refmv_count,
-                             mv_ref_list, Done);
+  if (cm->use_prev_frame_mvs) {
+    if (prev_frame_mvs->ref_frame[0] != ref_frame &&
+        prev_frame_mvs->ref_frame[0] > INTRA_FRAME) {
+      int_mv mv = prev_frame_mvs->mv[0];
+      if (ref_sign_bias[prev_frame_mvs->ref_frame[0]] !=
+          ref_sign_bias[ref_frame]) {
+        mv.as_mv.row *= -1;
+        mv.as_mv.col *= -1;
+      }
+      ADD_MV_REF_LIST(mv, refmv_count, mv_ref_list, Done);
+    }
+
+    if (prev_frame_mvs->ref_frame[1] > INTRA_FRAME &&
+        prev_frame_mvs->ref_frame[1] != ref_frame &&
+        prev_frame_mvs->mv[1].as_int != prev_frame_mvs->mv[0].as_int) {
+      int_mv mv = prev_frame_mvs->mv[1];
+      if (ref_sign_bias[prev_frame_mvs->ref_frame[1]] !=
+          ref_sign_bias[ref_frame]) {
+        mv.as_mv.row *= -1;
+        mv.as_mv.col *= -1;
+      }
+      ADD_MV_REF_LIST(mv, refmv_count, mv_ref_list, Done);
+    }
+  }
 
  Done:
 
diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h
index b818ae818..b3a6590b2 100644
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h
@@ -56,9 +56,16 @@ typedef enum {
   REFERENCE_MODES       = 3,
 } REFERENCE_MODE;
 
+typedef struct {
+  int_mv mv[2];
+  MV_REFERENCE_FRAME ref_frame[2];
+} MV_REF;
 
 typedef struct {
   int ref_count;
+  MV_REF *mvs;
+  int mi_rows;
+  int mi_cols;
   vpx_codec_frame_buffer_t raw_frame_buffer;
   YV12_BUFFER_CONFIG buf;
 } RefCntBuffer;
@@ -91,6 +98,10 @@ typedef struct VP9Common {
   YV12_BUFFER_CONFIG *frame_to_show;
 
   RefCntBuffer frame_bufs[FRAME_BUFFERS];
+  RefCntBuffer *prev_frame;
+
+  // TODO(hkuang): Combine this with cur_buf in macroblockd.
+  RefCntBuffer *cur_frame;
 
   int ref_frame_map[REF_FRAMES]; /* maps fb_idx to reference slot */
 
@@ -149,6 +160,10 @@ typedef struct VP9Common {
   MODE_INFO *prev_mip; /* MODE_INFO array 'mip' from last decoded frame */
   MODE_INFO *prev_mi;  /* 'mi' from last frame (points into prev_mip) */
 
+
+  // Whether to use previous frame's motion vectors for prediction.
+  int use_prev_frame_mvs;
+
   // Persistent mb segment id map used in prediction.
   unsigned char *last_frame_seg_map;
 
diff --git a/vp9/common/x86/vp9_loopfilter_intrin_avx2.c b/vp9/common/x86/vp9_loopfilter_intrin_avx2.c
index 439c028f2..0cb0912ad 100644
--- a/vp9/common/x86/vp9_loopfilter_intrin_avx2.c
+++ b/vp9/common/x86/vp9_loopfilter_intrin_avx2.c
@@ -9,6 +9,7 @@
  */
 
 #include <immintrin.h>  /* AVX2 */
+#include "vpx_ports/mem.h"
 
 static void mb_lpf_horizontal_edge_w_avx2_8(unsigned char *s, int p,
         const unsigned char *_blimit, const unsigned char *_limit,
@@ -392,6 +393,11 @@ static void mb_lpf_horizontal_edge_w_avx2_8(unsigned char *s, int p,
     }
 }
 
+DECLARE_ALIGNED(32, static const uint8_t, filt_loopfilter_avx2[32]) = {
+  0, 128, 1, 128, 2, 128, 3, 128, 4, 128, 5, 128, 6, 128, 7, 128,
+  8, 128, 9, 128, 10, 128, 11, 128, 12, 128, 13, 128, 14, 128, 15, 128
+};
+
 static void mb_lpf_horizontal_edge_w_avx2_16(unsigned char *s, int p,
         const unsigned char *_blimit, const unsigned char *_limit,
         const unsigned char *_thresh) {
@@ -401,6 +407,9 @@ static void mb_lpf_horizontal_edge_w_avx2_16(unsigned char *s, int p,
     __m128i p7, p6, p5;
     __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
     __m128i q5, q6, q7;
+    __m256i p256_7, q256_7, p256_6, q256_6, p256_5, q256_5, p256_4,
+            q256_4, p256_3, q256_3, p256_2, q256_2, p256_1, q256_1,
+            p256_0, q256_0;
 
     const __m128i thresh = _mm_broadcastb_epi8(
             _mm_cvtsi32_si128((int) _thresh[0]));
@@ -409,16 +418,37 @@ static void mb_lpf_horizontal_edge_w_avx2_16(unsigned char *s, int p,
     const __m128i blimit = _mm_broadcastb_epi8(
             _mm_cvtsi32_si128((int) _blimit[0]));
 
-    p4 = _mm_loadu_si128((__m128i *) (s - 5 * p));
-    p3 = _mm_loadu_si128((__m128i *) (s - 4 * p));
-    p2 = _mm_loadu_si128((__m128i *) (s - 3 * p));
-    p1 = _mm_loadu_si128((__m128i *) (s - 2 * p));
-    p0 = _mm_loadu_si128((__m128i *) (s - 1 * p));
-    q0 = _mm_loadu_si128((__m128i *) (s - 0 * p));
-    q1 = _mm_loadu_si128((__m128i *) (s + 1 * p));
-    q2 = _mm_loadu_si128((__m128i *) (s + 2 * p));
-    q3 = _mm_loadu_si128((__m128i *) (s + 3 * p));
-    q4 = _mm_loadu_si128((__m128i *) (s + 4 * p));
+    p256_4 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                (__m128d const *)(s - 5 * p)));
+    p256_3 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                (__m128d const *)(s - 4 * p)));
+    p256_2 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                (__m128d const *)(s - 3 * p)));
+    p256_1 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                (__m128d const *)(s - 2 * p)));
+    p256_0 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                (__m128d const *)(s - 1 * p)));
+    q256_0 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                (__m128d const *)(s - 0 * p)));
+    q256_1 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                (__m128d const *)(s + 1 * p)));
+    q256_2 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                (__m128d const *)(s + 2 * p)));
+    q256_3 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                (__m128d const *)(s + 3 * p)));
+    q256_4 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                (__m128d const *)(s + 4 * p)));
+
+    p4 = _mm256_castsi256_si128(p256_4);
+    p3 = _mm256_castsi256_si128(p256_3);
+    p2 = _mm256_castsi256_si128(p256_2);
+    p1 = _mm256_castsi256_si128(p256_1);
+    p0 = _mm256_castsi256_si128(p256_0);
+    q0 = _mm256_castsi256_si128(q256_0);
+    q1 = _mm256_castsi256_si128(q256_1);
+    q2 = _mm256_castsi256_si128(q256_2);
+    q3 = _mm256_castsi256_si128(q256_3);
+    q4 = _mm256_castsi256_si128(q256_4);
 
     {
         const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
@@ -534,23 +564,35 @@ static void mb_lpf_horizontal_edge_w_avx2_16(unsigned char *s, int p,
             flat = _mm_cmpeq_epi8(flat, zero);
             flat = _mm_and_si128(flat, mask);
 
-            p5 = _mm_loadu_si128((__m128i *) (s - 6 * p));
-            q5 = _mm_loadu_si128((__m128i *) (s + 5 * p));
+            p256_5 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                        (__m128d const *)(s - 6 * p)));
+            q256_5 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                        (__m128d const *)(s + 5 * p)));
+            p5 = _mm256_castsi256_si128(p256_5);
+            q5 = _mm256_castsi256_si128(q256_5);
             flat2 = _mm_max_epu8(
                     _mm_or_si128(_mm_subs_epu8(p5, p0), _mm_subs_epu8(p0, p5)),
                     _mm_or_si128(_mm_subs_epu8(q5, q0), _mm_subs_epu8(q0, q5)));
 
             flat2 = _mm_max_epu8(work, flat2);
-            p6 = _mm_loadu_si128((__m128i *) (s - 7 * p));
-            q6 = _mm_loadu_si128((__m128i *) (s + 6 * p));
+            p256_6 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                        (__m128d const *)(s - 7 * p)));
+            q256_6 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                        (__m128d const *)(s + 6 * p)));
+            p6 = _mm256_castsi256_si128(p256_6);
+            q6 = _mm256_castsi256_si128(q256_6);
             work = _mm_max_epu8(
                     _mm_or_si128(_mm_subs_epu8(p6, p0), _mm_subs_epu8(p0, p6)),
                     _mm_or_si128(_mm_subs_epu8(q6, q0), _mm_subs_epu8(q0, q6)));
 
             flat2 = _mm_max_epu8(work, flat2);
 
-            p7 = _mm_loadu_si128((__m128i *) (s - 8 * p));
-            q7 = _mm_loadu_si128((__m128i *) (s + 7 * p));
+            p256_7 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                        (__m128d const *)(s - 8 * p)));
+            q256_7 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                        (__m128d const *)(s + 7 * p)));
+            p7 = _mm256_castsi256_si128(p256_7);
+            q7 = _mm256_castsi256_si128(q256_7);
             work = _mm_max_epu8(
                     _mm_or_si128(_mm_subs_epu8(p7, p0), _mm_subs_epu8(p0, p7)),
                     _mm_or_si128(_mm_subs_epu8(q7, q0), _mm_subs_epu8(q0, q7)));
@@ -566,29 +608,28 @@ static void mb_lpf_horizontal_edge_w_avx2_16(unsigned char *s, int p,
         {
             const __m256i eight = _mm256_set1_epi16(8);
             const __m256i four = _mm256_set1_epi16(4);
-            __m256i p256_7, q256_7, p256_6, q256_6, p256_5, q256_5, p256_4,
-                    q256_4, p256_3, q256_3, p256_2, q256_2, p256_1, q256_1,
-                    p256_0, q256_0;
             __m256i pixelFilter_p, pixelFilter_q, pixetFilter_p2p1p0,
                     pixetFilter_q2q1q0, sum_p7, sum_q7, sum_p3, sum_q3, res_p,
                     res_q;
 
-            p256_7 = _mm256_cvtepu8_epi16(p7);
-            p256_6 = _mm256_cvtepu8_epi16(p6);
-            p256_5 = _mm256_cvtepu8_epi16(p5);
-            p256_4 = _mm256_cvtepu8_epi16(p4);
-            p256_3 = _mm256_cvtepu8_epi16(p3);
-            p256_2 = _mm256_cvtepu8_epi16(p2);
-            p256_1 = _mm256_cvtepu8_epi16(p1);
-            p256_0 = _mm256_cvtepu8_epi16(p0);
-            q256_0 = _mm256_cvtepu8_epi16(q0);
-            q256_1 = _mm256_cvtepu8_epi16(q1);
-            q256_2 = _mm256_cvtepu8_epi16(q2);
-            q256_3 = _mm256_cvtepu8_epi16(q3);
-            q256_4 = _mm256_cvtepu8_epi16(q4);
-            q256_5 = _mm256_cvtepu8_epi16(q5);
-            q256_6 = _mm256_cvtepu8_epi16(q6);
-            q256_7 = _mm256_cvtepu8_epi16(q7);
+            const __m256i filter = _mm256_load_si256(
+                                  (__m256i const *)filt_loopfilter_avx2);
+            p256_7 = _mm256_shuffle_epi8(p256_7, filter);
+            p256_6 = _mm256_shuffle_epi8(p256_6, filter);
+            p256_5 = _mm256_shuffle_epi8(p256_5, filter);
+            p256_4 = _mm256_shuffle_epi8(p256_4, filter);
+            p256_3 = _mm256_shuffle_epi8(p256_3, filter);
+            p256_2 = _mm256_shuffle_epi8(p256_2, filter);
+            p256_1 = _mm256_shuffle_epi8(p256_1, filter);
+            p256_0 = _mm256_shuffle_epi8(p256_0, filter);
+            q256_0 = _mm256_shuffle_epi8(q256_0, filter);
+            q256_1 = _mm256_shuffle_epi8(q256_1, filter);
+            q256_2 = _mm256_shuffle_epi8(q256_2, filter);
+            q256_3 = _mm256_shuffle_epi8(q256_3, filter);
+            q256_4 = _mm256_shuffle_epi8(q256_4, filter);
+            q256_5 = _mm256_shuffle_epi8(q256_5, filter);
+            q256_6 = _mm256_shuffle_epi8(q256_6, filter);
+            q256_7 = _mm256_shuffle_epi8(q256_7, filter);
 
             pixelFilter_p = _mm256_add_epi16(_mm256_add_epi16(p256_6, p256_5),
                     _mm256_add_epi16(p256_4, p256_3));
diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index 66da63ac6..a088325df 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -667,6 +667,14 @@ static void setup_display_size(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
     vp9_read_frame_size(rb, &cm->display_width, &cm->display_height);
 }
 
+static void resize_mv_buffer(VP9_COMMON *cm) {
+  vpx_free(cm->cur_frame->mvs);
+  cm->cur_frame->mi_rows = cm->mi_rows;
+  cm->cur_frame->mi_cols = cm->mi_cols;
+  cm->cur_frame->mvs = (MV_REF *)vpx_calloc(cm->mi_rows * cm->mi_cols,
+                                            sizeof(*cm->cur_frame->mvs));
+}
+
 static void resize_context_buffers(VP9_COMMON *cm, int width, int height) {
 #if CONFIG_SIZE_LIMIT
   if (width > DECODE_WIDTH_LIMIT || height > DECODE_HEIGHT_LIMIT)
@@ -692,6 +700,10 @@ static void resize_context_buffers(VP9_COMMON *cm, int width, int height) {
     cm->width = width;
     cm->height = height;
   }
+  if (cm->cur_frame->mvs == NULL || cm->mi_rows > cm->cur_frame->mi_rows ||
+      cm->mi_cols > cm->cur_frame->mi_cols) {
+    resize_mv_buffer(cm);
+  }
 }
 
 static void setup_frame_size(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
@@ -1537,10 +1549,11 @@ void vp9_decode_frame(VP9Decoder *pbi,
 
   init_macroblockd(cm, &pbi->mb);
 
-  if (!cm->error_resilient_mode)
-    set_prev_mi(cm);
-  else
-    cm->prev_mi = NULL;
+  cm->use_prev_frame_mvs = !cm->error_resilient_mode &&
+                           cm->width == cm->last_width &&
+                           cm->height == cm->last_height &&
+                           !cm->intra_only &&
+                           cm->last_show_frame;
 
   setup_plane_dequants(cm, xd, cm->base_qindex);
   vp9_setup_block_planes(xd, cm->subsampling_x, cm->subsampling_y);
diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index d0e0b76da..7ca812f40 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -425,7 +425,6 @@ static void read_inter_block_mode_info(VP9_COMMON *const cm,
   MB_MODE_INFO *const mbmi = &mi->mbmi;
   const BLOCK_SIZE bsize = mbmi->sb_type;
   const int allow_hp = cm->allow_high_precision_mv;
-
   int_mv nearestmv[2], nearmv[2];
   int inter_mode_ctx, ref, is_compound;
 
@@ -544,8 +543,27 @@ static void read_inter_frame_mode_info(VP9_COMMON *const cm,
 void vp9_read_mode_info(VP9_COMMON *cm, MACROBLOCKD *xd,
                         const TileInfo *const tile,
                         int mi_row, int mi_col, vp9_reader *r) {
+  MODE_INFO *const mi = xd->mi[0].src_mi;
+  const int bw = num_8x8_blocks_wide_lookup[mi->mbmi.sb_type];
+  const int bh = num_8x8_blocks_high_lookup[mi->mbmi.sb_type];
+  const int x_mis = MIN(bw, cm->mi_cols - mi_col);
+  const int y_mis = MIN(bh, cm->mi_rows - mi_row);
+  MV_REF* frame_mvs = cm->cur_frame->mvs + mi_row * cm->mi_cols + mi_col;
+  int w, h;
+
   if (frame_is_intra_only(cm))
     read_intra_frame_mode_info(cm, xd, mi_row, mi_col, r);
   else
     read_inter_frame_mode_info(cm, xd, tile, mi_row, mi_col, r);
+
+  for (h = 0; h < y_mis; ++h) {
+    MV_REF *const frame_mv = frame_mvs + h * cm->mi_cols;
+    for (w = 0; w < x_mis; ++w) {
+      MV_REF *const mv = frame_mv + w;
+      mv->ref_frame[0] = mi->src_mi->mbmi.ref_frame[0];
+      mv->ref_frame[1] = mi->src_mi->mbmi.ref_frame[1];
+      mv->mv[0].as_int = mi->src_mi->mbmi.mv[0].as_int;
+      mv->mv[1].as_int = mi->src_mi->mbmi.mv[1].as_int;
+    }
+  }
 }
diff --git a/vp9/decoder/vp9_decoder.c b/vp9/decoder/vp9_decoder.c
index fa2f01041..196816531 100644
--- a/vp9/decoder/vp9_decoder.c
+++ b/vp9/decoder/vp9_decoder.c
@@ -252,6 +252,9 @@ int vp9_receive_compressed_data(VP9Decoder *pbi,
                       &cm->frame_bufs[cm->new_fb_idx].raw_frame_buffer);
   cm->new_fb_idx = get_free_fb(cm);
 
+  // Assign a MV array to the frame buffer.
+  cm->cur_frame = &cm->frame_bufs[cm->new_fb_idx];
+
   if (setjmp(cm->error.jmp)) {
     pbi->need_resync = 1;
     cm->error.setjmp = 0;
@@ -284,14 +287,13 @@ int vp9_receive_compressed_data(VP9Decoder *pbi,
   cm->last_width = cm->width;
   cm->last_height = cm->height;
 
-  if (!cm->show_existing_frame)
+  if (!cm->show_existing_frame) {
     cm->last_show_frame = cm->show_frame;
-  if (cm->show_frame) {
-    if (!cm->show_existing_frame)
-      vp9_swap_mi_and_prev_mi(cm);
+    cm->prev_frame = cm->cur_frame;
+  }
 
+  if (cm->show_frame)
     cm->current_video_frame++;
-  }
 
   cm->error.setjmp = 0;
   return retcode;
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 917d91b26..b87a28332 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -611,6 +611,13 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
   MB_MODE_INFO *const mbmi = &xd->mi[0].src_mi->mbmi;
   MODE_INFO *mi_addr = &xd->mi[0];
   const struct segmentation *const seg = &cm->seg;
+  const int bw = num_8x8_blocks_wide_lookup[mi->mbmi.sb_type];
+  const int bh = num_8x8_blocks_high_lookup[mi->mbmi.sb_type];
+  const int x_mis = MIN(bw, cm->mi_cols - mi_col);
+  const int y_mis = MIN(bh, cm->mi_rows - mi_row);
+  MV_REF *const frame_mvs =
+      cm->cur_frame->mvs + mi_row * cm->mi_cols + mi_col;
+  int w, h;
 
   const int mis = cm->mi_stride;
   const int mi_width = num_8x8_blocks_wide_lookup[bsize];
@@ -728,6 +735,17 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
     for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
       rd_opt->filter_diff[i] += ctx->best_filter_diff[i];
   }
+
+  for (h = 0; h < y_mis; ++h) {
+    MV_REF *const frame_mv = frame_mvs + h * cm->mi_cols;
+    for (w = 0; w < x_mis; ++w) {
+      MV_REF *const mv = frame_mv + w;
+      mv->ref_frame[0] = mi->src_mi->mbmi.ref_frame[0];
+      mv->ref_frame[1] = mi->src_mi->mbmi.ref_frame[1];
+      mv->mv[0].as_int = mi->src_mi->mbmi.mv[0].as_int;
+      mv->mv[1].as_int = mi->src_mi->mbmi.mv[1].as_int;
+    }
+  }
 }
 
 void vp9_setup_src_planes(MACROBLOCK *x, const YV12_BUFFER_CONFIG *src,
@@ -1293,8 +1311,16 @@ static void update_state_rt(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
+  MODE_INFO *const mi = xd->mi[0].src_mi;
   MB_MODE_INFO *const mbmi = &xd->mi[0].src_mi->mbmi;
   const struct segmentation *const seg = &cm->seg;
+  const int bw = num_8x8_blocks_wide_lookup[mi->mbmi.sb_type];
+  const int bh = num_8x8_blocks_high_lookup[mi->mbmi.sb_type];
+  const int x_mis = MIN(bw, cm->mi_cols - mi_col);
+  const int y_mis = MIN(bh, cm->mi_rows - mi_row);
+  MV_REF *const frame_mvs =
+      cm->cur_frame->mvs + mi_row * cm->mi_cols + mi_col;
+  int w, h;
 
   *(xd->mi[0].src_mi) = ctx->mic;
   xd->mi[0].src_mi = &xd->mi[0];
@@ -1323,6 +1349,17 @@ static void update_state_rt(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
     }
   }
 
+  for (h = 0; h < y_mis; ++h) {
+    MV_REF *const frame_mv = frame_mvs + h * cm->mi_cols;
+    for (w = 0; w < x_mis; ++w) {
+      MV_REF *const mv = frame_mv + w;
+      mv->ref_frame[0] = mi->src_mi->mbmi.ref_frame[0];
+      mv->ref_frame[1] = mi->src_mi->mbmi.ref_frame[1];
+      mv->mv[0].as_int = mi->src_mi->mbmi.mv[0].as_int;
+      mv->mv[1].as_int = mi->src_mi->mbmi.mv[1].as_int;
+    }
+  }
+
   x->skip = ctx->skip;
   x->skip_txfm[0] = mbmi->segment_id ? 0 : ctx->skip_txfm[0];
 }
@@ -3529,6 +3566,11 @@ static void encode_frame_internal(VP9_COMP *cpi) {
   vp9_initialize_me_consts(cpi, cm->base_qindex);
   init_encode_frame_mb_context(cpi);
   set_prev_mi(cm);
+  cm->use_prev_frame_mvs = !cm->error_resilient_mode &&
+                           cm->width == cm->last_width &&
+                           cm->height == cm->last_height &&
+                           !cm->intra_only &&
+                           cm->last_show_frame;
 
   x->quant_fp = cpi->sf.use_quant_fp;
   vp9_zero(x->skip_txfm);
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index dfc636a41..1d9fe5e92 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -584,7 +584,7 @@ static void init_config(struct VP9_COMP *cpi, VP9EncoderConfig *oxcf) {
   if ((cpi->svc.number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) ||
       ((cpi->svc.number_temporal_layers > 1 ||
         cpi->svc.number_spatial_layers > 1) &&
-       cpi->oxcf.pass == 2)) {
+       cpi->oxcf.pass != 1)) {
     vp9_init_layer_context(cpi);
   }
 
@@ -1285,7 +1285,7 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
       cpi->oxcf.rc_mode == VPX_CBR) ||
       ((cpi->svc.number_temporal_layers > 1 ||
         cpi->svc.number_spatial_layers > 1) &&
-       cpi->oxcf.pass == 2)) {
+       cpi->oxcf.pass != 1)) {
     vp9_update_layer_context_change_config(cpi,
                                            (int)cpi->oxcf.target_bandwidth);
   }
@@ -2420,6 +2420,7 @@ void vp9_scale_references(VP9_COMP *cpi) {
 #if CONFIG_VP9_HIGHBITDEPTH
       if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) {
         const int new_fb = get_free_fb(cm);
+        cm->cur_frame = &cm->frame_bufs[new_fb];
         vp9_realloc_frame_buffer(&cm->frame_bufs[new_fb].buf,
                                  cm->width, cm->height,
                                  cm->subsampling_x, cm->subsampling_y,
@@ -2437,6 +2438,15 @@ void vp9_scale_references(VP9_COMP *cpi) {
         scale_and_extend_frame(ref, &cm->frame_bufs[new_fb].buf);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
         cpi->scaled_ref_idx[ref_frame - 1] = new_fb;
+        if (cm->frame_bufs[new_fb].mvs == NULL ||
+            cm->frame_bufs[new_fb].mi_rows < cm->mi_rows ||
+            cm->frame_bufs[new_fb].mi_cols < cm->mi_cols) {
+          cm->frame_bufs[new_fb].mvs =
+            (MV_REF *)vpx_calloc(cm->mi_rows * cm->mi_cols,
+                                 sizeof(*cm->frame_bufs[new_fb].mvs));
+          cm->frame_bufs[new_fb].mi_rows = cm->mi_rows;
+          cm->frame_bufs[new_fb].mi_cols = cm->mi_cols;
+        }
       } else {
         cpi->scaled_ref_idx[ref_frame - 1] = idx;
         ++cm->frame_bufs[idx].ref_count;
@@ -2973,7 +2983,9 @@ static int get_ref_frame_flags(const VP9_COMP *cpi) {
   if (gold_is_last)
     flags &= ~VP9_GOLD_FLAG;
 
-  if (cpi->rc.frames_till_gf_update_due == INT_MAX && !is_two_pass_svc(cpi))
+  if (cpi->rc.frames_till_gf_update_due == INT_MAX &&
+      (cpi->svc.number_temporal_layers == 1 &&
+       cpi->svc.number_spatial_layers == 1))
     flags &= ~VP9_GOLD_FLAG;
 
   if (alt_is_last)
@@ -3279,13 +3291,13 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
 
   if (cm->show_frame) {
     vp9_swap_mi_and_prev_mi(cm);
-
     // Don't increment frame counters if this was an altref buffer
     // update not a real frame
     ++cm->current_video_frame;
     if (cpi->use_svc)
       vp9_inc_frame_in_layer(cpi);
   }
+  cm->prev_frame = cm->cur_frame;
 
   if (is_two_pass_svc(cpi))
     cpi->svc.layer_context[cpi->svc.spatial_layer_id].last_frame_type =
@@ -3630,6 +3642,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
   // held.
   cm->frame_bufs[cm->new_fb_idx].ref_count--;
   cm->new_fb_idx = get_free_fb(cm);
+  cm->cur_frame = &cm->frame_bufs[cm->new_fb_idx];
 
   if (!cpi->use_svc && cpi->multi_arf_allowed) {
     if (cm->frame_type == KEY_FRAME) {
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 0e112f2ff..f0c05430a 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -531,9 +531,8 @@ void vp9_apply_encoding_flags(VP9_COMP *cpi, vpx_enc_frame_flags_t flags);
 
 static INLINE int is_two_pass_svc(const struct VP9_COMP *const cpi) {
   return cpi->use_svc &&
-         (cpi->svc.number_temporal_layers > 1 ||
-          cpi->svc.number_spatial_layers > 1) &&
-         (cpi->oxcf.pass == 1 || cpi->oxcf.pass == 2);
+         ((cpi->svc.number_spatial_layers > 1) ||
+         (cpi->svc.number_temporal_layers > 1 && cpi->oxcf.pass != 0));
 }
 
 static INLINE int is_altref_enabled(const VP9_COMP *const cpi) {
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 69b419384..28598f1aa 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -590,6 +590,13 @@ int vp9_find_best_sub_pixel_tree_pruned(const MACROBLOCK *x,
   return besterr;
 }
 
+const MV search_step_table[12] = {
+    // left, right, up, down
+    {0, -4}, {0, 4}, {-4, 0}, {4, 0},
+    {0, -2}, {0, 2}, {-2, 0}, {2, 0},
+    {0, -1}, {0, 1}, {-1, 0}, {1, 0}
+};
+
 int vp9_find_best_sub_pixel_tree(const MACROBLOCK *x,
                                  MV *bestmv, const MV *ref_mv,
                                  int allow_hp,
@@ -603,43 +610,134 @@ int vp9_find_best_sub_pixel_tree(const MACROBLOCK *x,
                                  unsigned int *sse1,
                                  const uint8_t *second_pred,
                                  int w, int h) {
-  SETUP_SUBPEL_SEARCH;
-  SETUP_CENTER_ERROR;
-  (void) cost_list;  // to silence compiler warning
+  const uint8_t *const z = x->plane[0].src.buf;
+  const uint8_t *const src_address = z;
+  const int src_stride = x->plane[0].src.stride;
+  const MACROBLOCKD *xd = &x->e_mbd;
+  unsigned int besterr = INT_MAX;
+  unsigned int sse;
+  unsigned int whichdir = 0;
+  int thismse;
+  const int y_stride = xd->plane[0].pre[0].stride;
+  const int offset = bestmv->row * y_stride + bestmv->col;
+  const uint8_t *const y = xd->plane[0].pre[0].buf;
+
+  int rr = ref_mv->row;
+  int rc = ref_mv->col;
+  int br = bestmv->row * 8;
+  int bc = bestmv->col * 8;
+  int hstep = 4;
+  int iter, round = 3 - forced_stop;
+  const int minc = MAX(x->mv_col_min * 8, ref_mv->col - MV_MAX);
+  const int maxc = MIN(x->mv_col_max * 8, ref_mv->col + MV_MAX);
+  const int minr = MAX(x->mv_row_min * 8, ref_mv->row - MV_MAX);
+  const int maxr = MIN(x->mv_row_max * 8, ref_mv->row + MV_MAX);
+  int tr = br;
+  int tc = bc;
+  const MV *search_step = search_step_table;
+  int idx, best_idx = -1;
+  unsigned int cost_array[5];
+
+  if (!(allow_hp && vp9_use_mv_hp(ref_mv)))
+    if (round == 3)
+      round = 2;
+
+  bestmv->row *= 8;
+  bestmv->col *= 8;
 
-  // Each subsequent iteration checks at least one point in
-  // common with the last iteration could be 2 ( if diag selected)
-  // 1/2 pel
-  FIRST_LEVEL_CHECKS;
-  if (halfiters > 1) {
-    SECOND_LEVEL_CHECKS;
+  if (second_pred != NULL) {
+    DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64);
+    vp9_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
+    besterr = vfp->vf(comp_pred, w, src_address, src_stride, sse1);
+  } else {
+    besterr = vfp->vf(y + offset, y_stride, src_address, src_stride, sse1);
   }
-  tr = br;
-  tc = bc;
+  *distortion = besterr;
+  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
 
-  // Each subsequent iteration checks at least one point in common with
-  // the last iteration could be 2 ( if diag selected) 1/4 pel
+  (void) cost_list;  // to silence compiler warning
 
-  // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only
-  if (forced_stop != 2) {
-    hstep >>= 1;
-    FIRST_LEVEL_CHECKS;
-    if (quarteriters > 1) {
-      SECOND_LEVEL_CHECKS;
+  for (iter = 0; iter < round; ++iter) {
+    // Check vertical and horizontal sub-pixel positions.
+    for (idx = 0; idx < 4; ++idx) {
+      tr = br + search_step[idx].row;
+      tc = bc + search_step[idx].col;
+      if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
+        const uint8_t *const pre_address = y + (tr >> 3) * y_stride + (tc >> 3);
+        int row_offset = (tr & 0x07) << 1;
+        int col_offset = (tc & 0x07) << 1;
+        MV this_mv;
+        this_mv.row = tr;
+        this_mv.col = tc;
+        if (second_pred == NULL)
+          thismse = vfp->svf(pre_address, y_stride, col_offset, row_offset,
+                             src_address, src_stride, &sse);
+        else
+          thismse = vfp->svaf(pre_address, y_stride, col_offset, row_offset,
+                              src_address, src_stride, &sse, second_pred);
+        cost_array[idx] = thismse +
+            mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit);
+
+        if (cost_array[idx] < besterr) {
+          best_idx = idx;
+          besterr = cost_array[idx];
+          *distortion = thismse;
+          *sse1 = sse;
+        }
+      } else {
+        cost_array[idx] = INT_MAX;
+      }
     }
-    tr = br;
-    tc = bc;
-  }
 
-  if (allow_hp && vp9_use_mv_hp(ref_mv) && forced_stop == 0) {
-    hstep >>= 1;
-    FIRST_LEVEL_CHECKS;
-    if (eighthiters > 1) {
-      SECOND_LEVEL_CHECKS;
+    // Check diagonal sub-pixel position
+    tc = bc + (cost_array[0] < cost_array[1] ? -hstep : hstep);
+    tr = br + (cost_array[2] < cost_array[3] ? -hstep : hstep);
+    if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
+      const uint8_t *const pre_address = y + (tr >> 3) * y_stride + (tc >> 3);
+      int row_offset = (tr & 0x07) << 1;
+      int col_offset = (tc & 0x07) << 1;
+      MV this_mv = {tr, tc};
+      if (second_pred == NULL)
+        thismse = vfp->svf(pre_address, y_stride, col_offset, row_offset,
+                           src_address, src_stride, &sse);
+      else
+        thismse = vfp->svaf(pre_address, y_stride, col_offset, row_offset,
+                            src_address, src_stride, &sse, second_pred);
+      cost_array[4] = thismse +
+          mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit);
+
+      if (cost_array[4] < besterr) {
+        best_idx = 4;
+        besterr = cost_array[4];
+        *distortion = thismse;
+        *sse1 = sse;
+      }
+    } else {
+      cost_array[idx] = INT_MAX;
     }
+
+    if (best_idx < 4 && best_idx >= 0) {
+      br += search_step[best_idx].row;
+      bc += search_step[best_idx].col;
+    } else if (best_idx == 4) {
+      br = tr;
+      bc = tc;
+    }
+
+    if (iters_per_step > 1)
+      SECOND_LEVEL_CHECKS;
+
     tr = br;
     tc = bc;
+
+    search_step += 4;
+    hstep >>= 1;
+    best_idx = -1;
   }
+
+  // Each subsequent iteration checks at least one point in common with
+  // the last iteration could be 2 ( if diag selected) 1/4 pel
+
   // These lines insure static analysis doesn't warn that
   // tr and tc aren't used after the above point.
   (void) tr;
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index 3315aa6a1..7a1b0cc1f 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -295,16 +295,16 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf,
   }
 
   if (speed >= 7) {
+    sf->adaptive_rd_thresh = 3;
     sf->mv.search_method = FAST_DIAMOND;
     sf->mv.fullpel_search_step_param = 10;
     sf->lpf_pick = LPF_PICK_MINIMAL_LPF;
     sf->encode_breakout_thresh = (MIN(cm->width, cm->height) >= 720) ?
         800 : 300;
-    sf->elevate_newmv_thresh = 2500;
   }
 
   if (speed >= 12) {
-    sf->elevate_newmv_thresh = 4000;
+    sf->adaptive_rd_thresh = 4;
     sf->mv.subpel_force_stop = 2;
   }
 
@@ -386,7 +386,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
   sf->search_type_check_frequency = 50;
   sf->encode_breakout_thresh = 0;
   sf->elevate_newmv_thresh = 0;
-  // Recode loop tolerence %.
+  // Recode loop tolerance %.
   sf->recode_tolerance = 25;
   sf->default_interp_filter = SWITCHABLE;
   sf->tx_size_search_breakout = 0;