18 files changed, 230 insertions, 208 deletions
diff --git a/examples.mk b/examples.mk
index 1187f147a..a1d4eb68c 100644
--- a/examples.mk
+++ b/examples.mk
@@ -404,3 +404,4 @@ CLEAN-OBJS += examples.doxy samples.dox $(ALL_EXAMPLES:.c=.dox)
 DOCS-yes += examples.doxy samples.dox
 examples.doxy: samples.dox $(ALL_EXAMPLES:.c=.dox)
 	@echo "INPUT += $^" > $@
+	@echo "ENABLED_SECTIONS += samples" >> $@
diff --git a/libs.mk b/libs.mk
index 5dfb78ff1..7ec8c8756 100644
--- a/libs.mk
+++ b/libs.mk
@@ -112,11 +112,6 @@ ifeq ($(CONFIG_DECODERS),yes)
   CODEC_DOC_SECTIONS += decoder
 endif
 
-# Suppress -Wextra warnings in first party code pending investigation.
-# https://bugs.chromium.org/p/webm/issues/detail?id=1246
-$(BUILD_PFX)vp8/encoder/onyx_if.c.o: CFLAGS += -Wno-unknown-warning-option -Wno-clobbered
-$(BUILD_PFX)vp8/decoder/onyxd_if.c.o: CFLAGS += -Wno-unknown-warning-option -Wno-clobbered
-
 ifeq ($(CONFIG_MSVS),yes)
 CODEC_LIB=$(if $(CONFIG_STATIC_MSVCRT),vpxmt,vpxmd)
 GTEST_LIB=$(if $(CONFIG_STATIC_MSVCRT),gtestmt,gtestmd)
diff --git a/mainpage.dox b/mainpage.dox
index ec202fa4f..4b0dff087 100644
--- a/mainpage.dox
+++ b/mainpage.dox
@@ -25,8 +25,10 @@
     release.
   - The \ref readme contains instructions on recompiling the sample applications.
   - Read the \ref usage "usage" for a narrative on codec usage.
+  \if samples
   - Read the \ref samples "sample code" for examples of how to interact with the
     codec.
+  \endif
   - \ref codec reference
   \if encoder
   - \ref encoder reference
diff --git a/tools/tiny_ssim.c b/tools/tiny_ssim.c
index 1f73c73c1..67a6903b5 100644
--- a/tools/tiny_ssim.c
+++ b/tools/tiny_ssim.c
@@ -50,7 +50,8 @@ static uint64_t calc_plane_error16(uint16_t *orig, int orig_stride,
   }
   return total_sse;
 }
-#endif
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 static uint64_t calc_plane_error(uint8_t *orig, int orig_stride, uint8_t *recon,
                                  int recon_stride, unsigned int cols,
                                  unsigned int rows) {
@@ -269,6 +270,7 @@ static double ssim_8x8(const uint8_t *s, int sp, const uint8_t *r, int rp) {
   return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64, 8);
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
 static double highbd_ssim_8x8(const uint16_t *s, int sp, const uint16_t *r,
                               int rp, uint32_t bd, uint32_t shift) {
   uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0;
@@ -277,6 +279,7 @@ static double highbd_ssim_8x8(const uint16_t *s, int sp, const uint16_t *r,
   return similarity(sum_s >> shift, sum_r >> shift, sum_sq_s >> (2 * shift),
                     sum_sq_r >> (2 * shift), sum_sxr >> (2 * shift), 64, bd);
 }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
 // We are using a 8x8 moving window with starting location of each 8x8 window
 // on the 4x4 pixel grid. Such arrangement allows the windows to overlap
@@ -300,6 +303,7 @@ static double ssim2(const uint8_t *img1, const uint8_t *img2, int stride_img1,
   return ssim_total;
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
 static double highbd_ssim2(const uint8_t *img1, const uint8_t *img2,
                            int stride_img1, int stride_img2, int width,
                            int height, uint32_t bd, uint32_t shift) {
@@ -321,6 +325,7 @@ static double highbd_ssim2(const uint8_t *img1, const uint8_t *img2,
   ssim_total /= samples;
   return ssim_total;
 }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
 // traditional ssim as per: http://en.wikipedia.org/wiki/Structural_similarity
 //
@@ -565,35 +570,6 @@ double get_ssim_metrics(uint8_t *img1, int img1_pitch, uint8_t *img2,
   return inconsistency_total;
 }
 
-double highbd_calc_ssim(const YV12_BUFFER_CONFIG *source,
-                        const YV12_BUFFER_CONFIG *dest, double *weight,
-                        uint32_t bd, uint32_t in_bd) {
-  double a, b, c;
-  double ssimv;
-  uint32_t shift = 0;
-
-  assert(bd >= in_bd);
-  shift = bd - in_bd;
-
-  a = highbd_ssim2(source->y_buffer, dest->y_buffer, source->y_stride,
-                   dest->y_stride, source->y_crop_width, source->y_crop_height,
-                   in_bd, shift);
-
-  b = highbd_ssim2(source->u_buffer, dest->u_buffer, source->uv_stride,
-                   dest->uv_stride, source->uv_crop_width,
-                   source->uv_crop_height, in_bd, shift);
-
-  c = highbd_ssim2(source->v_buffer, dest->v_buffer, source->uv_stride,
-                   dest->uv_stride, source->uv_crop_width,
-                   source->uv_crop_height, in_bd, shift);
-
-  ssimv = a * .8 + .1 * (b + c);
-
-  *weight = 1;
-
-  return ssimv;
-}
-
 int main(int argc, char *argv[]) {
   FILE *framestats = NULL;
   int bit_depth = 8;
@@ -711,7 +687,7 @@ int main(int argc, char *argv[]) {
 #define psnr_and_ssim(ssim, psnr, buf0, buf1, w, h) \
   ssim = ssim2(buf0, buf1, w, w, w, h);             \
   psnr = calc_plane_error(buf0, w, buf1, w, w, h);
-#endif
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
     if (n_frames == allocated_frames) {
       allocated_frames = allocated_frames == 0 ? 1024 : allocated_frames * 2;
diff --git a/usage_cx.dox b/usage_cx.dox
index 92b0d34ef..b2220cfdd 100644
--- a/usage_cx.dox
+++ b/usage_cx.dox
@@ -8,6 +8,8 @@
     \ref usage_deadline.
 
 
+    \if samples
     \ref samples
+    \endif
 
 */
diff --git a/usage_dx.dox b/usage_dx.dox
index 883ce2492..85063f705 100644
--- a/usage_dx.dox
+++ b/usage_dx.dox
@@ -11,7 +11,9 @@
     \ref usage_postproc based on the amount of free CPU time. For more
     information on the <code>deadline</code> parameter, see \ref usage_deadline.
 
+    \if samples
     \ref samples
+    \endif
 
 
     \section usage_cb Callback Based Decoding
diff --git a/vp8/common/mfqe.c b/vp8/common/mfqe.c
index aad908572..1fe7363f1 100644
--- a/vp8/common/mfqe.c
+++ b/vp8/common/mfqe.c
@@ -235,7 +235,7 @@ void vp8_multiframe_quality_enhance(VP8_COMMON *cm) {
 
   FRAME_TYPE frame_type = cm->frame_type;
   /* Point at base of Mb MODE_INFO list has motion vectors etc */
-  const MODE_INFO *mode_info_context = cm->show_frame_mi;
+  const MODE_INFO *mode_info_context = cm->mi;
   int mb_row;
   int mb_col;
   int totmap, map[4];
diff --git a/vp8/common/postproc.c b/vp8/common/postproc.c
index 8c292d616..1ff2e5cc5 100644
--- a/vp8/common/postproc.c
+++ b/vp8/common/postproc.c
@@ -325,7 +325,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest,
   vpx_clear_system_state();
 
   if ((flags & VP8D_MFQE) && oci->postproc_state.last_frame_valid &&
-      oci->current_video_frame >= 2 &&
+      oci->current_video_frame > 10 &&
       oci->postproc_state.last_base_qindex < 60 &&
       oci->base_qindex - oci->postproc_state.last_base_qindex >= 20) {
     vp8_multiframe_quality_enhance(oci);
diff --git a/vp8/decoder/onyxd_if.c b/vp8/decoder/onyxd_if.c
index bb34821f3..918c68626 100644
--- a/vp8/decoder/onyxd_if.c
+++ b/vp8/decoder/onyxd_if.c
@@ -321,22 +321,6 @@ int vp8dx_receive_compressed_data(VP8D_COMP *pbi, size_t size,
   pbi->dec_fb_ref[GOLDEN_FRAME] = &cm->yv12_fb[cm->gld_fb_idx];
   pbi->dec_fb_ref[ALTREF_FRAME] = &cm->yv12_fb[cm->alt_fb_idx];
 
-  if (setjmp(pbi->common.error.jmp)) {
-    /* We do not know if the missing frame(s) was supposed to update
-     * any of the reference buffers, but we act conservative and
-     * mark only the last buffer as corrupted.
-     */
-    cm->yv12_fb[cm->lst_fb_idx].corrupted = 1;
-
-    if (cm->fb_idx_ref_cnt[cm->new_fb_idx] > 0) {
-      cm->fb_idx_ref_cnt[cm->new_fb_idx]--;
-    }
-    pbi->common.error.setjmp = 0;
-    goto decode_exit;
-  }
-
-  pbi->common.error.setjmp = 1;
-
   retcode = vp8_decode_frame(pbi);
 
   if (retcode < 0) {
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index 8de8ca18e..adc25024c 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -4813,14 +4813,6 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags,
 
   cm = &cpi->common;
 
-  if (setjmp(cpi->common.error.jmp)) {
-    cpi->common.error.setjmp = 0;
-    vpx_clear_system_state();
-    return VPX_CODEC_CORRUPT_FRAME;
-  }
-
-  cpi->common.error.setjmp = 1;
-
   vpx_usec_timer_start(&cmptimer);
 
   cpi->source = NULL;
diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c
index 71eadeb08..d01d2095f 100644
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@@ -16,6 +16,7 @@
 #include "vpx/internal/vpx_codec_internal.h"
 #include "vpx_version.h"
 #include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/system_state.h"
 #include "vpx_ports/vpx_once.h"
 #include "vp8/encoder/onyx_int.h"
 #include "vpx/vp8cx.h"
@@ -796,9 +797,11 @@ static vpx_codec_err_t set_reference_and_update(vpx_codec_alg_priv_t *ctx,
 static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx,
                                    const vpx_image_t *img, vpx_codec_pts_t pts,
                                    unsigned long duration,
-                                   vpx_enc_frame_flags_t flags,
+                                   vpx_enc_frame_flags_t enc_flags,
                                    unsigned long deadline) {
-  vpx_codec_err_t res = VPX_CODEC_OK;
+  volatile vpx_codec_err_t res = VPX_CODEC_OK;
+  // Make a copy as volatile to avoid -Wclobbered with longjmp.
+  volatile vpx_enc_frame_flags_t flags = enc_flags;
 
   if (!ctx->cfg.rc_target_bitrate) {
 #if CONFIG_MULTI_RES_ENCODING
@@ -840,6 +843,12 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx,
     }
   }
 
+  if (setjmp(ctx->cpi->common.error.jmp)) {
+    ctx->cpi->common.error.setjmp = 0;
+    vpx_clear_system_state();
+    return VPX_CODEC_CORRUPT_FRAME;
+  }
+
   /* Initialize the encoder instance on the first frame*/
   if (!res && ctx->cpi) {
     unsigned int lib_flags;
@@ -886,6 +895,8 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx,
     cx_data_end = ctx->cx_data + cx_data_sz;
     lib_flags = 0;
 
+    ctx->cpi->common.error.setjmp = 1;
+
     while (cx_data_sz >= ctx->cx_data_sz / 2) {
       comp_data_state = vp8_get_compressed_data(
           ctx->cpi, &lib_flags, &size, cx_data, cx_data_end, &dst_time_stamp,
diff --git a/vp8/vp8_dx_iface.c b/vp8/vp8_dx_iface.c
index a2008b903..6d1c5f595 100644
--- a/vp8/vp8_dx_iface.c
+++ b/vp8/vp8_dx_iface.c
@@ -335,8 +335,8 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx,
 
   if (!res) {
     VP8D_COMP *pbi = ctx->yv12_frame_buffers.pbi[0];
+    VP8_COMMON *const pc = &pbi->common;
     if (resolution_change) {
-      VP8_COMMON *const pc = &pbi->common;
       MACROBLOCKD *const xd = &pbi->mb;
 #if CONFIG_MULTITHREAD
       int i;
@@ -428,6 +428,23 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx,
       pbi->common.fb_idx_ref_cnt[0] = 0;
     }
 
+    if (setjmp(pbi->common.error.jmp)) {
+      /* We do not know if the missing frame(s) was supposed to update
+       * any of the reference buffers, but we act conservative and
+       * mark only the last buffer as corrupted.
+       */
+      pc->yv12_fb[pc->lst_fb_idx].corrupted = 1;
+
+      if (pc->fb_idx_ref_cnt[pc->new_fb_idx] > 0) {
+        pc->fb_idx_ref_cnt[pc->new_fb_idx]--;
+      }
+      pc->error.setjmp = 0;
+      res = update_error_state(ctx, &pbi->common.error);
+      return res;
+    }
+
+    pbi->common.error.setjmp = 1;
+
     /* update the pbi fragment data */
     pbi->fragments = ctx->fragments;
 
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 98343f0d2..9c82f2ebe 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -1382,7 +1382,7 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
       x->sb_mvrow_part = mi->mv[0].as_mv.row;
       if (cpi->oxcf.content == VP9E_CONTENT_SCREEN &&
           cpi->svc.spatial_layer_id == 0 &&
-          cpi->rc.high_num_blocks_with_motion && !x->zero_temp_sad_source &&
+          cpi->svc.high_num_blocks_with_motion && !x->zero_temp_sad_source &&
           cm->width > 640 && cm->height > 480) {
         // Disable split below 16x16 block size when scroll motion is detected.
         // TODO(marpan/jianj): Improve this condition: issue is that search
@@ -5734,7 +5734,7 @@ static void encode_frame_internal(VP9_COMP *cpi) {
     int64_t mc_dep_cost_base = 0;
     int row, col;
 
-    for (row = 0; row < cm->mi_rows; ++row) {
+    for (row = 0; row < cm->mi_rows && tpl_frame->is_valid; ++row) {
       for (col = 0; col < cm->mi_cols; ++col) {
         TplDepStats *this_stats = &tpl_stats[row * tpl_stride + col];
         intra_cost_base += this_stats->intra_cost;
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 0b1d74a59..b10b91c98 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -2110,7 +2110,7 @@ static void cal_nmvsadcosts_hp(int *mvsadcost[2]) {
 
 VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,
                                 BufferPool *const pool) {
-  unsigned int i, frame;
+  unsigned int i;
   VP9_COMP *volatile const cpi = vpx_memalign(32, sizeof(VP9_COMP));
   VP9_COMMON *volatile const cm = cpi != NULL ? &cpi->common : NULL;
 
@@ -2361,51 +2361,10 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,
   vp9_set_speed_features_framesize_independent(cpi);
   vp9_set_speed_features_framesize_dependent(cpi);
 
-  if (cpi->sf.enable_tpl_model) {
-    const int mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
-    const int mi_rows = mi_cols_aligned_to_sb(cm->mi_rows);
 #if CONFIG_NON_GREEDY_MV
-    CHECK_MEM_ERROR(
-        cm, cpi->feature_score_loc_arr,
-        vpx_calloc(mi_rows * mi_cols, sizeof(*cpi->feature_score_loc_arr)));
-    CHECK_MEM_ERROR(
-        cm, cpi->feature_score_loc_sort,
-        vpx_calloc(mi_rows * mi_cols, sizeof(*cpi->feature_score_loc_sort)));
-    CHECK_MEM_ERROR(
-        cm, cpi->feature_score_loc_heap,
-        vpx_calloc(mi_rows * mi_cols, sizeof(*cpi->feature_score_loc_heap)));
-#endif
-    // TODO(jingning): Reduce the actual memory use for tpl model build up.
-    for (frame = 0; frame < MAX_ARF_GOP_SIZE; ++frame) {
-#if CONFIG_NON_GREEDY_MV
-      int sqr_bsize;
-      int rf_idx;
-      for (rf_idx = 0; rf_idx < 3; ++rf_idx) {
-        for (sqr_bsize = 0; sqr_bsize < SQUARE_BLOCK_SIZES; ++sqr_bsize) {
-          CHECK_MEM_ERROR(
-              cm, cpi->tpl_stats[frame].pyramid_mv_arr[rf_idx][sqr_bsize],
-              vpx_calloc(mi_rows * mi_cols,
-                         sizeof(*cpi->tpl_stats[frame]
-                                     .pyramid_mv_arr[rf_idx][sqr_bsize])));
-        }
-      }
-#endif
-      CHECK_MEM_ERROR(cm, cpi->tpl_stats[frame].tpl_stats_ptr,
-                      vpx_calloc(mi_rows * mi_cols,
-                                 sizeof(*cpi->tpl_stats[frame].tpl_stats_ptr)));
-      cpi->tpl_stats[frame].is_valid = 0;
-      cpi->tpl_stats[frame].width = mi_cols;
-      cpi->tpl_stats[frame].height = mi_rows;
-      cpi->tpl_stats[frame].stride = mi_cols;
-      cpi->tpl_stats[frame].mi_rows = cm->mi_rows;
-      cpi->tpl_stats[frame].mi_cols = cm->mi_cols;
-    }
-
-    for (frame = 0; frame < REF_FRAMES; ++frame) {
-      cpi->enc_frame_buf[frame].mem_valid = 0;
-      cpi->enc_frame_buf[frame].released = 1;
-    }
-  }
+  cpi->feature_score_loc_alloc = 0;
+#endif  // CONFIG_NON_GREEDY_MV
+  for (i = 0; i < MAX_ARF_GOP_SIZE; ++i) cpi->tpl_stats[i].tpl_stats_ptr = NULL;
 
   // Allocate memory to store variances for a frame.
   CHECK_MEM_ERROR(cm, cpi->source_diff_var, vpx_calloc(cm->MBs, sizeof(diff)));
@@ -3848,12 +3807,15 @@ static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
 
   if (svc->spatial_layer_id == svc->first_spatial_layer_to_encode) {
     svc->high_source_sad_superframe = cpi->rc.high_source_sad;
+    svc->high_num_blocks_with_motion = cpi->rc.high_num_blocks_with_motion;
     // On scene change reset temporal layer pattern to TL0.
     // Note that if the base/lower spatial layers are skipped: instead of
     // inserting base layer here, we force max-q for the next superframe
     // with lower spatial layers: this is done in vp9_encodedframe_overshoot()
     // when max-q is decided for the current layer.
-    if (svc->high_source_sad_superframe && svc->temporal_layer_id > 0) {
+    // Only do this reset for bypass/flexible mode.
+    if (svc->high_source_sad_superframe && svc->temporal_layer_id > 0 &&
+        svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS) {
       // rc->high_source_sad will get reset so copy it to restore it.
       int tmp_high_source_sad = cpi->rc.high_source_sad;
       vp9_svc_reset_temporal_layers(cpi, cm->frame_type == KEY_FRAME);
@@ -6185,6 +6147,79 @@ static void add_nb_blocks_to_heap(VP9_COMP *cpi, const TplDepFrame *tpl_frame,
 }
 #endif  // USE_PQSORT
 #endif  // CHANGE_MV_SEARCH_ORDER
+
+static void build_motion_field(VP9_COMP *cpi, MACROBLOCKD *xd, int frame_idx,
+                               YV12_BUFFER_CONFIG *ref_frame[3],
+                               BLOCK_SIZE bsize) {
+  VP9_COMMON *cm = &cpi->common;
+  ThreadData *td = &cpi->td;
+  TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx];
+  const int mi_height = num_8x8_blocks_high_lookup[bsize];
+  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+  int fs_loc_sort_size;
+  int fs_loc_heap_size;
+  int mi_row, mi_col;
+
+  tpl_frame->lambda = 250;
+
+  fs_loc_sort_size = 0;
+  for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) {
+    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) {
+      const int mb_y_offset =
+          mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE;
+      const int bw = 4 << b_width_log2_lookup[bsize];
+      const int bh = 4 << b_height_log2_lookup[bsize];
+      TplDepStats *tpl_stats =
+          &tpl_frame->tpl_stats_ptr[mi_row * tpl_frame->stride + mi_col];
+      FEATURE_SCORE_LOC *fs_loc =
+          &cpi->feature_score_loc_arr[mi_row * tpl_frame->stride + mi_col];
+      tpl_stats->feature_score = get_feature_score(
+          xd->cur_buf->y_buffer + mb_y_offset, xd->cur_buf->y_stride, bw, bh);
+      fs_loc->visited = 0;
+      fs_loc->feature_score = tpl_stats->feature_score;
+      fs_loc->mi_row = mi_row;
+      fs_loc->mi_col = mi_col;
+      cpi->feature_score_loc_sort[fs_loc_sort_size] = fs_loc;
+      ++fs_loc_sort_size;
+    }
+  }
+
+  qsort(cpi->feature_score_loc_sort, fs_loc_sort_size,
+        sizeof(*cpi->feature_score_loc_sort), compare_feature_score);
+
+#if CHANGE_MV_SEARCH_ORDER
+#if !USE_PQSORT
+  for (i = 0; i < fs_loc_sort_size; ++i) {
+    FEATURE_SCORE_LOC *fs_loc = cpi->feature_score_loc_sort[i];
+    do_motion_search(cpi, td, frame_idx, ref_frame, bsize, fs_loc->mi_row,
+                     fs_loc->mi_col);
+  }
+#else   // !USE_PQSORT
+  fs_loc_heap_size = 0;
+  max_heap_push(cpi->feature_score_loc_heap, &fs_loc_heap_size,
+                cpi->feature_score_loc_sort[0]);
+
+  while (fs_loc_heap_size > 0) {
+    FEATURE_SCORE_LOC *fs_loc;
+    max_heap_pop(cpi->feature_score_loc_heap, &fs_loc_heap_size, &fs_loc);
+
+    fs_loc->visited = 1;
+
+    do_motion_search(cpi, td, frame_idx, ref_frame, bsize, fs_loc->mi_row,
+                     fs_loc->mi_col);
+
+    add_nb_blocks_to_heap(cpi, tpl_frame, bsize, fs_loc->mi_row, fs_loc->mi_col,
+                          &fs_loc_heap_size);
+  }
+#endif  // !USE_PQSORT
+#else   // CHANGE_MV_SEARCH_ORDER
+  for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) {
+    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) {
+      do_motion_search(cpi, td, frame_idx, ref_frame, bsize, mi_row, mi_col);
+    }
+  }
+#endif  // CHANGE_MV_SEARCH_ORDER
+}
 #endif  // CONFIG_NON_GREEDY_MV
 
 void mc_flow_dispenser(VP9_COMP *cpi, GF_PICTURE *gf_picture, int frame_idx,
@@ -6217,17 +6252,6 @@ void mc_flow_dispenser(VP9_COMP *cpi, GF_PICTURE *gf_picture, int frame_idx,
   const int mi_height = num_8x8_blocks_high_lookup[bsize];
   const int mi_width = num_8x8_blocks_wide_lookup[bsize];
   int64_t recon_error, sse;
-#if CONFIG_NON_GREEDY_MV
-  int rf_idx;
-  int fs_loc_sort_size;
-#if CHANGE_MV_SEARCH_ORDER
-#if USE_PQSORT
-  int fs_loc_heap_size;
-#else
-  int i;
-#endif  // USE_PQSORT
-#endif  // CHANGE_MV_SEARCH_ORDER
-#endif  // CONFIG_NON_GREEDY_MV
 
   // Setup scaling factor
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -6268,66 +6292,9 @@ void mc_flow_dispenser(VP9_COMP *cpi, GF_PICTURE *gf_picture, int frame_idx,
   vp9_frame_init_quantizer(cpi);
 
 #if CONFIG_NON_GREEDY_MV
-  tpl_frame->lambda = 250;
-  fs_loc_sort_size = 0;
-
-  for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) {
-    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) {
-      const int mb_y_offset =
-          mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE;
-      const int bw = 4 << b_width_log2_lookup[bsize];
-      const int bh = 4 << b_height_log2_lookup[bsize];
-      TplDepStats *tpl_stats =
-          &tpl_frame->tpl_stats_ptr[mi_row * tpl_frame->stride + mi_col];
-      FEATURE_SCORE_LOC *fs_loc =
-          &cpi->feature_score_loc_arr[mi_row * tpl_frame->stride + mi_col];
-      tpl_stats->feature_score = get_feature_score(
-          xd->cur_buf->y_buffer + mb_y_offset, xd->cur_buf->y_stride, bw, bh);
-      fs_loc->visited = 0;
-      fs_loc->feature_score = tpl_stats->feature_score;
-      fs_loc->mi_row = mi_row;
-      fs_loc->mi_col = mi_col;
-      cpi->feature_score_loc_sort[fs_loc_sort_size] = fs_loc;
-      ++fs_loc_sort_size;
-    }
-  }
-
-  qsort(cpi->feature_score_loc_sort, fs_loc_sort_size,
-        sizeof(*cpi->feature_score_loc_sort), compare_feature_score);
-
-#if CHANGE_MV_SEARCH_ORDER
-#if !USE_PQSORT
-  for (i = 0; i < fs_loc_sort_size; ++i) {
-    FEATURE_SCORE_LOC *fs_loc = cpi->feature_score_loc_sort[i];
-    do_motion_search(cpi, td, frame_idx, ref_frame, bsize, fs_loc->mi_row,
-                     fs_loc->mi_col);
-  }
-#else   // !USE_PQSORT
-  fs_loc_heap_size = 0;
-  max_heap_push(cpi->feature_score_loc_heap, &fs_loc_heap_size,
-                cpi->feature_score_loc_sort[0]);
-
-  while (fs_loc_heap_size > 0) {
-    FEATURE_SCORE_LOC *fs_loc;
-    max_heap_pop(cpi->feature_score_loc_heap, &fs_loc_heap_size, &fs_loc);
-
-    fs_loc->visited = 1;
-
-    do_motion_search(cpi, td, frame_idx, ref_frame, bsize, fs_loc->mi_row,
-                     fs_loc->mi_col);
+  build_motion_field(cpi, xd, frame_idx, ref_frame, bsize);
+#endif
 
-    add_nb_blocks_to_heap(cpi, tpl_frame, bsize, fs_loc->mi_row, fs_loc->mi_col,
-                          &fs_loc_heap_size);
-  }
-#endif  // !USE_PQSORT
-#else   // CHANGE_MV_SEARCH_ORDER
-  for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) {
-    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) {
-      do_motion_search(cpi, td, frame_idx, ref_frame, bsize, mi_row, mi_col);
-    }
-  }
-#endif  // CHANGE_MV_SEARCH_ORDER
-#endif  // CONFIG_NON_GREEDY_MV
   for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) {
     for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) {
       mode_estimation(cpi, x, xd, &sf, gf_picture, frame_idx, tpl_frame,
@@ -6341,6 +6308,7 @@ void mc_flow_dispenser(VP9_COMP *cpi, GF_PICTURE *gf_picture, int frame_idx,
                        bsize);
 #if CONFIG_NON_GREEDY_MV
       {
+        int rf_idx;
         TplDepStats *this_tpl_stats =
             &tpl_frame->tpl_stats_ptr[mi_row * tpl_frame->stride + mi_col];
         for (rf_idx = 0; rf_idx < 3; ++rf_idx) {
@@ -6436,6 +6404,71 @@ static void dump_tpl_stats(const VP9_COMP *cpi, int tpl_group_frames,
 #endif  // DUMP_TPL_STATS
 #endif  // CONFIG_NON_GREEDY_MV
 
+static void init_tpl_buffer(VP9_COMP *cpi) {
+  VP9_COMMON *cm = &cpi->common;
+  int frame;
+
+  const int mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
+  const int mi_rows = mi_cols_aligned_to_sb(cm->mi_rows);
+#if CONFIG_NON_GREEDY_MV
+  int sqr_bsize;
+  int rf_idx;
+
+  // TODO(angiebird): This probably needs further modifications to support
+  // frame scaling later on.
+  if (cpi->feature_score_loc_alloc == 0) {
+    CHECK_MEM_ERROR(
+        cm, cpi->feature_score_loc_arr,
+        vpx_calloc(mi_rows * mi_cols, sizeof(*cpi->feature_score_loc_arr)));
+    CHECK_MEM_ERROR(
+        cm, cpi->feature_score_loc_sort,
+        vpx_calloc(mi_rows * mi_cols, sizeof(*cpi->feature_score_loc_sort)));
+    CHECK_MEM_ERROR(
+        cm, cpi->feature_score_loc_heap,
+        vpx_calloc(mi_rows * mi_cols, sizeof(*cpi->feature_score_loc_heap)));
+
+    cpi->feature_score_loc_alloc = 1;
+  }
+#endif
+
+  // TODO(jingning): Reduce the actual memory use for tpl model build up.
+  for (frame = 0; frame < MAX_ARF_GOP_SIZE; ++frame) {
+    if (cpi->tpl_stats[frame].width >= mi_cols &&
+        cpi->tpl_stats[frame].height >= mi_rows &&
+        cpi->tpl_stats[frame].tpl_stats_ptr)
+      continue;
+
+#if CONFIG_NON_GREEDY_MV
+    vpx_free(cpi->tpl_stats[frame].pyramid_mv_arr);
+    for (rf_idx = 0; rf_idx < 3; ++rf_idx) {
+      for (sqr_bsize = 0; sqr_bsize < SQUARE_BLOCK_SIZES; ++sqr_bsize) {
+        CHECK_MEM_ERROR(
+            cm, cpi->tpl_stats[frame].pyramid_mv_arr[rf_idx][sqr_bsize],
+            vpx_calloc(
+                mi_rows * mi_cols,
+                sizeof(
+                    *cpi->tpl_stats[frame].pyramid_mv_arr[rf_idx][sqr_bsize])));
+      }
+    }
+#endif
+    vpx_free(cpi->tpl_stats[frame].tpl_stats_ptr);
+    CHECK_MEM_ERROR(cm, cpi->tpl_stats[frame].tpl_stats_ptr,
+                    vpx_calloc(mi_rows * mi_cols,
+                               sizeof(*cpi->tpl_stats[frame].tpl_stats_ptr)));
+    cpi->tpl_stats[frame].is_valid = 0;
+    cpi->tpl_stats[frame].width = mi_cols;
+    cpi->tpl_stats[frame].height = mi_rows;
+    cpi->tpl_stats[frame].stride = mi_cols;
+    cpi->tpl_stats[frame].mi_rows = cm->mi_rows;
+    cpi->tpl_stats[frame].mi_cols = cm->mi_cols;
+  }
+
+  for (frame = 0; frame < REF_FRAMES; ++frame) {
+    cpi->enc_frame_buf[frame].mem_valid = 0;
+    cpi->enc_frame_buf[frame].released = 1;
+  }
+}
+
 static void setup_tpl_stats(VP9_COMP *cpi) {
   GF_PICTURE gf_picture[MAX_ARF_GOP_SIZE];
   const GF_GROUP *gf_group = &cpi->twopass.gf_group;
@@ -6673,6 +6706,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
   if (gf_group_index == 1 &&
       cpi->twopass.gf_group.update_type[gf_group_index] == ARF_UPDATE &&
       cpi->sf.enable_tpl_model) {
+    init_tpl_buffer(cpi);
     vp9_estimate_qp_gop(cpi);
     setup_tpl_stats(cpi);
   }
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index c7c45e3a4..e4175f6ed 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -570,6 +570,7 @@ typedef struct VP9_COMP {
   YV12_BUFFER_CONFIG *tpl_recon_frames[REF_FRAMES];
   EncFrameBuf enc_frame_buf[REF_FRAMES];
 #if CONFIG_NON_GREEDY_MV
+  int feature_score_loc_alloc;
   FEATURE_SCORE_LOC *feature_score_loc_arr;
   FEATURE_SCORE_LOC **feature_score_loc_sort;
   FEATURE_SCORE_LOC **feature_score_loc_heap;
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index e4a5f3e18..602147421 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -424,6 +424,7 @@ static void set_rt_speed_feature_framesize_dependent(VP9_COMP *cpi,
 static void set_rt_speed_feature_framesize_independent(
     VP9_COMP *cpi, SPEED_FEATURES *sf, int speed, vp9e_tune_content content) {
   VP9_COMMON *const cm = &cpi->common;
+  SVC *const svc = &cpi->svc;
   const int is_keyframe = cm->frame_type == KEY_FRAME;
   const int frames_since_key = is_keyframe ? 0 : cpi->rc.frames_since_key;
   sf->static_segmentation = 0;
@@ -479,7 +480,7 @@ static void set_rt_speed_feature_framesize_independent(
     // Reference masking only enabled for 1 spatial layer, and if none of the
     // references have been scaled. The latter condition needs to be checked
     // for external or internal dynamic resize.
-    sf->reference_masking = (cpi->svc.number_spatial_layers == 1);
+    sf->reference_masking = (svc->number_spatial_layers == 1);
     if (sf->reference_masking == 1 &&
         (cpi->external_resize == 1 ||
          cpi->oxcf.resize_mode == RESIZE_DYNAMIC)) {
@@ -615,7 +616,7 @@ static void set_rt_speed_feature_framesize_independent(
     }
     // Keep nonrd_keyframe = 1 for non-base spatial layers to prevent
     // increase in encoding time.
-    if (cpi->use_svc && cpi->svc.spatial_layer_id > 0) sf->nonrd_keyframe = 1;
+    if (cpi->use_svc && svc->spatial_layer_id > 0) sf->nonrd_keyframe = 1;
     if (cm->frame_type != KEY_FRAME && cpi->resize_state == ORIG &&
         cpi->oxcf.rc_mode == VPX_CBR)
       sf->overshoot_detection_cbr_rt = FAST_DETECTION_MAXQ;
@@ -653,7 +654,7 @@ static void set_rt_speed_feature_framesize_independent(
           (cm->width * cm->height <= 640 * 360) ? 40000 : 60000;
       if (cpi->content_state_sb_fd == NULL &&
           (!cpi->use_svc ||
-           cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)) {
+           svc->spatial_layer_id == svc->number_spatial_layers - 1)) {
         cpi->content_state_sb_fd = (uint8_t *)vpx_calloc(
             (cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1), sizeof(uint8_t));
       }
@@ -662,7 +663,7 @@ static void set_rt_speed_feature_framesize_independent(
       // Enable short circuit for low temporal variance.
       sf->short_circuit_low_temp_var = 1;
     }
-    if (cpi->svc.temporal_layer_id > 0) {
+    if (svc->temporal_layer_id > 0) {
       sf->adaptive_rd_thresh = 4;
       sf->limit_newmv_early_exit = 0;
       sf->base_mv_aggressive = 1;
@@ -676,16 +677,15 @@ static void set_rt_speed_feature_framesize_independent(
     sf->mv.fullpel_search_step_param = 10;
     // For SVC: use better mv search on base temporal layer, and only
     // on base spatial layer if highest resolution is above 640x360.
-    if (cpi->svc.number_temporal_layers > 2 &&
-        cpi->svc.temporal_layer_id == 0 &&
-        (cpi->svc.spatial_layer_id == 0 ||
+    if (svc->number_temporal_layers > 2 && svc->temporal_layer_id == 0 &&
+        (svc->spatial_layer_id == 0 ||
          cpi->oxcf.width * cpi->oxcf.height <= 640 * 360)) {
       sf->mv.search_method = NSTEP;
       sf->mv.fullpel_search_step_param = 6;
     }
-    if (cpi->svc.temporal_layer_id > 0 || cpi->svc.spatial_layer_id > 1) {
+    if (svc->temporal_layer_id > 0 || svc->spatial_layer_id > 1) {
       sf->use_simple_block_yrd = 1;
-      if (cpi->svc.non_reference_frame)
+      if (svc->non_reference_frame)
         sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED_EVENMORE;
     }
     if (cpi->use_svc && cpi->row_mt && cpi->oxcf.max_threads > 1)
@@ -696,28 +696,28 @@ static void set_rt_speed_feature_framesize_independent(
     if (!cpi->last_frame_dropped && cpi->resize_state == ORIG &&
         !cpi->external_resize &&
         (!cpi->use_svc ||
-         (cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1 &&
-          !cpi->svc.last_layer_dropped[cpi->svc.number_spatial_layers - 1]))) {
+         (svc->spatial_layer_id == svc->number_spatial_layers - 1 &&
+          !svc->last_layer_dropped[svc->number_spatial_layers - 1]))) {
       sf->copy_partition_flag = 1;
       cpi->max_copied_frame = 2;
       // The top temporal enhancement layer (for number of temporal layers > 1)
       // are non-reference frames, so use large/max value for max_copied_frame.
-      if (cpi->svc.number_temporal_layers > 1 &&
-          cpi->svc.temporal_layer_id == cpi->svc.number_temporal_layers - 1)
+      if (svc->number_temporal_layers > 1 &&
+          svc->temporal_layer_id == svc->number_temporal_layers - 1)
         cpi->max_copied_frame = 255;
     }
     // For SVC: enable use of lower resolution partition for higher resolution,
     // only for 3 spatial layers and when config/top resolution is above VGA.
     // Enable only for non-base temporal layer frames.
-    if (cpi->use_svc && cpi->svc.use_partition_reuse &&
-        cpi->svc.number_spatial_layers == 3 && cpi->svc.temporal_layer_id > 0 &&
+    if (cpi->use_svc && svc->use_partition_reuse &&
+        svc->number_spatial_layers == 3 && svc->temporal_layer_id > 0 &&
         cpi->oxcf.width * cpi->oxcf.height > 640 * 480)
       sf->svc_use_lowres_part = 1;
     // For SVC when golden is used as second temporal reference: to avoid
     // encode time increase only use this feature on base temporal layer.
     // (i.e remove golden flag from frame_flags for temporal_layer_id > 0).
-    if (cpi->use_svc && cpi->svc.use_gf_temporal_ref_current_layer &&
-        cpi->svc.temporal_layer_id > 0)
+    if (cpi->use_svc && svc->use_gf_temporal_ref_current_layer &&
+        svc->temporal_layer_id > 0)
       cpi->ref_frame_flags &= (~VP9_GOLD_FLAG);
   }
 
@@ -789,17 +789,18 @@ static void set_rt_speed_feature_framesize_independent(
           (uint8_t *)vpx_calloc((cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1),
                                 sizeof(*cpi->count_lastgolden_frame_usage));
   }
-  if (cpi->svc.previous_frame_is_intra_only) {
+  if (svc->previous_frame_is_intra_only) {
     sf->partition_search_type = FIXED_PARTITION;
     sf->always_this_block_size = BLOCK_64X64;
   }
   // Special case for screen content: increase motion search on base spatial
   // layer when high motion is detected or previous SL0 frame was dropped.
   if (cpi->oxcf.content == VP9E_CONTENT_SCREEN && cpi->oxcf.speed >= 5 &&
-      cpi->svc.spatial_layer_id == 0 &&
-      (cpi->rc.high_num_blocks_with_motion || cpi->svc.last_layer_dropped[0])) {
+      (svc->high_num_blocks_with_motion || svc->last_layer_dropped[0])) {
     sf->mv.search_method = NSTEP;
-    sf->mv.fullpel_search_step_param = 2;
+    // TODO(marpan/jianj): Tune this setting for screensharing. For now use
+    // larger step_param for non-base layer, to avoid increase in encode time.
+    sf->mv.fullpel_search_step_param = (svc->spatial_layer_id == 0) ? 2 : 4;
   }
 }
 
diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c
index 510087580..df5af6dc9 100644
--- a/vp9/encoder/vp9_svc_layercontext.c
+++ b/vp9/encoder/vp9_svc_layercontext.c
@@ -884,7 +884,10 @@ int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) {
       svc->non_reference_frame = 0;
   }
 
-  if (svc->spatial_layer_id == 0) svc->high_source_sad_superframe = 0;
+  if (svc->spatial_layer_id == 0) {
+    svc->high_source_sad_superframe = 0;
+    svc->high_num_blocks_with_motion = 0;
+  }
 
   if (svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_BYPASS &&
       svc->last_layer_dropped[svc->spatial_layer_id] &&
diff --git a/vp9/encoder/vp9_svc_layercontext.h b/vp9/encoder/vp9_svc_layercontext.h
index f1f2457b2..c25644617 100644
--- a/vp9/encoder/vp9_svc_layercontext.h
+++ b/vp9/encoder/vp9_svc_layercontext.h
@@ -143,10 +143,11 @@ typedef struct SVC {
 
   INTER_LAYER_PRED disable_inter_layer_pred;
 
-  // Flag to indicate scene change at current superframe, scene detection is
-  // currently checked for each superframe prior to encoding, on the full
-  // resolution source.
+  // Flag to indicate scene change and high num of motion blocks at current
+  // superframe, scene detection is currently checked for each superframe prior
+  // to encoding, on the full resolution source.
   int high_source_sad_superframe;
+  int high_num_blocks_with_motion;
 
   // Flags used to get SVC pattern info.
   int update_buffer_slot[VPX_SS_MAX_LAYERS];