11 files changed, 299 insertions, 205 deletions
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 8464882ea..88b67cf5e 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -1445,40 +1445,6 @@ static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src,
   vp9_extend_frame_borders(dst);
 }
 
-#define WRITE_RECON_BUFFER 0
-#if WRITE_RECON_BUFFER
-void write_cx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame) {
-  FILE *yframe;
-  int i;
-  char filename[255];
-
-  snprintf(filename, sizeof(filename), "cx\\y%04d.raw", this_frame);
-  yframe = fopen(filename, "wb");
-
-  for (i = 0; i < frame->y_height; i++)
-    fwrite(frame->y_buffer + i * frame->y_stride,
-           frame->y_width, 1, yframe);
-
-  fclose(yframe);
-  snprintf(filename, sizeof(filename), "cx\\u%04d.raw", this_frame);
-  yframe = fopen(filename, "wb");
-
-  for (i = 0; i < frame->uv_height; i++)
-    fwrite(frame->u_buffer + i * frame->uv_stride,
-           frame->uv_width, 1, yframe);
-
-  fclose(yframe);
-  snprintf(filename, sizeof(filename), "cx\\v%04d.raw", this_frame);
-  yframe = fopen(filename, "wb");
-
-  for (i = 0; i < frame->uv_height; i++)
-    fwrite(frame->v_buffer + i * frame->uv_stride,
-           frame->uv_width, 1, yframe);
-
-  fclose(yframe);
-}
-#endif
-
 // Function to test for conditions that indicate we should loop
 // back and recode a frame.
 static int recode_loop_test(const VP9_COMP *cpi,
@@ -2013,18 +1979,16 @@ YV12_BUFFER_CONFIG *vp9_scale_if_required(VP9_COMMON *cm,
   }
 }
 
-static void configure_skippable_frame(VP9_COMP *cpi) {
+static int is_skippable_frame(const VP9_COMP *cpi) {
   // If the current frame does not have non-zero motion vector detected in the
   // first  pass, and so do its previous and forward frames, then this frame
   // can be skipped for partition check, and the partition size is assigned
   // according to the variance
+  const SVC *const svc = &cpi->svc;
+  const TWO_PASS *const twopass = is_spatial_svc(cpi) ?
+      &svc->layer_context[svc->spatial_layer_id].twopass : &cpi->twopass;
 
-  SVC *const svc = &cpi->svc;
-  TWO_PASS *const twopass = is_spatial_svc(cpi) ?
-                            &svc->layer_context[svc->spatial_layer_id].twopass
-                            : &cpi->twopass;
-
-  cpi->skippable_frame = (!frame_is_intra_only(&cpi->common) &&
+  return (!frame_is_intra_only(&cpi->common) &&
     twopass->stats_in - 2 > twopass->stats_in_start &&
     twopass->stats_in < twopass->stats_in_end &&
     (twopass->stats_in - 1)->pcnt_inter - (twopass->stats_in - 1)->pcnt_motion
@@ -2198,7 +2162,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
   // second pass according to the first pass stats
   if (oxcf->pass == 2 &&
       (!cpi->use_svc || is_spatial_svc(cpi))) {
-    configure_skippable_frame(cpi);
+    cpi->skippable_frame = is_skippable_frame(cpi);
   }
 
   // For 1 pass CBR, check if we are dropping this frame.
@@ -2280,27 +2244,9 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
 
   cm->frame_to_show = get_frame_new_buffer(cm);
 
-#if WRITE_RECON_BUFFER
-  if (cm->show_frame)
-    write_cx_frame_to_file(cm->frame_to_show,
-                           cm->current_video_frame);
-  else
-    write_cx_frame_to_file(cm->frame_to_show,
-                           cm->current_video_frame + 1000);
-#endif
-
   // Pick the loop filter level for the frame.
   loopfilter_frame(cpi, cm);
 
-#if WRITE_RECON_BUFFER
-  if (cm->show_frame)
-    write_cx_frame_to_file(cm->frame_to_show,
-                           cm->current_video_frame + 2000);
-  else
-    write_cx_frame_to_file(cm->frame_to_show,
-                           cm->current_video_frame + 3000);
-#endif
-
   // build the bitstream
   cpi->dummy_packing = 0;
   vp9_pack_bitstream(cpi, dest, size);
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 2dba67c54..ace673457 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -523,6 +523,10 @@ static INLINE int get_chessboard_index(const int frame_index) {
   return frame_index & 0x1;
 }
 
+static INLINE int *cond_sad_list(const struct VP9_COMP *cpi, int *sad_list) {
+  return cpi->sf.mv.subpel_search_method != SUBPEL_TREE ? sad_list : NULL;
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp9/encoder/vp9_mbgraph.c b/vp9/encoder/vp9_mbgraph.c
index 6e04e2a9c..b8e716492 100644
--- a/vp9/encoder/vp9_mbgraph.c
+++ b/vp9/encoder/vp9_mbgraph.c
@@ -34,6 +34,7 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi,
   const int tmp_row_min = x->mv_row_min;
   const int tmp_row_max = x->mv_row_max;
   MV ref_full;
+  int sad_list[5];
 
   // Further step/diamond searches as necessary
   int step_param = mv_sf->reduce_first_step_size;
@@ -45,8 +46,9 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi,
   ref_full.row = ref_mv->row >> 3;
 
   /*cpi->sf.search_method == HEX*/
-  vp9_hex_search(x, &ref_full, step_param, x->errorperbit, 0, &v_fn_ptr, 0,
-                 ref_mv, dst_mv);
+  vp9_hex_search(x, &ref_full, step_param, x->errorperbit, 0,
+                 cond_sad_list(cpi, sad_list),
+                 &v_fn_ptr, 0, ref_mv, dst_mv);
 
   // Try sub-pixel MC
   // if (bestsme > error_thresh && bestsme < INT_MAX)
@@ -55,8 +57,10 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi,
     unsigned int sse;
     cpi->find_fractional_mv_step(
         x, dst_mv, ref_mv, cpi->common.allow_high_precision_mv, x->errorperbit,
-        &v_fn_ptr, 0, mv_sf->subpel_iters_per_step, NULL, NULL, &distortion,
-        &sse, NULL, 0, 0);
+        &v_fn_ptr, 0, mv_sf->subpel_iters_per_step,
+        cond_sad_list(cpi, sad_list),
+        NULL, NULL,
+        &distortion, &sse, NULL, 0, 0);
   }
 
   xd->mi[0]->mbmi.mode = NEWMV;
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index ae924d596..d6f6b2563 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -256,6 +256,137 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) {
     }                                                   \
   }
 
+#define SETUP_SUBPEL_SEARCH                                                \
+  const uint8_t *const z = x->plane[0].src.buf;                            \
+  const int src_stride = x->plane[0].src.stride;                           \
+  const MACROBLOCKD *xd = &x->e_mbd;                                       \
+  unsigned int besterr = INT_MAX;                                          \
+  unsigned int sse;                                                        \
+  unsigned int whichdir;                                                   \
+  int thismse;                                                             \
+  const unsigned int halfiters = iters_per_step;                           \
+  const unsigned int quarteriters = iters_per_step;                        \
+  const unsigned int eighthiters = iters_per_step;                         \
+  const int y_stride = xd->plane[0].pre[0].stride;                         \
+  const int offset = bestmv->row * y_stride + bestmv->col;                 \
+  const uint8_t *const y = xd->plane[0].pre[0].buf;                        \
+                                                                           \
+  int rr = ref_mv->row;                                                    \
+  int rc = ref_mv->col;                                                    \
+  int br = bestmv->row * 8;                                                \
+  int bc = bestmv->col * 8;                                                \
+  int hstep = 4;                                                           \
+  const int minc = MAX(x->mv_col_min * 8, ref_mv->col - MV_MAX);           \
+  const int maxc = MIN(x->mv_col_max * 8, ref_mv->col + MV_MAX);           \
+  const int minr = MAX(x->mv_row_min * 8, ref_mv->row - MV_MAX);           \
+  const int maxr = MIN(x->mv_row_max * 8, ref_mv->row + MV_MAX);           \
+  int tr = br;                                                             \
+  int tc = bc;                                                             \
+                                                                           \
+  bestmv->row *= 8;                                                        \
+  bestmv->col *= 8;                                                        \
+  if (second_pred != NULL) {                                               \
+    DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64);                \
+    vp9_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride); \
+    besterr = vfp->vf(comp_pred, w, z, src_stride, sse1);                  \
+  } else {                                                                 \
+    besterr = vfp->vf(y + offset, y_stride, z, src_stride, sse1);          \
+  }                                                                        \
+  *distortion = besterr;                                                   \
+  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
+
+int vp9_find_best_sub_pixel_tree_pruned(const MACROBLOCK *x,
+                                        MV *bestmv, const MV *ref_mv,
+                                        int allow_hp,
+                                        int error_per_bit,
+                                        const vp9_variance_fn_ptr_t *vfp,
+                                        int forced_stop,
+                                        int iters_per_step,
+                                        int *sad_list,
+                                        int *mvjcost, int *mvcost[2],
+                                        int *distortion,
+                                        unsigned int *sse1,
+                                        const uint8_t *second_pred,
+                                        int w, int h) {
+  SETUP_SUBPEL_SEARCH;
+
+  if (sad_list &&
+      sad_list[0] != INT_MAX && sad_list[1] != INT_MAX &&
+      sad_list[2] != INT_MAX && sad_list[3] != INT_MAX &&
+      sad_list[4] != INT_MAX) {
+    unsigned int left, right, up, down, diag;
+    whichdir = (sad_list[1] < sad_list[3] ? 0 : 1) +
+               (sad_list[2] < sad_list[4] ? 0 : 2);
+    switch (whichdir) {
+      case 0:
+        CHECK_BETTER(left, tr, tc - hstep);
+        CHECK_BETTER(up, tr - hstep, tc);
+        CHECK_BETTER(diag, tr - hstep, tc - hstep);
+        break;
+      case 1:
+        CHECK_BETTER(right, tr, tc + hstep);
+        CHECK_BETTER(up, tr - hstep, tc);
+        CHECK_BETTER(diag, tr - hstep, tc + hstep);
+        break;
+      case 2:
+        CHECK_BETTER(left, tr, tc - hstep);
+        CHECK_BETTER(down, tr + hstep, tc);
+        CHECK_BETTER(diag, tr + hstep, tc - hstep);
+        break;
+      case 3:
+        CHECK_BETTER(right, tr, tc + hstep);
+        CHECK_BETTER(down, tr + hstep, tc);
+        CHECK_BETTER(diag, tr + hstep, tc + hstep);
+        break;
+    }
+  } else {
+    FIRST_LEVEL_CHECKS;
+    if (halfiters > 1) {
+      SECOND_LEVEL_CHECKS;
+    }
+  }
+
+  tr = br;
+  tc = bc;
+
+  // Each subsequent iteration checks at least one point in common with
+  // the last iteration could be 2 ( if diag selected) 1/4 pel
+
+  // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only
+  if (forced_stop != 2) {
+    hstep >>= 1;
+    FIRST_LEVEL_CHECKS;
+    if (quarteriters > 1) {
+      SECOND_LEVEL_CHECKS;
+    }
+    tr = br;
+    tc = bc;
+  }
+
+  if (allow_hp && vp9_use_mv_hp(ref_mv) && forced_stop == 0) {
+    hstep >>= 1;
+    FIRST_LEVEL_CHECKS;
+    if (eighthiters > 1) {
+      SECOND_LEVEL_CHECKS;
+    }
+    tr = br;
+    tc = bc;
+  }
+  // These lines insure static analysis doesn't warn that
+  // tr and tc aren't used after the above point.
+  (void) tr;
+  (void) tc;
+
+  bestmv->row = br;
+  bestmv->col = bc;
+
+  if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) ||
+      (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3)))
+    return INT_MAX;
+
+  return besterr;
+}
+
 int vp9_find_best_sub_pixel_tree(const MACROBLOCK *x,
                                  MV *bestmv, const MV *ref_mv,
                                  int allow_hp,
@@ -263,55 +394,14 @@ int vp9_find_best_sub_pixel_tree(const MACROBLOCK *x,
                                  const vp9_variance_fn_ptr_t *vfp,
                                  int forced_stop,
                                  int iters_per_step,
+                                 int *sad_list,
                                  int *mvjcost, int *mvcost[2],
                                  int *distortion,
                                  unsigned int *sse1,
                                  const uint8_t *second_pred,
                                  int w, int h) {
-  const uint8_t *const z = x->plane[0].src.buf;
-  const int src_stride = x->plane[0].src.stride;
-  const MACROBLOCKD *xd = &x->e_mbd;
-  unsigned int besterr = INT_MAX;
-  unsigned int sse;
-  unsigned int whichdir;
-  int thismse;
-  const unsigned int halfiters = iters_per_step;
-  const unsigned int quarteriters = iters_per_step;
-  const unsigned int eighthiters = iters_per_step;
-
-  const int y_stride = xd->plane[0].pre[0].stride;
-  const int offset = bestmv->row * y_stride + bestmv->col;
-  const uint8_t *const y = xd->plane[0].pre[0].buf;
-
-  int rr = ref_mv->row;
-  int rc = ref_mv->col;
-  int br = bestmv->row * 8;
-  int bc = bestmv->col * 8;
-  int hstep = 4;
-  const int minc = MAX(x->mv_col_min * 8, ref_mv->col - MV_MAX);
-  const int maxc = MIN(x->mv_col_max * 8, ref_mv->col + MV_MAX);
-  const int minr = MAX(x->mv_row_min * 8, ref_mv->row - MV_MAX);
-  const int maxr = MIN(x->mv_row_max * 8, ref_mv->row + MV_MAX);
-
-  int tr = br;
-  int tc = bc;
-
-  // central mv
-  bestmv->row *= 8;
-  bestmv->col *= 8;
-
-  // calculate central point error
-  // TODO(yunqingwang): central pointer error was already calculated in full-
-  // pixel search, and can be passed in this function.
-  if (second_pred != NULL) {
-    DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64);
-    vp9_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
-    besterr = vfp->vf(comp_pred, w, z, src_stride, sse1);
-  } else {
-    besterr = vfp->vf(y + offset, y_stride, z, src_stride, sse1);
-  }
-  *distortion = besterr;
-  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
+  SETUP_SUBPEL_SEARCH;
+  (void) sad_list;  // to silence compiler warning
 
   // Each subsequent iteration checks at least one point in
   // common with the last iteration could be 2 ( if diag selected)
@@ -398,14 +488,17 @@ static INLINE int is_mv_in(const MACROBLOCK *x, const MV *mv) {
 // Each scale can have a different number of candidates and shape of
 // candidates as indicated in the num_candidates and candidates arrays
 // passed into this function
+//
 static int vp9_pattern_search(const MACROBLOCK *x,
                               MV *ref_mv,
                               int search_param,
                               int sad_per_bit,
-                              int do_init_search, int do_refine,
+                              int do_init_search,
+                              int *sad_list,
                               const vp9_variance_fn_ptr_t *vfp,
                               int use_mvcost,
-                              const MV *center_mv, MV *best_mv,
+                              const MV *center_mv,
+                              MV *best_mv,
                               const int num_candidates[MAX_PATTERN_SCALES],
                               const MV candidates[MAX_PATTERN_SCALES]
                                                  [MAX_PATTERN_CANDIDATES]) {
@@ -413,7 +506,7 @@ static int vp9_pattern_search(const MACROBLOCK *x,
   static const int search_param_to_steps[MAX_MVSEARCH_STEPS] = {
     10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
   };
-  int i, j, s, t;
+  int i, s, t;
   const struct buf_2d *const what = &x->plane[0].src;
   const struct buf_2d *const in_what = &xd->plane[0].pre[0];
   int br, bc;
@@ -552,47 +645,38 @@ static int vp9_pattern_search(const MACROBLOCK *x,
     } while (s--);
   }
 
-  // Check 4 1-away neighbors if do_refine is true.
-  // For most well-designed schemes do_refine will not be necessary.
-  if (do_refine) {
-    static const MV neighbors[4] = {{0, -1}, { -1, 0}, {1, 0}, {0, 1}};
-
-    for (j = 0; j < 16; j++) {
-      int best_site = -1;
-      if (check_bounds(x, br, bc, 1)) {
-        for (i = 0; i < 4; i++) {
-          const MV this_mv = {br + neighbors[i].row,
-                              bc + neighbors[i].col};
-          thissad = vfp->sdf(what->buf, what->stride,
-                             get_buf_from_mv(in_what, &this_mv),
-                             in_what->stride);
-          CHECK_BETTER
-        }
-      } else {
-        for (i = 0; i < 4; i++) {
-          const MV this_mv = {br + neighbors[i].row,
-                              bc + neighbors[i].col};
-          if (!is_mv_in(x, &this_mv))
-            continue;
-          thissad = vfp->sdf(what->buf, what->stride,
-                             get_buf_from_mv(in_what, &this_mv),
-                             in_what->stride);
-          CHECK_BETTER
-        }
+  // Returns the one-away integer pel sad values around the best as follows:
+  // sad_list[0]: sad at the best integer pel
+  // sad_list[1]: sad at delta {0, -1} (left)   from the best integer pel
+  // sad_list[2]: sad at delta {-1, 0} (top)    from the best integer pel
+  // sad_list[3]: sad at delta { 0, 1} (right)  from the best integer pel
+  // sad_list[4]: sad at delta { 1, 0} (bottom) from the best integer pel
+  if (sad_list) {
+    static const MV neighbors[4] = {{0, -1}, {-1, 0}, {0, 1}, {1, 0}};
+    sad_list[0] = bestsad;
+    if (check_bounds(x, br, bc, 1)) {
+      for (i = 0; i < 4; i++) {
+        const MV this_mv = {br + neighbors[i].row,
+                            bc + neighbors[i].col};
+        sad_list[i + 1] = vfp->sdf(what->buf, what->stride,
+                                   get_buf_from_mv(in_what, &this_mv),
+                                   in_what->stride);
       }
-
-      if (best_site == -1) {
-        break;
-      } else {
-        br += neighbors[best_site].row;
-        bc += neighbors[best_site].col;
+    } else {
+      for (i = 0; i < 4; i++) {
+        const MV this_mv = {br + neighbors[i].row,
+                            bc + neighbors[i].col};
+        if (!is_mv_in(x, &this_mv))
+          sad_list[i + 1] = INT_MAX;
+        else
+          sad_list[i + 1] = vfp->sdf(what->buf, what->stride,
+                                     get_buf_from_mv(in_what, &this_mv),
+                                     in_what->stride);
       }
     }
   }
-
   best_mv->row = br;
   best_mv->col = bc;
-
   return bestsad;
 }
 
@@ -634,6 +718,7 @@ int vp9_hex_search(const MACROBLOCK *x,
                    int search_param,
                    int sad_per_bit,
                    int do_init_search,
+                   int *sad_list,
                    const vp9_variance_fn_ptr_t *vfp,
                    int use_mvcost,
                    const MV *center_mv, MV *best_mv) {
@@ -658,7 +743,7 @@ int vp9_hex_search(const MACROBLOCK *x,
       { -1024, 0}},
   };
   return vp9_pattern_search(x, ref_mv, search_param, sad_per_bit,
-                            do_init_search, 0, vfp, use_mvcost,
+                            do_init_search, sad_list, vfp, use_mvcost,
                             center_mv, best_mv,
                             hex_num_candidates, hex_candidates);
 }
@@ -668,6 +753,7 @@ int vp9_bigdia_search(const MACROBLOCK *x,
                       int search_param,
                       int sad_per_bit,
                       int do_init_search,
+                      int *sad_list,
                       const vp9_variance_fn_ptr_t *vfp,
                       int use_mvcost,
                       const MV *center_mv,
@@ -699,7 +785,7 @@ int vp9_bigdia_search(const MACROBLOCK *x,
       {-512, 512}, {-1024, 0}},
   };
   return vp9_pattern_search(x, ref_mv, search_param, sad_per_bit,
-                            do_init_search, 0, vfp, use_mvcost,
+                            do_init_search, sad_list, vfp, use_mvcost,
                             center_mv, best_mv,
                             bigdia_num_candidates, bigdia_candidates);
 }
@@ -709,6 +795,7 @@ int vp9_square_search(const MACROBLOCK *x,
                       int search_param,
                       int sad_per_bit,
                       int do_init_search,
+                      int *sad_list,
                       const vp9_variance_fn_ptr_t *vfp,
                       int use_mvcost,
                       const MV *center_mv,
@@ -740,7 +827,7 @@ int vp9_square_search(const MACROBLOCK *x,
       {0, 1024}, {-1024, 1024}, {-1024, 0}},
   };
   return vp9_pattern_search(x, ref_mv, search_param, sad_per_bit,
-                            do_init_search, 0, vfp, use_mvcost,
+                            do_init_search, sad_list, vfp, use_mvcost,
                             center_mv, best_mv,
                             square_num_candidates, square_candidates);
 }
@@ -750,12 +837,13 @@ int vp9_fast_hex_search(const MACROBLOCK *x,
                         int search_param,
                         int sad_per_bit,
                         int do_init_search,  // must be zero for fast_hex
+                        int *sad_list,
                         const vp9_variance_fn_ptr_t *vfp,
                         int use_mvcost,
                         const MV *center_mv,
                         MV *best_mv) {
   return vp9_hex_search(x, ref_mv, MAX(MAX_MVSEARCH_STEPS - 2, search_param),
-                        sad_per_bit, do_init_search, vfp, use_mvcost,
+                        sad_per_bit, do_init_search, sad_list, vfp, use_mvcost,
                         center_mv, best_mv);
 }
 
@@ -764,13 +852,14 @@ int vp9_fast_dia_search(const MACROBLOCK *x,
                         int search_param,
                         int sad_per_bit,
                         int do_init_search,
+                        int *sad_list,
                         const vp9_variance_fn_ptr_t *vfp,
                         int use_mvcost,
                         const MV *center_mv,
                         MV *best_mv) {
   return vp9_bigdia_search(x, ref_mv, MAX(MAX_MVSEARCH_STEPS - 2, search_param),
-                           sad_per_bit, do_init_search, vfp, use_mvcost,
-                           center_mv, best_mv);
+                           sad_per_bit, do_init_search, sad_list, vfp,
+                           use_mvcost, center_mv, best_mv);
 }
 
 #undef CHECK_BETTER
@@ -1368,33 +1457,41 @@ int vp9_refining_search_8p_c(const MACROBLOCK *x,
 int vp9_full_pixel_search(VP9_COMP *cpi, MACROBLOCK *x,
                           BLOCK_SIZE bsize, MV *mvp_full,
                           int step_param, int error_per_bit,
+                          int *sad_list,
                           const MV *ref_mv, MV *tmp_mv,
                           int var_max, int rd) {
   const SPEED_FEATURES *const sf = &cpi->sf;
   const SEARCH_METHODS method = sf->mv.search_method;
   vp9_variance_fn_ptr_t *fn_ptr = &cpi->fn_ptr[bsize];
   int var = 0;
+  if (sad_list) {
+    sad_list[0] = INT_MAX;
+    sad_list[1] = INT_MAX;
+    sad_list[2] = INT_MAX;
+    sad_list[3] = INT_MAX;
+    sad_list[4] = INT_MAX;
+  }
 
   switch (method) {
     case FAST_DIAMOND:
       var = vp9_fast_dia_search(x, mvp_full, step_param, error_per_bit, 0,
-                                fn_ptr, 1, ref_mv, tmp_mv);
+                                sad_list, fn_ptr, 1, ref_mv, tmp_mv);
       break;
     case FAST_HEX:
       var = vp9_fast_hex_search(x, mvp_full, step_param, error_per_bit, 0,
-                                fn_ptr, 1, ref_mv, tmp_mv);
+                                sad_list, fn_ptr, 1, ref_mv, tmp_mv);
       break;
     case HEX:
       var = vp9_hex_search(x, mvp_full, step_param, error_per_bit, 1,
-                           fn_ptr, 1, ref_mv, tmp_mv);
+                           sad_list, fn_ptr, 1, ref_mv, tmp_mv);
       break;
     case SQUARE:
       var = vp9_square_search(x, mvp_full, step_param, error_per_bit, 1,
-                              fn_ptr, 1, ref_mv, tmp_mv);
+                              sad_list, fn_ptr, 1, ref_mv, tmp_mv);
       break;
     case BIGDIA:
       var = vp9_bigdia_search(x, mvp_full, step_param, error_per_bit, 1,
-                              fn_ptr, 1, ref_mv, tmp_mv);
+                              sad_list, fn_ptr, 1, ref_mv, tmp_mv);
       break;
     case NSTEP:
       var = vp9_full_pixel_diamond(cpi, x, mvp_full, step_param, error_per_bit,
diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h
index 298fbb6c9..9b4734a6f 100644
--- a/vp9/encoder/vp9_mcomp.h
+++ b/vp9/encoder/vp9_mcomp.h
@@ -79,6 +79,7 @@ typedef int (integer_mv_pattern_search_fn) (
     int search_param,
     int error_per_bit,
     int do_init_search,
+    int *sad_list,
     const vp9_variance_fn_ptr_t *vf,
     int use_mvcost,
     const MV *center_mv,
@@ -98,12 +99,14 @@ typedef int (fractional_mv_step_fp) (
     const vp9_variance_fn_ptr_t *vfp,
     int forced_stop,  // 0 - full, 1 - qtr only, 2 - half only
     int iters_per_step,
+    int *sad_list,
     int *mvjcost, int *mvcost[2],
     int *distortion, unsigned int *sse1,
     const uint8_t *second_pred,
     int w, int h);
 
 extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree;
+extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree_pruned;
 
 typedef int (*vp9_full_search_fn_t)(const MACROBLOCK *x,
                                     const MV *ref_mv, int sad_per_bit,
@@ -136,8 +139,10 @@ struct VP9_COMP;
 int vp9_full_pixel_search(struct VP9_COMP *cpi, MACROBLOCK *x,
                           BLOCK_SIZE bsize, MV *mvp_full,
                           int step_param, int error_per_bit,
+                          int *sad_list,
                           const MV *ref_mv, MV *tmp_mv,
                           int var_max, int rd);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 7a7bb2824..28d8302fa 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -126,6 +126,7 @@ static int combined_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   const int tmp_row_min = x->mv_row_min;
   const int tmp_row_max = x->mv_row_max;
   int rv = 0;
+  int sad_list[5];
   const YV12_BUFFER_CONFIG *scaled_ref_frame = vp9_get_scaled_ref_frame(cpi,
                                                                         ref);
   if (cpi->common.show_frame &&
@@ -152,8 +153,9 @@ static int combined_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   mvp_full.col >>= 3;
   mvp_full.row >>= 3;
 
-  vp9_full_pixel_search(cpi, x, bsize, &mvp_full, step_param, sadpb, &ref_mv,
-                        &tmp_mv->as_mv, INT_MAX, 0);
+  vp9_full_pixel_search(cpi, x, bsize, &mvp_full, step_param, sadpb,
+                        cond_sad_list(cpi, sad_list),
+                        &ref_mv, &tmp_mv->as_mv, INT_MAX, 0);
 
   x->mv_col_min = tmp_col_min;
   x->mv_col_max = tmp_col_max;
@@ -179,6 +181,7 @@ static int combined_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
                                  &cpi->fn_ptr[bsize],
                                  cpi->sf.mv.subpel_force_stop,
                                  cpi->sf.mv.subpel_iters_per_step,
+                                 cond_sad_list(cpi, sad_list),
                                  x->nmvjointcost, x->mvcost,
                                  &dis, &x->pred_sse[ref], NULL, 0, 0);
     x->pred_mv[ref] = tmp_mv->as_mv;
diff --git a/vp9/encoder/vp9_rd.c b/vp9/encoder/vp9_rd.c
index 2841efabe..b9e44088d 100644
--- a/vp9/encoder/vp9_rd.c
+++ b/vp9/encoder/vp9_rd.c
@@ -364,18 +364,15 @@ void vp9_mv_pred(VP9_COMP *cpi, MACROBLOCK *x,
                  int ref_frame, BLOCK_SIZE block_size) {
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-  int_mv this_mv;
   int i;
   int zero_seen = 0;
   int best_index = 0;
   int best_sad = INT_MAX;
   int this_sad = INT_MAX;
   int max_mv = 0;
-
   uint8_t *src_y_ptr = x->plane[0].src.buf;
   uint8_t *ref_y_ptr;
-  int row_offset, col_offset;
-  int num_mv_refs = MAX_MV_REF_CANDIDATES +
+  const int num_mv_refs = MAX_MV_REF_CANDIDATES +
                     (cpi->sf.adaptive_motion_search &&
                      cpi->common.show_frame &&
                      block_size < cpi->sf.max_partition_size);
@@ -387,19 +384,16 @@ void vp9_mv_pred(VP9_COMP *cpi, MACROBLOCK *x,
 
   // Get the sad for each candidate reference mv.
   for (i = 0; i < num_mv_refs; ++i) {
-    this_mv.as_mv = pred_mv[i];
+    const MV *this_mv = &pred_mv[i];
 
-    max_mv = MAX(max_mv,
-                 MAX(abs(this_mv.as_mv.row), abs(this_mv.as_mv.col)) >> 3);
-    // Only need to check zero mv once.
-    if (!this_mv.as_int && zero_seen)
+    max_mv = MAX(max_mv, MAX(abs(this_mv->row), abs(this_mv->col)) >> 3);
+    if (is_zero_mv(this_mv) && zero_seen)
       continue;
 
-    zero_seen = zero_seen || !this_mv.as_int;
+    zero_seen |= is_zero_mv(this_mv);
 
-    row_offset = this_mv.as_mv.row >> 3;
-    col_offset = this_mv.as_mv.col >> 3;
-    ref_y_ptr = ref_y_buffer + (ref_y_stride * row_offset) + col_offset;
+    ref_y_ptr =
+        &ref_y_buffer[ref_y_stride * (this_mv->row >> 3) + (this_mv->col >> 3)];
 
     // Find sad for current vector.
     this_sad = cpi->fn_ptr[block_size].sdf(src_y_ptr, x->plane[0].src.stride,
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 5f1b0a515..2efa3db52 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -171,19 +171,43 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize,
   int64_t dist_sum = 0;
   const int ref = xd->mi[0]->mbmi.ref_frame[0];
   unsigned int sse;
+  unsigned int var = 0;
   const int shift = 8;
+  int rate;
+  int64_t dist;
+
+  x->pred_sse[ref] = 0;
 
   for (i = 0; i < MAX_MB_PLANE; ++i) {
     struct macroblock_plane *const p = &x->plane[i];
     struct macroblockd_plane *const pd = &xd->plane[i];
     const BLOCK_SIZE bs = get_plane_block_size(bsize, pd);
-
-    const unsigned int var = cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride,
-                                                pd->dst.buf, pd->dst.stride,
-                                                &sse);
+    const TX_SIZE max_tx_size = max_txsize_lookup[bs];
+    const BLOCK_SIZE unit_size = txsize_to_bsize[max_tx_size];
+    int bw = 1 << (b_width_log2_lookup[bs] - b_width_log2_lookup[unit_size]);
+    int bh = 1 << (b_height_log2_lookup[bs] - b_width_log2_lookup[unit_size]);
+    int idx, idy;
+    int lw = b_width_log2_lookup[unit_size] + 2;
+    int lh = b_height_log2_lookup[unit_size] + 2;
+
+    x->bsse[i] = 0;
+
+    for (idy = 0; idy < bh; ++idy) {
+      for (idx = 0; idx < bw; ++idx) {
+        uint8_t *src = p->src.buf + (idy * p->src.stride << lh) + (idx << lw);
+        uint8_t *dst = pd->dst.buf + (idy * pd->dst.stride << lh) + (idx << lh);
+
+        var += cpi->fn_ptr[unit_size].vf(src , p->src.stride,
+                                         dst, pd->dst.stride, &sse);
+
+        x->bsse[i] += sse;
+        if (i == 0)
+          x->pred_sse[ref] += sse;
+      }
+    }
 
     if (!x->select_tx_size) {
-      if (sse < p->quant_thred[0] >> shift)
+      if (x->bsse[i] < p->quant_thred[0] >> shift)
         x->skip_txfm[i] = 1;
       else if (var < p->quant_thred[1] >> shift)
         x->skip_txfm[i] = 2;
@@ -191,10 +215,6 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize,
         x->skip_txfm[i] = 0;
     }
 
-    x->bsse[i] = sse;
-    if (i == 0)
-      x->pred_sse[ref] = sse;
-
     // Fast approximate the modelling function.
     if (cpi->oxcf.speed > 4) {
       int64_t rate;
@@ -210,9 +230,7 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize,
       rate_sum += rate;
       dist_sum += dist;
     } else {
-      int rate;
-      int64_t dist;
-      vp9_model_rd_from_var_lapndz(sse, 1 << num_pels_log2_lookup[bs],
+      vp9_model_rd_from_var_lapndz(x->bsse[i], 1 << num_pels_log2_lookup[bs],
                                    pd->dequant[1] >> 3, &rate, &dist);
       rate_sum += rate;
       dist_sum += dist;
@@ -1360,6 +1378,7 @@ static int64_t rd_pick_best_sub8x8_mode(VP9_COMP *cpi, MACROBLOCK *x,
           int sadpb = x->sadperbit4;
           MV mvp_full;
           int max_mv;
+          int sad_list[5];
 
           /* Is the best so far sufficiently good that we cant justify doing
            * and new motion search. */
@@ -1403,9 +1422,11 @@ static int64_t rd_pick_best_sub8x8_mode(VP9_COMP *cpi, MACROBLOCK *x,
 
           vp9_set_mv_search_range(x, &bsi->ref_mv[0]->as_mv);
 
-          bestsme = vp9_full_pixel_search(cpi, x, bsize, &mvp_full, step_param,
-                                          sadpb, &bsi->ref_mv[0]->as_mv, new_mv,
-                                          INT_MAX, 1);
+          bestsme = vp9_full_pixel_search(
+              cpi, x, bsize, &mvp_full, step_param, sadpb,
+              cpi->sf.mv.subpel_search_method != SUBPEL_TREE ? sad_list : NULL,
+              &bsi->ref_mv[0]->as_mv, new_mv,
+              INT_MAX, 1);
 
           // Should we do a full search (best quality only)
           if (cpi->oxcf.mode == BEST) {
@@ -1417,6 +1438,7 @@ static int64_t rd_pick_best_sub8x8_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                            sadpb, 16, &cpi->fn_ptr[bsize],
                                            &bsi->ref_mv[0]->as_mv,
                                            &best_mv->as_mv);
+            sad_list[1] = sad_list[2] = sad_list[3] = sad_list[4] = INT_MAX;
             if (thissme < bestsme) {
               bestsme = thissme;
               *new_mv = best_mv->as_mv;
@@ -1429,17 +1451,19 @@ static int64_t rd_pick_best_sub8x8_mode(VP9_COMP *cpi, MACROBLOCK *x,
 
           if (bestsme < INT_MAX) {
             int distortion;
-            cpi->find_fractional_mv_step(x,
-                                         new_mv,
-                                         &bsi->ref_mv[0]->as_mv,
-                                         cm->allow_high_precision_mv,
-                                         x->errorperbit, &cpi->fn_ptr[bsize],
-                                         cpi->sf.mv.subpel_force_stop,
-                                         cpi->sf.mv.subpel_iters_per_step,
-                                         x->nmvjointcost, x->mvcost,
-                                         &distortion,
-                                         &x->pred_sse[mbmi->ref_frame[0]],
-                                         NULL, 0, 0);
+            cpi->find_fractional_mv_step(
+                x,
+                new_mv,
+                &bsi->ref_mv[0]->as_mv,
+                cm->allow_high_precision_mv,
+                x->errorperbit, &cpi->fn_ptr[bsize],
+                cpi->sf.mv.subpel_force_stop,
+                cpi->sf.mv.subpel_iters_per_step,
+                cond_sad_list(cpi, sad_list),
+                x->nmvjointcost, x->mvcost,
+                &distortion,
+                &x->pred_sse[mbmi->ref_frame[0]],
+                NULL, 0, 0);
 
             // save motion search result for use in compound prediction
             seg_mvs[i][mbmi->ref_frame[0]].as_mv = *new_mv;
@@ -1767,6 +1791,7 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   int tmp_col_max = x->mv_col_max;
   int tmp_row_min = x->mv_row_min;
   int tmp_row_max = x->mv_row_max;
+  int sad_list[5];
 
   const YV12_BUFFER_CONFIG *scaled_ref_frame = vp9_get_scaled_ref_frame(cpi,
                                                                         ref);
@@ -1839,6 +1864,7 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   mvp_full.row >>= 3;
 
   bestsme = vp9_full_pixel_search(cpi, x, bsize, &mvp_full, step_param, sadpb,
+                                  cond_sad_list(cpi, sad_list),
                                   &ref_mv, &tmp_mv->as_mv, INT_MAX, 1);
 
   x->mv_col_min = tmp_col_min;
@@ -1854,6 +1880,7 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
                                  &cpi->fn_ptr[bsize],
                                  cpi->sf.mv.subpel_force_stop,
                                  cpi->sf.mv.subpel_iters_per_step,
+                                 cond_sad_list(cpi, sad_list),
                                  x->nmvjointcost, x->mvcost,
                                  &dis, &x->pred_sse[ref], NULL, 0, 0);
   }
@@ -1978,6 +2005,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
           x->errorperbit,
           &cpi->fn_ptr[bsize],
           0, cpi->sf.mv.subpel_iters_per_step,
+          NULL,
           x->nmvjointcost, x->mvcost,
           &dis, &sse, second_pred,
           pw, ph);
@@ -2130,7 +2158,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf, MAX_MB_PLANE * 64 * 64);
   int pred_exists = 0;
   int intpel_mv;
-  int64_t rd, best_rd = INT64_MAX;
+  int64_t rd, tmp_rd, best_rd = INT64_MAX;
   int best_needs_copy = 0;
   uint8_t *orig_dst[MAX_MB_PLANE];
   int orig_dst_stride[MAX_MB_PLANE];
@@ -2220,6 +2248,10 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
    * if the first is known */
   *rate2 += cost_mv_ref(cpi, this_mode, mbmi->mode_context[refs[0]]);
 
+  if (RDCOST(x->rdmult, x->rddiv, *rate2, 0) > ref_best_rd &&
+      mbmi->mode != NEARESTMV)
+    return INT64_MAX;
+
   pred_exists = 0;
   // Are all MVs integer pel for Y and UV
   intpel_mv = !mv_has_subpel(&mbmi->mv[0].as_mv);
@@ -2315,6 +2347,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
             (cm->interp_filter != SWITCHABLE &&
              cm->interp_filter == mbmi->interp_filter)) {
           pred_exists = 1;
+          tmp_rd = best_rd;
         }
       }
       restore_dst_buf(xd, orig_dst, orig_dst_stride);
@@ -2333,17 +2366,19 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
         xd->plane[i].dst.stride = 64;
       }
     }
+    rd = tmp_rd + RDCOST(x->rdmult, x->rddiv, rs, 0);
   } else {
+    int tmp_rate;
+    int64_t tmp_dist;
     // Handles the special case when a filter that is not in the
-    // switchable list (ex. bilinear, 6-tap) is indicated at the frame level
+    // switchable list (ex. bilinear) is indicated at the frame level, or
+    // skip condition holds.
     vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
+    model_rd_for_sb(cpi, bsize, x, xd, &tmp_rate, &tmp_dist);
+    rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate, tmp_dist);
   }
 
   if (cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) {
-    int tmp_rate;
-    int64_t tmp_dist;
-    model_rd_for_sb(cpi, bsize, x, xd, &tmp_rate, &tmp_dist);
-    rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate, tmp_dist);
     // if current pred_error modeled rd is substantially more than the best
     // so far, do not bother doing full rd
     if (rd / 2 > ref_best_rd) {
@@ -2353,7 +2388,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   }
 
   if (cm->interp_filter == SWITCHABLE)
-    *rate2 += vp9_get_switchable_rate(cpi);
+    *rate2 += rs;
 
   if (!is_comp_pred) {
     if (cpi->allow_encode_breakout)
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index 879c83c08..0afcde535 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -436,6 +436,8 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
 
   if (sf->mv.subpel_search_method == SUBPEL_TREE) {
     cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_tree;
+  } else if (sf->mv.subpel_search_method == SUBPEL_TREE_PRUNED) {
+    cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_tree_pruned;
   }
 
   cpi->mb.optimize = sf->optimize_coefficients == 1 && oxcf->pass != 1;
diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h
index e2e5c1e99..46eedc147 100644
--- a/vp9/encoder/vp9_speed_features.h
+++ b/vp9/encoder/vp9_speed_features.h
@@ -40,6 +40,7 @@ typedef enum {
 
 typedef enum {
   SUBPEL_TREE = 0,
+  SUBPEL_TREE_PRUNED = 1,
   // Other methods to come
 } SUBPEL_SEARCH_METHODS;
 
diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c
index 076d77683..045e3590a 100644
--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c
@@ -145,6 +145,7 @@ static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi,
   int bestsme = INT_MAX;
   int distortion;
   unsigned int sse;
+  int sad_list[5];
 
   MV best_ref_mv1 = {0, 0};
   MV best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */
@@ -168,6 +169,7 @@ static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi,
 
   // Ignore mv costing by sending NULL pointer instead of cost arrays
   vp9_hex_search(x, &best_ref_mv1_full, step_param, sadpb, 1,
+                 cond_sad_list(cpi, sad_list),
                  &cpi->fn_ptr[BLOCK_16X16], 0, &best_ref_mv1, ref_mv);
 
   // Ignore mv costing by sending NULL pointer instead of cost array
@@ -177,6 +179,7 @@ static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi,
                                          x->errorperbit,
                                          &cpi->fn_ptr[BLOCK_16X16],
                                          0, mv_sf->subpel_iters_per_step,
+                                         cond_sad_list(cpi, sad_list),
                                          NULL, NULL,
                                          &distortion, &sse, NULL, 0, 0);