12 files changed, 291 insertions, 559 deletions
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index 147743e8d..bbdfbb823 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -145,6 +145,11 @@ struct macroblock {
 
   uint8_t sb_is_skin;
 
+  // Used to save the status of whether a block has a low variance in
+  // choose_partitioning. 0 for 64x64, 1 2 for 64x32, 3 4 for 32x64, 5~8 for
+  // 32x32.
+  uint8_t variance_low[9];
+
   void (*fwd_txm4x4)(const int16_t *input, tran_low_t *output, int stride);
   void (*itxm_add)(const tran_low_t *input, uint8_t *dest, int stride, int eob);
 #if CONFIG_VP9_HIGHBITDEPTH
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 67069e7c1..e3570504e 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -747,6 +747,8 @@ static int choose_partitioning(VP9_COMP *cpi,
   const uint8_t *d;
   int sp;
   int dp;
+  // Ref frame used in partitioning.
+  MV_REFERENCE_FRAME ref_frame_partition = LAST_FRAME;
   int pixels_wide = 64, pixels_high = 64;
   int64_t thresholds[4] = {cpi->vbp_thresholds[0], cpi->vbp_thresholds[1],
       cpi->vbp_thresholds[2], cpi->vbp_thresholds[3]};
@@ -771,6 +773,10 @@ static int choose_partitioning(VP9_COMP *cpi,
     }
   }
 
+  for (i = 0; i < 9; i++) {
+    x->variance_low[i] = 0;
+  }
+
   if (xd->mb_to_right_edge < 0)
     pixels_wide += (xd->mb_to_right_edge >> 3);
   if (xd->mb_to_bottom_edge < 0)
@@ -831,8 +837,10 @@ static int choose_partitioning(VP9_COMP *cpi,
       mi->ref_frame[0] = GOLDEN_FRAME;
       mi->mv[0].as_int = 0;
       y_sad = y_sad_g;
+      ref_frame_partition = GOLDEN_FRAME;
     } else {
       x->pred_mv[LAST_FRAME] = mi->mv[0].as_mv;
+      ref_frame_partition = LAST_FRAME;
     }
 
     set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]);
@@ -1073,6 +1081,34 @@ static int choose_partitioning(VP9_COMP *cpi,
       }
     }
   }
+
+  if (cpi->sf.short_circuit_low_temp_var) {
+    // Set low variance flag, only for blocks >= 32x32 and if LAST_FRAME was
+    // selected.
+    if (ref_frame_partition == LAST_FRAME) {
+      if (xd->mi[0]->sb_type == BLOCK_64X64 &&
+          vt.part_variances.none.variance < (thresholds[0] >> 1)) {
+        x->variance_low[0] = 1;
+      } else if (xd->mi[0]->sb_type == BLOCK_64X32) {
+        if (vt.part_variances.horz[0].variance < (thresholds[0] >> 2))
+          x->variance_low[1] = 1;
+        if (vt.part_variances.horz[1].variance < (thresholds[0] >> 2))
+          x->variance_low[2] = 1;
+      } else if (xd->mi[0]->sb_type == BLOCK_32X64) {
+        if (vt.part_variances.vert[0].variance < (thresholds[0] >> 2))
+          x->variance_low[3] = 1;
+        if (vt.part_variances.vert[1].variance < (thresholds[0] >> 2))
+          x->variance_low[4] = 1;
+      } else {
+        // 32x32
+        for (i = 0; i < 4; i++) {
+          if (!force_split[i + 1] &&
+              vt.split[i].part_variances.none.variance < (thresholds[1] >> 1))
+            x->variance_low[i + 5] = 1;
+        }
+      }
+    }
+  }
   return 0;
 }
 
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index f456f37a1..a70eaea3e 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -236,13 +236,6 @@ static void subtract_stats(FIRSTPASS_STATS *section,
   section->duration   -= frame->duration;
 }
 
-// Calculate the linear size relative to a baseline of 1080P
-#define BASE_SIZE 2073600.0  // 1920x1080
-static double get_linear_size_factor(const VP9_COMP *cpi) {
-  const double this_area = cpi->initial_width * cpi->initial_height;
-  return pow(this_area / BASE_SIZE, 0.5);
-}
-
 // Calculate an active area of the image that discounts formatting
 // bars and partially discounts other 0 energy areas.
 #define MIN_ACTIVE_AREA 0.5
@@ -1247,14 +1240,15 @@ static double calc_correction_factor(double err_per_mb,
   return fclamp(pow(error_term, power_term), 0.05, 5.0);
 }
 
-#define ERR_DIVISOR         100.0
-static int get_twopass_worst_quality(const VP9_COMP *cpi,
+#define ERR_DIVISOR         115.0
+static int get_twopass_worst_quality(VP9_COMP *cpi,
                                      const double section_err,
                                      double inactive_zone,
-                                     int section_target_bandwidth,
-                                     double group_weight_factor) {
+                                     int section_target_bandwidth) {
   const RATE_CONTROL *const rc = &cpi->rc;
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+  TWO_PASS *const twopass = &cpi->twopass;
+
   // Clamp the target rate to VBR min / max limts.
   const int target_rate =
       vp9_rc_clamp_pframe_target_size(cpi, section_target_bandwidth);
@@ -1269,7 +1263,7 @@ static int get_twopass_worst_quality(const VP9_COMP *cpi,
     const int active_mbs = VPXMAX(1, num_mbs - (int)(num_mbs * inactive_zone));
     const double av_err_per_mb = section_err / active_mbs;
     const double speed_term = 1.0 + 0.04 * oxcf->speed;
-    double ediv_size_correction;
+    double last_group_rate_err;
     const int target_norm_bits_per_mb = ((uint64_t)target_rate <<
                                          BPER_MB_NORMBITS) / active_mbs;
     int q;
@@ -1278,29 +1272,27 @@ static int get_twopass_worst_quality(const VP9_COMP *cpi,
     if (is_two_pass_svc(cpi) && cpi->svc.spatial_layer_id > 0)
       is_svc_upper_layer = 1;
 
-    // Larger image formats are expected to be a little harder to code
-    // relatively given the same prediction error score. This in part at
-    // least relates to the increased size and hence coding overheads of
-    // motion vectors. Some account of this is made through adjustment of
-    // the error divisor.
-    ediv_size_correction =
-        VPXMAX(0.2, VPXMIN(5.0, get_linear_size_factor(cpi)));
-    if (ediv_size_correction < 1.0)
-      ediv_size_correction = -(1.0 / ediv_size_correction);
-    ediv_size_correction *= 4.0;
+    // based on recent history adjust expectations of bits per macroblock.
+    last_group_rate_err = (double)twopass->rolling_arf_group_actual_bits /
+        DOUBLE_DIVIDE_CHECK((double)twopass->rolling_arf_group_target_bits);
+    last_group_rate_err =
+        VPXMAX(0.25, VPXMIN(4.0, last_group_rate_err));
+    twopass->bpm_factor *= (3.0 + last_group_rate_err) / 4.0;
+    twopass->bpm_factor =
+        VPXMAX(0.25, VPXMIN(4.0, twopass->bpm_factor));
 
     // Try and pick a max Q that will be high enough to encode the
     // content at the given rate.
     for (q = rc->best_quality; q < rc->worst_quality; ++q) {
       const double factor =
           calc_correction_factor(av_err_per_mb,
-                                 ERR_DIVISOR - ediv_size_correction,
+                                 ERR_DIVISOR,
                                  is_svc_upper_layer ? SVC_FACTOR_PT_LOW :
                                  FACTOR_PT_LOW, FACTOR_PT_HIGH, q,
                                  cpi->common.bit_depth);
       const int bits_per_mb =
         vp9_rc_bits_per_mb(INTER_FRAME, q,
-                           factor * speed_term * group_weight_factor,
+                           factor * speed_term * cpi->twopass.bpm_factor,
                            cpi->common.bit_depth);
       if (bits_per_mb <= target_norm_bits_per_mb)
         break;
@@ -2115,8 +2107,6 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     old_boost_score = boost_score;
   }
 
-  twopass->gf_zeromotion_pct = (int)(zero_motion_accumulator * 1000.0);
-
   // Was the group length constrained by the requirement for a new KF?
   rc->constrained_gf_group = (i >= rc->frames_to_key) ? 1 : 0;
 
@@ -2184,24 +2174,12 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     const double group_av_inactive_zone =
       ((gf_group_inactive_zone_rows * 2) /
        (rc->baseline_gf_interval * (double)cm->mb_rows));
-
-    int tmp_q;
-    // rc factor is a weight factor that corrects for local rate control drift.
-    double rc_factor = 1.0;
-    if (rc->rate_error_estimate > 0) {
-      rc_factor = VPXMAX(RC_FACTOR_MIN,
-                         (double)(100 - rc->rate_error_estimate) / 100.0);
-    } else {
-      rc_factor = VPXMIN(RC_FACTOR_MAX,
-                         (double)(100 - rc->rate_error_estimate) / 100.0);
-    }
-    tmp_q =
-      get_twopass_worst_quality(cpi, group_av_err,
-                                (group_av_skip_pct + group_av_inactive_zone),
-                                vbr_group_bits_per_frame,
-                                twopass->kfgroup_inter_fraction * rc_factor);
+    int tmp_q =
+        get_twopass_worst_quality(cpi, group_av_err,
+                                  (group_av_skip_pct + group_av_inactive_zone),
+                                  vbr_group_bits_per_frame);
     twopass->active_worst_quality =
-        VPXMAX(tmp_q, twopass->active_worst_quality >> 1);
+        (tmp_q + (twopass->active_worst_quality * 3)) >> 2;
   }
 #endif
 
@@ -2243,6 +2221,10 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     // Default to starting GF groups at normal frame size.
     cpi->rc.next_frame_size_selector = UNSCALED;
   }
+
+  // Reset rolling actual and target bits counters for ARF groups.
+  twopass->rolling_arf_group_target_bits = 0;
+  twopass->rolling_arf_group_actual_bits = 0;
 }
 
 // Threshold for use of the lagging second reference frame. High second ref
@@ -2580,16 +2562,6 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   kf_bits = calculate_boost_bits((rc->frames_to_key - 1),
                                   rc->kf_boost, twopass->kf_group_bits);
 
-  // Work out the fraction of the kf group bits reserved for the inter frames
-  // within the group after discounting the bits for the kf itself.
-  if (twopass->kf_group_bits) {
-    twopass->kfgroup_inter_fraction =
-      (double)(twopass->kf_group_bits - kf_bits) /
-      (double)twopass->kf_group_bits;
-  } else {
-    twopass->kfgroup_inter_fraction = 1.0;
-  }
-
   twopass->kf_group_bits -= kf_bits;
 
   // Save the bits to spend on the key frame.
@@ -2683,21 +2655,12 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
   RATE_CONTROL *const rc = &cpi->rc;
   TWO_PASS *const twopass = &cpi->twopass;
   GF_GROUP *const gf_group = &twopass->gf_group;
-  int frames_left;
   FIRSTPASS_STATS this_frame;
 
   int target_rate;
   LAYER_CONTEXT *const lc = is_two_pass_svc(cpi) ?
         &cpi->svc.layer_context[cpi->svc.spatial_layer_id] : 0;
 
-  if (lc != NULL) {
-    frames_left = (int)(twopass->total_stats.count -
-                  lc->current_video_frame_in_layer);
-  } else {
-    frames_left = (int)(twopass->total_stats.count -
-                  cm->current_video_frame);
-  }
-
   if (!twopass->stats_in)
     return;
 
@@ -2739,6 +2702,9 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
     twopass->active_worst_quality = cpi->oxcf.cq_level;
   } else if (cm->current_video_frame == 0 ||
              (lc != NULL && lc->current_video_frame_in_layer == 0)) {
+    const int frames_left = (int)(twopass->total_stats.count -
+        ((lc != NULL) ? lc->current_video_frame_in_layer
+                      : cm->current_video_frame));
     // Special case code for first frame.
     const int section_target_bandwidth = (int)(twopass->bits_left /
                                                frames_left);
@@ -2750,10 +2716,17 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
     const double section_inactive_zone =
       (twopass->total_left_stats.inactive_zone_rows * 2) /
       ((double)cm->mb_rows * section_length);
-    const int tmp_q =
-      get_twopass_worst_quality(cpi, section_error,
-                                section_intra_skip + section_inactive_zone,
-                                section_target_bandwidth, DEFAULT_GRP_WEIGHT);
+    int tmp_q;
+
+    // Initialize bits per macro_block estimate correction factor.
+    twopass->bpm_factor = 1.0;
+    // Initiallize actual and target bits counters for ARF groups so that
+    // at the start we have a neutral bpm adjustment.
+    twopass->rolling_arf_group_target_bits = 1;
+    twopass->rolling_arf_group_actual_bits = 1;
+
+    tmp_q = get_twopass_worst_quality(cpi, section_error,
+        section_intra_skip + section_inactive_zone, section_target_bandwidth);
 
     twopass->active_worst_quality = tmp_q;
     twopass->baseline_active_worst_quality = tmp_q;
@@ -2871,6 +2844,10 @@ void vp9_twopass_postencode_update(VP9_COMP *cpi) {
   rc->vbr_bits_off_target += rc->base_frame_target - rc->projected_frame_size;
   twopass->bits_left = VPXMAX(twopass->bits_left - bits_used, 0);
 
+  // Target vs actual bits for this arf group.
+  twopass->rolling_arf_group_target_bits += rc->this_frame_target;
+  twopass->rolling_arf_group_actual_bits += rc->projected_frame_size;
+
   // Calculate the pct rc error.
   if (rc->total_actual_bits) {
     rc->rate_error_estimate =
@@ -2892,7 +2869,6 @@ void vp9_twopass_postencode_update(VP9_COMP *cpi) {
 
   // If the rate control is drifting consider adjustment to min or maxq.
   if ((cpi->oxcf.rc_mode != VPX_Q) &&
-      (cpi->twopass.gf_zeromotion_pct < VLOW_MOTION_THRESHOLD) &&
       !cpi->rc.is_src_frame_alt_ref) {
     const int maxq_adj_limit =
       rc->worst_quality - twopass->active_worst_quality;
diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h
index 7eb44fa13..76072884d 100644
--- a/vp9/encoder/vp9_firstpass.h
+++ b/vp9/encoder/vp9_firstpass.h
@@ -39,8 +39,6 @@ typedef struct {
 } FIRSTPASS_MB_STATS;
 #endif
 
-#define VLOW_MOTION_THRESHOLD 950
-
 typedef struct {
   double frame;
   double weight;
@@ -124,14 +122,13 @@ typedef struct {
   // Error score of frames still to be coded in kf group
   int64_t kf_group_error_left;
 
-  // The fraction for a kf groups total bits allocated to the inter frames
-  double kfgroup_inter_fraction;
+  double bpm_factor;
+  int rolling_arf_group_target_bits;
+  int rolling_arf_group_actual_bits;
 
   int sr_update_lag;
-
   int kf_zeromotion_pct;
   int last_kfgroup_zeromotion_pct;
-  int gf_zeromotion_pct;
   int active_worst_quality;
   int baseline_active_worst_quality;
   int extend_minq;
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 918b3b10b..554409b74 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -1126,34 +1126,38 @@ static INLINE void find_predictors(VP9_COMP *cpi, MACROBLOCK *x,
                                  TileDataEnc *tile_data,
                                  int mi_row, int mi_col,
                                  struct buf_2d yv12_mb[4][MAX_MB_PLANE],
-                                 BLOCK_SIZE bsize) {
+                                 BLOCK_SIZE bsize,
+                                 int force_skip_low_temp_var) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
   TileInfo *const tile_info = &tile_data->tile_info;
-// TODO(jingning) placeholder for inter-frame non-RD mode decision.
+  // TODO(jingning) placeholder for inter-frame non-RD mode decision.
   x->pred_mv_sad[ref_frame] = INT_MAX;
   frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
   frame_mv[ZEROMV][ref_frame].as_int = 0;
-// this needs various further optimizations. to be continued..
+  // this needs various further optimizations. to be continued..
   if ((cpi->ref_frame_flags & flag_list[ref_frame]) && (yv12 != NULL)) {
     int_mv *const candidates = x->mbmi_ext->ref_mvs[ref_frame];
     const struct scale_factors *const sf = &cm->frame_refs[ref_frame - 1].sf;
     vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col,
                          sf, sf);
-    if (cm->use_prev_frame_mvs)
+    if (cm->use_prev_frame_mvs) {
       vp9_find_mv_refs(cm, xd, xd->mi[0], ref_frame,
                        candidates, mi_row, mi_col,
                        x->mbmi_ext->mode_context);
-    else
-    const_motion[ref_frame] =
-        mv_refs_rt(cpi, cm, x, xd, tile_info, xd->mi[0], ref_frame,
-            candidates, &frame_mv[NEWMV][ref_frame], mi_row, mi_col,
-            (int)(cpi->svc.use_base_mv && cpi->svc.spatial_layer_id));
+    } else {
+      const_motion[ref_frame] =
+          mv_refs_rt(cpi, cm, x, xd, tile_info, xd->mi[0], ref_frame,
+                     candidates, &frame_mv[NEWMV][ref_frame], mi_row, mi_col,
+                     (int)(cpi->svc.use_base_mv && cpi->svc.spatial_layer_id));
+    }
     vp9_find_best_ref_mvs(xd, cm->allow_high_precision_mv, candidates,
                           &frame_mv[NEARESTMV][ref_frame],
                           &frame_mv[NEARMV][ref_frame]);
-    if (!vp9_is_scaled(sf) && bsize >= BLOCK_8X8) {
+    // Early exit for golden frame if force_skip_low_temp_var is set.
+    if (!vp9_is_scaled(sf) && bsize >= BLOCK_8X8 &&
+        !(force_skip_low_temp_var && ref_frame == GOLDEN_FRAME)) {
       vp9_mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12->y_stride,
                   ref_frame, bsize);
     }
@@ -1266,6 +1270,39 @@ static void recheck_zeromv_after_denoising(
 }
 #endif  // CONFIG_VP9_TEMPORAL_DENOISING
 
+static INLINE int set_force_skip_low_temp_var(uint8_t *variance_low,
+                                              int mi_row, int mi_col,
+                                              BLOCK_SIZE bsize) {
+  int force_skip_low_temp_var = 0;
+  // Set force_skip_low_temp_var based on the block size and block offset.
+  if (bsize == BLOCK_64X64) {
+    force_skip_low_temp_var = variance_low[0];
+  } else if (bsize == BLOCK_64X32) {
+    if (!(mi_col & 0x7) && !(mi_row & 0x7)) {
+      force_skip_low_temp_var = variance_low[1];
+    } else if (!(mi_col & 0x7) && (mi_row & 0x7)) {
+      force_skip_low_temp_var = variance_low[2];
+    }
+  } else if (bsize == BLOCK_32X64) {
+    if (!(mi_col & 0x7) && !(mi_row & 0x7)) {
+      force_skip_low_temp_var = variance_low[3];
+    } else if ((mi_col & 0x7) && !(mi_row & 0x7)) {
+      force_skip_low_temp_var = variance_low[4];
+    }
+  } else if (bsize == BLOCK_32X32) {
+    if (!(mi_col & 0x7) && !(mi_row & 0x7)) {
+      force_skip_low_temp_var = variance_low[5];
+    } else if ((mi_col & 0x7) && !(mi_row & 0x7)) {
+      force_skip_low_temp_var = variance_low[6];
+    } else if (!(mi_col & 0x7) && (mi_row & 0x7)) {
+      force_skip_low_temp_var = variance_low[7];
+    } else if ((mi_col & 0x7) && (mi_row & 0x7)) {
+      force_skip_low_temp_var = variance_low[8];
+    }
+  }
+  return force_skip_low_temp_var;
+}
+
 void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                          TileDataEnc *tile_data,
                          int mi_row, int mi_col, RD_COST *rd_cost,
@@ -1323,6 +1360,8 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   int ref_frame_cost[MAX_REF_FRAMES];
   int svc_force_zero_mode[3] = {0};
   int perform_intra_pred = 1;
+  int use_golden_nonzeromv = 1;
+  int force_skip_low_temp_var = 0;
 #if CONFIG_VP9_TEMPORAL_DENOISING
   VP9_PICKMODE_CTX_DEN ctx_den;
   int64_t zero_last_cost_orig = INT64_MAX;
@@ -1409,10 +1448,19 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     }
   }
 
+  if (cpi->sf.short_circuit_low_temp_var) {
+    force_skip_low_temp_var =
+        set_force_skip_low_temp_var(&x->variance_low[0], mi_row, mi_col, bsize);
+  }
+
+  if (!((cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) &&
+      !svc_force_zero_mode[GOLDEN_FRAME - 1] && !force_skip_low_temp_var))
+    use_golden_nonzeromv = 0;
+
   for (ref_frame = LAST_FRAME; ref_frame <= usable_ref_frame; ++ref_frame) {
     find_predictors(cpi, x, ref_frame, frame_mv, const_motion,
                     &ref_frame_skip_mask, flag_list, tile_data, mi_row, mi_col,
-                    yv12_mb, bsize);
+                    yv12_mb, bsize, force_skip_low_temp_var);
   }
 
   for (idx = 0; idx < RT_INTER_MODES; ++idx) {
@@ -1424,6 +1472,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     int is_skippable;
     int this_early_term = 0;
     PREDICTION_MODE this_mode = ref_mode_set[idx].pred_mode;
+
     if (cpi->use_svc)
       this_mode = ref_mode_set_svc[idx].pred_mode;
 
@@ -1442,17 +1491,27 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
 
     if (!(cpi->ref_frame_flags & flag_list[ref_frame]))
       continue;
+
     if (const_motion[ref_frame] && this_mode == NEARMV)
       continue;
 
+    // Skip non-zeromv mode search for golden frame if force_skip_low_temp_var
+    // is set. If nearestmv for golden frame is 0, zeromv mode will be skipped
+    // later.
+    if (force_skip_low_temp_var && ref_frame == GOLDEN_FRAME &&
+        frame_mv[this_mode][ref_frame].as_int != 0) {
+      continue;
+    }
+
     if (cpi->use_svc) {
       if (svc_force_zero_mode[ref_frame - 1] &&
           frame_mv[this_mode][ref_frame].as_int != 0)
         continue;
     }
 
-    if (!(frame_mv[this_mode][ref_frame].as_int == 0 &&
-        ref_frame == LAST_FRAME)) {
+    if (!force_skip_low_temp_var &&
+        !(frame_mv[this_mode][ref_frame].as_int == 0 &&
+          ref_frame == LAST_FRAME)) {
       i = (ref_frame == LAST_FRAME) ? GOLDEN_FRAME : LAST_FRAME;
       if ((cpi->ref_frame_flags & flag_list[i]) && sf->reference_masking)
         if (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[i] << 1))
@@ -1543,7 +1602,10 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
       }
     }
 
-    if (this_mode == NEWMV && ref_frame == LAST_FRAME &&
+    // If use_golden_nonzeromv is false, NEWMV mode is skipped for golden, no
+    // need to compute best_pred_sad which is only used to skip golden NEWMV.
+    if (use_golden_nonzeromv && this_mode == NEWMV &&
+        ref_frame == LAST_FRAME &&
         frame_mv[NEWMV][LAST_FRAME].as_int != INVALID_MV) {
       const int pre_stride = xd->plane[0].pre[0].stride;
       const uint8_t * const pre_buf = xd->plane[0].pre[0].buf +
@@ -1555,21 +1617,6 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
       x->pred_mv_sad[LAST_FRAME] = best_pred_sad;
     }
 
-    if (cpi->use_svc) {
-      if (this_mode == NEWMV && ref_frame == GOLDEN_FRAME &&
-          frame_mv[NEWMV][GOLDEN_FRAME].as_int != INVALID_MV) {
-        const int pre_stride = xd->plane[0].pre[0].stride;
-        const uint8_t * const pre_buf = xd->plane[0].pre[0].buf +
-            (frame_mv[NEWMV][GOLDEN_FRAME].as_mv.row >> 3) * pre_stride +
-            (frame_mv[NEWMV][GOLDEN_FRAME].as_mv.col >> 3);
-        best_pred_sad = cpi->fn_ptr[bsize].sdf(x->plane[0].src.buf,
-                                               x->plane[0].src.stride,
-                                               pre_buf, pre_stride);
-        x->pred_mv_sad[GOLDEN_FRAME] = best_pred_sad;
-      }
-    }
-
-
     if (this_mode != NEARESTMV &&
         frame_mv[this_mode][ref_frame].as_int ==
             frame_mv[NEARESTMV][ref_frame].as_int)
@@ -1795,11 +1842,11 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     inter_mode_thresh = (inter_mode_thresh << 1) + inter_mode_thresh;
   }
   // Perform intra prediction search, if the best SAD is above a certain
-  // threshold.
-  if (perform_intra_pred &&
-      ((best_rdc.rdcost == INT64_MAX ||
-      (!x->skip && best_rdc.rdcost > inter_mode_thresh &&
-       bsize <= cpi->sf.max_intra_bsize)))) {
+  // threshold. Skip intra prediction if force_skip_low_temp_var is set.
+  if (!force_skip_low_temp_var && perform_intra_pred &&
+      (best_rdc.rdcost == INT64_MAX ||
+       (!x->skip && best_rdc.rdcost > inter_mode_thresh &&
+        bsize <= cpi->sf.max_intra_bsize))) {
     struct estimate_block_intra_args args = { cpi, x, DC_PRED, 1, 0, 0 };
     int i;
     TX_SIZE best_intra_tx_size = TX_SIZES;
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index b8a5e6e7d..6c3f91951 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -1160,8 +1160,7 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi,
 
   // Extension to max or min Q if undershoot or overshoot is outside
   // the permitted range.
-  if ((cpi->oxcf.rc_mode != VPX_Q) &&
-      (cpi->twopass.gf_zeromotion_pct < VLOW_MOTION_THRESHOLD)) {
+  if (cpi->oxcf.rc_mode != VPX_Q) {
     if (frame_is_intra_only(cm) ||
         (!rc->is_src_frame_alt_ref &&
          (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame))) {
@@ -1559,12 +1558,13 @@ void vp9_rc_get_one_pass_vbr_params(VP9_COMP *cpi) {
     if (cm->current_video_frame > 30 &&
         rc->avg_frame_qindex[INTER_FRAME] > (7 * rc->worst_quality) >> 3 &&
         rc->avg_size_inter > (5 * rc->avg_frame_bandwidth) >> 1) {
-        rc->baseline_gf_interval = (3 * rc->baseline_gf_interval) >> 1;
+        rc->baseline_gf_interval =
+            VPXMIN(15, (3 * rc->baseline_gf_interval) >> 1);
     } else if (cm->current_video_frame > 30 &&
                rc->avg_frame_low_motion < 20) {
       // Decrease boost and gf interval for high motion case.
       rc->gfu_boost = DEFAULT_GF_BOOST >> 1;
-      rc->baseline_gf_interval = VPXMIN(6, rc->baseline_gf_interval >> 1);
+      rc->baseline_gf_interval = VPXMAX(5, rc->baseline_gf_interval >> 1);
     }
     adjust_gf_key_frame(cpi);
     rc->frames_till_gf_update_due = rc->baseline_gf_interval;
@@ -1890,27 +1890,28 @@ static void vbr_rate_correction(VP9_COMP *cpi, int *this_frame_target) {
   RATE_CONTROL *const rc = &cpi->rc;
   int64_t vbr_bits_off_target = rc->vbr_bits_off_target;
   int max_delta;
-  double position_factor = 1.0;
-
-  // How far through the clip are we.
-  // This number is used to damp the per frame rate correction.
-  // Range 0 - 1.0
-  if (cpi->twopass.total_stats.count) {
-    position_factor = sqrt((double)cpi->common.current_video_frame /
-                           cpi->twopass.total_stats.count);
-  }
-  max_delta = (int)(position_factor *
-                    ((*this_frame_target * VBR_PCT_ADJUSTMENT_LIMIT) / 100));
-
-  // vbr_bits_off_target > 0 means we have extra bits to spend
-  if (vbr_bits_off_target > 0) {
-    *this_frame_target +=
-      (vbr_bits_off_target > max_delta) ? max_delta
-                                        : (int)vbr_bits_off_target;
-  } else {
-    *this_frame_target -=
-      (vbr_bits_off_target < -max_delta) ? max_delta
-                                         : (int)-vbr_bits_off_target;
+  int frame_window = VPXMIN(16,
+      ((int)cpi->twopass.total_stats.count - cpi->common.current_video_frame));
+
+  // Calcluate the adjustment to rate for this frame.
+  if (frame_window > 0) {
+    max_delta = (vbr_bits_off_target > 0)
+        ? (int)(vbr_bits_off_target / frame_window)
+        : (int)(-vbr_bits_off_target / frame_window);
+
+    max_delta = VPXMIN(max_delta,
+        ((*this_frame_target * VBR_PCT_ADJUSTMENT_LIMIT) / 100));
+
+    // vbr_bits_off_target > 0 means we have extra bits to spend
+    if (vbr_bits_off_target > 0) {
+      *this_frame_target +=
+        (vbr_bits_off_target > max_delta) ? max_delta
+                                          : (int)vbr_bits_off_target;
+    } else {
+      *this_frame_target -=
+        (vbr_bits_off_target < -max_delta) ? max_delta
+                                           : (int)-vbr_bits_off_target;
+    }
   }
 
   // Fast redistribution of bits arising from massive local undershoot.
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index 02be3c3f9..0090b4f40 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -429,6 +429,11 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf,
     sf->mv.search_method = NSTEP;
     sf->mv.reduce_first_step_size = 1;
     sf->skip_encode_sb = 0;
+    if (!cpi->use_svc && cpi->oxcf.rc_mode == VPX_CBR && cpi->oxcf.pass == 0 &&
+        content != VP9E_CONTENT_SCREEN) {
+      // Enable short circuit for low temporal variance.
+      sf->short_circuit_low_temp_var = 1;
+    }
   }
 
   if (speed >= 7) {
@@ -554,6 +559,7 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) {
   sf->default_interp_filter = SWITCHABLE;
   sf->simple_model_rd_from_var = 0;
   sf->short_circuit_flat_blocks = 0;
+  sf->short_circuit_low_temp_var = 0;
 
   // Some speed-up features even for best quality as minimal impact on quality.
   sf->adaptive_rd_thresh = 1;
diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h
index 90b32164b..71ff0ac10 100644
--- a/vp9/encoder/vp9_speed_features.h
+++ b/vp9/encoder/vp9_speed_features.h
@@ -446,6 +446,10 @@ typedef struct SPEED_FEATURES {
   // Skip a number of expensive mode evaluations for blocks with zero source
   // variance.
   int short_circuit_flat_blocks;
+
+  // Skip a number of expensive mode evaluations for blocks with very low
+  // temporal variance.
+  int short_circuit_low_temp_var;
 } SPEED_FEATURES;
 
 struct VP9_COMP;
diff --git a/vp9/encoder/x86/vp9_dct_sse2.c b/vp9/encoder/x86/vp9_dct_intrin_sse2.c
index fa37b6fed..fa37b6fed 100644
--- a/vp9/encoder/x86/vp9_dct_sse2.c
+++ b/vp9/encoder/x86/vp9_dct_intrin_sse2.c
diff --git a/vp9/encoder/x86/vp9_dct_mmx.asm b/vp9/encoder/x86/vp9_dct_mmx.asm
deleted file mode 100644
index 7a7a6b655..000000000
--- a/vp9/encoder/x86/vp9_dct_mmx.asm
+++ /dev/null
@@ -1,104 +0,0 @@
-;
-;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-%define private_prefix vp9
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION .text
-
-%macro TRANSFORM_COLS 0
-  paddw           m0,        m1
-  movq            m4,        m0
-  psubw           m3,        m2
-  psubw           m4,        m3
-  psraw           m4,        1
-  movq            m5,        m4
-  psubw           m5,        m1 ;b1
-  psubw           m4,        m2 ;c1
-  psubw           m0,        m4
-  paddw           m3,        m5
-                                ; m0 a0
-  SWAP            1,         4  ; m1 c1
-  SWAP            2,         3  ; m2 d1
-  SWAP            3,         5  ; m3 b1
-%endmacro
-
-%macro TRANSPOSE_4X4 0
-  movq            m4,        m0
-  movq            m5,        m2
-  punpcklwd       m4,        m1
-  punpckhwd       m0,        m1
-  punpcklwd       m5,        m3
-  punpckhwd       m2,        m3
-  movq            m1,        m4
-  movq            m3,        m0
-  punpckldq       m1,        m5
-  punpckhdq       m4,        m5
-  punpckldq       m3,        m2
-  punpckhdq       m0,        m2
-  SWAP            2, 3, 0, 1, 4
-%endmacro
-
-INIT_MMX mmx
-cglobal fwht4x4, 3, 4, 8, input, output, stride
-  lea             r3q,       [inputq + strideq*4]
-  movq            m0,        [inputq] ;a1
-  movq            m1,        [inputq + strideq*2] ;b1
-  movq            m2,        [r3q] ;c1
-  movq            m3,        [r3q + strideq*2] ;d1
-
-  TRANSFORM_COLS
-  TRANSPOSE_4X4
-  TRANSFORM_COLS
-  TRANSPOSE_4X4
-
-  psllw           m0,        2
-  psllw           m1,        2
-  psllw           m2,        2
-  psllw           m3,        2
-
-%if CONFIG_VP9_HIGHBITDEPTH
-  pxor            m4,             m4
-  pxor            m5,             m5
-  pcmpgtw         m4,             m0
-  pcmpgtw         m5,             m1
-  movq            m6,             m0
-  movq            m7,             m1
-  punpcklwd       m0,             m4
-  punpcklwd       m1,             m5
-  punpckhwd       m6,             m4
-  punpckhwd       m7,             m5
-  movq            [outputq],      m0
-  movq            [outputq + 8],  m6
-  movq            [outputq + 16], m1
-  movq            [outputq + 24], m7
-  pxor            m4,             m4
-  pxor            m5,             m5
-  pcmpgtw         m4,             m2
-  pcmpgtw         m5,             m3
-  movq            m6,             m2
-  movq            m7,             m3
-  punpcklwd       m2,             m4
-  punpcklwd       m3,             m5
-  punpckhwd       m6,             m4
-  punpckhwd       m7,             m5
-  movq            [outputq + 32], m2
-  movq            [outputq + 40], m6
-  movq            [outputq + 48], m3
-  movq            [outputq + 56], m7
-%else
-  movq            [outputq],      m0
-  movq            [outputq + 8],  m1
-  movq            [outputq + 16], m2
-  movq            [outputq + 24], m3
-%endif
-
-  RET
diff --git a/vp9/encoder/x86/vp9_dct_sse2.asm b/vp9/encoder/x86/vp9_dct_sse2.asm
new file mode 100644
index 000000000..d3b2a271b
--- /dev/null
+++ b/vp9/encoder/x86/vp9_dct_sse2.asm
@@ -0,0 +1,87 @@
+;
+;  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%define private_prefix vp9
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+%macro TRANSFORM_COLS 0
+  paddw           m0,        m1
+  movq            m4,        m0
+  psubw           m3,        m2
+  psubw           m4,        m3
+  psraw           m4,        1
+  movq            m5,        m4
+  psubw           m5,        m1 ;b1
+  psubw           m4,        m2 ;c1
+  psubw           m0,        m4
+  paddw           m3,        m5
+                                ; m0 a0
+  SWAP            1,         4  ; m1 c1
+  SWAP            2,         3  ; m2 d1
+  SWAP            3,         5  ; m3 b1
+%endmacro
+
+%macro TRANSPOSE_4X4 0
+                                ; 00 01 02 03
+                                ; 10 11 12 13
+                                ; 20 21 22 23
+                                ; 30 31 32 33
+  punpcklwd       m0,        m1 ; 00 10 01 11  02 12 03 13
+  punpcklwd       m2,        m3 ; 20 30 21 31  22 32 23 33
+  mova            m1,        m0
+  punpckldq       m0,        m2 ; 00 10 20 30  01 11 21 31
+  punpckhdq       m1,        m2 ; 02 12 22 32  03 13 23 33
+%endmacro
+
+INIT_XMM sse2
+cglobal fwht4x4, 3, 4, 8, input, output, stride
+; TODO(linfeng): The duplication with vp10 should be resolved.
+  lea             r3q,       [inputq + strideq*4]
+  movq            m0,        [inputq] ;a1
+  movq            m1,        [inputq + strideq*2] ;b1
+  movq            m2,        [r3q] ;c1
+  movq            m3,        [r3q + strideq*2] ;d1
+
+  TRANSFORM_COLS
+  TRANSPOSE_4X4
+  SWAP            1,         2
+  psrldq          m1,        m0, 8
+  psrldq          m3,        m2, 8
+  TRANSFORM_COLS
+  TRANSPOSE_4X4
+
+  psllw           m0,        2
+  psllw           m1,        2
+
+%if CONFIG_VP9_HIGHBITDEPTH
+  ; sign extension
+  mova            m2,             m0
+  mova            m3,             m1
+  punpcklwd       m0,             m0
+  punpcklwd       m1,             m1
+  punpckhwd       m2,             m2
+  punpckhwd       m3,             m3
+  psrad           m0,             16
+  psrad           m1,             16
+  psrad           m2,             16
+  psrad           m3,             16
+  mova            [outputq],      m0
+  mova            [outputq + 16], m2
+  mova            [outputq + 32], m1
+  mova            [outputq + 48], m3
+%else
+  mova            [outputq],      m0
+  mova            [outputq + 16], m1
+%endif
+
+  RET
diff --git a/vp9/encoder/x86/vp9_diamond_search_sad_avx.c b/vp9/encoder/x86/vp9_diamond_search_sad_avx.c
deleted file mode 100644
index 0bc417fc1..000000000
--- a/vp9/encoder/x86/vp9_diamond_search_sad_avx.c
+++ /dev/null
@@ -1,323 +0,0 @@
-/*
- *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#if defined(_MSC_VER)
-# include <intrin.h>
-#endif
-#include <emmintrin.h>
-#include <smmintrin.h>
-
-#include "vpx_dsp/vpx_dsp_common.h"
-#include "vp9/encoder/vp9_encoder.h"
-#include "vpx_ports/mem.h"
-
-#ifdef __GNUC__
-# define LIKELY(v)    __builtin_expect(v, 1)
-# define UNLIKELY(v)  __builtin_expect(v, 0)
-#else
-# define LIKELY(v)    (v)
-# define UNLIKELY(v)  (v)
-#endif
-
-static INLINE int_mv pack_int_mv(int16_t row, int16_t col) {
-  int_mv result;
-  result.as_mv.row = row;
-  result.as_mv.col = col;
-  return result;
-}
-
-static INLINE MV_JOINT_TYPE get_mv_joint(const int_mv mv) {
-  // This is simplified from the C implementation to utilise that
-  //  x->nmvjointsadcost[1] == x->nmvjointsadcost[2]  and
-  //  x->nmvjointsadcost[1] == x->nmvjointsadcost[3]
-  return mv.as_int == 0 ? 0 : 1;
-}
-
-static INLINE int mv_cost(const int_mv mv,
-                          const int *joint_cost, int *const comp_cost[2]) {
-  return joint_cost[get_mv_joint(mv)] +
-         comp_cost[0][mv.as_mv.row] + comp_cost[1][mv.as_mv.col];
-}
-
-static int mvsad_err_cost(const MACROBLOCK *x, const int_mv mv, const MV *ref,
-                          int sad_per_bit) {
-  const int_mv diff = pack_int_mv(mv.as_mv.row - ref->row,
-                                  mv.as_mv.col - ref->col);
-  return ROUND_POWER_OF_TWO((unsigned)mv_cost(diff, x->nmvjointsadcost,
-                                              x->nmvsadcost) *
-                                              sad_per_bit, VP9_PROB_COST_SHIFT);
-}
-
-/*****************************************************************************
- * This function utilises 3 properties of the cost function lookup tables,   *
- * constructed in using 'cal_nmvjointsadcost' and 'cal_nmvsadcosts' in       *
- * vp9_encoder.c.                                                            *
- * For the joint cost:                                                       *
- *   - mvjointsadcost[1] == mvjointsadcost[2] == mvjointsadcost[3]           *
- * For the component costs:                                                  *
- *   - For all i: mvsadcost[0][i] == mvsadcost[1][i]                         *
- *         (Equal costs for both components)                                 *
- *   - For all i: mvsadcost[0][i] == mvsadcost[0][-i]                        *
- *         (Cost function is even)                                           *
- * If these do not hold, then this function cannot be used without           *
- * modification, in which case you can revert to using the C implementation, *
- * which does not rely on these properties.                                  *
- *****************************************************************************/
-int vp9_diamond_search_sad_avx(const MACROBLOCK *x,
-                               const search_site_config *cfg,
-                               MV *ref_mv, MV *best_mv, int search_param,
-                               int sad_per_bit, int *num00,
-                               const vp9_variance_fn_ptr_t *fn_ptr,
-                               const MV *center_mv) {
-  const int_mv maxmv = pack_int_mv(x->mv_row_max, x->mv_col_max);
-  const __m128i v_max_mv_w = _mm_set1_epi32(maxmv.as_int);
-  const int_mv minmv = pack_int_mv(x->mv_row_min, x->mv_col_min);
-  const __m128i v_min_mv_w = _mm_set1_epi32(minmv.as_int);
-
-  const __m128i v_spb_d = _mm_set1_epi32(sad_per_bit);
-
-  const __m128i v_joint_cost_0_d = _mm_set1_epi32(x->nmvjointsadcost[0]);
-  const __m128i v_joint_cost_1_d = _mm_set1_epi32(x->nmvjointsadcost[1]);
-
-  // search_param determines the length of the initial step and hence the number
-  // of iterations.
-  // 0 = initial step (MAX_FIRST_STEP) pel
-  // 1 = (MAX_FIRST_STEP/2) pel,
-  // 2 = (MAX_FIRST_STEP/4) pel...
-  const       MV *ss_mv = &cfg->ss_mv[cfg->searches_per_step * search_param];
-  const intptr_t *ss_os = &cfg->ss_os[cfg->searches_per_step * search_param];
-  const int tot_steps = cfg->total_steps - search_param;
-
-  const int_mv fcenter_mv = pack_int_mv(center_mv->row >> 3,
-                                        center_mv->col >> 3);
-  const __m128i vfcmv = _mm_set1_epi32(fcenter_mv.as_int);
-
-  const int ref_row = clamp(ref_mv->row, minmv.as_mv.row, maxmv.as_mv.row);
-  const int ref_col = clamp(ref_mv->col, minmv.as_mv.col, maxmv.as_mv.col);
-
-  int_mv bmv = pack_int_mv(ref_row, ref_col);
-  int_mv new_bmv = bmv;
-  __m128i v_bmv_w = _mm_set1_epi32(bmv.as_int);
-
-  const int what_stride = x->plane[0].src.stride;
-  const int in_what_stride = x->e_mbd.plane[0].pre[0].stride;
-  const uint8_t *const what = x->plane[0].src.buf;
-  const uint8_t *const in_what = x->e_mbd.plane[0].pre[0].buf +
-                                 ref_row * in_what_stride + ref_col;
-
-  // Work out the start point for the search
-  const uint8_t *best_address = in_what;
-  const uint8_t *new_best_address = best_address;
-#if ARCH_X86_64
-  __m128i v_ba_q = _mm_set1_epi64x((intptr_t)best_address);
-#else
-  __m128i v_ba_d = _mm_set1_epi32((intptr_t)best_address);
-#endif
-
-  unsigned int best_sad;
-
-  int i;
-  int j;
-  int step;
-
-  // Check the prerequisite cost function properties that are easy to check
-  // in an assert. See the function-level documentation for details on all
-  // prerequisites.
-  assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[2]);
-  assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[3]);
-
-  // Check the starting position
-  best_sad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride);
-  best_sad += mvsad_err_cost(x, bmv, &fcenter_mv.as_mv, sad_per_bit);
-
-  *num00 = 0;
-
-  for (i = 0, step = 0; step < tot_steps; step++) {
-    for (j = 0; j < cfg->searches_per_step; j += 4, i += 4) {
-      __m128i v_sad_d;
-      __m128i v_cost_d;
-      __m128i v_outside_d;
-      __m128i v_inside_d;
-      __m128i v_diff_mv_w;
-#if ARCH_X86_64
-      __m128i v_blocka[2];
-#else
-      __m128i v_blocka[1];
-#endif
-
-      // Compute the candidate motion vectors
-      const __m128i v_ss_mv_w = _mm_loadu_si128((const __m128i*)&ss_mv[i]);
-      const __m128i v_these_mv_w = _mm_add_epi16(v_bmv_w, v_ss_mv_w);
-      // Clamp them to the search bounds
-      __m128i v_these_mv_clamp_w = v_these_mv_w;
-      v_these_mv_clamp_w = _mm_min_epi16(v_these_mv_clamp_w, v_max_mv_w);
-      v_these_mv_clamp_w = _mm_max_epi16(v_these_mv_clamp_w, v_min_mv_w);
-      // The ones that did not change are inside the search area
-      v_inside_d = _mm_cmpeq_epi32(v_these_mv_clamp_w, v_these_mv_w);
-
-      // If none of them are inside, then move on
-      if (LIKELY(_mm_test_all_zeros(v_inside_d, v_inside_d))) {
-        continue;
-      }
-
-      // The inverse mask indicates which of the MVs are outside
-      v_outside_d = _mm_xor_si128(v_inside_d, _mm_set1_epi8(0xff));
-      // Shift right to keep the sign bit clear, we will use this later
-      // to set the cost to the maximum value.
-      v_outside_d = _mm_srli_epi32(v_outside_d, 1);
-
-      // Compute the difference MV
-      v_diff_mv_w = _mm_sub_epi16(v_these_mv_clamp_w, vfcmv);
-      // We utilise the fact that the cost function is even, and use the
-      // absolute difference. This allows us to use unsigned indexes later
-      // and reduces cache pressure somewhat as only a half of the table
-      // is ever referenced.
-      v_diff_mv_w = _mm_abs_epi16(v_diff_mv_w);
-
-      // Compute the SIMD pointer offsets.
-      {
-#if ARCH_X86_64  //  sizeof(intptr_t) == 8
-        // Load the offsets
-        __m128i v_bo10_q = _mm_loadu_si128((const __m128i*)&ss_os[i+0]);
-        __m128i v_bo32_q = _mm_loadu_si128((const __m128i*)&ss_os[i+2]);
-        // Set the ones falling outside to zero
-        v_bo10_q = _mm_and_si128(v_bo10_q,
-                                 _mm_cvtepi32_epi64(v_inside_d));
-        v_bo32_q = _mm_and_si128(v_bo32_q,
-                                 _mm_unpackhi_epi32(v_inside_d, v_inside_d));
-        // Compute the candidate addresses
-        v_blocka[0] = _mm_add_epi64(v_ba_q, v_bo10_q);
-        v_blocka[1] = _mm_add_epi64(v_ba_q, v_bo32_q);
-#else  // ARCH_X86 //  sizeof(intptr_t) == 4
-        __m128i v_bo_d = _mm_loadu_si128((const __m128i*)&ss_os[i]);
-        v_bo_d = _mm_and_si128(v_bo_d, v_inside_d);
-        v_blocka[0] = _mm_add_epi32(v_ba_d, v_bo_d);
-#endif
-      }
-
-      fn_ptr->sdx4df(what, what_stride,
-                     (const uint8_t **)&v_blocka[0], in_what_stride,
-                     (uint32_t*)&v_sad_d);
-
-      // Look up the component cost of the residual motion vector
-      {
-        const int32_t row0 = _mm_extract_epi16(v_diff_mv_w, 0);
-        const int32_t col0 = _mm_extract_epi16(v_diff_mv_w, 1);
-        const int32_t row1 = _mm_extract_epi16(v_diff_mv_w, 2);
-        const int32_t col1 = _mm_extract_epi16(v_diff_mv_w, 3);
-        const int32_t row2 = _mm_extract_epi16(v_diff_mv_w, 4);
-        const int32_t col2 = _mm_extract_epi16(v_diff_mv_w, 5);
-        const int32_t row3 = _mm_extract_epi16(v_diff_mv_w, 6);
-        const int32_t col3 = _mm_extract_epi16(v_diff_mv_w, 7);
-
-        // Note: This is a use case for vpgather in AVX2
-        const uint32_t cost0 = x->nmvsadcost[0][row0] + x->nmvsadcost[0][col0];
-        const uint32_t cost1 = x->nmvsadcost[0][row1] + x->nmvsadcost[0][col1];
-        const uint32_t cost2 = x->nmvsadcost[0][row2] + x->nmvsadcost[0][col2];
-        const uint32_t cost3 = x->nmvsadcost[0][row3] + x->nmvsadcost[0][col3];
-
-        __m128i v_cost_10_d, v_cost_32_d;
-
-        v_cost_10_d = _mm_cvtsi32_si128(cost0);
-        v_cost_10_d = _mm_insert_epi32(v_cost_10_d, cost1, 1);
-
-        v_cost_32_d = _mm_cvtsi32_si128(cost2);
-        v_cost_32_d = _mm_insert_epi32(v_cost_32_d, cost3, 1);
-
-        v_cost_d = _mm_unpacklo_epi64(v_cost_10_d, v_cost_32_d);
-      }
-
-      // Now add in the joint cost
-      {
-        const __m128i v_sel_d = _mm_cmpeq_epi32(v_diff_mv_w,
-                                                _mm_setzero_si128());
-        const __m128i v_joint_cost_d = _mm_blendv_epi8(v_joint_cost_1_d,
-                                                       v_joint_cost_0_d,
-                                                       v_sel_d);
-        v_cost_d = _mm_add_epi32(v_cost_d, v_joint_cost_d);
-      }
-
-      // Multiply by sad_per_bit
-      v_cost_d = _mm_mullo_epi32(v_cost_d, v_spb_d);
-      // ROUND_POWER_OF_TWO(v_cost_d, 8)
-      v_cost_d = _mm_add_epi32(v_cost_d, _mm_set1_epi32(0x80));
-      v_cost_d = _mm_srai_epi32(v_cost_d, 8);
-      // Add the cost to the sad
-      v_sad_d = _mm_add_epi32(v_sad_d, v_cost_d);
-
-      // Make the motion vectors outside the search area have max cost
-      // by or'ing in the comparison mask, this way the minimum search won't
-      // pick them.
-      v_sad_d = _mm_or_si128(v_sad_d, v_outside_d);
-
-      // Find the minimum value and index horizontally in v_sad_d
-      {
-        // Try speculatively on 16 bits, so we can use the minpos intrinsic
-        const __m128i v_sad_w = _mm_packus_epi32(v_sad_d, v_sad_d);
-        const __m128i v_minp_w = _mm_minpos_epu16(v_sad_w);
-
-        uint32_t local_best_sad = _mm_extract_epi16(v_minp_w, 0);
-        uint32_t local_best_idx = _mm_extract_epi16(v_minp_w, 1);
-
-        // If the local best value is not saturated, just use it, otherwise
-        // find the horizontal minimum again the hard way on 32 bits.
-        // This is executed rarely.
-        if (UNLIKELY(local_best_sad == 0xffff)) {
-          __m128i v_loval_d, v_hival_d, v_loidx_d, v_hiidx_d, v_sel_d;
-
-          v_loval_d = v_sad_d;
-          v_loidx_d = _mm_set_epi32(3, 2, 1, 0);
-          v_hival_d = _mm_srli_si128(v_loval_d, 8);
-          v_hiidx_d = _mm_srli_si128(v_loidx_d, 8);
-
-          v_sel_d = _mm_cmplt_epi32(v_hival_d, v_loval_d);
-
-          v_loval_d = _mm_blendv_epi8(v_loval_d, v_hival_d, v_sel_d);
-          v_loidx_d = _mm_blendv_epi8(v_loidx_d, v_hiidx_d, v_sel_d);
-          v_hival_d = _mm_srli_si128(v_loval_d, 4);
-          v_hiidx_d = _mm_srli_si128(v_loidx_d, 4);
-
-          v_sel_d = _mm_cmplt_epi32(v_hival_d, v_loval_d);
-
-          v_loval_d = _mm_blendv_epi8(v_loval_d, v_hival_d, v_sel_d);
-          v_loidx_d = _mm_blendv_epi8(v_loidx_d, v_hiidx_d, v_sel_d);
-
-          local_best_sad = _mm_extract_epi32(v_loval_d, 0);
-          local_best_idx = _mm_extract_epi32(v_loidx_d, 0);
-        }
-
-        // Update the global minimum if the local minimum is smaller
-        if (LIKELY(local_best_sad < best_sad)) {
-          new_bmv = ((const int_mv *)&v_these_mv_w)[local_best_idx];
-          new_best_address = ((const uint8_t **)v_blocka)[local_best_idx];
-
-          best_sad = local_best_sad;
-        }
-      }
-    }
-
-    bmv = new_bmv;
-    best_address = new_best_address;
-
-    v_bmv_w = _mm_set1_epi32(bmv.as_int);
-#if ARCH_X86_64
-    v_ba_q = _mm_set1_epi64x((intptr_t)best_address);
-#else
-    v_ba_d = _mm_set1_epi32((intptr_t)best_address);
-#endif
-
-    if (UNLIKELY(best_address == in_what)) {
-      (*num00)++;
-    }
-  }
-
-  *best_mv = bmv.as_mv;
-  return best_sad;
-}