5 files changed, 302 insertions, 275 deletions
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 046c64170..3f28e0ff3 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -60,6 +60,14 @@
 #define RC_FACTOR_MIN       0.75
 #define RC_FACTOR_MAX       1.75
 
+
+#define INTRA_WEIGHT_EXPERIMENT 0
+#if INTRA_WEIGHT_EXPERIMENT
+#define NCOUNT_INTRA_THRESH 8192
+#define NCOUNT_INTRA_FACTOR 3
+#define NCOUNT_FRAME_II_THRESH 5.0
+#endif
+
 #define DOUBLE_DIVIDE_CHECK(x) ((x) < 0 ? (x) - 0.000001 : (x) + 0.000001)
 
 #if ARF_STATS_OUTPUT
@@ -470,7 +478,7 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
   int intercount = 0;
   int second_ref_count = 0;
   const int intrapenalty = INTRA_MODE_PENALTY;
-  int neutral_count = 0;
+  double neutral_count;
   int new_mv_count = 0;
   int sum_in_vectors = 0;
   MV lastmv = {0, 0};
@@ -503,6 +511,7 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
 
   intra_factor = 0.0;
   brightness_factor = 0.0;
+  neutral_count = 0.0;
 
   set_first_pass_params(cpi);
   vp9_set_quantizer(cm, find_fp_qindex(cm->bit_depth));
@@ -818,12 +827,28 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
 #endif
 
         if (motion_error <= this_error) {
+          vp9_clear_system_state();
+
           // Keep a count of cases where the inter and intra were very close
           // and very low. This helps with scene cut detection for example in
           // cropped clips with black bars at the sides or top and bottom.
+#if INTRA_WEIGHT_EXPERIMENT
+          if (((this_error - intrapenalty) * 9 <= motion_error * 10) &&
+              (this_error < (2 * intrapenalty))) {
+            neutral_count += 1.0;
+          // Also track cases where the intra is not much worse than the inter
+          // and use this in limiting the GF/arf group length.
+          } else if ((this_error > NCOUNT_INTRA_THRESH) &&
+                     (this_error < (NCOUNT_INTRA_FACTOR * motion_error))) {
+            neutral_count += (double)motion_error /
+                             DOUBLE_DIVIDE_CHECK((double)this_error);
+          }
+#else
           if (((this_error - intrapenalty) * 9 <= motion_error * 10) &&
-              this_error < 2 * intrapenalty)
-            ++neutral_count;
+              (this_error < (2 * intrapenalty))) {
+            neutral_count += 1.0;
+          }
+#endif
 
           mv.row *= 8;
           mv.col *= 8;
@@ -1260,17 +1285,27 @@ static double get_sr_decay_rate(const VP9_COMP *cpi,
   double sr_diff =
       (frame->sr_coded_error - frame->coded_error) / num_mbs;
   double sr_decay = 1.0;
+  double modified_pct_inter;
+  double modified_pcnt_intra;
   const double motion_amplitude_factor =
     frame->pcnt_motion * ((frame->mvc_abs + frame->mvr_abs) / 2);
-  const double pcnt_intra = 100 * (1.0 - frame->pcnt_inter);
+
+  modified_pct_inter = frame->pcnt_inter;
+#if INTRA_WEIGHT_EXPERIMENT
+  if ((frame->intra_error / DOUBLE_DIVIDE_CHECK(frame->coded_error)) <
+      (double)NCOUNT_FRAME_II_THRESH)
+    modified_pct_inter = frame->pcnt_inter - frame->pcnt_neutral;
+#endif
+  modified_pcnt_intra = 100 * (1.0 - modified_pct_inter);
+
 
   if ((sr_diff > LOW_SR_DIFF_TRHESH)) {
     sr_diff = MIN(sr_diff, SR_DIFF_MAX);
     sr_decay = 1.0 - (SR_DIFF_PART * sr_diff) -
                (MOTION_AMP_PART * motion_amplitude_factor) -
-               (INTRA_PART * pcnt_intra);
+               (INTRA_PART * modified_pcnt_intra);
   }
-  return MAX(sr_decay, MIN(DEFAULT_DECAY_LIMIT, frame->pcnt_inter));
+  return MAX(sr_decay, MIN(DEFAULT_DECAY_LIMIT, modified_pct_inter));
 }
 
 // This function gives an estimate of how badly we believe the prediction
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 12882e432..eb01bb279 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -1783,9 +1783,8 @@ static int vector_match(int16_t *ref, int16_t *src, int bwl) {
   return (center - (bw >> 1));
 }
 
-static const MV search_pos[9] = {
-  {-1, -1}, {-1, 0}, {-1, 1}, {0, -1}, {0, 0}, {0, 1},
-  {1, -1}, {1, 0}, {1, 1},
+static const MV search_pos[5] = {
+    {-1, 0}, {0, -1}, {0, 0}, {0, 1}, {1, 0},
 };
 
 unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x,
@@ -1804,7 +1803,7 @@ unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x,
   const int ref_stride = xd->plane[0].pre[0].stride;
   uint8_t const *ref_buf, *src_buf;
   MV *tmp_mv = &xd->mi[0].src_mi->mbmi.mv[0].as_mv;
-  int best_sad;
+  int best_sad, tmp_sad, this_sad[5];
   MV this_mv;
 
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -1845,21 +1844,40 @@ unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x,
 
   best_sad = INT_MAX;
   this_mv = *tmp_mv;
-  for (idx = 0; idx < 9; ++idx) {
-    int this_sad;
-    src_buf = x->plane[0].src.buf;
+  src_buf = x->plane[0].src.buf;
+  for (idx = 0; idx < 5; ++idx) {
     ref_buf = xd->plane[0].pre[0].buf +
         (search_pos[idx].row + this_mv.row) * ref_stride +
         (search_pos[idx].col + this_mv.col);
 
-    this_sad = cpi->fn_ptr[bsize].sdf(src_buf, src_stride,
-                                      ref_buf, ref_stride);
-    if (this_sad < best_sad) {
-      best_sad = this_sad;
+    this_sad[idx] = cpi->fn_ptr[bsize].sdf(src_buf, src_stride,
+                                           ref_buf, ref_stride);
+    if (this_sad[idx] < best_sad) {
+      best_sad = this_sad[idx];
       tmp_mv->row = search_pos[idx].row + this_mv.row;
       tmp_mv->col = search_pos[idx].col + this_mv.col;
     }
   }
+
+  if (this_sad[0] < this_sad[4])
+    this_mv.row -= 1;
+  else
+    this_mv.row += 1;
+
+  if (this_sad[1] < this_sad[3])
+    this_mv.col -= 1;
+  else
+    this_mv.col += 1;
+
+  ref_buf = xd->plane[0].pre[0].buf + this_mv.row * ref_stride + this_mv.col;
+
+  tmp_sad = cpi->fn_ptr[bsize].sdf(src_buf, src_stride,
+                                   ref_buf, ref_stride);
+  if (best_sad > tmp_sad) {
+    *tmp_mv = this_mv;
+    best_sad = tmp_sad;
+  }
+
   tmp_mv->row *= 8;
   tmp_mv->col *= 8;
 
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index c820651fb..88003ec17 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -217,6 +217,8 @@ static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize,
   const uint32_t ac_quant = pd->dequant[1];
   unsigned int var = cpi->fn_ptr[bsize].vf(p->src.buf, p->src.stride,
                                            pd->dst.buf, pd->dst.stride, &sse);
+  int skip_dc = 0;
+
   *var_y = var;
   *sse_y = sse;
 
@@ -258,6 +260,9 @@ static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize,
       // Check if dc coefficient can be quantized to zero.
       if (sse_tx - var_tx < dc_thr || sse == var)
         x->skip_txfm[0] = 1;
+    } else {
+      if (sse_tx - var_tx < dc_thr || sse == var)
+        skip_dc = 1;
     }
   }
 
@@ -267,21 +272,28 @@ static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize,
     return;
   }
 
+  if (!skip_dc) {
 #if CONFIG_VP9_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bsize],
-                                 dc_quant >> (xd->bd - 5), &rate, &dist);
-  } else {
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bsize],
+                                   dc_quant >> (xd->bd - 5), &rate, &dist);
+    } else {
+      vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bsize],
+                                   dc_quant >> 3, &rate, &dist);
+    }
+#else
     vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bsize],
                                  dc_quant >> 3, &rate, &dist);
-  }
-#else
-  vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bsize],
-                               dc_quant >> 3, &rate, &dist);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
+  }
 
-  *out_rate_sum = rate >> 1;
-  *out_dist_sum = dist << 3;
+  if (!skip_dc) {
+    *out_rate_sum = rate >> 1;
+    *out_dist_sum = dist << 3;
+  } else {
+    *out_rate_sum = 0;
+    *out_dist_sum = (sse - var) << 4;
+  }
 
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
@@ -590,13 +602,27 @@ void vp9_pick_intra_mode(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *rd_cost,
   *rd_cost = best_rdc;
 }
 
-static const PREDICTION_MODE inter_mode_set[INTER_MODES] = {
-    ZEROMV, NEARESTMV, NEARMV, NEWMV,
-};
-
 static const int ref_frame_cost[MAX_REF_FRAMES] = {
     1235, 229, 530, 615,
 };
+
+typedef struct {
+  MV_REFERENCE_FRAME ref_frame;
+  PREDICTION_MODE pred_mode;
+} REF_MODE;
+
+#define RT_INTER_MODES 8
+static const REF_MODE ref_mode_set[RT_INTER_MODES] = {
+    {LAST_FRAME, ZEROMV},
+    {LAST_FRAME, NEARESTMV},
+    {LAST_FRAME, NEARMV},
+    {LAST_FRAME, NEWMV},
+    {GOLDEN_FRAME, ZEROMV},
+    {GOLDEN_FRAME, NEARESTMV},
+    {GOLDEN_FRAME, NEARMV},
+    {GOLDEN_FRAME, NEWMV}
+};
+
 // TODO(jingning) placeholder for inter-frame non-RD mode decision.
 // this needs various further optimizations. to be continued..
 void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
@@ -653,6 +679,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   const int pixels_in_block = bh * bw;
   int reuse_inter_pred = cpi->sf.reuse_inter_pred_sby && ctx->pred_pixel_ready;
   int ref_frame_skip_mask = 0;
+  int idx;
 
   if (reuse_inter_pred) {
     int i;
@@ -736,9 +763,17 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   if (cpi->rc.frames_since_golden == 0)
     ref_frame_skip_mask |= (1 << GOLDEN_FRAME);
 
-  for (ref_frame = LAST_FRAME; ref_frame <= GOLDEN_FRAME; ++ref_frame) {
-    PREDICTION_MODE this_mode;
-    int i = (ref_frame == LAST_FRAME) ? GOLDEN_FRAME : LAST_FRAME;
+  for (idx = 0; idx < RT_INTER_MODES; ++idx) {
+    int rate_mv = 0;
+    int mode_rd_thresh;
+    int mode_index;
+    int i;
+    PREDICTION_MODE this_mode = ref_mode_set[idx].pred_mode;
+
+    ref_frame = ref_mode_set[idx].ref_frame;
+    mode_index = mode_idx[ref_frame][INTER_OFFSET(this_mode)];
+
+    i = (ref_frame == LAST_FRAME) ? GOLDEN_FRAME : LAST_FRAME;
 
     if (!(cpi->ref_frame_flags & flag_list[ref_frame]))
       continue;
@@ -760,212 +795,194 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     mbmi->ref_frame[0] = ref_frame;
     set_ref_ptrs(cm, xd, ref_frame, NONE);
 
-    for (i = 0; i < INTER_MODES; ++i) {
-      int rate_mv = 0;
-      int mode_rd_thresh;
-      int mode_index;
-      this_mode = inter_mode_set[i];
-      mode_index = mode_idx[ref_frame][INTER_OFFSET(this_mode)];
+    if (const_motion[ref_frame] && this_mode == NEARMV)
+      continue;
 
-      if (const_motion[ref_frame] && this_mode == NEARMV)
-        continue;
+    if (!(cpi->sf.inter_mode_mask[bsize] & (1 << this_mode)))
+      continue;
 
-      if (!(cpi->sf.inter_mode_mask[bsize] & (1 << this_mode)))
-        continue;
+    mode_rd_thresh = best_mode_skip_txfm ?
+            rd_threshes[mode_index] << 1 : rd_threshes[mode_index];
+    if (rd_less_than_thresh(best_rdc.rdcost, mode_rd_thresh,
+                            rd_thresh_freq_fact[mode_index]))
+      continue;
 
-      mode_rd_thresh = best_mode_skip_txfm ? rd_threshes[mode_index] << 1 :
-                                             rd_threshes[mode_index];
-      if (rd_less_than_thresh(best_rdc.rdcost, mode_rd_thresh,
-                              rd_thresh_freq_fact[mode_index]))
+    if (this_mode == NEWMV) {
+      if (cpi->sf.partition_search_type != VAR_BASED_PARTITION
+          && best_rdc.rdcost < (int64_t) (1 << num_pels_log2_lookup[bsize]))
         continue;
+      if (ref_frame > LAST_FRAME) {
+        int tmp_sad;
+        int dis, cost_list[5];
 
-      if (this_mode == NEWMV) {
-        if (cpi->sf.partition_search_type != VAR_BASED_PARTITION &&
-            best_rdc.rdcost < (int64_t)(1 << num_pels_log2_lookup[bsize]))
+        if (bsize < BLOCK_16X16)
           continue;
 
-        if (ref_frame > LAST_FRAME) {
-          int tmp_sad;
-          int dis, cost_list[5];
-
-          if (bsize < BLOCK_16X16)
-            continue;
-
-          tmp_sad = vp9_int_pro_motion_estimation(cpi, x, bsize);
-          if (tmp_sad > x->pred_mv_sad[LAST_FRAME])
-            continue;
-
-          frame_mv[NEWMV][ref_frame].as_int = mbmi->mv[0].as_int;
-          rate_mv = vp9_mv_bit_cost(&frame_mv[NEWMV][ref_frame].as_mv,
-                                    &mbmi->ref_mvs[ref_frame][0].as_mv,
-                                    x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
-          frame_mv[NEWMV][ref_frame].as_mv.row >>= 3;
-          frame_mv[NEWMV][ref_frame].as_mv.col >>= 3;
-
-          cpi->find_fractional_mv_step(x, &frame_mv[NEWMV][ref_frame].as_mv,
-                                       &mbmi->ref_mvs[ref_frame][0].as_mv,
-                                       cpi->common.allow_high_precision_mv,
-                                       x->errorperbit,
-                                       &cpi->fn_ptr[bsize],
-                                       cpi->sf.mv.subpel_force_stop,
-                                       cpi->sf.mv.subpel_iters_per_step,
-                                       cond_cost_list(cpi, cost_list),
-                                       x->nmvjointcost, x->mvcost, &dis,
-                                       &x->pred_sse[ref_frame], NULL, 0, 0);
-        } else if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col,
-                                           &frame_mv[NEWMV][ref_frame],
-                                           &rate_mv, best_rdc.rdcost)) {
+        tmp_sad = vp9_int_pro_motion_estimation(cpi, x, bsize);
+        if (tmp_sad > x->pred_mv_sad[LAST_FRAME])
           continue;
-        }
-      }
 
-      if (this_mode != NEARESTMV &&
-          frame_mv[this_mode][ref_frame].as_int ==
-              frame_mv[NEARESTMV][ref_frame].as_int)
+        frame_mv[NEWMV][ref_frame].as_int = mbmi->mv[0].as_int;
+        rate_mv = vp9_mv_bit_cost(&frame_mv[NEWMV][ref_frame].as_mv,
+          &mbmi->ref_mvs[ref_frame][0].as_mv,
+          x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+        frame_mv[NEWMV][ref_frame].as_mv.row >>= 3;
+        frame_mv[NEWMV][ref_frame].as_mv.col >>= 3;
+
+        cpi->find_fractional_mv_step(x, &frame_mv[NEWMV][ref_frame].as_mv,
+          &mbmi->ref_mvs[ref_frame][0].as_mv,
+          cpi->common.allow_high_precision_mv,
+          x->errorperbit,
+          &cpi->fn_ptr[bsize],
+          cpi->sf.mv.subpel_force_stop,
+          cpi->sf.mv.subpel_iters_per_step,
+          cond_cost_list(cpi, cost_list),
+          x->nmvjointcost, x->mvcost, &dis,
+          &x->pred_sse[ref_frame], NULL, 0, 0);
+      } else if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col,
+        &frame_mv[NEWMV][ref_frame], &rate_mv, best_rdc.rdcost)) {
         continue;
+      }
+    }
 
-      mbmi->mode = this_mode;
-      mbmi->mv[0].as_int = frame_mv[this_mode][ref_frame].as_int;
+    if (this_mode != NEARESTMV && frame_mv[this_mode][ref_frame].as_int ==
+        frame_mv[NEARESTMV][ref_frame].as_int)
+      continue;
 
-      // Search for the best prediction filter type, when the resulting
-      // motion vector is at sub-pixel accuracy level for luma component, i.e.,
-      // the last three bits are all zeros.
-      if (reuse_inter_pred) {
-        if (!this_mode_pred) {
-          this_mode_pred = &tmp[3];
-        } else {
-          this_mode_pred = &tmp[get_pred_buffer(tmp, 3)];
-          pd->dst.buf = this_mode_pred->data;
-          pd->dst.stride = bw;
-        }
+    mbmi->mode = this_mode;
+    mbmi->mv[0].as_int = frame_mv[this_mode][ref_frame].as_int;
+
+    // Search for the best prediction filter type, when the resulting
+    // motion vector is at sub-pixel accuracy level for luma component, i.e.,
+    // the last three bits are all zeros.
+    if (reuse_inter_pred) {
+      if (!this_mode_pred) {
+        this_mode_pred = &tmp[3];
+      } else {
+        this_mode_pred = &tmp[get_pred_buffer(tmp, 3)];
+        pd->dst.buf = this_mode_pred->data;
+        pd->dst.stride = bw;
       }
+    }
 
-      if ((this_mode == NEWMV || filter_ref == SWITCHABLE) &&
-          pred_filter_search && (ref_frame == LAST_FRAME) &&
-          ((mbmi->mv[0].as_mv.row & 0x07) != 0 ||
-           (mbmi->mv[0].as_mv.col & 0x07) != 0)) {
-        int pf_rate[3];
-        int64_t pf_dist[3];
-        unsigned int pf_var[3];
-        unsigned int pf_sse[3];
-        TX_SIZE pf_tx_size[3];
-        int64_t best_cost = INT64_MAX;
-        INTERP_FILTER best_filter = SWITCHABLE, filter;
-        PRED_BUFFER *current_pred = this_mode_pred;
-
-        for (filter = EIGHTTAP; filter <= EIGHTTAP_SHARP; ++filter) {
-          int64_t cost;
-          mbmi->interp_filter = filter;
-          vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
-          model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rate[filter],
-                            &pf_dist[filter], &pf_var[filter], &pf_sse[filter]);
-          pf_rate[filter] += vp9_get_switchable_rate(cpi, xd);
-          cost = RDCOST(x->rdmult, x->rddiv, pf_rate[filter], pf_dist[filter]);
-          pf_tx_size[filter] = mbmi->tx_size;
-          if (cost < best_cost) {
-            best_filter = filter;
-            best_cost = cost;
-            skip_txfm = x->skip_txfm[0];
-
-            if (reuse_inter_pred) {
-              if (this_mode_pred != current_pred) {
-                free_pred_buffer(this_mode_pred);
-                this_mode_pred = current_pred;
-              }
-
-              if (filter < EIGHTTAP_SHARP) {
-                current_pred = &tmp[get_pred_buffer(tmp, 3)];
-                pd->dst.buf = current_pred->data;
-                pd->dst.stride = bw;
-              }
+    if ((this_mode == NEWMV || filter_ref == SWITCHABLE) && pred_filter_search
+        && (ref_frame == LAST_FRAME)
+        && (((mbmi->mv[0].as_mv.row | mbmi->mv[0].as_mv.col) & 0x07) != 0)) {
+      int pf_rate[3];
+      int64_t pf_dist[3];
+      unsigned int pf_var[3];
+      unsigned int pf_sse[3];
+      TX_SIZE pf_tx_size[3];
+      int64_t best_cost = INT64_MAX;
+      INTERP_FILTER best_filter = SWITCHABLE, filter;
+      PRED_BUFFER *current_pred = this_mode_pred;
+
+      for (filter = EIGHTTAP; filter <= EIGHTTAP_SHARP; ++filter) {
+        int64_t cost;
+        mbmi->interp_filter = filter;
+        vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
+        model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rate[filter], &pf_dist[filter],
+                          &pf_var[filter], &pf_sse[filter]);
+        pf_rate[filter] += vp9_get_switchable_rate(cpi, xd);
+        cost = RDCOST(x->rdmult, x->rddiv, pf_rate[filter], pf_dist[filter]);
+        pf_tx_size[filter] = mbmi->tx_size;
+        if (cost < best_cost) {
+          best_filter = filter;
+          best_cost = cost;
+          skip_txfm = x->skip_txfm[0];
+
+          if (reuse_inter_pred) {
+            if (this_mode_pred != current_pred) {
+              free_pred_buffer(this_mode_pred);
+              this_mode_pred = current_pred;
+            }
+
+            if (filter < EIGHTTAP_SHARP) {
+              current_pred = &tmp[get_pred_buffer(tmp, 3)];
+              pd->dst.buf = current_pred->data;
+              pd->dst.stride = bw;
             }
           }
         }
-
-        if (reuse_inter_pred && this_mode_pred != current_pred)
-          free_pred_buffer(current_pred);
-
-        mbmi->interp_filter = best_filter;
-        mbmi->tx_size = pf_tx_size[mbmi->interp_filter];
-        this_rdc.rate = pf_rate[mbmi->interp_filter];
-        this_rdc.dist = pf_dist[mbmi->interp_filter];
-        var_y = pf_var[mbmi->interp_filter];
-        sse_y = pf_sse[mbmi->interp_filter];
-        x->skip_txfm[0] = skip_txfm;
-      } else {
-        mbmi->interp_filter = (filter_ref == SWITCHABLE) ? EIGHTTAP: filter_ref;
-        vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
-        model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc.rate, &this_rdc.dist,
-                          &var_y, &sse_y);
-        this_rdc.rate += cm->interp_filter == SWITCHABLE ?
-            vp9_get_switchable_rate(cpi, xd) : 0;
       }
 
-      // chroma component rate-distortion cost modeling
-      if (x->color_sensitivity[0] || x->color_sensitivity[1]) {
-        int uv_rate = 0;
-        int64_t uv_dist = 0;
-        if (x->color_sensitivity[0])
-          vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, 1);
-        if (x->color_sensitivity[1])
-          vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, 2);
-        model_rd_for_sb_uv(cpi, bsize, x, xd, &uv_rate, &uv_dist,
-                           &var_y, &sse_y);
-        this_rdc.rate += uv_rate;
-        this_rdc.dist += uv_dist;
-      }
+      if (reuse_inter_pred && this_mode_pred != current_pred)
+        free_pred_buffer(current_pred);
 
-      this_rdc.rate += rate_mv;
-      this_rdc.rate += cpi->inter_mode_cost[mbmi->mode_context[ref_frame]]
-                                  [INTER_OFFSET(this_mode)];
-      this_rdc.rate += ref_frame_cost[ref_frame];
-      this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
-                               this_rdc.rate, this_rdc.dist);
+      mbmi->interp_filter = best_filter;
+      mbmi->tx_size = pf_tx_size[mbmi->interp_filter];
+      this_rdc.rate = pf_rate[mbmi->interp_filter];
+      this_rdc.dist = pf_dist[mbmi->interp_filter];
+      var_y = pf_var[mbmi->interp_filter];
+      sse_y = pf_sse[mbmi->interp_filter];
+      x->skip_txfm[0] = skip_txfm;
+    } else {
+      mbmi->interp_filter = (filter_ref == SWITCHABLE) ? EIGHTTAP : filter_ref;
+      vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
+      model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc.rate, &this_rdc.dist,
+                        &var_y, &sse_y);
+      this_rdc.rate +=
+          cm->interp_filter == SWITCHABLE ?
+              vp9_get_switchable_rate(cpi, xd) : 0;
+    }
 
-      // Skipping checking: test to see if this block can be reconstructed by
-      // prediction only.
-      if (cpi->allow_encode_breakout) {
-        encode_breakout_test(cpi, x, bsize, mi_row, mi_col, ref_frame,
-                             this_mode, var_y, sse_y, yv12_mb,
-                             &this_rdc.rate, &this_rdc.dist);
-        if (x->skip) {
-          this_rdc.rate += rate_mv;
-          this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
-                                   this_rdc.rate, this_rdc.dist);
-        }
+    // chroma component rate-distortion cost modeling
+    if (x->color_sensitivity[0] || x->color_sensitivity[1]) {
+      int uv_rate = 0;
+      int64_t uv_dist = 0;
+      if (x->color_sensitivity[0])
+        vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, 1);
+      if (x->color_sensitivity[1])
+        vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, 2);
+      model_rd_for_sb_uv(cpi, bsize, x, xd, &uv_rate, &uv_dist, &var_y, &sse_y);
+      this_rdc.rate += uv_rate;
+      this_rdc.dist += uv_dist;
+    }
+
+    this_rdc.rate += rate_mv;
+    this_rdc.rate +=
+        cpi->inter_mode_cost[mbmi->mode_context[ref_frame]][INTER_OFFSET(
+            this_mode)];
+    this_rdc.rate += ref_frame_cost[ref_frame];
+    this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, this_rdc.rate, this_rdc.dist);
+
+    // Skipping checking: test to see if this block can be reconstructed by
+    // prediction only.
+    if (cpi->allow_encode_breakout) {
+      encode_breakout_test(cpi, x, bsize, mi_row, mi_col, ref_frame, this_mode,
+                           var_y, sse_y, yv12_mb, &this_rdc.rate,
+                           &this_rdc.dist);
+      if (x->skip) {
+        this_rdc.rate += rate_mv;
+        this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, this_rdc.rate,
+                                 this_rdc.dist);
       }
+    }
 
 #if CONFIG_VP9_TEMPORAL_DENOISING
-      if (cpi->oxcf.noise_sensitivity > 0)
-        vp9_denoiser_update_frame_stats(mbmi, sse_y, this_mode, ctx);
+    if (cpi->oxcf.noise_sensitivity > 0)
+      vp9_denoiser_update_frame_stats(mbmi, sse_y, this_mode, ctx);
 #else
-      (void)ctx;
+    (void)ctx;
 #endif
 
-      if (this_rdc.rdcost < best_rdc.rdcost || x->skip) {
-        best_rdc = this_rdc;
-        best_mode = this_mode;
-        best_pred_filter = mbmi->interp_filter;
-        best_tx_size = mbmi->tx_size;
-        best_ref_frame = ref_frame;
-        best_mode_skip_txfm = x->skip_txfm[0];
-
-        if (reuse_inter_pred) {
-          free_pred_buffer(best_pred);
-          best_pred = this_mode_pred;
-        }
-      } else {
-        if (reuse_inter_pred)
-          free_pred_buffer(this_mode_pred);
-      }
+    if (this_rdc.rdcost < best_rdc.rdcost || x->skip) {
+      best_rdc = this_rdc;
+      best_mode = this_mode;
+      best_pred_filter = mbmi->interp_filter;
+      best_tx_size = mbmi->tx_size;
+      best_ref_frame = ref_frame;
+      best_mode_skip_txfm = x->skip_txfm[0];
 
-      if (x->skip)
-        break;
+      if (reuse_inter_pred) {
+        free_pred_buffer(best_pred);
+        best_pred = this_mode_pred;
+      }
+    } else {
+      if (reuse_inter_pred)
+        free_pred_buffer(this_mode_pred);
     }
 
-    // Check that a prediction mode has been selected.
-    assert(best_rdc.rdcost < INT64_MAX);
-
     if (x->skip)
       break;
   }
diff --git a/vp9/encoder/x86/vp9_dct_ssse3.c b/vp9/encoder/x86/vp9_dct_ssse3.c
index 5c0ad7892..a1a2bda80 100644
--- a/vp9/encoder/x86/vp9_dct_ssse3.c
+++ b/vp9/encoder/x86/vp9_dct_ssse3.c
@@ -102,99 +102,56 @@ void vp9_fdct8x8_quant_ssse3(const int16_t *input, int stride,
       const __m128i r2 = _mm_sub_epi16(q1, q2);
       const __m128i r3 = _mm_sub_epi16(q0, q3);
       // Interleave to do the multiply by constants which gets us into 32bits
-      const __m128i t0 = _mm_add_epi16(r0, r1);
-      const __m128i t1 = _mm_sub_epi16(r0, r1);
+      const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
+      const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
       const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
       const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
 
-      const __m128i u0 = _mm_mulhrs_epi16(t0, k__dual_p16_p16);
-      const __m128i u1 = _mm_mulhrs_epi16(t1, k__dual_p16_p16);
+      const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
+      const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
+      const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
+      const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
+
       const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
       const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
       const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
       const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
       // dct_const_round_shift
-      const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
-      const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
-      const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
-      const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
-      const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
-      const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
-      const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
-      const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
-      // Combine
-      res0 = u0;
-      res4 = u1;
-      res2 = _mm_packs_epi32(w4, w5);
-      res6 = _mm_packs_epi32(w6, w7);
-    }
-    // Work on next four results
-    if (pass == 1) {
-      // Interleave to do the multiply by constants which gets us into 32bits
-      const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
-      const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
-      const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16);
-      const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16);
-      const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16);
-      const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16);
-      // dct_const_round_shift
-      const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING);
-      const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);
-      const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);
-      const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);
-      const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);
-      const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);
-      const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);
-      const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);
-      // Combine
-      const __m128i r0 = _mm_packs_epi32(s0, s1);
-      const __m128i r1 = _mm_packs_epi32(s2, s3);
-      // Add/subtract
-      const __m128i x0 = _mm_add_epi16(q4, r0);
-      const __m128i x1 = _mm_sub_epi16(q4, r0);
-      const __m128i x2 = _mm_sub_epi16(q7, r1);
-      const __m128i x3 = _mm_add_epi16(q7, r1);
-      // Interleave to do the multiply by constants which gets us into 32bits
-      const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
-      const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
-      const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
-      const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
-      const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
-      const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
-      const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
-      const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
-      const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
-      const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
-      const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
-      const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
-      // dct_const_round_shift
+
       const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
       const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
       const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
       const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+
       const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
       const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
       const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
       const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+
       const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
       const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
       const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
       const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+
       const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
       const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
       const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
       const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
       // Combine
-      res1 = _mm_packs_epi32(w0, w1);
-      res7 = _mm_packs_epi32(w2, w3);
-      res5 = _mm_packs_epi32(w4, w5);
-      res3 = _mm_packs_epi32(w6, w7);
-    } else {
+
+      res0 = _mm_packs_epi32(w0, w1);
+      res4 = _mm_packs_epi32(w2, w3);
+      res2 = _mm_packs_epi32(w4, w5);
+      res6 = _mm_packs_epi32(w6, w7);
+    }
+    // Work on next four results
+    {
       // Interleave to do the multiply by constants which gets us into 32bits
       const __m128i d0 = _mm_sub_epi16(q6, q5);
       const __m128i d1 = _mm_add_epi16(q6, q5);
       const __m128i r0 = _mm_mulhrs_epi16(d0, k__dual_p16_p16);
       const __m128i r1 = _mm_mulhrs_epi16(d1, k__dual_p16_p16);
+
       // Add/subtract
       const __m128i x0 = _mm_add_epi16(q4, r0);
       const __m128i x1 = _mm_sub_epi16(q4, r0);
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index d1dd66b2c..fb6aac721 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -694,7 +694,7 @@ static vpx_codec_err_t ctrl_set_rc_gf_cbr_boost_pct(
     vpx_codec_alg_priv_t *ctx, va_list args) {
   struct vp9_extracfg extra_cfg = ctx->extra_cfg;
   extra_cfg.gf_cbr_boost_pct =
-      CAST(VP8E_SET_GF_CBR_BOOST_PCT, args);
+      CAST(VP9E_SET_GF_CBR_BOOST_PCT, args);
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
@@ -1387,7 +1387,7 @@ static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
   {VP8E_SET_CQ_LEVEL,                 ctrl_set_cq_level},
   {VP8E_SET_MAX_INTRA_BITRATE_PCT,    ctrl_set_rc_max_intra_bitrate_pct},
   {VP8E_SET_MAX_INTER_BITRATE_PCT,    ctrl_set_rc_max_inter_bitrate_pct},
-  {VP8E_SET_GF_CBR_BOOST_PCT,         ctrl_set_rc_gf_cbr_boost_pct},
+  {VP9E_SET_GF_CBR_BOOST_PCT,         ctrl_set_rc_gf_cbr_boost_pct},
   {VP9E_SET_LOSSLESS,                 ctrl_set_lossless},
   {VP9E_SET_FRAME_PARALLEL_DECODING,  ctrl_set_frame_parallel_decoding_mode},
   {VP9E_SET_AQ_MODE,                  ctrl_set_aq_mode},