9 files changed, 561 insertions, 446 deletions
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index d17952487..6894f553f 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -1923,9 +1923,6 @@ static void encode_sb_row(VP9_COMP *cpi, const TileInfo *const tile,
 
     vp9_zero(cpi->mb.pred_mv);
 
-    if (cpi->sf.reference_masking)
-      rd_pick_reference_frame(cpi, tile, mi_row, mi_col);
-
     if (cpi->sf.use_lastframe_partitioning ||
         cpi->sf.use_one_partition_size_always ) {
       const int idx_str = cm->mode_info_stride * mi_row + mi_col;
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index c51ce9f54..500255748 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -106,6 +106,7 @@ static int lookup_next_frame_stats(const struct twopass_rc *p,
   return 1;
 }
 
+
 // Read frame stats at an offset from the current position
 static int read_frame_stats(const struct twopass_rc *p,
                             FIRSTPASS_STATS *frame_stats, int offset) {
@@ -149,7 +150,7 @@ static void output_stats(const VP9_COMP            *cpi,
     FILE *fpfile;
     fpfile = fopen("firstpass.stt", "a");
 
-    fprintf(stdout, "%12.0f %12.0f %12.0f %12.0f %12.0f %12.4f %12.4f"
+    fprintf(fpfile, "%12.0f %12.0f %12.0f %12.0f %12.0f %12.4f %12.4f"
             "%12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f"
             "%12.0f %12.0f %12.4f %12.0f %12.0f %12.4f\n",
             stats->frame,
@@ -349,17 +350,14 @@ static double simple_weight(YV12_BUFFER_CONFIG *source) {
 }
 
 
-// This function returns the current per frame maximum bitrate target.
+// This function returns the maximum target rate per frame.
 static int frame_max_bits(VP9_COMP *cpi) {
-  // Max allocation for a single frame based on the max section guidelines
-  // passed in and how many bits are left.
-  // For VBR base this on the bits and frames left plus the
-  // two_pass_vbrmax_section rate passed in by the user.
-  const double max_bits = (1.0 * cpi->twopass.bits_left /
-      (cpi->twopass.total_stats.count - cpi->common.current_video_frame)) *
-      (cpi->oxcf.two_pass_vbrmax_section / 100.0);
+  int64_t max_bits =
+     ((int64_t)cpi->rc.av_per_frame_bandwidth *
+      (int64_t)cpi->oxcf.two_pass_vbrmax_section) / 100;
+
   if (max_bits < 0)
-      return 0;
+    return 0;
   if (max_bits >= INT_MAX)
     return INT_MAX;
   return (int)max_bits;
@@ -716,7 +714,7 @@ void vp9_first_pass(VP9_COMP *cpi) {
           mv.as_mv.row *= 8;
           mv.as_mv.col *= 8;
           this_error = motion_error;
-          vp9_set_mbmode_and_mvs(x, NEWMV, &mv);
+          vp9_set_mbmode_and_mvs(xd, NEWMV, &mv.as_mv);
           xd->mi_8x8[0]->mbmi.tx_size = TX_4X4;
           xd->mi_8x8[0]->mbmi.ref_frame[0] = LAST_FRAME;
           xd->mi_8x8[0]->mbmi.ref_frame[1] = NONE;
@@ -1662,7 +1660,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
 
   // Don't allow a gf too near the next kf
   if ((cpi->rc.frames_to_key - i) < MIN_GF_INTERVAL) {
-    while (i < cpi->rc.frames_to_key) {
+    while (i < (cpi->rc.frames_to_key + !cpi->rc.next_key_frame_forced)) {
       i++;
 
       if (EOF == input_stats(&cpi->twopass, this_frame))
@@ -1697,6 +1695,9 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   if (allow_alt_ref &&
       (i < cpi->oxcf.lag_in_frames) &&
       (i >= MIN_GF_INTERVAL) &&
+      // for real scene cuts (not forced kfs) dont allow arf very near kf.
+      (cpi->rc.next_key_frame_forced ||
+        (i <= (cpi->rc.frames_to_key - MIN_GF_INTERVAL))) &&
       ((next_frame.pcnt_inter > 0.75) ||
        (next_frame.pcnt_second_ref > 0.5)) &&
       ((mv_in_out_accumulator / (double)i > -0.2) ||
@@ -1765,18 +1766,6 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
 #endif
 #endif
 
-  // Now decide how many bits should be allocated to the GF group as  a
-  // proportion of those remaining in the kf group.
-  // The final key frame group in the clip is treated as a special case
-  // where cpi->twopass.kf_group_bits is tied to cpi->twopass.bits_left.
-  // This is also important for short clips where there may only be one
-  // key frame.
-  if (cpi->rc.frames_to_key >= (int)(cpi->twopass.total_stats.count -
-                                          cpi->common.current_video_frame)) {
-    cpi->twopass.kf_group_bits =
-      (cpi->twopass.bits_left > 0) ? cpi->twopass.bits_left : 0;
-  }
-
   // Calculate the bits to be allocated to the group as a whole
   if ((cpi->twopass.kf_group_bits > 0) &&
       (cpi->twopass.kf_group_error_left > 0)) {
@@ -1836,7 +1825,8 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     // If the frame that is to be boosted is simpler than the average for
     // the gf/arf group then use an alternative calculation
     // based on the error score of the frame itself
-    if (mod_frame_err < gf_group_err / (double)cpi->rc.baseline_gf_interval) {
+    if (cpi->rc.baseline_gf_interval < 1 ||
+        mod_frame_err < gf_group_err / (double)cpi->rc.baseline_gf_interval) {
       double alt_gf_grp_bits =
         (double)cpi->twopass.kf_group_bits  *
         (mod_frame_err * (double)cpi->rc.baseline_gf_interval) /
@@ -1863,9 +1853,6 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     if (gf_bits < 0)
       gf_bits = 0;
 
-    // Add in minimum for a frame
-    gf_bits += cpi->rc.min_frame_bandwidth;
-
     if (i == 0) {
       cpi->twopass.gf_bits = gf_bits;
     }
@@ -1899,8 +1886,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
       cpi->twopass.gf_group_error_left = (int64_t)gf_group_err;
     }
 
-    cpi->twopass.gf_group_bits -= cpi->twopass.gf_bits
-        - cpi->rc.min_frame_bandwidth;
+    cpi->twopass.gf_group_bits -= cpi->twopass.gf_bits;
 
     if (cpi->twopass.gf_group_bits < 0)
       cpi->twopass.gf_group_bits = 0;
@@ -1985,9 +1971,6 @@ static void assign_std_frame_bits(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   if (cpi->twopass.gf_group_bits < 0)
     cpi->twopass.gf_group_bits = 0;
 
-  // Add in the minimum number of bits that is set aside for every frame.
-  target_frame_size += cpi->rc.min_frame_bandwidth;
-
   // Per frame bit target for this frame.
   cpi->rc.per_frame_bandwidth = target_frame_size;
 }
@@ -2029,6 +2012,22 @@ void vp9_get_one_pass_params(VP9_COMP *cpi) {
   }
 }
 
+void vp9_get_one_pass_cbr_params(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  if ((cm->current_video_frame == 0 ||
+      cm->frame_flags & FRAMEFLAGS_KEY ||
+      cpi->rc.frames_to_key == 0 ||
+      (cpi->oxcf.auto_key && test_for_kf_one_pass(cpi)))) {
+    cm->frame_type = KEY_FRAME;
+    cpi->rc.frames_to_key = cpi->key_frame_frequency;
+  } else {
+    cm->frame_type = INTER_FRAME;
+  }
+  // Don't use gf_update by default in CBR mode.
+  cpi->rc.frames_till_gf_update_due = INT_MAX;
+  cpi->rc.baseline_gf_interval = INT_MAX;
+}
+
 void vp9_get_first_pass_params(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   if (!cpi->refresh_alt_ref_frame &&
@@ -2265,8 +2264,8 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   vp9_zero(next_frame);
 
   vp9_clear_system_state();  // __asm emms;
-  start_position = cpi->twopass.stats_in;
 
+  start_position = cpi->twopass.stats_in;
   cpi->common.frame_type = KEY_FRAME;
 
   // is this a forced key frame by interval
@@ -2348,7 +2347,6 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   // interval is between 1x and 2x
   if (cpi->oxcf.auto_key
       && cpi->rc.frames_to_key > (int)cpi->key_frame_frequency) {
-    FIRSTPASS_STATS *current_pos = cpi->twopass.stats_in;
     FIRSTPASS_STATS tmp_frame;
 
     cpi->rc.frames_to_key /= 2;
@@ -2373,15 +2371,14 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
       // Load a the next frame's stats
       input_stats(&cpi->twopass, &tmp_frame);
     }
-
-    // Reset to the start of the group
-    reset_fpf_position(&cpi->twopass, current_pos);
-
+    cpi->rc.next_key_frame_forced = 1;
+  } else if (cpi->twopass.stats_in == cpi->twopass.stats_in_end) {
     cpi->rc.next_key_frame_forced = 1;
   } else {
     cpi->rc.next_key_frame_forced = 0;
   }
-  // Special case for the last frame of the file
+
+  // Special case for the last key frame of the file
   if (cpi->twopass.stats_in >= cpi->twopass.stats_in_end) {
     // Accumulate kf group error
     kf_group_err += calculate_modified_err(cpi, this_frame);
@@ -2566,8 +2563,6 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     }
 
     cpi->twopass.kf_group_bits -= cpi->twopass.kf_bits;
-    // Add in the minimum frame allowance
-    cpi->twopass.kf_bits += cpi->rc.min_frame_bandwidth;
 
     // Peer frame bit target for this frame
     cpi->rc.per_frame_bandwidth = cpi->twopass.kf_bits;
diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h
index 43703c2c5..f89e4cb1c 100644
--- a/vp9/encoder/vp9_firstpass.h
+++ b/vp9/encoder/vp9_firstpass.h
@@ -22,6 +22,7 @@ void vp9_end_second_pass(VP9_COMP *cpi);
 
 void vp9_get_first_pass_params(VP9_COMP *cpi);
 void vp9_get_one_pass_params(VP9_COMP *cpi);
+void vp9_get_one_pass_cbr_params(VP9_COMP *cpi);
 void vp9_get_svc_params(VP9_COMP *cpi);
 
 #endif  // VP9_ENCODER_VP9_FIRSTPASS_H_
diff --git a/vp9/encoder/vp9_mbgraph.c b/vp9/encoder/vp9_mbgraph.c
index f3ddd39b6..c50098678 100644
--- a/vp9/encoder/vp9_mbgraph.c
+++ b/vp9/encoder/vp9_mbgraph.c
@@ -23,7 +23,7 @@
 
 static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi,
                                               const MV *ref_mv,
-                                              int_mv *dst_mv,
+                                              MV *dst_mv,
                                               int mb_row,
                                               int mb_col) {
   MACROBLOCK   *const x  = &cpi->mb;
@@ -35,7 +35,7 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi,
   const int tmp_col_max = x->mv_col_max;
   const int tmp_row_min = x->mv_row_min;
   const int tmp_row_max = x->mv_row_max;
-  int_mv ref_full;
+  MV ref_full;
 
   // Further step/diamond searches as necessary
   int step_param = cpi->sf.reduce_first_step_size +
@@ -44,12 +44,12 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi,
 
   vp9_set_mv_search_range(x, ref_mv);
 
-  ref_full.as_mv.col = ref_mv->col >> 3;
-  ref_full.as_mv.row = ref_mv->row >> 3;
+  ref_full.col = ref_mv->col >> 3;
+  ref_full.row = ref_mv->row >> 3;
 
   /*cpi->sf.search_method == HEX*/
-  best_err = vp9_hex_search(x, &ref_full.as_mv, step_param, x->errorperbit,
-                            0, &v_fn_ptr, 0, ref_mv, &dst_mv->as_mv);
+  best_err = vp9_hex_search(x, &ref_full, step_param, x->errorperbit,
+                            0, &v_fn_ptr, 0, ref_mv, dst_mv);
 
   // Try sub-pixel MC
   // if (bestsme > error_thresh && bestsme < INT_MAX)
@@ -57,15 +57,14 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi,
     int distortion;
     unsigned int sse;
     best_err = cpi->find_fractional_mv_step(
-        x,
-        &dst_mv->as_mv, ref_mv,
+        x, dst_mv, ref_mv,
         cpi->common.allow_high_precision_mv,
         x->errorperbit, &v_fn_ptr,
         0, cpi->sf.subpel_iters_per_step, NULL, NULL,
         & distortion, &sse);
   }
 
-  vp9_set_mbmode_and_mvs(x, NEWMV, dst_mv);
+  vp9_set_mbmode_and_mvs(xd, NEWMV, dst_mv);
   vp9_build_inter_predictors_sby(xd, mb_row, mb_col, BLOCK_16X16);
   best_err = vp9_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
                           xd->plane[0].dst.buf, xd->plane[0].dst.stride,
@@ -96,7 +95,7 @@ static int do_16x16_motion_search(VP9_COMP *cpi, const int_mv *ref_mv,
 
   // Test last reference frame using the previous best mv as the
   // starting point (best reference) for the search
-  tmp_err = do_16x16_motion_iteration(cpi, &ref_mv->as_mv, &tmp_mv,
+  tmp_err = do_16x16_motion_iteration(cpi, &ref_mv->as_mv, &tmp_mv.as_mv,
                                       mb_row, mb_col);
   if (tmp_err < err) {
     err = tmp_err;
@@ -110,7 +109,7 @@ static int do_16x16_motion_search(VP9_COMP *cpi, const int_mv *ref_mv,
     int_mv zero_ref_mv, tmp_mv;
 
     zero_ref_mv.as_int = 0;
-    tmp_err = do_16x16_motion_iteration(cpi, &zero_ref_mv.as_mv, &tmp_mv,
+    tmp_err = do_16x16_motion_iteration(cpi, &zero_ref_mv.as_mv, &tmp_mv.as_mv,
                                         mb_row, mb_col);
     if (tmp_err < err) {
       dst_mv->as_int = tmp_mv.as_int;
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index 7a1f5c1a9..291a55ecb 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -581,6 +581,177 @@ static void set_rd_speed_thresholds_sub8x8(VP9_COMP *cpi, int mode) {
     sf->thresh_mult_sub8x8[THR_COMP_GA] = INT_MAX;
 }
 
+static void set_good_speed_feature(VP9_COMMON *cm,
+                                   SPEED_FEATURES *sf,
+                                   int speed) {
+  int i;
+  sf->adaptive_rd_thresh = 1;
+  sf->recode_loop = (speed < 1);
+  if (speed == 1) {
+    sf->use_square_partition_only = !frame_is_intra_only(cm);
+    sf->less_rectangular_check  = 1;
+    sf->tx_size_search_method = frame_is_intra_only(cm)
+      ? USE_FULL_RD : USE_LARGESTALL;
+
+    if (MIN(cm->width, cm->height) >= 720)
+      sf->disable_split_mask = cm->show_frame ?
+        DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT;
+    else
+      sf->disable_split_mask = DISABLE_COMPOUND_SPLIT;
+
+    sf->use_rd_breakout = 1;
+    sf->adaptive_motion_search = 1;
+    sf->adaptive_pred_filter_type = 1;
+    sf->auto_mv_step_size = 1;
+    sf->adaptive_rd_thresh = 2;
+    sf->recode_loop = 2;
+    sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
+    sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V;
+    sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V;
+  }
+  if (speed == 2) {
+    sf->use_square_partition_only = !frame_is_intra_only(cm);
+    sf->less_rectangular_check  = 1;
+    sf->tx_size_search_method = frame_is_intra_only(cm)
+      ? USE_FULL_RD : USE_LARGESTALL;
+
+    if (MIN(cm->width, cm->height) >= 720)
+      sf->disable_split_mask = cm->show_frame ?
+        DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT;
+    else
+      sf->disable_split_mask = LAST_AND_INTRA_SPLIT_ONLY;
+
+    sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH |
+                                 FLAG_SKIP_INTRA_BESTINTER |
+                                 FLAG_SKIP_COMP_BESTINTRA |
+                                 FLAG_SKIP_INTRA_LOWVAR;
+    sf->use_rd_breakout = 1;
+    sf->adaptive_motion_search = 1;
+    sf->adaptive_pred_filter_type = 2;
+    sf->reference_masking = 1;
+    sf->auto_mv_step_size = 1;
+
+    sf->disable_filter_search_var_thresh = 50;
+    sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
+
+    sf->auto_min_max_partition_size = 1;
+    sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_LOW_MOTION;
+    sf->adjust_partitioning_from_last_frame = 1;
+    sf->last_partitioning_redo_frequency = 3;
+
+    sf->adaptive_rd_thresh = 2;
+    sf->recode_loop = 2;
+    sf->use_lp32x32fdct = 1;
+    sf->mode_skip_start = 11;
+    sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
+    sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V;
+    sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V;
+    sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V;
+  }
+  if (speed == 3) {
+    sf->use_square_partition_only = 1;
+    sf->tx_size_search_method = USE_LARGESTALL;
+
+    if (MIN(cm->width, cm->height) >= 720)
+      sf->disable_split_mask = DISABLE_ALL_SPLIT;
+    else
+      sf->disable_split_mask = DISABLE_ALL_INTER_SPLIT;
+
+    sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH |
+      FLAG_SKIP_INTRA_BESTINTER |
+      FLAG_SKIP_COMP_BESTINTRA |
+      FLAG_SKIP_INTRA_LOWVAR;
+
+    sf->use_rd_breakout = 1;
+    sf->adaptive_motion_search = 1;
+    sf->adaptive_pred_filter_type = 2;
+    sf->reference_masking = 1;
+    sf->auto_mv_step_size = 1;
+
+    sf->disable_filter_search_var_thresh = 100;
+    sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
+
+    sf->auto_min_max_partition_size = 1;
+    sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_ALL;
+    sf->adjust_partitioning_from_last_frame = 1;
+    sf->last_partitioning_redo_frequency = 3;
+
+    sf->use_uv_intra_rd_estimate = 1;
+    sf->skip_encode_sb = 1;
+    sf->use_lp32x32fdct = 1;
+    sf->subpel_iters_per_step = 1;
+    sf->use_fast_coef_updates = 2;
+
+    sf->adaptive_rd_thresh = 4;
+    sf->mode_skip_start = 6;
+  }
+  if (speed == 4) {
+    sf->use_square_partition_only = 1;
+    sf->tx_size_search_method = USE_LARGESTALL;
+    sf->disable_split_mask = DISABLE_ALL_SPLIT;
+
+    sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH |
+      FLAG_SKIP_INTRA_BESTINTER |
+      FLAG_SKIP_COMP_BESTINTRA |
+      FLAG_SKIP_COMP_REFMISMATCH |
+      FLAG_SKIP_INTRA_LOWVAR |
+      FLAG_EARLY_TERMINATE;
+
+    sf->use_rd_breakout = 1;
+    sf->adaptive_motion_search = 1;
+    sf->adaptive_pred_filter_type = 2;
+    sf->reference_masking = 1;
+    sf->auto_mv_step_size = 1;
+
+    sf->disable_filter_search_var_thresh = 200;
+    sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
+
+    sf->auto_min_max_partition_size = 1;
+    sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_ALL;
+    sf->adjust_partitioning_from_last_frame = 1;
+    sf->last_partitioning_redo_frequency = 3;
+
+    sf->use_uv_intra_rd_estimate = 1;
+    sf->skip_encode_sb = 1;
+    sf->use_lp32x32fdct = 1;
+    sf->subpel_iters_per_step = 1;
+    sf->use_fast_coef_updates = 2;
+
+    sf->adaptive_rd_thresh = 4;
+    sf->mode_skip_start = 6;
+  }
+  if (speed == 5) {
+    sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
+    sf->use_one_partition_size_always = 1;
+    sf->always_this_block_size = BLOCK_16X16;
+    sf->tx_size_search_method = frame_is_intra_only(cm) ?
+      USE_FULL_RD : USE_LARGESTALL;
+    sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH |
+                                 FLAG_SKIP_INTRA_BESTINTER |
+                                 FLAG_SKIP_COMP_BESTINTRA |
+                                 FLAG_SKIP_COMP_REFMISMATCH |
+                                 FLAG_SKIP_INTRA_LOWVAR |
+                                 FLAG_EARLY_TERMINATE;
+    sf->use_rd_breakout = 1;
+    sf->use_lp32x32fdct = 1;
+    sf->optimize_coefficients = 0;
+    sf->auto_mv_step_size = 1;
+    sf->reference_masking = 1;
+
+    sf->disable_split_mask = DISABLE_ALL_SPLIT;
+    sf->search_method = HEX;
+    sf->subpel_iters_per_step = 1;
+    sf->disable_split_var_thresh = 64;
+    sf->disable_filter_search_var_thresh = 500;
+    for (i = 0; i < TX_SIZES; i++) {
+      sf->intra_y_mode_mask[i] = INTRA_DC_ONLY;
+      sf->intra_uv_mode_mask[i] = INTRA_DC_ONLY;
+    }
+    sf->use_fast_coef_updates = 2;
+    sf->adaptive_rd_thresh = 4;
+    sf->mode_skip_start = 6;
+  }
+}
 static void set_rt_speed_feature(VP9_COMMON *cm,
                                  SPEED_FEATURES *sf,
                                  int speed) {
@@ -629,6 +800,7 @@ static void set_rt_speed_feature(VP9_COMMON *cm,
     sf->adaptive_motion_search = 1;
     sf->adaptive_pred_filter_type = 2;
     sf->auto_mv_step_size = 1;
+    sf->reference_masking = 1;
 
     sf->disable_filter_search_var_thresh = 50;
     sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
@@ -699,7 +871,6 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
   // best quality defaults
   sf->RD = 1;
   sf->search_method = NSTEP;
-  sf->auto_filter = 1;
   sf->recode_loop = 1;
   sf->subpel_search_method = SUBPEL_TREE;
   sf->subpel_iters_per_step = 2;
@@ -744,179 +915,8 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
       cpi->diamond_search_sad = vp9_full_range_search;
       break;
     case 1:
-      sf->adaptive_rd_thresh = 1;
-      sf->recode_loop = (speed < 1);
-
-      if (speed == 1) {
-        sf->use_square_partition_only = !frame_is_intra_only(cm);
-        sf->less_rectangular_check  = 1;
-        sf->tx_size_search_method = frame_is_intra_only(cm)
-                                     ? USE_FULL_RD : USE_LARGESTALL;
-
-        if (MIN(cm->width, cm->height) >= 720)
-          sf->disable_split_mask = cm->show_frame ?
-              DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT;
-        else
-          sf->disable_split_mask = DISABLE_COMPOUND_SPLIT;
-
-        sf->use_rd_breakout = 1;
-        sf->adaptive_motion_search = 1;
-        sf->adaptive_pred_filter_type = 1;
-        sf->auto_mv_step_size = 1;
-        sf->adaptive_rd_thresh = 2;
-        sf->recode_loop = 2;
-        sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
-        sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V;
-        sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V;
-      }
-      if (speed == 2) {
-        sf->use_square_partition_only = !frame_is_intra_only(cm);
-        sf->less_rectangular_check  = 1;
-        sf->tx_size_search_method = frame_is_intra_only(cm)
-                                     ? USE_FULL_RD : USE_LARGESTALL;
-
-        if (MIN(cm->width, cm->height) >= 720)
-          sf->disable_split_mask = cm->show_frame ?
-              DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT;
-        else
-          sf->disable_split_mask = LAST_AND_INTRA_SPLIT_ONLY;
-
-
-        sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH |
-                                     FLAG_SKIP_INTRA_BESTINTER |
-                                     FLAG_SKIP_COMP_BESTINTRA |
-                                     FLAG_SKIP_INTRA_LOWVAR;
-
-        sf->use_rd_breakout = 1;
-        sf->adaptive_motion_search = 1;
-        sf->adaptive_pred_filter_type = 2;
-        sf->auto_mv_step_size = 1;
-
-        sf->disable_filter_search_var_thresh = 50;
-        sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
-
-        sf->auto_min_max_partition_size = 1;
-        sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_LOW_MOTION;
-        sf->adjust_partitioning_from_last_frame = 1;
-        sf->last_partitioning_redo_frequency = 3;
-
-        sf->adaptive_rd_thresh = 2;
-        sf->recode_loop = 2;
-        sf->use_lp32x32fdct = 1;
-        sf->mode_skip_start = 11;
-        sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
-        sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V;
-        sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V;
-        sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V;
-      }
-      if (speed == 3) {
-        sf->use_square_partition_only = 1;
-        sf->tx_size_search_method = USE_LARGESTALL;
-
-        if (MIN(cm->width, cm->height) >= 720)
-          sf->disable_split_mask = DISABLE_ALL_SPLIT;
-        else
-          sf->disable_split_mask = DISABLE_ALL_INTER_SPLIT;
-
-        sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH |
-                                     FLAG_SKIP_INTRA_BESTINTER |
-                                     FLAG_SKIP_COMP_BESTINTRA |
-                                     FLAG_SKIP_INTRA_LOWVAR;
-
-        sf->use_rd_breakout = 1;
-        sf->adaptive_motion_search = 1;
-        sf->adaptive_pred_filter_type = 2;
-        sf->auto_mv_step_size = 1;
-
-        sf->disable_filter_search_var_thresh = 100;
-        sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
-
-        sf->auto_min_max_partition_size = 1;
-        sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_ALL;
-        sf->adjust_partitioning_from_last_frame = 1;
-        sf->last_partitioning_redo_frequency = 3;
-
-        sf->use_uv_intra_rd_estimate = 1;
-        sf->skip_encode_sb = 1;
-        sf->use_lp32x32fdct = 1;
-        sf->subpel_iters_per_step = 1;
-        sf->use_fast_coef_updates = 2;
-
-        sf->adaptive_rd_thresh = 4;
-        sf->mode_skip_start = 6;
-      }
-      if (speed == 4) {
-        sf->use_square_partition_only = 1;
-        sf->tx_size_search_method = USE_LARGESTALL;
-        sf->disable_split_mask = DISABLE_ALL_SPLIT;
-
-        sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH |
-                                     FLAG_SKIP_INTRA_BESTINTER |
-                                     FLAG_SKIP_COMP_BESTINTRA |
-                                     FLAG_SKIP_COMP_REFMISMATCH |
-                                     FLAG_SKIP_INTRA_LOWVAR |
-                                     FLAG_EARLY_TERMINATE;
-
-        sf->use_rd_breakout = 1;
-        sf->adaptive_motion_search = 1;
-        sf->adaptive_pred_filter_type = 2;
-        sf->auto_mv_step_size = 1;
-
-        sf->disable_filter_search_var_thresh = 200;
-        sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
-
-        sf->auto_min_max_partition_size = 1;
-        sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_ALL;
-        sf->adjust_partitioning_from_last_frame = 1;
-        sf->last_partitioning_redo_frequency = 3;
-
-        sf->use_uv_intra_rd_estimate = 1;
-        sf->skip_encode_sb = 1;
-        sf->use_lp32x32fdct = 1;
-        sf->subpel_iters_per_step = 1;
-        sf->use_fast_coef_updates = 2;
-
-        sf->adaptive_rd_thresh = 4;
-        sf->mode_skip_start = 6;
-
-        /* sf->intra_y_mode_mask = INTRA_DC_ONLY;
-        sf->intra_uv_mode_mask = INTRA_DC_ONLY;
-        sf->search_method = BIGDIA;
-        sf->disable_split_var_thresh = 64;
-        sf->disable_filter_search_var_thresh = 64; */
-      }
-      if (speed == 5) {
-        sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
-        sf->use_one_partition_size_always = 1;
-        sf->always_this_block_size = BLOCK_16X16;
-        sf->tx_size_search_method = frame_is_intra_only(cm) ?
-                                     USE_FULL_RD : USE_LARGESTALL;
-        sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH |
-                                     FLAG_SKIP_INTRA_BESTINTER |
-                                     FLAG_SKIP_COMP_BESTINTRA |
-                                     FLAG_SKIP_COMP_REFMISMATCH |
-                                     FLAG_SKIP_INTRA_LOWVAR |
-                                     FLAG_EARLY_TERMINATE;
-        sf->use_rd_breakout = 1;
-        sf->use_lp32x32fdct = 1;
-        sf->optimize_coefficients = 0;
-        sf->auto_mv_step_size = 1;
-        // sf->reduce_first_step_size = 1;
-        // sf->reference_masking = 1;
-
-        sf->disable_split_mask = DISABLE_ALL_SPLIT;
-        sf->search_method = HEX;
-        sf->subpel_iters_per_step = 1;
-        sf->disable_split_var_thresh = 64;
-        sf->disable_filter_search_var_thresh = 500;
-        for (i = 0; i < TX_SIZES; i++) {
-          sf->intra_y_mode_mask[i] = INTRA_DC_ONLY;
-          sf->intra_uv_mode_mask[i] = INTRA_DC_ONLY;
-        }
-        sf->use_fast_coef_updates = 2;
-        sf->adaptive_rd_thresh = 4;
-        sf->mode_skip_start = 6;
-      }
+      set_good_speed_feature(cm, sf, speed);
+      break;
       break;
     case 2:
       set_rt_speed_feature(cm, sf, speed);
@@ -1653,16 +1653,20 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
   cpi->bytes = 0;
 
   if (cpi->b_calculate_psnr) {
-    cpi->total_sq_error = 0.0;
-    cpi->total_sq_error2 = 0.0;
     cpi->total_y = 0.0;
     cpi->total_u = 0.0;
     cpi->total_v = 0.0;
     cpi->total = 0.0;
+    cpi->total_sq_error = 0;
+    cpi->total_samples = 0;
+
     cpi->totalp_y = 0.0;
     cpi->totalp_u = 0.0;
     cpi->totalp_v = 0.0;
     cpi->totalp = 0.0;
+    cpi->totalp_sq_error = 0;
+    cpi->totalp_samples = 0;
+
     cpi->tot_recode_hits = 0;
     cpi->summed_quality = 0;
     cpi->summed_weights = 0;
@@ -1897,21 +1901,20 @@ void vp9_remove_compressor(VP9_PTR *ptr) {
                   / time_encoded;
 
       if (cpi->b_calculate_psnr) {
-        YV12_BUFFER_CONFIG *lst_yv12 = get_ref_frame_buffer(cpi, LAST_FRAME);
-        double samples = 3.0 / 2 * cpi->count *
-                         lst_yv12->y_width * lst_yv12->y_height;
-        double total_psnr = vp9_mse2psnr(samples, 255.0, cpi->total_sq_error);
-        double total_psnr2 = vp9_mse2psnr(samples, 255.0, cpi->total_sq_error2);
-        double total_ssim = 100 * pow(cpi->summed_quality /
-                                      cpi->summed_weights, 8.0);
-        double total_ssimp = 100 * pow(cpi->summedp_quality /
-                                       cpi->summedp_weights, 8.0);
+        const double total_psnr = vp9_mse2psnr(cpi->total_samples, 255.0,
+                                               cpi->total_sq_error);
+        const double totalp_psnr = vp9_mse2psnr(cpi->totalp_samples, 255.0,
+                                                cpi->totalp_sq_error);
+        const double total_ssim = 100 * pow(cpi->summed_quality /
+                                                cpi->summed_weights, 8.0);
+        const double totalp_ssim = 100 * pow(cpi->summedp_quality /
+                                                cpi->summedp_weights, 8.0);
 
         fprintf(f, "Bitrate\tAVGPsnr\tGLBPsnr\tAVPsnrP\tGLPsnrP\t"
                 "VPXSSIM\tVPSSIMP\t  Time(ms)\n");
         fprintf(f, "%7.2f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%8.0f\n",
                 dr, cpi->total / cpi->count, total_psnr,
-                cpi->totalp / cpi->count, total_psnr2, total_ssim, total_ssimp,
+                cpi->totalp / cpi->count, totalp_psnr, total_ssim, totalp_ssim,
                 total_encode_time);
       }
 
@@ -2055,8 +2058,8 @@ void vp9_remove_compressor(VP9_PTR *ptr) {
 }
 
 
-static uint64_t calc_plane_error(uint8_t *orig, int orig_stride,
-                                 uint8_t *recon, int recon_stride,
+static uint64_t calc_plane_error(const uint8_t *orig, int orig_stride,
+                                 const uint8_t *recon, int recon_stride,
                                  unsigned int cols, unsigned int rows) {
   unsigned int row, col;
   uint64_t total_sse = 0;
@@ -2073,8 +2076,8 @@ static uint64_t calc_plane_error(uint8_t *orig, int orig_stride,
     /* Handle odd-sized width */
     if (col < cols) {
       unsigned int border_row, border_col;
-      uint8_t *border_orig = orig;
-      uint8_t *border_recon = recon;
+      const uint8_t *border_orig = orig;
+      const uint8_t *border_recon = recon;
 
       for (border_row = 0; border_row < 16; border_row++) {
         for (border_col = col; border_col < cols; border_col++) {
@@ -2105,51 +2108,57 @@ static uint64_t calc_plane_error(uint8_t *orig, int orig_stride,
   return total_sse;
 }
 
+typedef struct {
+  double psnr[4];       // total/y/u/v
+  uint64_t sse[4];      // total/y/u/v
+  uint32_t samples[4];  // total/y/u/v
+} PSNR_STATS;
+
+static void calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b,
+                      PSNR_STATS *psnr) {
+  const int widths[3]        = {a->y_width,  a->uv_width,  a->uv_width };
+  const int heights[3]       = {a->y_height, a->uv_height, a->uv_height};
+  const uint8_t *a_planes[3] = {a->y_buffer, a->u_buffer,  a->v_buffer };
+  const int a_strides[3]     = {a->y_stride, a->uv_stride, a->uv_stride};
+  const uint8_t *b_planes[3] = {b->y_buffer, b->u_buffer,  b->v_buffer };
+  const int b_strides[3]     = {b->y_stride, b->uv_stride, b->uv_stride};
+  int i;
+  uint64_t total_sse = 0;
+  uint32_t total_samples = 0;
 
-static void generate_psnr_packet(VP9_COMP *cpi) {
-  YV12_BUFFER_CONFIG      *orig = cpi->Source;
-  YV12_BUFFER_CONFIG      *recon = cpi->common.frame_to_show;
-  struct vpx_codec_cx_pkt  pkt;
-  uint64_t                 sse;
-  int                      i;
-  unsigned int             width = orig->y_crop_width;
-  unsigned int             height = orig->y_crop_height;
+  for (i = 0; i < 3; ++i) {
+    const int w = widths[i];
+    const int h = heights[i];
+    const uint32_t samples = w * h;
+    const double sse = calc_plane_error(a_planes[i], a_strides[i],
+                                        b_planes[i], b_strides[i],
+                                        w, h);
+    psnr->sse[1 + i] = sse;
+    psnr->samples[1 + i] = samples;
+    psnr->psnr[1 + i] = vp9_mse2psnr(samples, 255.0, sse);
 
-  pkt.kind = VPX_CODEC_PSNR_PKT;
-  sse = calc_plane_error(orig->y_buffer, orig->y_stride,
-                         recon->y_buffer, recon->y_stride,
-                         width, height);
-  pkt.data.psnr.sse[0] = sse;
-  pkt.data.psnr.sse[1] = sse;
-  pkt.data.psnr.samples[0] = width * height;
-  pkt.data.psnr.samples[1] = width * height;
-
-  width = orig->uv_crop_width;
-  height = orig->uv_crop_height;
-
-  sse = calc_plane_error(orig->u_buffer, orig->uv_stride,
-                         recon->u_buffer, recon->uv_stride,
-                         width, height);
-  pkt.data.psnr.sse[0] += sse;
-  pkt.data.psnr.sse[2] = sse;
-  pkt.data.psnr.samples[0] += width * height;
-  pkt.data.psnr.samples[2] = width * height;
-
-  sse = calc_plane_error(orig->v_buffer, orig->uv_stride,
-                         recon->v_buffer, recon->uv_stride,
-                         width, height);
-  pkt.data.psnr.sse[0] += sse;
-  pkt.data.psnr.sse[3] = sse;
-  pkt.data.psnr.samples[0] += width * height;
-  pkt.data.psnr.samples[3] = width * height;
-
-  for (i = 0; i < 4; i++)
-    pkt.data.psnr.psnr[i] = vp9_mse2psnr(pkt.data.psnr.samples[i], 255.0,
-                                         (double)pkt.data.psnr.sse[i]);
+    total_sse += sse;
+    total_samples += samples;
+  }
 
-  vpx_codec_pkt_list_add(cpi->output_pkt_list, &pkt);
+  psnr->sse[0] = total_sse;
+  psnr->samples[0] = total_samples;
+  psnr->psnr[0] = vp9_mse2psnr(total_samples, 255.0, total_sse);
 }
 
+static void generate_psnr_packet(VP9_COMP *cpi) {
+  struct vpx_codec_cx_pkt pkt;
+  int i;
+  PSNR_STATS psnr;
+  calc_psnr(cpi->Source, cpi->common.frame_to_show, &psnr);
+  for (i = 0; i < 4; ++i) {
+    pkt.data.psnr.samples[i] = psnr.samples[i];
+    pkt.data.psnr.sse[i] = psnr.sse[i];
+    pkt.data.psnr.psnr[i] = psnr.psnr[i];
+  }
+  pkt.kind = VPX_CODEC_PSNR_PKT;
+  vpx_codec_pkt_list_add(cpi->output_pkt_list, &pkt);
+}
 
 int vp9_use_as_reference(VP9_PTR ptr, int ref_frame_flags) {
   VP9_COMP *cpi = (VP9_COMP *)(ptr);
@@ -3247,7 +3256,11 @@ static void SvcEncode(VP9_COMP *cpi, size_t *size, uint8_t *dest,
 
 static void Pass0Encode(VP9_COMP *cpi, size_t *size, uint8_t *dest,
                         unsigned int *frame_flags) {
-  vp9_get_one_pass_params(cpi);
+  if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) {
+    vp9_get_one_pass_cbr_params(cpi);
+  } else {
+    vp9_get_one_pass_params(cpi);
+  }
   encode_frame_to_data_rate(cpi, size, dest, frame_flags);
 }
 
@@ -3404,7 +3417,7 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
 #endif
       frames_to_arf = cpi->rc.frames_till_gf_update_due;
 
-    assert(frames_to_arf < cpi->rc.frames_to_key);
+    assert(frames_to_arf <= cpi->rc.frames_to_key);
 
     if ((cpi->source = vp9_lookahead_peek(cpi->lookahead, frames_to_arf))) {
 #if CONFIG_MULTIPLE_ARF
@@ -3599,76 +3612,43 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
       cpi->count++;
 
       if (cpi->b_calculate_psnr) {
-        double ye, ue, ve;
-        double frame_psnr;
-        YV12_BUFFER_CONFIG      *orig = cpi->Source;
-        YV12_BUFFER_CONFIG      *recon = cpi->common.frame_to_show;
-        YV12_BUFFER_CONFIG      *pp = &cm->post_proc_buffer;
-        int y_samples = orig->y_height * orig->y_width;
-        int uv_samples = orig->uv_height * orig->uv_width;
-        int t_samples = y_samples + 2 * uv_samples;
-        double sq_error;
-
-        ye = (double)calc_plane_error(orig->y_buffer, orig->y_stride,
-                              recon->y_buffer, recon->y_stride,
-                              orig->y_crop_width, orig->y_crop_height);
-
-        ue = (double)calc_plane_error(orig->u_buffer, orig->uv_stride,
-                              recon->u_buffer, recon->uv_stride,
-                              orig->uv_crop_width, orig->uv_crop_height);
-
-        ve = (double)calc_plane_error(orig->v_buffer, orig->uv_stride,
-                              recon->v_buffer, recon->uv_stride,
-                              orig->uv_crop_width, orig->uv_crop_height);
-
-        sq_error = ye + ue + ve;
-
-        frame_psnr = vp9_mse2psnr(t_samples, 255.0, sq_error);
-
-        cpi->total_y += vp9_mse2psnr(y_samples, 255.0, ye);
-        cpi->total_u += vp9_mse2psnr(uv_samples, 255.0, ue);
-        cpi->total_v += vp9_mse2psnr(uv_samples, 255.0, ve);
-        cpi->total_sq_error += sq_error;
-        cpi->total  += frame_psnr;
+        YV12_BUFFER_CONFIG *orig = cpi->Source;
+        YV12_BUFFER_CONFIG *recon = cpi->common.frame_to_show;
+        YV12_BUFFER_CONFIG *pp = &cm->post_proc_buffer;
+        PSNR_STATS psnr;
+        calc_psnr(orig, recon, &psnr);
+
+        cpi->total += psnr.psnr[0];
+        cpi->total_y += psnr.psnr[1];
+        cpi->total_u += psnr.psnr[2];
+        cpi->total_v += psnr.psnr[3];
+        cpi->total_sq_error += psnr.sse[0];
+        cpi->total_samples += psnr.samples[0];
+
         {
-          double frame_psnr2, frame_ssim2 = 0;
-          double weight = 0;
+          PSNR_STATS psnr2;
+          double frame_ssim2 = 0, weight = 0;
 #if CONFIG_VP9_POSTPROC
           vp9_deblock(cm->frame_to_show, &cm->post_proc_buffer,
                       cm->lf.filter_level * 10 / 6);
 #endif
           vp9_clear_system_state();
 
-          ye = (double)calc_plane_error(orig->y_buffer, orig->y_stride,
-                                pp->y_buffer, pp->y_stride,
-                                orig->y_crop_width, orig->y_crop_height);
-
-          ue = (double)calc_plane_error(orig->u_buffer, orig->uv_stride,
-                                pp->u_buffer, pp->uv_stride,
-                                orig->uv_crop_width, orig->uv_crop_height);
-
-          ve = (double)calc_plane_error(orig->v_buffer, orig->uv_stride,
-                                pp->v_buffer, pp->uv_stride,
-                                orig->uv_crop_width, orig->uv_crop_height);
-
-          sq_error = ye + ue + ve;
-
-          frame_psnr2 = vp9_mse2psnr(t_samples, 255.0, sq_error);
+          calc_psnr(orig, pp, &psnr2);
 
-          cpi->totalp_y += vp9_mse2psnr(y_samples, 255.0, ye);
-          cpi->totalp_u += vp9_mse2psnr(uv_samples, 255.0, ue);
-          cpi->totalp_v += vp9_mse2psnr(uv_samples, 255.0, ve);
-          cpi->total_sq_error2 += sq_error;
-          cpi->totalp  += frame_psnr2;
+          cpi->totalp += psnr2.psnr[0];
+          cpi->totalp_y += psnr2.psnr[1];
+          cpi->totalp_u += psnr2.psnr[2];
+          cpi->totalp_v += psnr2.psnr[3];
+          cpi->totalp_sq_error += psnr2.sse[0];
+          cpi->totalp_samples += psnr2.samples[0];
 
-          frame_ssim2 = vp9_calc_ssim(cpi->Source,
-                                      recon, 1, &weight);
+          frame_ssim2 = vp9_calc_ssim(orig, recon, 1, &weight);
 
           cpi->summed_quality += frame_ssim2 * weight;
           cpi->summed_weights += weight;
 
-          frame_ssim2 = vp9_calc_ssim(cpi->Source,
-                                      &cm->post_proc_buffer, 1, &weight);
+          frame_ssim2 = vp9_calc_ssim(orig, &cm->post_proc_buffer, 1, &weight);
 
           cpi->summedp_quality += frame_ssim2 * weight;
           cpi->summedp_weights += weight;
@@ -3686,8 +3666,7 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
 
       if (cpi->b_calculate_ssimg) {
         double y, u, v, frame_all;
-        frame_all =  vp9_calc_ssimg(cpi->Source, cm->frame_to_show,
-                                    &y, &u, &v);
+        frame_all = vp9_calc_ssimg(cpi->Source, cm->frame_to_show, &y, &u, &v);
         cpi->total_ssimg_y += y;
         cpi->total_ssimg_u += u;
         cpi->total_ssimg_v += v;
diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h
index 6dde3bea7..a5be0f424 100644
--- a/vp9/encoder/vp9_onyx_int.h
+++ b/vp9/encoder/vp9_onyx_int.h
@@ -232,57 +232,185 @@ typedef enum {
 } LAST_FRAME_PARTITION_METHOD;
 
 typedef struct {
+  // This flag refers to whether or not to perform rd optimization.
   int RD;
+
+  // Motion search method (Diamond, NSTEP, Hex, Big Diamond, Square, etc).
   SEARCH_METHODS search_method;
-  int auto_filter;
+
+  // Recode_loop can be:
+  // 0 means we only encode a frame once
+  // 1 means we can re-encode based on bitrate constraints on any frame
+  // 2 means we can only recode gold, alt, and key frames.
   int recode_loop;
+
+  // Subpel_search_method can only be subpel_tree which does a subpixel
+  // logarithmic search that keeps stepping at 1/2 pixel units until
+  // you stop getting a gain, and then goes on to 1/4 and repeats
+  // the same process. Along the way it skips many diagonals.
   SUBPEL_SEARCH_METHODS subpel_search_method;
+
+  // Maximum number of steps in logarithmic subpel search before giving up.
   int subpel_iters_per_step;
+
+  // Thresh_mult is used to set a threshold for the rd score. A higher value
+  // means that we will accept the best mode so far more often. This number
+  // is used in combination with the current block size, and thresh_freq_fact
+  // to pick a threshold.
   int thresh_mult[MAX_MODES];
   int thresh_mult_sub8x8[MAX_REFS];
+
+  // This parameter controls the number of steps we'll do in a diamond
+  // search.
   int max_step_search_steps;
+
+  // This parameter controls which step in the n-step process we start at.
+  // It's changed adaptively based on circumstances.
   int reduce_first_step_size;
+
+  // If this is set to 1, we limit the motion search range to 2 times the
+  // largest motion vector found in the last frame.
   int auto_mv_step_size;
+
+  // Trellis (dynamic programming) optimization of quantized values (+1, 0).
   int optimize_coefficients;
+
+  // Always set to 0. If on it enables 0 cost background transmission
+  // (except for the initial transmission of the segmentation). The feature is
+  // disabled because the addition of very large block sizes make the
+  // backgrounds very to cheap to encode, and the segmentation we have
+  // adds overhead.
   int static_segmentation;
+
+  // If 1 we iterate finding a best reference for 2 ref frames together - via
+  // a log search that iterates 4 times (check around mv for last for best
+  // error of combined predictor then check around mv for alt). If 0 we
+  // we just use the best motion vector found for each frame by itself.
   int comp_inter_joint_search_thresh;
+
+  // This variable is used to cap the maximum number of times we skip testing a
+  // mode to be evaluated. A high value means we will be faster.
   int adaptive_rd_thresh;
+
+  // Enables skipping the reconstruction step (idct, recon) in the
+  // intermediate steps assuming the last frame didn't have too many intra
+  // blocks and the q is less than a threshold.
   int skip_encode_sb;
   int skip_encode_frame;
+
+  // This variable allows us to reuse the last frames partition choices
+  // (64x64 v 32x32 etc) for this frame. It can be set to only use the last
+  // frame as a starting point in low motion scenes or always use it. If set
+  // we use last partitioning_redo frequency to determine how often to redo
+  // the partitioning from scratch. Adjust_partitioning_from_last_frame
+  // enables us to adjust up or down one partitioning from the last frames
+  // partitioning.
   LAST_FRAME_PARTITION_METHOD use_lastframe_partitioning;
+
+  // Determine which method we use to determine transform size. We can choose
+  // between options like full rd, largest for prediction size, largest
+  // for intra and model coefs for the rest.
   TX_SIZE_SEARCH_METHOD tx_size_search_method;
+
+  // Low precision 32x32 fdct keeps everything in 16 bits and thus is less
+  // precise but significantly faster than the non lp version.
   int use_lp32x32fdct;
+
+  // TODO(JBB): remove this as its no longer used.
+
+  // If set partition size will always be always_this_block_size.
   int use_one_partition_size_always;
+
+  // Skip rectangular partition test when partition type none gives better
+  // rd than partition type split.
   int less_rectangular_check;
+
+  // Disable testing non square partitions. (eg 16x32)
   int use_square_partition_only;
+
+  // After looking at the first set of modes (set by index here), skip
+  // checking modes for reference frames that don't match the reference frame
+  // of the best so far.
   int mode_skip_start;
+
+  // TODO(JBB): Remove this.
   int reference_masking;
+
+  // Used in conjunction with use_one_partition_size_always.
   BLOCK_SIZE always_this_block_size;
+
+  // Sets min and max partition sizes for this 64x64 region based on the
+  // same superblock in last encoded frame, and the left and above neighbor
+  // in this block.
   int auto_min_max_partition_size;
+
+  // Min and max partition size we enable (block_size) as per auto
+  // min max, but also used by adjust partitioning, and pick_partitioning.
   BLOCK_SIZE min_partition_size;
   BLOCK_SIZE max_partition_size;
+
+  // Whether or not we allow partitions one smaller or one greater than the last
+  // frame's partitioning. Only used if use_lastframe_partitioning is set.
   int adjust_partitioning_from_last_frame;
+
+  // How frequently we re do the partitioning from scratch. Only used if
+  // use_lastframe_partitioning is set.
   int last_partitioning_redo_frequency;
+
+  // Disables sub 8x8 blocksizes in different scenarios: Choices are to disable
+  // it always, to allow it for only Last frame and Intra, disable it for all
+  // inter modes or to enable it always.
   int disable_split_mask;
+
+  // TODO(jbb): Remove this and everything that uses it. It's only valid if
+  // we were doing small to large partition checks. We currently do the
+  // reverse.
   int using_small_partition_info;
+
   // TODO(jingning): combine the related motion search speed features
+  // This allows us to use motion search at other sizes as a starting
+  // point for this motion search and limits the search range around it.
   int adaptive_motion_search;
+
+  // Allows sub 8x8 modes to use the prediction filter that was determined
+  // best for 8x8 mode. If set to 0 we always re check all the filters for
+  // sizes less than 8x8, 1 means we check all filter modes if no 8x8 filter
+  // was selected, and 2 means we use 8 tap if no 8x8 filter mode was selected.
   int adaptive_pred_filter_type;
 
   // Implements various heuristics to skip searching modes
   // The heuristics selected are based on  flags
   // defined in the MODE_SEARCH_SKIP_HEURISTICS enum
   unsigned int mode_search_skip_flags;
+
   // A source variance threshold below which the split mode is disabled
   unsigned int disable_split_var_thresh;
+
   // A source variance threshold below which filter search is disabled
   // Choose a very large value (UINT_MAX) to use 8-tap always
   unsigned int disable_filter_search_var_thresh;
+
+  // These bit masks allow you to enable or disable intra modes for each
+  // transform size separately.
   int intra_y_mode_mask[TX_SIZES];
   int intra_uv_mode_mask[TX_SIZES];
+
+  // This variable enables an early break out of mode testing if the model for
+  // rd built from the prediction signal indicates a value that's much
+  // higher than the best rd we've seen so far.
   int use_rd_breakout;
+
+  // This enables us to use an estimate for intra rd based on dc mode rather
+  // than choosing an actual uv mode in the stage of encoding before the actual
+  // final encode.
   int use_uv_intra_rd_estimate;
+
+  // This picks a loop filter strength by trying a small portion of the image
+  // with different values.
   int use_fast_lpf_pick;
+
+  // This feature limits the number of coefficients updates we actually do
+  // by only looking at counts from 1/2 the bands.
   int use_fast_coef_updates;  // 0: 2-loop, 1: 1-loop, 2: 1-loop reduced
 } SPEED_FEATURES;
 
@@ -572,12 +700,16 @@ typedef struct VP9_COMP {
   double total_u;
   double total_v;
   double total;
-  double total_sq_error;
+  uint64_t total_sq_error;
+  uint64_t total_samples;
+
   double totalp_y;
   double totalp_u;
   double totalp_v;
   double totalp;
-  double total_sq_error2;
+  uint64_t totalp_sq_error;
+  uint64_t totalp_samples;
+
   int    bytes;
   double summed_quality;
   double summed_weights;
@@ -601,7 +733,7 @@ typedef struct VP9_COMP {
   int *mb_norm_activity_map;
   int output_partition;
 
-  /* force next frame to intra when kf_auto says so */
+  // Force next frame to intra when kf_auto says so.
   int force_next_frame_intra;
 
   int droppable;
@@ -643,7 +775,7 @@ typedef struct VP9_COMP {
   int64_t mode_test_hits[BLOCK_SIZES];
 #endif
 
-  /* Y,U,V,(A) */
+  // Y,U,V,(A)
   ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
   ENTROPY_CONTEXT left_context[MAX_MB_PLANE][16];
 
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index 728f238e4..aefef5319 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -866,36 +866,35 @@ static void update_golden_frame_stats(VP9_COMP *cpi) {
 
 void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
   VP9_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
   // Update rate control heuristics
-  cpi->rc.projected_frame_size = (bytes_used << 3);
+  rc->projected_frame_size = (bytes_used << 3);
 
   // Post encode loop adjustment of Q prediction.
-  vp9_rc_update_rate_correction_factors(
-      cpi, (cpi->sf.recode_loop ||
+  vp9_rc_update_rate_correction_factors(cpi, (cpi->sf.recode_loop ||
             cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) ? 2 : 0);
 
   // Keep a record of last Q and ambient average Q.
   if (cm->frame_type == KEY_FRAME) {
-    cpi->rc.last_q[KEY_FRAME] = cm->base_qindex;
-    cpi->rc.avg_frame_qindex[KEY_FRAME] =
-        (2 + 3 * cpi->rc.avg_frame_qindex[KEY_FRAME] + cm->base_qindex) >> 2;
-  } else if (!cpi->rc.is_src_frame_alt_ref &&
+    rc->last_q[KEY_FRAME] = cm->base_qindex;
+    rc->avg_frame_qindex[KEY_FRAME] = ROUND_POWER_OF_TWO(
+        3 * rc->avg_frame_qindex[KEY_FRAME] + cm->base_qindex, 2);
+  } else if (!rc->is_src_frame_alt_ref &&
              (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
-    cpi->rc.last_q[2] = cm->base_qindex;
-    cpi->rc.avg_frame_qindex[2] =
-        (2 + 3 * cpi->rc.avg_frame_qindex[2] + cm->base_qindex) >> 2;
+    rc->last_q[2] = cm->base_qindex;
+    rc->avg_frame_qindex[2] = ROUND_POWER_OF_TWO(
+        3 * rc->avg_frame_qindex[2] + cm->base_qindex, 2);
   } else {
-    cpi->rc.last_q[INTER_FRAME] = cm->base_qindex;
-    cpi->rc.avg_frame_qindex[INTER_FRAME] =
-        (2 + 3 * cpi->rc.avg_frame_qindex[INTER_FRAME] +
-         cm->base_qindex) >> 2;
-    cpi->rc.ni_frames++;
-    cpi->rc.tot_q += vp9_convert_qindex_to_q(cm->base_qindex);
-    cpi->rc.avg_q = cpi->rc.tot_q / (double)cpi->rc.ni_frames;
+    rc->last_q[INTER_FRAME] = cm->base_qindex;
+    rc->avg_frame_qindex[INTER_FRAME] = ROUND_POWER_OF_TWO(
+        3 * rc->avg_frame_qindex[INTER_FRAME] + cm->base_qindex, 2);
+    rc->ni_frames++;
+    rc->tot_q += vp9_convert_qindex_to_q(cm->base_qindex);
+    rc->avg_q = rc->tot_q / (double)rc->ni_frames;
 
     // Calculate the average Q for normal inter frames (not key or GFU frames).
-    cpi->rc.ni_tot_qi += cm->base_qindex;
-    cpi->rc.ni_av_qi = cpi->rc.ni_tot_qi / cpi->rc.ni_frames;
+    rc->ni_tot_qi += cm->base_qindex;
+    rc->ni_av_qi = rc->ni_tot_qi / rc->ni_frames;
   }
 
   // Keep record of last boosted (KF/KF/ARF) Q value.
@@ -903,38 +902,34 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
   // If all mbs in this group are skipped only update if the Q value is
   // better than that already stored.
   // This is used to help set quality in forced key frames to reduce popping
-  if ((cm->base_qindex < cpi->rc.last_boosted_qindex) ||
+  if ((cm->base_qindex < rc->last_boosted_qindex) ||
       ((cpi->static_mb_pct < 100) &&
        ((cm->frame_type == KEY_FRAME) || cpi->refresh_alt_ref_frame ||
-        (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)))) {
-    cpi->rc.last_boosted_qindex = cm->base_qindex;
+        (cpi->refresh_golden_frame && !rc->is_src_frame_alt_ref)))) {
+    rc->last_boosted_qindex = cm->base_qindex;
   }
 
-  vp9_update_buffer_level(cpi, cpi->rc.projected_frame_size);
+  vp9_update_buffer_level(cpi, rc->projected_frame_size);
 
   // Rolling monitors of whether we are over or underspending used to help
   // regulate min and Max Q in two pass.
   if (cm->frame_type != KEY_FRAME) {
-    cpi->rc.rolling_target_bits =
-        ((cpi->rc.rolling_target_bits * 3) +
-         cpi->rc.this_frame_target + 2) / 4;
-    cpi->rc.rolling_actual_bits =
-        ((cpi->rc.rolling_actual_bits * 3) +
-         cpi->rc.projected_frame_size + 2) / 4;
-    cpi->rc.long_rolling_target_bits =
-        ((cpi->rc.long_rolling_target_bits * 31) +
-         cpi->rc.this_frame_target + 16) / 32;
-    cpi->rc.long_rolling_actual_bits =
-        ((cpi->rc.long_rolling_actual_bits * 31) +
-         cpi->rc.projected_frame_size + 16) / 32;
+    rc->rolling_target_bits = ROUND_POWER_OF_TWO(
+        rc->rolling_target_bits * 3 + rc->this_frame_target, 2);
+    rc->rolling_actual_bits = ROUND_POWER_OF_TWO(
+        rc->rolling_actual_bits * 3 + rc->projected_frame_size, 2);
+    rc->long_rolling_target_bits = ROUND_POWER_OF_TWO(
+        rc->long_rolling_target_bits * 31 + rc->this_frame_target, 5);
+    rc->long_rolling_actual_bits = ROUND_POWER_OF_TWO(
+        rc->long_rolling_actual_bits * 31 + rc->projected_frame_size, 5);
   }
 
   // Actual bits spent
-  cpi->rc.total_actual_bits += cpi->rc.projected_frame_size;
+  rc->total_actual_bits += rc->projected_frame_size;
 
   // Debug stats
-  cpi->rc.total_target_vs_actual += (cpi->rc.this_frame_target -
-                                     cpi->rc.projected_frame_size);
+  rc->total_target_vs_actual += (rc->this_frame_target -
+                                 rc->projected_frame_size);
 
 #ifndef DISABLE_RC_LONG_TERM_MEM
   // Update bits left to the kf and gf groups to account for overshoot or
@@ -952,8 +947,8 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
   }
 #endif
 
-  if (cpi->oxcf.play_alternate && cpi->refresh_alt_ref_frame
-      && (cm->frame_type != KEY_FRAME))
+  if (cpi->oxcf.play_alternate && cpi->refresh_alt_ref_frame &&
+      (cm->frame_type != KEY_FRAME))
     // Update the alternate reference frame stats as appropriate.
     update_alt_ref_frame_stats(cpi);
   else
@@ -961,14 +956,14 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
     update_golden_frame_stats(cpi);
 
   if (cm->frame_type == KEY_FRAME)
-    cpi->rc.frames_since_key = 0;
+    rc->frames_since_key = 0;
   if (cm->show_frame) {
-    cpi->rc.frames_since_key++;
-    cpi->rc.frames_to_key--;
+    rc->frames_since_key++;
+    rc->frames_to_key--;
   }
 }
 
 void vp9_rc_postencode_update_drop_frame(VP9_COMP *cpi) {
   cpi->rc.frames_since_key++;
-  // cpi->rc.frames_to_key--;
+  cpi->rc.frames_to_key--;
 }
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 5ca34795d..b46e80891 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -1418,9 +1418,10 @@ static int cost_mv_ref(VP9_COMP *cpi, MB_PREDICTION_MODE mode,
   }
 }
 
-void vp9_set_mbmode_and_mvs(MACROBLOCK *x, MB_PREDICTION_MODE mb, int_mv *mv) {
-  x->e_mbd.mi_8x8[0]->mbmi.mode = mb;
-  x->e_mbd.mi_8x8[0]->mbmi.mv[0].as_int = mv->as_int;
+void vp9_set_mbmode_and_mvs(MACROBLOCKD *xd, MB_PREDICTION_MODE mode,
+                            const MV *mv) {
+  xd->mi_8x8[0]->mbmi.mode = mode;
+  xd->mi_8x8[0]->mbmi.mv[0].as_mv = *mv;
 }
 
 static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
@@ -1638,6 +1639,10 @@ static INLINE void mi_buf_restore(MACROBLOCK *x, struct buf_2d orig_src,
     x->e_mbd.plane[0].pre[1] = orig_pre[1];
 }
 
+static INLINE int mv_has_subpel(const MV *mv) {
+  return (mv->row & 0x0F) || (mv->col & 0x0F);
+}
+
 static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
                                     const TileInfo *const tile,
                                     BEST_SEG_INFO *bsi_buf, int filter_idx,
@@ -1931,15 +1936,13 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
 
         if (filter_idx > 0) {
           BEST_SEG_INFO *ref_bsi = bsi_buf;
-          subpelmv = (mode_mv[this_mode].as_mv.row & 0x0f) ||
-                     (mode_mv[this_mode].as_mv.col & 0x0f);
+          subpelmv = mv_has_subpel(&mode_mv[this_mode].as_mv);
           have_ref = mode_mv[this_mode].as_int ==
-                     ref_bsi->rdstat[i][mode_idx].mvs[0].as_int;
+                         ref_bsi->rdstat[i][mode_idx].mvs[0].as_int;
           if (has_second_rf) {
-            subpelmv |= (second_mode_mv[this_mode].as_mv.row & 0x0f) ||
-                        (second_mode_mv[this_mode].as_mv.col & 0x0f);
-            have_ref  &= second_mode_mv[this_mode].as_int ==
-                         ref_bsi->rdstat[i][mode_idx].mvs[1].as_int;
+            subpelmv |= mv_has_subpel(&second_mode_mv[this_mode].as_mv);
+            have_ref &= second_mode_mv[this_mode].as_int ==
+                            ref_bsi->rdstat[i][mode_idx].mvs[1].as_int;
           }
 
           if (filter_idx > 1 && !subpelmv && !have_ref) {
@@ -2276,14 +2279,14 @@ static void setup_pred_block(const MACROBLOCKD *xd,
   }
 }
 
-static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
-                               const TileInfo *const tile,
-                               int idx, MV_REFERENCE_FRAME frame_type,
-                               BLOCK_SIZE block_size,
-                               int mi_row, int mi_col,
-                               int_mv frame_nearest_mv[MAX_REF_FRAMES],
-                               int_mv frame_near_mv[MAX_REF_FRAMES],
-                               struct buf_2d yv12_mb[4][MAX_MB_PLANE]) {
+void vp9_setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
+                            const TileInfo *const tile,
+                            int idx, MV_REFERENCE_FRAME frame_type,
+                            BLOCK_SIZE block_size,
+                            int mi_row, int mi_col,
+                            int_mv frame_nearest_mv[MAX_REF_FRAMES],
+                            int_mv frame_near_mv[MAX_REF_FRAMES],
+                            struct buf_2d yv12_mb[4][MAX_MB_PLANE]) {
   VP9_COMMON *cm = &cpi->common;
   YV12_BUFFER_CONFIG *yv12 = &cm->yv12_fb[cpi->common.ref_frame_map[idx]];
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -2770,12 +2773,9 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
 
   pred_exists = 0;
   // Are all MVs integer pel for Y and UV
-  intpel_mv = (mbmi->mv[0].as_mv.row & 15) == 0 &&
-      (mbmi->mv[0].as_mv.col & 15) == 0;
+  intpel_mv = !mv_has_subpel(&mbmi->mv[0].as_mv);
   if (is_comp_pred)
-    intpel_mv &= (mbmi->mv[1].as_mv.row & 15) == 0 &&
-        (mbmi->mv[1].as_mv.col & 15) == 0;
-
+    intpel_mv &= !mv_has_subpel(&mbmi->mv[1].as_mv);
 
   // Search for best switchable filter by checking the variance of
   // pred error irrespective of whether the filter will be used
@@ -3175,17 +3175,29 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
 
   *returnrate = INT_MAX;
 
-  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
     x->pred_mv_sad[ref_frame] = INT_MAX;
     if (cpi->ref_frame_flags & flag_list[ref_frame]) {
-      setup_buffer_inter(cpi, x, tile, get_ref_frame_idx(cpi, ref_frame),
-                         ref_frame, block_size, mi_row, mi_col,
-                         frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb);
+      vp9_setup_buffer_inter(cpi, x, tile, get_ref_frame_idx(cpi, ref_frame),
+                             ref_frame, block_size, mi_row, mi_col,
+                             frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb);
     }
     frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
     frame_mv[ZEROMV][ref_frame].as_int = 0;
   }
 
+  cpi->ref_frame_mask = 0;
+  for (ref_frame = LAST_FRAME;
+       ref_frame <= ALTREF_FRAME && cpi->sf.reference_masking; ++ref_frame) {
+    int i;
+    for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
+      if ((x->pred_mv_sad[ref_frame] >> 2) > x->pred_mv_sad[i]) {
+        cpi->ref_frame_mask |= (1 << ref_frame);
+        break;
+      }
+    }
+  }
+
   for (mode_index = 0; mode_index < MAX_MODES; ++mode_index) {
     int mode_excluded = 0;
     int64_t this_rd = INT64_MAX;
@@ -3235,8 +3247,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     }
 
     // Skip if the current reference frame has been masked off
-    if (cpi->sf.reference_masking && !cpi->set_ref_frame_mask &&
-        (cpi->ref_frame_mask & (1 << ref_frame)))
+    if (cpi->ref_frame_mask & (1 << ref_frame) && this_mode != NEWMV)
       continue;
 
     // Test best rd so far against threshold for trying this mode.
@@ -3641,11 +3652,6 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     }
   }
 
-  // If we are using reference masking and the set mask flag is set then
-  // create the reference frame mask.
-  if (cpi->sf.reference_masking && cpi->set_ref_frame_mask)
-    cpi->ref_frame_mask = ~(1 << vp9_mode_order[best_mode_index].ref_frame[0]);
-
   // Flag all modes that have a distortion thats > 2x the best we found at
   // this level.
   for (mode_index = 0; mode_index < MB_MODE_COUNT; ++mode_index) {
@@ -3797,15 +3803,27 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
 
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
     if (cpi->ref_frame_flags & flag_list[ref_frame]) {
-      setup_buffer_inter(cpi, x, tile, get_ref_frame_idx(cpi, ref_frame),
-                         ref_frame, block_size, mi_row, mi_col,
-                         frame_mv[NEARESTMV], frame_mv[NEARMV],
-                         yv12_mb);
+      vp9_setup_buffer_inter(cpi, x, tile, get_ref_frame_idx(cpi, ref_frame),
+                             ref_frame, block_size, mi_row, mi_col,
+                             frame_mv[NEARESTMV], frame_mv[NEARMV],
+                             yv12_mb);
     }
     frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
     frame_mv[ZEROMV][ref_frame].as_int = 0;
   }
 
+  cpi->ref_frame_mask = 0;
+  for (ref_frame = LAST_FRAME;
+       ref_frame <= ALTREF_FRAME && cpi->sf.reference_masking; ++ref_frame) {
+    int i;
+    for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
+      if ((x->pred_mv_sad[ref_frame] >> 1) > x->pred_mv_sad[i]) {
+        cpi->ref_frame_mask |= (1 << ref_frame);
+        break;
+      }
+    }
+  }
+
   for (mode_index = 0; mode_index < MAX_REFS; ++mode_index) {
     int mode_excluded = 0;
     int64_t this_rd = INT64_MAX;
@@ -3853,11 +3871,6 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
         continue;
     }
 
-    // Skip if the current reference frame has been masked off
-    if (cpi->sf.reference_masking && !cpi->set_ref_frame_mask &&
-        (cpi->ref_frame_mask & (1 << ref_frame)))
-      continue;
-
     // Test best rd so far against threshold for trying this mode.
     if ((best_rd <
          ((int64_t)cpi->rd_thresh_sub8x8[segment_id][bsize][mode_index] *
@@ -4367,11 +4380,6 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
     }
   }
 
-  // If we are using reference masking and the set mask flag is set then
-  // create the reference frame mask.
-  if (cpi->sf.reference_masking && cpi->set_ref_frame_mask)
-    cpi->ref_frame_mask = ~(1 << vp9_ref_order[best_mode_index].ref_frame[0]);
-
   if (best_rd == INT64_MAX && bsize < BLOCK_8X8) {
     *returnrate = INT_MAX;
     *returndistortion = INT_MAX;
diff --git a/vp9/encoder/vp9_rdopt.h b/vp9/encoder/vp9_rdopt.h
index f0e8849c1..4b244a50a 100644
--- a/vp9/encoder/vp9_rdopt.h
+++ b/vp9/encoder/vp9_rdopt.h
@@ -27,6 +27,15 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi);
 
 void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex);
 
+void vp9_setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
+                            const TileInfo *const tile,
+                            int idx, MV_REFERENCE_FRAME frame_type,
+                            BLOCK_SIZE block_size,
+                            int mi_row, int mi_col,
+                            int_mv frame_nearest_mv[MAX_REF_FRAMES],
+                            int_mv frame_near_mv[MAX_REF_FRAMES],
+                            struct buf_2d yv12_mb[4][MAX_MB_PLANE]);
+
 void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
                                int *r, int64_t *d, BLOCK_SIZE bsize,
                                PICK_MODE_CONTEXT *ctx, int64_t best_rd);
@@ -51,8 +60,8 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
 
 void vp9_init_me_luts();
 
-void vp9_set_mbmode_and_mvs(MACROBLOCK *x,
-                            MB_PREDICTION_MODE mb, int_mv *mv);
+void vp9_set_mbmode_and_mvs(MACROBLOCKD *xd, MB_PREDICTION_MODE mode,
+                            const MV *mv);
 
 void vp9_get_entropy_contexts(TX_SIZE tx_size,
     ENTROPY_CONTEXT t_above[16], ENTROPY_CONTEXT t_left[16],