12 files changed, 233 insertions, 82 deletions
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index c6af7f6da..21e2b16a4 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -115,10 +115,6 @@ static INLINE int mi_width_log2(BLOCK_SIZE sb_type) {
   return mi_width_log2_lookup[sb_type];
 }
 
-static INLINE int mi_height_log2(BLOCK_SIZE sb_type) {
-  return mi_height_log2_lookup[sb_type];
-}
-
 // This structure now relates to 8x8 block regions.
 typedef struct {
   MB_PREDICTION_MODE mode, uv_mode;
diff --git a/vp9/common/vp9_common_data.c b/vp9/common/vp9_common_data.c
index 886c0afc6..a927823e0 100644
--- a/vp9/common/vp9_common_data.c
+++ b/vp9/common/vp9_common_data.c
@@ -26,8 +26,6 @@ const int mi_width_log2_lookup[BLOCK_SIZES] =
   {0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3};
 const int num_8x8_blocks_wide_lookup[BLOCK_SIZES] =
   {1, 1, 1, 1, 1, 2, 2, 2, 4, 4, 4, 8, 8};
-const int mi_height_log2_lookup[BLOCK_SIZES] =
-  {0, 0, 0, 0, 1, 0, 1, 2, 1, 2, 3, 2, 3};
 const int num_8x8_blocks_high_lookup[BLOCK_SIZES] =
   {1, 1, 1, 1, 2, 1, 2, 4, 2, 4, 8, 4, 8};
 
diff --git a/vp9/common/vp9_common_data.h b/vp9/common/vp9_common_data.h
index a367c65c6..5222d29c1 100644
--- a/vp9/common/vp9_common_data.h
+++ b/vp9/common/vp9_common_data.h
@@ -16,7 +16,6 @@
 extern const int b_width_log2_lookup[BLOCK_SIZES];
 extern const int b_height_log2_lookup[BLOCK_SIZES];
 extern const int mi_width_log2_lookup[BLOCK_SIZES];
-extern const int mi_height_log2_lookup[BLOCK_SIZES];
 extern const int num_8x8_blocks_wide_lookup[BLOCK_SIZES];
 extern const int num_8x8_blocks_high_lookup[BLOCK_SIZES];
 extern const int num_4x4_blocks_high_lookup[BLOCK_SIZES];
diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h
index b5ed959e1..f6fe4d3f1 100644
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h
@@ -346,7 +346,7 @@ static INLINE int partition_plane_context(
   const int bs = 1 << bsl;
   int above = 0, left = 0, i;
 
-  assert(mi_width_log2(bsize) == mi_height_log2(bsize));
+  assert(b_width_log2(bsize) == b_height_log2(bsize));
   assert(bsl >= 0);
 
   for (i = 0; i < bs; i++) {
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index d17952487..6894f553f 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -1923,9 +1923,6 @@ static void encode_sb_row(VP9_COMP *cpi, const TileInfo *const tile,
 
     vp9_zero(cpi->mb.pred_mv);
 
-    if (cpi->sf.reference_masking)
-      rd_pick_reference_frame(cpi, tile, mi_row, mi_col);
-
     if (cpi->sf.use_lastframe_partitioning ||
         cpi->sf.use_one_partition_size_always ) {
       const int idx_str = cm->mode_info_stride * mi_row + mi_col;
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index c51ce9f54..075f0a7d4 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -2029,6 +2029,22 @@ void vp9_get_one_pass_params(VP9_COMP *cpi) {
   }
 }
 
+void vp9_get_one_pass_cbr_params(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  if ((cm->current_video_frame == 0 ||
+      cm->frame_flags & FRAMEFLAGS_KEY ||
+      cpi->rc.frames_to_key == 0 ||
+      (cpi->oxcf.auto_key && test_for_kf_one_pass(cpi)))) {
+    cm->frame_type = KEY_FRAME;
+    cpi->rc.frames_to_key = cpi->key_frame_frequency;
+  } else {
+    cm->frame_type = INTER_FRAME;
+  }
+  // Don't use gf_update by default in CBR mode.
+  cpi->rc.frames_till_gf_update_due = INT_MAX;
+  cpi->rc.baseline_gf_interval = INT_MAX;
+}
+
 void vp9_get_first_pass_params(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   if (!cpi->refresh_alt_ref_frame &&
diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h
index 43703c2c5..f89e4cb1c 100644
--- a/vp9/encoder/vp9_firstpass.h
+++ b/vp9/encoder/vp9_firstpass.h
@@ -22,6 +22,7 @@ void vp9_end_second_pass(VP9_COMP *cpi);
 
 void vp9_get_first_pass_params(VP9_COMP *cpi);
 void vp9_get_one_pass_params(VP9_COMP *cpi);
+void vp9_get_one_pass_cbr_params(VP9_COMP *cpi);
 void vp9_get_svc_params(VP9_COMP *cpi);
 
 #endif  // VP9_ENCODER_VP9_FIRSTPASS_H_
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index 45fc13e69..d280055b7 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -699,7 +699,6 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
   // best quality defaults
   sf->RD = 1;
   sf->search_method = NSTEP;
-  sf->auto_filter = 1;
   sf->recode_loop = 1;
   sf->subpel_search_method = SUBPEL_TREE;
   sf->subpel_iters_per_step = 2;
@@ -790,6 +789,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
         sf->use_rd_breakout = 1;
         sf->adaptive_motion_search = 1;
         sf->adaptive_pred_filter_type = 2;
+        sf->reference_masking = 1;
         sf->auto_mv_step_size = 1;
 
         sf->disable_filter_search_var_thresh = 50;
@@ -826,6 +826,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
         sf->use_rd_breakout = 1;
         sf->adaptive_motion_search = 1;
         sf->adaptive_pred_filter_type = 2;
+        sf->reference_masking = 1;
         sf->auto_mv_step_size = 1;
 
         sf->disable_filter_search_var_thresh = 100;
@@ -860,6 +861,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
         sf->use_rd_breakout = 1;
         sf->adaptive_motion_search = 1;
         sf->adaptive_pred_filter_type = 2;
+        sf->reference_masking = 1;
         sf->auto_mv_step_size = 1;
 
         sf->disable_filter_search_var_thresh = 200;
@@ -3250,7 +3252,11 @@ static void SvcEncode(VP9_COMP *cpi, size_t *size, uint8_t *dest,
 
 static void Pass0Encode(VP9_COMP *cpi, size_t *size, uint8_t *dest,
                         unsigned int *frame_flags) {
-  vp9_get_one_pass_params(cpi);
+  if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) {
+    vp9_get_one_pass_cbr_params(cpi);
+  } else {
+    vp9_get_one_pass_params(cpi);
+  }
   encode_frame_to_data_rate(cpi, size, dest, frame_flags);
 }
 
diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h
index 6dde3bea7..4bcf10933 100644
--- a/vp9/encoder/vp9_onyx_int.h
+++ b/vp9/encoder/vp9_onyx_int.h
@@ -232,57 +232,185 @@ typedef enum {
 } LAST_FRAME_PARTITION_METHOD;
 
 typedef struct {
+  // This flag refers to whether or not to perform rd optimization.
   int RD;
+
+  // Motion search method (Diamond, NSTEP, Hex, Big Diamond, Square, etc).
   SEARCH_METHODS search_method;
-  int auto_filter;
+
+  // Recode_loop can be:
+  // 0 means we only encode a frame once
+  // 1 means we can re-encode based on bitrate constraints on any frame
+  // 2 means we can only recode gold, alt, and key frames.
   int recode_loop;
+
+  // Subpel_search_method can only be subpel_tree which does a subpixel
+  // logarithmic search that keeps stepping at 1/2 pixel units until
+  // you stop getting a gain, and then goes on to 1/4 and repeats
+  // the same process. Along the way it skips many diagonals.
   SUBPEL_SEARCH_METHODS subpel_search_method;
+
+  // Maximum number of steps in logarithmic subpel search before giving up.
   int subpel_iters_per_step;
+
+  // Thresh_mult is used to set a threshold for the rd score. A higher value
+  // means that we will accept the best mode so far more often. This number
+  // is used in combination with the current block size, and thresh_freq_fact
+  // to pick a threshold.
   int thresh_mult[MAX_MODES];
   int thresh_mult_sub8x8[MAX_REFS];
+
+  // This parameter controls the number of steps we'll do in a diamond
+  // search.
   int max_step_search_steps;
+
+  // This parameter controls which step in the n-step process we start at.
+  // It's changed adaptively based on circumstances.
   int reduce_first_step_size;
+
+  // If this is set to 1, we limit the motion search range to 2 times the
+  // largest motion vector found in the last frame.
   int auto_mv_step_size;
+
+  // Trellis (dynamic programming) optimization of quantized values (+1, 0).
   int optimize_coefficients;
+
+  // Always set to 0. If on it enables 0 cost background transmission
+  // (except for the initial transmission of the segmentation). The feature is
+  // disabled because the addition of very large block sizes make the
+  // backgrounds very to cheap to encode, and the segmentation we have
+  // adds overhead.
   int static_segmentation;
+
+  // If 1 we iterate finding a best reference for 2 ref frames together - via
+  // a log search that iterates 4 times (check around mv for last for best
+  // error of combined predictor then check around mv for alt). If 0 we
+  // we just use the best motion vector found for each frame by itself.
   int comp_inter_joint_search_thresh;
+
+  // This variable is used to cap the maximum number of times we skip testing a
+  // mode to be evaluated. A high value means we will be faster.
   int adaptive_rd_thresh;
+
+  // Enables skipping the reconstruction step (idct, recon) in the
+  // intermediate steps assuming the last frame didn't have too many intra
+  // blocks and the q is less than a threshold.
   int skip_encode_sb;
   int skip_encode_frame;
+
+  // This variable allows us to reuse the last frames partition choices
+  // (64x64 v 32x32 etc) for this frame. It can be set to only use the last
+  // frame as a starting point in low motion scenes or always use it. If set
+  // we use last partitioning_redo frequency to determine how often to redo
+  // the partitioning from scratch. Adjust_partitioning_from_last_frame
+  // enables us to adjust up or down one partitioning from the last frames
+  // partitioning.
   LAST_FRAME_PARTITION_METHOD use_lastframe_partitioning;
+
+  // Determine which method we use to determine transform size. We can choose
+  // between options like full rd, largest for prediction size, largest
+  // for intra and model coefs for the rest.
   TX_SIZE_SEARCH_METHOD tx_size_search_method;
+
+  // Low precision 32x32 fdct keeps everything in 16 bits and thus is less
+  // precise but significantly faster than the non lp version.
   int use_lp32x32fdct;
+
+  // TODO(JBB): remove this as its no longer used.
+
+  // If set partition size will always be always_this_block_size.
   int use_one_partition_size_always;
+
+  // Skip rectangular partition test when partition type none gives better
+  // rd than partition type split.
   int less_rectangular_check;
+
+  // Disable testing non square partitions. (eg 16x32)
   int use_square_partition_only;
+
+  // After looking at the first set of modes (set by index here), skip
+  // checking modes for reference frames that don't match the reference frame
+  // of the best so far.
   int mode_skip_start;
+
+  // TODO(JBB): Remove this.
   int reference_masking;
+
+  // Used in conjunction with use_one_partition_size_always.
   BLOCK_SIZE always_this_block_size;
+
+  // Sets min and max partition sizes for this 64x64 region based on the
+  // same superblock in last encoded frame, and the left and above neighbor
+  // in this block.
   int auto_min_max_partition_size;
+
+  // Min and max partition size we enable (block_size) as per auto
+  // min max, but also used by adjust partitioning, and pick_partitioning.
   BLOCK_SIZE min_partition_size;
   BLOCK_SIZE max_partition_size;
+
+  // Whether or not we allow partitions one smaller or one greater than the last
+  // frame's partitioning. Only used if use_lastframe_partitioning is set.
   int adjust_partitioning_from_last_frame;
+
+  // How frequently we re do the partitioning from scratch. Only used if
+  // use_lastframe_partitioning is set.
   int last_partitioning_redo_frequency;
+
+  // Disables sub 8x8 blocksizes in different scenarios: Choices are to disable
+  // it always, to allow it for only Last frame and Intra, disable it for all
+  // inter modes or to enable it always.
   int disable_split_mask;
+
+  // TODO(jbb): Remove this and everything that uses it. It's only valid if
+  // we were doing small to large partition checks. We currently do the
+  // reverse.
   int using_small_partition_info;
+
   // TODO(jingning): combine the related motion search speed features
+  // This allows us to use motion search at other sizes as a starting
+  // point for this motion search and limits the search range around it.
   int adaptive_motion_search;
+
+  // Allows sub 8x8 modes to use the prediction filter that was determined
+  // best for 8x8 mode. If set to 0 we always re check all the filters for
+  // sizes less than 8x8, 1 means we check all filter modes if no 8x8 filter
+  // was selected, and 2 means we use 8 tap if no 8x8 filter mode was selected.
   int adaptive_pred_filter_type;
 
   // Implements various heuristics to skip searching modes
   // The heuristics selected are based on  flags
   // defined in the MODE_SEARCH_SKIP_HEURISTICS enum
   unsigned int mode_search_skip_flags;
+
   // A source variance threshold below which the split mode is disabled
   unsigned int disable_split_var_thresh;
+
   // A source variance threshold below which filter search is disabled
   // Choose a very large value (UINT_MAX) to use 8-tap always
   unsigned int disable_filter_search_var_thresh;
+
+  // These bit masks allow you to enable or disable intra modes for each
+  // transform size separately.
   int intra_y_mode_mask[TX_SIZES];
   int intra_uv_mode_mask[TX_SIZES];
+
+  // This variable enables an early break out of mode testing if the model for
+  // rd built from the prediction signal indicates a value that's much
+  // higher than the best rd we've seen so far.
   int use_rd_breakout;
+
+  // This enables us to use an estimate for intra rd based on dc mode rather
+  // than choosing an actual uv mode in the stage of encoding before the actual
+  // final encode.
   int use_uv_intra_rd_estimate;
+
+  // This picks a loop filter strength by trying a small portion of the image
+  // with different values.
   int use_fast_lpf_pick;
+
+  // This feature limits the number of coefficients updates we actually do
+  // by only looking at counts from 1/2 the bands.
   int use_fast_coef_updates;  // 0: 2-loop, 1: 1-loop, 2: 1-loop reduced
 } SPEED_FEATURES;
 
@@ -601,7 +729,7 @@ typedef struct VP9_COMP {
   int *mb_norm_activity_map;
   int output_partition;
 
-  /* force next frame to intra when kf_auto says so */
+  // Force next frame to intra when kf_auto says so.
   int force_next_frame_intra;
 
   int droppable;
@@ -643,7 +771,7 @@ typedef struct VP9_COMP {
   int64_t mode_test_hits[BLOCK_SIZES];
 #endif
 
-  /* Y,U,V,(A) */
+  // Y,U,V,(A)
   ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
   ENTROPY_CONTEXT left_context[MAX_MB_PLANE][16];
 
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index 47c993f4b..728f238e4 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -430,10 +430,32 @@ static void calc_pframe_target_size(VP9_COMP *const cpi) {
   }
 }
 
+static double get_rate_correction_factor(const VP9_COMP *cpi) {
+  if (cpi->common.frame_type == KEY_FRAME) {
+    return cpi->rc.key_frame_rate_correction_factor;
+  } else {
+    if (cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame)
+      return cpi->rc.gf_rate_correction_factor;
+    else
+      return cpi->rc.rate_correction_factor;
+  }
+}
+
+static void set_rate_correction_factor(VP9_COMP *cpi, double factor) {
+  if (cpi->common.frame_type == KEY_FRAME) {
+    cpi->rc.key_frame_rate_correction_factor = factor;
+  } else {
+    if (cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame)
+      cpi->rc.gf_rate_correction_factor = factor;
+    else
+      cpi->rc.rate_correction_factor = factor;
+  }
+}
+
 void vp9_rc_update_rate_correction_factors(VP9_COMP *cpi, int damp_var) {
   const int q = cpi->common.base_qindex;
   int correction_factor = 100;
-  double rate_correction_factor;
+  double rate_correction_factor = get_rate_correction_factor(cpi);
   double adjustment_limit;
 
   int projected_size_based_on_q = 0;
@@ -441,15 +463,6 @@ void vp9_rc_update_rate_correction_factors(VP9_COMP *cpi, int damp_var) {
   // Clear down mmx registers to allow floating point in what follows
   vp9_clear_system_state();  // __asm emms;
 
-  if (cpi->common.frame_type == KEY_FRAME) {
-    rate_correction_factor = cpi->rc.key_frame_rate_correction_factor;
-  } else {
-    if (cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame)
-      rate_correction_factor = cpi->rc.gf_rate_correction_factor;
-    else
-      rate_correction_factor = cpi->rc.rate_correction_factor;
-  }
-
   // Work out how big we would have expected the frame to be at this Q given
   // the current correction factor.
   // Stay in double to avoid int overflow when values are large
@@ -499,36 +512,16 @@ void vp9_rc_update_rate_correction_factors(VP9_COMP *cpi, int damp_var) {
       rate_correction_factor = MIN_BPB_FACTOR;
   }
 
-  if (cpi->common.frame_type == KEY_FRAME) {
-    cpi->rc.key_frame_rate_correction_factor = rate_correction_factor;
-  } else {
-    if (cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame)
-      cpi->rc.gf_rate_correction_factor = rate_correction_factor;
-    else
-      cpi->rc.rate_correction_factor = rate_correction_factor;
-  }
+  set_rate_correction_factor(cpi, rate_correction_factor);
 }
 
 
 int vp9_rc_regulate_q(const VP9_COMP *cpi, int target_bits_per_frame,
                       int active_best_quality, int active_worst_quality) {
   int q = active_worst_quality;
-
-  int i;
   int last_error = INT_MAX;
-  int target_bits_per_mb;
-  int bits_per_mb_at_this_q;
-  double correction_factor;
-
-  // Select the appropriate correction factor based upon type of frame.
-  if (cpi->common.frame_type == KEY_FRAME) {
-    correction_factor = cpi->rc.key_frame_rate_correction_factor;
-  } else {
-    if (cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame)
-      correction_factor = cpi->rc.gf_rate_correction_factor;
-    else
-      correction_factor = cpi->rc.rate_correction_factor;
-  }
+  int i, target_bits_per_mb, bits_per_mb_at_this_q;
+  const double correction_factor = get_rate_correction_factor(cpi);
 
   // Calculate required scaling factor based on target frame size and size of
   // frame produced using previous Q.
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 5ca34795d..81d47de92 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -2276,14 +2276,14 @@ static void setup_pred_block(const MACROBLOCKD *xd,
   }
 }
 
-static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
-                               const TileInfo *const tile,
-                               int idx, MV_REFERENCE_FRAME frame_type,
-                               BLOCK_SIZE block_size,
-                               int mi_row, int mi_col,
-                               int_mv frame_nearest_mv[MAX_REF_FRAMES],
-                               int_mv frame_near_mv[MAX_REF_FRAMES],
-                               struct buf_2d yv12_mb[4][MAX_MB_PLANE]) {
+void vp9_setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
+                            const TileInfo *const tile,
+                            int idx, MV_REFERENCE_FRAME frame_type,
+                            BLOCK_SIZE block_size,
+                            int mi_row, int mi_col,
+                            int_mv frame_nearest_mv[MAX_REF_FRAMES],
+                            int_mv frame_near_mv[MAX_REF_FRAMES],
+                            struct buf_2d yv12_mb[4][MAX_MB_PLANE]) {
   VP9_COMMON *cm = &cpi->common;
   YV12_BUFFER_CONFIG *yv12 = &cm->yv12_fb[cpi->common.ref_frame_map[idx]];
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -3175,17 +3175,29 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
 
   *returnrate = INT_MAX;
 
-  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
     x->pred_mv_sad[ref_frame] = INT_MAX;
     if (cpi->ref_frame_flags & flag_list[ref_frame]) {
-      setup_buffer_inter(cpi, x, tile, get_ref_frame_idx(cpi, ref_frame),
-                         ref_frame, block_size, mi_row, mi_col,
-                         frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb);
+      vp9_setup_buffer_inter(cpi, x, tile, get_ref_frame_idx(cpi, ref_frame),
+                             ref_frame, block_size, mi_row, mi_col,
+                             frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb);
     }
     frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
     frame_mv[ZEROMV][ref_frame].as_int = 0;
   }
 
+  cpi->ref_frame_mask = 0;
+  for (ref_frame = LAST_FRAME;
+       ref_frame <= ALTREF_FRAME && cpi->sf.reference_masking; ++ref_frame) {
+    int i;
+    for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
+      if ((x->pred_mv_sad[ref_frame] >> 2) > x->pred_mv_sad[i]) {
+        cpi->ref_frame_mask |= (1 << ref_frame);
+        break;
+      }
+    }
+  }
+
   for (mode_index = 0; mode_index < MAX_MODES; ++mode_index) {
     int mode_excluded = 0;
     int64_t this_rd = INT64_MAX;
@@ -3235,8 +3247,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     }
 
     // Skip if the current reference frame has been masked off
-    if (cpi->sf.reference_masking && !cpi->set_ref_frame_mask &&
-        (cpi->ref_frame_mask & (1 << ref_frame)))
+    if (cpi->ref_frame_mask & (1 << ref_frame) && this_mode != NEWMV)
       continue;
 
     // Test best rd so far against threshold for trying this mode.
@@ -3641,11 +3652,6 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     }
   }
 
-  // If we are using reference masking and the set mask flag is set then
-  // create the reference frame mask.
-  if (cpi->sf.reference_masking && cpi->set_ref_frame_mask)
-    cpi->ref_frame_mask = ~(1 << vp9_mode_order[best_mode_index].ref_frame[0]);
-
   // Flag all modes that have a distortion thats > 2x the best we found at
   // this level.
   for (mode_index = 0; mode_index < MB_MODE_COUNT; ++mode_index) {
@@ -3797,15 +3803,27 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
 
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
     if (cpi->ref_frame_flags & flag_list[ref_frame]) {
-      setup_buffer_inter(cpi, x, tile, get_ref_frame_idx(cpi, ref_frame),
-                         ref_frame, block_size, mi_row, mi_col,
-                         frame_mv[NEARESTMV], frame_mv[NEARMV],
-                         yv12_mb);
+      vp9_setup_buffer_inter(cpi, x, tile, get_ref_frame_idx(cpi, ref_frame),
+                             ref_frame, block_size, mi_row, mi_col,
+                             frame_mv[NEARESTMV], frame_mv[NEARMV],
+                             yv12_mb);
     }
     frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
     frame_mv[ZEROMV][ref_frame].as_int = 0;
   }
 
+  cpi->ref_frame_mask = 0;
+  for (ref_frame = LAST_FRAME;
+       ref_frame <= ALTREF_FRAME && cpi->sf.reference_masking; ++ref_frame) {
+    int i;
+    for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
+      if ((x->pred_mv_sad[ref_frame] >> 1) > x->pred_mv_sad[i]) {
+        cpi->ref_frame_mask |= (1 << ref_frame);
+        break;
+      }
+    }
+  }
+
   for (mode_index = 0; mode_index < MAX_REFS; ++mode_index) {
     int mode_excluded = 0;
     int64_t this_rd = INT64_MAX;
@@ -3853,11 +3871,6 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
         continue;
     }
 
-    // Skip if the current reference frame has been masked off
-    if (cpi->sf.reference_masking && !cpi->set_ref_frame_mask &&
-        (cpi->ref_frame_mask & (1 << ref_frame)))
-      continue;
-
     // Test best rd so far against threshold for trying this mode.
     if ((best_rd <
          ((int64_t)cpi->rd_thresh_sub8x8[segment_id][bsize][mode_index] *
@@ -4367,11 +4380,6 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
     }
   }
 
-  // If we are using reference masking and the set mask flag is set then
-  // create the reference frame mask.
-  if (cpi->sf.reference_masking && cpi->set_ref_frame_mask)
-    cpi->ref_frame_mask = ~(1 << vp9_ref_order[best_mode_index].ref_frame[0]);
-
   if (best_rd == INT64_MAX && bsize < BLOCK_8X8) {
     *returnrate = INT_MAX;
     *returndistortion = INT_MAX;
diff --git a/vp9/encoder/vp9_rdopt.h b/vp9/encoder/vp9_rdopt.h
index f0e8849c1..5732c2b2d 100644
--- a/vp9/encoder/vp9_rdopt.h
+++ b/vp9/encoder/vp9_rdopt.h
@@ -27,6 +27,15 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi);
 
 void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex);
 
+void vp9_setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
+                            const TileInfo *const tile,
+                            int idx, MV_REFERENCE_FRAME frame_type,
+                            BLOCK_SIZE block_size,
+                            int mi_row, int mi_col,
+                            int_mv frame_nearest_mv[MAX_REF_FRAMES],
+                            int_mv frame_near_mv[MAX_REF_FRAMES],
+                            struct buf_2d yv12_mb[4][MAX_MB_PLANE]);
+
 void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
                                int *r, int64_t *d, BLOCK_SIZE bsize,
                                PICK_MODE_CONTEXT *ctx, int64_t best_rd);