28 files changed, 511 insertions, 684 deletions
diff --git a/vp9/encoder/vp9_aq_complexity.c b/vp9/encoder/vp9_aq_complexity.c
index 47ad8d8cc..0d6b41d15 100644
--- a/vp9/encoder/vp9_aq_complexity.c
+++ b/vp9/encoder/vp9_aq_complexity.c
@@ -47,11 +47,21 @@ void vp9_setup_in_frame_q_adj(VP9_COMP *cpi) {
 
     // Use some of the segments for in frame Q adjustment.
     for (segment = 1; segment < 2; segment++) {
-      const int qindex_delta =
+      int qindex_delta =
           vp9_compute_qdelta_by_rate(&cpi->rc, cm->frame_type, cm->base_qindex,
                                      in_frame_q_adj_ratio[segment]);
-      vp9_enable_segfeature(seg, segment, SEG_LVL_ALT_Q);
-      vp9_set_segdata(seg, segment, SEG_LVL_ALT_Q, qindex_delta);
+
+      // For AQ mode 2, we dont allow Q0 in a segment if the base Q is not 0.
+      // Q0 (lossless) implies 4x4 only and in AQ mode 2 a segment Q delta
+      // is sometimes applied without going back around the rd loop.
+      // This could lead to an illegal combination of partition size and q.
+      if ((cm->base_qindex != 0) && ((cm->base_qindex + qindex_delta) == 0)) {
+        qindex_delta = -cm->base_qindex + 1;
+      }
+      if ((cm->base_qindex + qindex_delta) > 0) {
+        vp9_enable_segfeature(seg, segment, SEG_LVL_ALT_Q);
+        vp9_set_segdata(seg, segment, SEG_LVL_ALT_Q, qindex_delta);
+      }
     }
   }
 }
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index 8ef2b2eed..76f5e7bbe 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -20,7 +20,6 @@
 #include "vp9/common/vp9_entropymode.h"
 #include "vp9/common/vp9_entropymv.h"
 #include "vp9/common/vp9_mvref_common.h"
-#include "vp9/common/vp9_pragmas.h"
 #include "vp9/common/vp9_pred_common.h"
 #include "vp9/common/vp9_seg_common.h"
 #include "vp9/common/vp9_systemdependent.h"
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index c406860a0..c3cd93b78 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -109,6 +109,7 @@ struct macroblock {
   MV pred_mv[MAX_REF_FRAMES];
 
   void (*fwd_txm4x4)(const int16_t *input, int16_t *output, int stride);
+  void (*itxm_add)(const int16_t *input, uint8_t *dest, int stride, int eob);
 };
 
 #ifdef __cplusplus
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index ef33fcaf1..37fb0f3c6 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -76,18 +76,6 @@ typedef struct {
   unsigned int var;
 } diff;
 
-static void get_sse_sum_8x8(const uint8_t *src, int src_stride,
-                            const uint8_t *ref, int ref_stride,
-                            unsigned int *sse, int *sum) {
-  variance(src, src_stride, ref, ref_stride, 8, 8, sse, sum);
-}
-
-static void get_sse_sum_16x16(const uint8_t *src, int src_stride,
-                              const uint8_t *ref, int ref_stride,
-                              unsigned int *sse, int *sum) {
-  variance(src, src_stride, ref, ref_stride, 16, 16, sse, sum);
-}
-
 static unsigned int get_sby_perpixel_variance(VP9_COMP *cpi,
                                               const struct buf_2d *ref,
                                               BLOCK_SIZE bs) {
@@ -490,8 +478,8 @@ static void choose_partitioning(VP9_COMP *cpi,
         unsigned int sse = 0;
         int sum = 0;
         if (x_idx < pixels_wide && y_idx < pixels_high)
-          get_sse_sum_8x8(s + y_idx * sp + x_idx, sp,
-                          d + y_idx * dp + x_idx, dp, &sse, &sum);
+          vp9_get8x8var(s + y_idx * sp + x_idx, sp,
+                        d + y_idx * dp + x_idx, dp, &sse, &sum);
         fill_variance(sse, sum, 64, &vst->split[k].part_variances.none);
       }
     }
@@ -1226,9 +1214,9 @@ static void set_source_var_based_partition(VP9_COMP *cpi,
         int b_offset = b_mi_row * MI_SIZE * src_stride +
                        b_mi_col * MI_SIZE;
 
-        get_sse_sum_16x16(src + b_offset, src_stride,
-                          pre_src + b_offset, pre_stride,
-                          &d16[j].sse, &d16[j].sum);
+        vp9_get16x16var(src + b_offset, src_stride,
+                        pre_src + b_offset, pre_stride,
+                        &d16[j].sse, &d16[j].sum);
 
         d16[j].var = d16[j].sse -
             (((uint32_t)d16[j].sum * d16[j].sum) >> 8);
@@ -1303,14 +1291,14 @@ static int is_background(VP9_COMP *cpi, const TileInfo *const tile,
   if (row8x8_remaining >= MI_BLOCK_SIZE &&
       col8x8_remaining >= MI_BLOCK_SIZE) {
     this_sad = cpi->fn_ptr[BLOCK_64X64].sdf(src, src_stride,
-                                            pre, pre_stride, 0x7fffffff);
+                                            pre, pre_stride);
     threshold = (1 << 12);
   } else {
     int r, c;
     for (r = 0; r < row8x8_remaining; r += 2)
       for (c = 0; c < col8x8_remaining; c += 2)
-        this_sad += cpi->fn_ptr[BLOCK_16X16].sdf(src, src_stride, pre,
-                                                 pre_stride, 0x7fffffff);
+        this_sad += cpi->fn_ptr[BLOCK_16X16].sdf(src, src_stride,
+                                                 pre, pre_stride);
     threshold = (row8x8_remaining * col8x8_remaining) << 6;
   }
 
@@ -1318,7 +1306,8 @@ static int is_background(VP9_COMP *cpi, const TileInfo *const tile,
   return x->in_static_area;
 }
 
-static int sb_has_motion(const VP9_COMMON *cm, MODE_INFO **prev_mi_8x8) {
+static int sb_has_motion(const VP9_COMMON *cm, MODE_INFO **prev_mi_8x8,
+                         const int motion_thresh) {
   const int mis = cm->mi_stride;
   int block_row, block_col;
 
@@ -1327,8 +1316,8 @@ static int sb_has_motion(const VP9_COMMON *cm, MODE_INFO **prev_mi_8x8) {
       for (block_col = 0; block_col < 8; ++block_col) {
         const MODE_INFO *prev_mi = prev_mi_8x8[block_row * mis + block_col];
         if (prev_mi) {
-          if (abs(prev_mi->mbmi.mv[0].as_mv.row) >= 8 ||
-              abs(prev_mi->mbmi.mv[0].as_mv.col) >= 8)
+          if (abs(prev_mi->mbmi.mv[0].as_mv.row) > motion_thresh ||
+              abs(prev_mi->mbmi.mv[0].as_mv.col) > motion_thresh)
             return 1;
         }
       }
@@ -2324,7 +2313,7 @@ static void encode_rd_sb_row(VP9_COMP *cpi, const TileInfo *const tile,
             || cpi->rc.is_src_frame_alt_ref
             || ((sf->use_lastframe_partitioning ==
                  LAST_FRAME_PARTITION_LOW_MOTION) &&
-                 sb_has_motion(cm, prev_mi))) {
+                 sb_has_motion(cm, prev_mi, sf->lf_motion_threshold))) {
           // If required set upper and lower partition size limits
           if (sf->auto_min_max_partition_size) {
             set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64);
@@ -2337,7 +2326,7 @@ static void encode_rd_sb_row(VP9_COMP *cpi, const TileInfo *const tile,
                             cpi->pc_root);
         } else {
           if (sf->constrain_copy_partition &&
-              sb_has_motion(cm, prev_mi))
+              sb_has_motion(cm, prev_mi, sf->lf_motion_threshold))
             constrain_copy_partitioning(cpi, tile, mi, prev_mi,
                                         mi_row, mi_col, BLOCK_16X16);
           else
@@ -2384,7 +2373,7 @@ static void switch_lossless_mode(VP9_COMP *cpi, int lossless) {
   if (lossless) {
     // printf("Switching to lossless\n");
     cpi->mb.fwd_txm4x4 = vp9_fwht4x4;
-    cpi->mb.e_mbd.itxm_add = vp9_iwht4x4_add;
+    cpi->mb.itxm_add = vp9_iwht4x4_add;
     cpi->mb.optimize = 0;
     cpi->common.lf.filter_level = 0;
     cpi->zbin_mode_boost_enabled = 0;
@@ -2392,7 +2381,7 @@ static void switch_lossless_mode(VP9_COMP *cpi, int lossless) {
   } else {
     // printf("Not lossless\n");
     cpi->mb.fwd_txm4x4 = vp9_fdct4x4;
-    cpi->mb.e_mbd.itxm_add = vp9_idct4x4_add;
+    cpi->mb.itxm_add = vp9_idct4x4_add;
   }
 }
 
@@ -2869,7 +2858,7 @@ static void nonrd_use_partition(VP9_COMP *cpi,
       if (mi_row + hbs < cm->mi_rows) {
         nonrd_pick_sb_modes(cpi, tile, mi_row + hbs, mi_col,
                             &rate, &dist, subsize);
-        pc_tree->horizontal[1].mic.mbmi = mi[0]->mbmi;
+        pc_tree->horizontal[1].mic.mbmi = xd->mi[0]->mbmi;
         if (rate != INT_MAX && dist != INT64_MAX &&
             *totrate != INT_MAX && *totdist != INT64_MAX) {
           *totrate += rate;
@@ -3368,7 +3357,8 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
       vp9_tokenize_sb(cpi, t, !output_enabled, MAX(bsize, BLOCK_8X8));
     } else {
       mbmi->skip = 1;
-      if (output_enabled)
+      if (output_enabled &&
+          !vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP))
         cm->counts.skip[vp9_get_skip_context(xd)][1]++;
       reset_skip_context(xd, MAX(bsize, BLOCK_8X8));
     }
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 3b231b7f2..8581e6117 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -406,7 +406,7 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize,
       // this is like vp9_short_idct4x4 but has a special case around eob<=1
       // which is significant (not just an optimization) for the lossless
       // case.
-      xd->itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
+      x->itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
       break;
     default:
       assert(0 && "Invalid transform size");
@@ -428,7 +428,7 @@ static void encode_block_pass1(int plane, int block, BLOCK_SIZE plane_bsize,
   vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
 
   if (p->eobs[block] > 0)
-    xd->itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
+    x->itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
 }
 
 void vp9_encode_sby_pass1(MACROBLOCK *x, BLOCK_SIZE bsize) {
@@ -574,7 +574,7 @@ static void encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
           // this is like vp9_short_idct4x4 but has a special case around eob<=1
           // which is significant (not just an optimization) for the lossless
           // case.
-          xd->itxm_add(dqcoeff, dst, dst_stride, *eob);
+          x->itxm_add(dqcoeff, dst, dst_stride, *eob);
         else
           vp9_iht4x4_16_add(dqcoeff, dst, dst_stride, tx_type);
       }
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 645503103..c955d27c0 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -602,9 +602,9 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
     // is set.
     cpi->oxcf.worst_allowed_q = 0;
     cpi->oxcf.best_allowed_q = 0;
-    cpi->mb.e_mbd.itxm_add = vp9_iwht4x4_add;
+    cpi->mb.itxm_add = vp9_iwht4x4_add;
   } else {
-    cpi->mb.e_mbd.itxm_add = vp9_idct4x4_add;
+    cpi->mb.itxm_add = vp9_idct4x4_add;
   }
   rc->baseline_gf_interval = DEFAULT_GF_INTERVAL;
   cpi->ref_frame_flags = VP9_ALT_FLAG | VP9_GOLD_FLAG | VP9_LAST_FLAG;
@@ -632,9 +632,6 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
     cpi->oxcf.maximum_buffer_size     = 240000;
   }
 
-  // Convert target bandwidth from Kbit/s to Bit/s
-  cpi->oxcf.target_bandwidth       *= 1000;
-
   cpi->oxcf.starting_buffer_level =
       vp9_rescale(cpi->oxcf.starting_buffer_level,
                   cpi->oxcf.target_bandwidth, 1000);
@@ -2103,8 +2100,6 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
 
   vp9_clear_system_state();
 
-  vp9_zero(cpi->rd.tx_select_threshes);
-
 #if CONFIG_VP9_POSTPROC
   if (cpi->oxcf.noise_sensitivity > 0) {
     int l = 0;
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index fce5b28f3..dc3832b16 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -64,8 +64,7 @@
 #define MIN_GF_INTERVAL             4
 #endif
 
-
-// #define LONG_TERM_VBR_CORRECTION
+#define LONG_TERM_VBR_CORRECTION
 
 static void swap_yv12(YV12_BUFFER_CONFIG *a, YV12_BUFFER_CONFIG *b) {
   YV12_BUFFER_CONFIG temp = *a;
@@ -170,7 +169,6 @@ static void zero_stats(FIRSTPASS_STATS *section) {
   section->intra_error = 0.0;
   section->coded_error = 0.0;
   section->sr_coded_error = 0.0;
-  section->ssim_weighted_pred_err = 0.0;
   section->pcnt_inter  = 0.0;
   section->pcnt_motion  = 0.0;
   section->pcnt_second_ref = 0.0;
@@ -195,7 +193,6 @@ static void accumulate_stats(FIRSTPASS_STATS *section,
   section->intra_error += frame->intra_error;
   section->coded_error += frame->coded_error;
   section->sr_coded_error += frame->sr_coded_error;
-  section->ssim_weighted_pred_err += frame->ssim_weighted_pred_err;
   section->pcnt_inter  += frame->pcnt_inter;
   section->pcnt_motion += frame->pcnt_motion;
   section->pcnt_second_ref += frame->pcnt_second_ref;
@@ -218,7 +215,6 @@ static void subtract_stats(FIRSTPASS_STATS *section,
   section->intra_error -= frame->intra_error;
   section->coded_error -= frame->coded_error;
   section->sr_coded_error -= frame->sr_coded_error;
-  section->ssim_weighted_pred_err -= frame->ssim_weighted_pred_err;
   section->pcnt_inter  -= frame->pcnt_inter;
   section->pcnt_motion -= frame->pcnt_motion;
   section->pcnt_second_ref -= frame->pcnt_second_ref;
@@ -242,7 +238,6 @@ static void avg_stats(FIRSTPASS_STATS *section) {
   section->intra_error /= section->count;
   section->coded_error /= section->count;
   section->sr_coded_error /= section->count;
-  section->ssim_weighted_pred_err /= section->count;
   section->pcnt_inter  /= section->count;
   section->pcnt_second_ref /= section->count;
   section->pcnt_neutral /= section->count;
@@ -259,86 +254,18 @@ static void avg_stats(FIRSTPASS_STATS *section) {
 
 // Calculate a modified Error used in distributing bits between easier and
 // harder frames.
-static double calculate_modified_err(const VP9_COMP *cpi,
+static double calculate_modified_err(const TWO_PASS *twopass,
+                                     const VP9EncoderConfig *oxcf,
                                      const FIRSTPASS_STATS *this_frame) {
-  const TWO_PASS *twopass = &cpi->twopass;
-  const SVC *const svc = &cpi->svc;
-  const FIRSTPASS_STATS *stats;
-  double av_err;
-  double modified_error;
-
-  if (svc->number_spatial_layers > 1 &&
-      svc->number_temporal_layers == 1) {
-    twopass = &svc->layer_context[svc->spatial_layer_id].twopass;
-  }
-
-  stats = &twopass->total_stats;
-  av_err = stats->ssim_weighted_pred_err / stats->count;
-  modified_error = av_err * pow(this_frame->ssim_weighted_pred_err /
-                   DOUBLE_DIVIDE_CHECK(av_err),
-                   cpi->oxcf.two_pass_vbrbias / 100.0);
-
+  const FIRSTPASS_STATS *const stats = &twopass->total_stats;
+  const double av_err = stats->coded_error / stats->count;
+  const double modified_error = av_err *
+      pow(this_frame->coded_error / DOUBLE_DIVIDE_CHECK(av_err),
+          oxcf->two_pass_vbrbias / 100.0);
   return fclamp(modified_error,
                 twopass->modified_error_min, twopass->modified_error_max);
 }
 
-static const double weight_table[256] = {
-  0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
-  0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
-  0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
-  0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
-  0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.031250, 0.062500,
-  0.093750, 0.125000, 0.156250, 0.187500, 0.218750, 0.250000, 0.281250,
-  0.312500, 0.343750, 0.375000, 0.406250, 0.437500, 0.468750, 0.500000,
-  0.531250, 0.562500, 0.593750, 0.625000, 0.656250, 0.687500, 0.718750,
-  0.750000, 0.781250, 0.812500, 0.843750, 0.875000, 0.906250, 0.937500,
-  0.968750, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000
-};
-
-static double simple_weight(const YV12_BUFFER_CONFIG *buf) {
-  int i, j;
-  double sum = 0.0;
-  const int w = buf->y_crop_width;
-  const int h = buf->y_crop_height;
-  const uint8_t *row = buf->y_buffer;
-
-  for (i = 0; i < h; ++i) {
-    const uint8_t *pixel = row;
-    for (j = 0; j < w; ++j)
-      sum += weight_table[*pixel++];
-    row += buf->y_stride;
-  }
-
-  return MAX(0.1, sum / (w * h));
-}
-
 // This function returns the maximum target rate per frame.
 static int frame_max_bits(const RATE_CONTROL *rc,
                           const VP9EncoderConfig *oxcf) {
@@ -533,6 +460,7 @@ void vp9_first_pass(VP9_COMP *cpi) {
   TWO_PASS *twopass = &cpi->twopass;
   const MV zero_mv = {0, 0};
   const YV12_BUFFER_CONFIG *first_ref_buf = lst_yv12;
+  FIRSTPASS_STATS fps;
 
   vp9_clear_system_state();
 
@@ -830,14 +758,11 @@ void vp9_first_pass(VP9_COMP *cpi) {
 
   vp9_clear_system_state();
   {
-    FIRSTPASS_STATS fps;
-
     fps.frame = cm->current_video_frame;
     fps.spatial_layer_id = cpi->svc.spatial_layer_id;
     fps.intra_error = (double)(intra_error >> 8);
     fps.coded_error = (double)(coded_error >> 8);
     fps.sr_coded_error = (double)(sr_coded_error >> 8);
-    fps.ssim_weighted_pred_err = fps.coded_error * simple_weight(cpi->Source);
     fps.count = 1.0;
     fps.pcnt_inter = (double)intercount / cm->MBs;
     fps.pcnt_second_ref = (double)second_ref_count / cm->MBs;
@@ -871,8 +796,7 @@ void vp9_first_pass(VP9_COMP *cpi) {
     fps.duration = (double)(cpi->source->ts_end - cpi->source->ts_start);
 
     // Don't want to do output stats with a stack variable!
-    twopass->this_frame_stats = fps;
-    output_stats(&twopass->this_frame_stats, cpi->output_pkt_list);
+    output_stats(&fps, cpi->output_pkt_list);
     accumulate_stats(&twopass->total_stats, &fps);
   }
 
@@ -880,9 +804,9 @@ void vp9_first_pass(VP9_COMP *cpi) {
   // the prediction is good enough... but also don't allow it to lag too far.
   if ((twopass->sr_update_lag > 3) ||
       ((cm->current_video_frame > 0) &&
-       (twopass->this_frame_stats.pcnt_inter > 0.20) &&
-       ((twopass->this_frame_stats.intra_error /
-         DOUBLE_DIVIDE_CHECK(twopass->this_frame_stats.coded_error)) > 2.0))) {
+       (fps.pcnt_inter > 0.20) &&
+       ((fps.intra_error /
+         DOUBLE_DIVIDE_CHECK(fps.coded_error)) > 2.0))) {
     if (gld_yv12 != NULL) {
       vp8_yv12_copy_frame(lst_yv12, gld_yv12);
     }
@@ -1043,25 +967,19 @@ void vp9_init_second_pass(VP9_COMP *cpi) {
   // Scan the first pass file and calculate a modified total error based upon
   // the bias/power function used to allocate bits.
   {
-    const FIRSTPASS_STATS *const start_pos = twopass->stats_in;
-    FIRSTPASS_STATS this_frame;
-    const double av_error = stats->ssim_weighted_pred_err /
-                                DOUBLE_DIVIDE_CHECK(stats->count);
-
-
-    twopass->modified_error_total = 0.0;
-    twopass->modified_error_min =
-        (av_error * oxcf->two_pass_vbrmin_section) / 100;
-    twopass->modified_error_max =
-        (av_error * oxcf->two_pass_vbrmax_section) / 100;
-
-    while (input_stats(twopass, &this_frame) != EOF) {
-      twopass->modified_error_total +=
-          calculate_modified_err(cpi, &this_frame);
+    const double avg_error = stats->coded_error /
+                             DOUBLE_DIVIDE_CHECK(stats->count);
+    const FIRSTPASS_STATS *s = twopass->stats_in;
+    double modified_error_total = 0.0;
+    twopass->modified_error_min = (avg_error *
+                                      oxcf->two_pass_vbrmin_section) / 100;
+    twopass->modified_error_max = (avg_error *
+                                      oxcf->two_pass_vbrmax_section) / 100;
+    while (s < twopass->stats_in_end) {
+      modified_error_total += calculate_modified_err(twopass, oxcf, s);
+      ++s;
     }
-    twopass->modified_error_left = twopass->modified_error_total;
-
-    reset_fpf_position(twopass, start_pos);
+    twopass->modified_error_left = modified_error_total;
   }
 
   // Reset the vbr bits off target counter
@@ -1182,16 +1100,17 @@ static void accumulate_frame_motion_stats(
 }
 
 // Calculate a baseline boost number for the current frame.
-static double calc_frame_boost(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame,
+static double calc_frame_boost(const TWO_PASS *twopass,
+                               const FIRSTPASS_STATS *this_frame,
                                double this_frame_mv_in_out) {
   double frame_boost;
 
   // Underlying boost factor is based on inter intra error ratio.
-  if (this_frame->intra_error > cpi->twopass.gf_intra_err_min)
+  if (this_frame->intra_error > twopass->gf_intra_err_min)
     frame_boost = (IIFACTOR * this_frame->intra_error /
                    DOUBLE_DIVIDE_CHECK(this_frame->coded_error));
   else
-    frame_boost = (IIFACTOR * cpi->twopass.gf_intra_err_min /
+    frame_boost = (IIFACTOR * twopass->gf_intra_err_min /
                    DOUBLE_DIVIDE_CHECK(this_frame->coded_error));
 
   // Increase boost for frames where new data coming into frame (e.g. zoom out).
@@ -1244,8 +1163,8 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset,
                           ? MIN_DECAY_FACTOR : decay_accumulator;
     }
 
-    boost_score += (decay_accumulator *
-                    calc_frame_boost(cpi, &this_frame, this_frame_mv_in_out));
+    boost_score += decay_accumulator * calc_frame_boost(twopass, &this_frame,
+                                                        this_frame_mv_in_out);
   }
 
   *f_boost = (int)boost_score;
@@ -1281,8 +1200,8 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset,
                               ? MIN_DECAY_FACTOR : decay_accumulator;
     }
 
-    boost_score += (decay_accumulator *
-                    calc_frame_boost(cpi, &this_frame, this_frame_mv_in_out));
+    boost_score += decay_accumulator * calc_frame_boost(twopass, &this_frame,
+                                                        this_frame_mv_in_out);
   }
   *b_boost = (int)boost_score;
 
@@ -1500,6 +1419,66 @@ static int calculate_boost_bits(int frame_count,
   return MAX((int)(((int64_t)boost * total_group_bits) / allocation_chunks), 0);
 }
 
+static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits,
+                                   double group_error, int gf_arf_bits) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+  TWO_PASS *twopass = &cpi->twopass;
+  FIRSTPASS_STATS frame_stats;
+  int i;
+  int group_frame_index = 1;
+  int target_frame_size;
+  int key_frame;
+  const int max_bits = frame_max_bits(&cpi->rc, &cpi->oxcf);
+  int64_t total_group_bits = gf_group_bits;
+  double modified_err = 0.0;
+  double err_fraction;
+
+  key_frame = cpi->common.frame_type == KEY_FRAME ||
+              vp9_is_upper_layer_key_frame(cpi);
+
+  // For key frames the frame target rate is already set and it
+  // is also the golden frame.
+  // NOTE: We dont bother to check for the special case of ARF overlay
+  // frames here, as there is clamping code for this in the function
+  // vp9_rc_clamp_pframe_target_size(), which applies to one and two pass
+  // encodes.
+  if (!key_frame) {
+    twopass->gf_group_bit_allocation[0] = gf_arf_bits;
+
+    // Step over the golden frame / overlay frame
+    if (EOF == input_stats(twopass, &frame_stats))
+      return;
+  }
+
+  // Store the bits to spend on the ARF if there is one.
+  if (rc->source_alt_ref_pending) {
+    twopass->gf_group_bit_allocation[group_frame_index++] = gf_arf_bits;
+  }
+
+  // Deduct the boost bits for arf or gf if it is not a key frame.
+  if (rc->source_alt_ref_pending || !key_frame)
+    total_group_bits -= gf_arf_bits;
+
+  // Allocate bits to the other frames in the group.
+  for (i = 0; i < rc->baseline_gf_interval - 1; ++i) {
+    if (EOF == input_stats(twopass, &frame_stats))
+      break;
+
+    modified_err = calculate_modified_err(twopass, oxcf, &frame_stats);
+
+    if (group_error > 0)
+      err_fraction = modified_err / DOUBLE_DIVIDE_CHECK(group_error);
+    else
+      err_fraction = 0.0;
+
+    target_frame_size = (int)((double)total_group_bits * err_fraction);
+    target_frame_size = clamp(target_frame_size, 0,
+                              MIN(max_bits, (int)total_group_bits));
+
+    twopass->gf_group_bit_allocation[group_frame_index++] = target_frame_size;
+  }
+}
 
 // Analyse and define a gf/arf group.
 static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
@@ -1509,6 +1488,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   FIRSTPASS_STATS next_frame;
   const FIRSTPASS_STATS *const start_pos = twopass->stats_in;
   int i;
+
   double boost_score = 0.0;
   double old_boost_score = 0.0;
   double gf_group_err = 0.0;
@@ -1532,14 +1512,24 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   int b_boost = 0;
   int flash_detected;
   int active_max_gf_interval;
+  int64_t gf_group_bits;
+  double gf_group_error_left;
+  int gf_arf_bits;
+
+  // Reset the GF group data structures unless this is a key
+  // frame in which case it will already have been done.
+  if (cpi->common.frame_type != KEY_FRAME) {
+    twopass->gf_group_index = 0;
+    vp9_zero(twopass->gf_group_bit_allocation);
+  }
 
   vp9_clear_system_state();
   vp9_zero(next_frame);
 
-  twopass->gf_group_bits = 0;
+  gf_group_bits = 0;
 
   // Load stats for the current frame.
-  mod_frame_err = calculate_modified_err(cpi, this_frame);
+  mod_frame_err = calculate_modified_err(twopass, oxcf, this_frame);
 
   // Note the error of the frame at the start of the group. This will be
   // the GF frame error if we code a normal gf.
@@ -1571,7 +1561,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     ++i;
 
     // Accumulate error score of frames in this gf group.
-    mod_frame_err = calculate_modified_err(cpi, this_frame);
+    mod_frame_err = calculate_modified_err(twopass, oxcf, this_frame);
     gf_group_err += mod_frame_err;
 
     if (EOF == input_stats(twopass, &next_frame))
@@ -1610,12 +1600,12 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     }
 
     // Calculate a boost number for this frame.
-    boost_score += (decay_accumulator *
-       calc_frame_boost(cpi, &next_frame, this_frame_mv_in_out));
+    boost_score += decay_accumulator * calc_frame_boost(twopass, &next_frame,
+                                                        this_frame_mv_in_out);
 
     // Break out conditions.
     if (
-      // Break at cpi->max_gf_interval unless almost totally static.
+      // Break at active_max_gf_interval unless almost totally static.
       (i >= active_max_gf_interval && (zero_motion_accumulator < 0.995)) ||
       (
         // Don't break out with a very short interval.
@@ -1646,7 +1636,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
         break;
 
       if (i < rc->frames_to_key) {
-        mod_frame_err = calculate_modified_err(cpi, this_frame);
+        mod_frame_err = calculate_modified_err(twopass, oxcf, this_frame);
         gf_group_err += mod_frame_err;
       }
     }
@@ -1742,7 +1732,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   reset_fpf_position(twopass, start_pos);
 
   // Calculate the bits to be allocated to the gf/arf group as a whole
-  twopass->gf_group_bits = calculate_total_gf_group_bits(cpi, gf_group_err);
+  gf_group_bits = calculate_total_gf_group_bits(cpi, gf_group_err);
 
   // Calculate the extra bits to be used for boosted frame(s)
   {
@@ -1753,19 +1743,8 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     boost = clamp(boost, 125, (rc->baseline_gf_interval + 1) * 200);
 
     // Calculate the extra bits to be used for boosted frame(s)
-    twopass->gf_bits = calculate_boost_bits(rc->baseline_gf_interval,
-                                            boost, twopass->gf_group_bits);
-
-
-    // For key frames the frame target rate is set already.
-    // NOTE: We dont bother to check for the special case of ARF overlay
-    // frames here, as there is clamping code for this in the function
-    // vp9_rc_clamp_pframe_target_size(), which applies to one and two pass
-    // encodes.
-    if (cpi->common.frame_type != KEY_FRAME &&
-        !vp9_is_upper_layer_key_frame(cpi)) {
-      vp9_rc_set_frame_target(cpi, twopass->gf_bits);
-    }
+    gf_arf_bits = calculate_boost_bits(rc->baseline_gf_interval,
+                                       boost, gf_group_bits);
   }
 
   // Adjust KF group bits and error remaining.
@@ -1778,14 +1757,19 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   // For normal GFs remove the score for the GF itself unless this is
   // also a key frame in which case it has already been accounted for.
   if (rc->source_alt_ref_pending) {
-    twopass->gf_group_error_left = (int64_t)(gf_group_err - mod_frame_err);
+    gf_group_error_left = gf_group_err - mod_frame_err;
   } else if (cpi->common.frame_type != KEY_FRAME) {
-    twopass->gf_group_error_left = (int64_t)(gf_group_err
-                                                 - gf_first_frame_err);
+    gf_group_error_left = gf_group_err - gf_first_frame_err;
   } else {
-    twopass->gf_group_error_left = (int64_t)gf_group_err;
+    gf_group_error_left = gf_group_err;
   }
 
+  // Allocate bits to each of the frames in the GF group.
+  allocate_gf_group_bits(cpi, gf_group_bits, gf_group_error_left, gf_arf_bits);
+
+  // Reset the file position.
+  reset_fpf_position(twopass, start_pos);
+
   // Calculate a section intra ratio used in setting max loop filter.
   if (cpi->common.frame_type != KEY_FRAME) {
     twopass->section_intra_rating =
@@ -1794,37 +1778,6 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   }
 }
 
-// Allocate bits to a normal frame that is neither a gf an arf or a key frame.
-static void assign_std_frame_bits(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
-  TWO_PASS *twopass = &cpi->twopass;
-  // For a single frame.
-  const int max_bits = frame_max_bits(&cpi->rc, &cpi->oxcf);
-  // Calculate modified prediction error used in bit allocation.
-  const double modified_err = calculate_modified_err(cpi, this_frame);
-  int target_frame_size;
-  double err_fraction;
-
-  if (twopass->gf_group_error_left > 0)
-    // What portion of the remaining GF group error is used by this frame.
-    err_fraction = modified_err / twopass->gf_group_error_left;
-  else
-    err_fraction = 0.0;
-
-  // How many of those bits available for allocation should we give it?
-  target_frame_size = (int)((double)twopass->gf_group_bits * err_fraction);
-
-  // Clip target size to 0 - max_bits (or cpi->twopass.gf_group_bits) at
-  // the top end.
-  target_frame_size = clamp(target_frame_size, 0,
-                            MIN(max_bits, (int)twopass->gf_group_bits));
-
-  // Adjust error and bits remaining.
-  twopass->gf_group_error_left -= (int64_t)modified_err;
-
-  // Per frame bit target for this frame.
-  vp9_rc_set_frame_target(cpi, target_frame_size);
-}
-
 static int test_candidate_kf(TWO_PASS *twopass,
                              const FIRSTPASS_STATS *last_frame,
                              const FIRSTPASS_STATS *this_frame,
@@ -1906,10 +1859,12 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   int i, j;
   RATE_CONTROL *const rc = &cpi->rc;
   TWO_PASS *const twopass = &cpi->twopass;
+  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
   const FIRSTPASS_STATS first_frame = *this_frame;
   const FIRSTPASS_STATS *const start_position = twopass->stats_in;
   FIRSTPASS_STATS next_frame;
   FIRSTPASS_STATS last_frame;
+  int kf_bits = 0;
   double decay_accumulator = 1.0;
   double zero_motion_accumulator = 1.0;
   double boost_score = 0.0;
@@ -1921,6 +1876,10 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
 
   cpi->common.frame_type = KEY_FRAME;
 
+  // Reset the GF group data structures.
+  twopass->gf_group_index = 0;
+  vp9_zero(twopass->gf_group_bit_allocation);
+
   // Is this a forced key frame by interval.
   rc->this_key_frame_forced = rc->next_key_frame_forced;
 
@@ -1935,14 +1894,14 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   twopass->kf_group_bits = 0;        // Total bits available to kf group
   twopass->kf_group_error_left = 0;  // Group modified error score.
 
-  kf_mod_err = calculate_modified_err(cpi, this_frame);
+  kf_mod_err = calculate_modified_err(twopass, oxcf, this_frame);
 
   // Find the next keyframe.
   i = 0;
   while (twopass->stats_in < twopass->stats_in_end &&
          rc->frames_to_key < cpi->oxcf.key_freq) {
     // Accumulate kf group error.
-    kf_group_err += calculate_modified_err(cpi, this_frame);
+    kf_group_err += calculate_modified_err(twopass, oxcf, this_frame);
 
     // Load the next frame's stats.
     last_frame = *this_frame;
@@ -2004,7 +1963,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
 
     // Rescan to get the correct error data for the forced kf group.
     for (i = 0; i < rc->frames_to_key; ++i) {
-      kf_group_err += calculate_modified_err(cpi, &tmp_frame);
+      kf_group_err += calculate_modified_err(twopass, oxcf, &tmp_frame);
       input_stats(twopass, &tmp_frame);
     }
     rc->next_key_frame_forced = 1;
@@ -2018,7 +1977,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   // Special case for the last key frame of the file.
   if (twopass->stats_in >= twopass->stats_in_end) {
     // Accumulate kf group error.
-    kf_group_err += calculate_modified_err(cpi, this_frame);
+    kf_group_err += calculate_modified_err(twopass, oxcf, this_frame);
   }
 
   // Calculate the number of bits that should be assigned to the kf group.
@@ -2086,6 +2045,8 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     }
   }
 
+  reset_fpf_position(twopass, start_position);
+
   // Store the zero motion percentage
   twopass->kf_zeromotion_pct = (int)(zero_motion_accumulator * 100.0);
 
@@ -2102,13 +2063,13 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   if (rc->kf_boost   < MIN_KF_BOOST)
     rc->kf_boost = MIN_KF_BOOST;
 
-  twopass->kf_bits = calculate_boost_bits((rc->frames_to_key - 1),
-                                          rc->kf_boost, twopass->kf_group_bits);
+  kf_bits = calculate_boost_bits((rc->frames_to_key - 1),
+                                  rc->kf_boost, twopass->kf_group_bits);
 
-  twopass->kf_group_bits -= twopass->kf_bits;
+  twopass->kf_group_bits -= kf_bits;
 
-  // Per frame bit target for this frame.
-  vp9_rc_set_frame_target(cpi, twopass->kf_bits);
+  // Save the bits to spend on the key frame.
+  twopass->gf_group_bit_allocation[0] = kf_bits;
 
   // Note the total error score of the kf group minus the key frame itself.
   twopass->kf_group_error_left = (int)(kf_group_err - kf_mod_err);
@@ -2144,7 +2105,7 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
   FIRSTPASS_STATS this_frame;
   FIRSTPASS_STATS this_frame_copy;
 
-  int target;
+  int target_rate;
   LAYER_CONTEXT *lc = NULL;
   const int is_spatial_svc = (cpi->use_svc &&
                               cpi->svc.number_temporal_layers == 1);
@@ -2160,16 +2121,23 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
   if (!twopass->stats_in)
     return;
 
+  // Increment the gf group index.
+  ++twopass->gf_group_index;
+
+  // If this is an arf frame then we dont want to read the stats file or
+  // advance the input pointer as we already have what we need.
   if (cpi->refresh_alt_ref_frame) {
-    int modified_target = twopass->gf_bits;
-    rc->base_frame_target = twopass->gf_bits;
-    cm->frame_type = INTER_FRAME;
+    int target_rate;
+    target_rate = twopass->gf_group_bit_allocation[twopass->gf_group_index];
+    target_rate = vp9_rc_clamp_pframe_target_size(cpi, target_rate);
+    rc->base_frame_target = target_rate;
 #ifdef LONG_TERM_VBR_CORRECTION
     // Correction to rate target based on prior over or under shoot.
     if (cpi->oxcf.rc_mode == RC_MODE_VBR)
-      vbr_rate_correction(&modified_target, rc->vbr_bits_off_target);
+      vbr_rate_correction(&target_rate, rc->vbr_bits_off_target);
 #endif
-    vp9_rc_set_frame_target(cpi, modified_target);
+    vp9_rc_set_frame_target(cpi, target_rate);
+    cm->frame_type = INTER_FRAME;
     return;
   }
 
@@ -2197,11 +2165,13 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
   if (EOF == input_stats(twopass, &this_frame))
     return;
 
+  // Local copy of the current frame's first pass stats.
+  this_frame_copy = this_frame;
+
   // Keyframe and section processing.
   if (rc->frames_to_key == 0 ||
       (cpi->frame_flags & FRAMEFLAGS_KEY)) {
     // Define next KF group and assign bits to it.
-    this_frame_copy = this_frame;
     find_next_key_frame(cpi, &this_frame_copy);
   } else {
     cm->frame_type = INTER_FRAME;
@@ -2220,11 +2190,8 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
     }
   }
 
-  // Is this frame a GF / ARF? (Note: a key frame is always also a GF).
+  // Define a new GF/ARF group. (Should always enter here for key frames).
   if (rc->frames_till_gf_update_due == 0) {
-    // Define next gf group and assign bits to it.
-    this_frame_copy = this_frame;
-
 #if CONFIG_MULTIPLE_ARF
     if (cpi->multi_arf_enabled) {
       define_fixed_arf_period(cpi);
@@ -2247,11 +2214,6 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
 
     rc->frames_till_gf_update_due = rc->baseline_gf_interval;
     cpi->refresh_golden_frame = 1;
-  } else {
-    // Otherwise this is an ordinary frame.
-    // Assign bits from those allocated to the GF group.
-    this_frame_copy =  this_frame;
-    assign_std_frame_bits(cpi, &this_frame_copy);
   }
 
   {
@@ -2262,18 +2224,19 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
     }
   }
 
+  target_rate = twopass->gf_group_bit_allocation[twopass->gf_group_index];
   if (cpi->common.frame_type == KEY_FRAME)
-    target = vp9_rc_clamp_iframe_target_size(cpi, rc->this_frame_target);
+    target_rate = vp9_rc_clamp_iframe_target_size(cpi, target_rate);
   else
-    target = vp9_rc_clamp_pframe_target_size(cpi, rc->this_frame_target);
+    target_rate = vp9_rc_clamp_pframe_target_size(cpi, target_rate);
 
-  rc->base_frame_target = target;
+  rc->base_frame_target = target_rate;
 #ifdef LONG_TERM_VBR_CORRECTION
   // Correction to rate target based on prior over or under shoot.
   if (cpi->oxcf.rc_mode == RC_MODE_VBR)
-    vbr_rate_correction(&target, rc->vbr_bits_off_target);
+    vbr_rate_correction(&target_rate, rc->vbr_bits_off_target);
 #endif
-  vp9_rc_set_frame_target(cpi, target);
+  vp9_rc_set_frame_target(cpi, target_rate);
 
   // Update the total stats remaining structure.
   subtract_stats(&twopass->total_left_stats, &this_frame);
@@ -2322,8 +2285,6 @@ void vp9_twopass_postencode_update(VP9_COMP *cpi) {
   } else {
 #endif
     twopass->kf_group_bits -= bits_used;
-    twopass->gf_group_bits -= bits_used;
-    twopass->gf_group_bits = MAX(twopass->gf_group_bits, 0);
   }
   twopass->kf_group_bits = MAX(twopass->kf_group_bits, 0);
 }
diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h
index d84793e28..c89cfaf8d 100644
--- a/vp9/encoder/vp9_firstpass.h
+++ b/vp9/encoder/vp9_firstpass.h
@@ -11,6 +11,8 @@
 #ifndef VP9_ENCODER_VP9_FIRSTPASS_H_
 #define VP9_ENCODER_VP9_FIRSTPASS_H_
 
+#include "vp9/encoder/vp9_lookahead.h"
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -42,7 +44,6 @@ typedef struct {
   unsigned int section_intra_rating;
   unsigned int next_iiratio;
   FIRSTPASS_STATS total_stats;
-  FIRSTPASS_STATS this_frame_stats;
   const FIRSTPASS_STATS *stats_in;
   const FIRSTPASS_STATS *stats_in_start;
   const FIRSTPASS_STATS *stats_in_end;
@@ -51,31 +52,24 @@ typedef struct {
   int64_t bits_left;
   double modified_error_min;
   double modified_error_max;
-  double modified_error_total;
   double modified_error_left;
   double kf_intra_err_min;
   double gf_intra_err_min;
-  int kf_bits;
-  // Remaining error from uncoded frames in a gf group. Two pass use only
-  int64_t gf_group_error_left;
 
   // Projected total bits available for a key frame group of frames
   int64_t kf_group_bits;
 
   // Error score of frames still to be coded in kf group
   int64_t kf_group_error_left;
-
-  // Projected Bits available for a group of frames including 1 GF or ARF
-  int64_t gf_group_bits;
-  // Bits for the golden frame or ARF - 2 pass only
-  int gf_bits;
-
   int sr_update_lag;
 
   int kf_zeromotion_pct;
   int gf_zeromotion_pct;
 
   int active_worst_quality;
+
+  int gf_group_index;
+  int gf_group_bit_allocation[MAX_LAG_BUFFERS * 2];
 } TWO_PASS;
 
 struct VP9_COMP;
diff --git a/vp9/encoder/vp9_mbgraph.c b/vp9/encoder/vp9_mbgraph.c
index 041e583fd..842bc5b9d 100644
--- a/vp9/encoder/vp9_mbgraph.c
+++ b/vp9/encoder/vp9_mbgraph.c
@@ -72,8 +72,7 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi,
   x->mv_row_max = tmp_row_max;
 
   return vp9_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
-          xd->plane[0].dst.buf, xd->plane[0].dst.stride,
-          INT_MAX);
+          xd->plane[0].dst.buf, xd->plane[0].dst.stride);
 }
 
 static int do_16x16_motion_search(VP9_COMP *cpi, const MV *ref_mv,
@@ -86,8 +85,7 @@ static int do_16x16_motion_search(VP9_COMP *cpi, const MV *ref_mv,
   // Try zero MV first
   // FIXME should really use something like near/nearest MV and/or MV prediction
   err = vp9_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
-                     xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride,
-                     INT_MAX);
+                     xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride);
   dst_mv->as_int = 0;
 
   // Test last reference frame using the previous best mv as the
@@ -123,8 +121,7 @@ static int do_16x16_zerozero_search(VP9_COMP *cpi, int_mv *dst_mv) {
   // Try zero MV first
   // FIXME should really use something like near/nearest MV and/or MV prediction
   err = vp9_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
-                     xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride,
-                     INT_MAX);
+                     xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride);
 
   dst_mv->as_int = 0;
 
@@ -147,7 +144,7 @@ static int find_best_16x16_intra(VP9_COMP *cpi, PREDICTION_MODE *pbest_mode) {
                             xd->plane[0].dst.buf, xd->plane[0].dst.stride,
                             0, 0, 0);
     err = vp9_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
-                       xd->plane[0].dst.buf, xd->plane[0].dst.stride, best_err);
+                       xd->plane[0].dst.buf, xd->plane[0].dst.stride);
 
     // find best
     if (err < best_err) {
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 4f7d6f17c..dbd19a2d6 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -524,9 +524,8 @@ static int vp9_pattern_search(const MACROBLOCK *x,
 
   // Work out the start point for the search
   bestsad = vfp->sdf(what->buf, what->stride,
-                     get_buf_from_mv(in_what, ref_mv), in_what->stride,
-                     0x7fffffff) + mvsad_err_cost(x, ref_mv, &fcenter_mv,
-                                                  sad_per_bit);
+                     get_buf_from_mv(in_what, ref_mv), in_what->stride) +
+      mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
 
   // Search all possible scales upto the search param around the center point
   // pick the scale of the point that is best as the starting scale of
@@ -542,7 +541,7 @@ static int vp9_pattern_search(const MACROBLOCK *x,
                               bc + candidates[t][i].col};
           thissad = vfp->sdf(what->buf, what->stride,
                              get_buf_from_mv(in_what, &this_mv),
-                             in_what->stride, bestsad);
+                             in_what->stride);
           CHECK_BETTER
         }
       } else {
@@ -553,7 +552,7 @@ static int vp9_pattern_search(const MACROBLOCK *x,
             continue;
           thissad = vfp->sdf(what->buf, what->stride,
                              get_buf_from_mv(in_what, &this_mv),
-                             in_what->stride, bestsad);
+                             in_what->stride);
           CHECK_BETTER
         }
       }
@@ -585,7 +584,7 @@ static int vp9_pattern_search(const MACROBLOCK *x,
                                 bc + candidates[s][i].col};
             thissad = vfp->sdf(what->buf, what->stride,
                                get_buf_from_mv(in_what, &this_mv),
-                               in_what->stride, bestsad);
+                               in_what->stride);
             CHECK_BETTER
           }
         } else {
@@ -596,7 +595,7 @@ static int vp9_pattern_search(const MACROBLOCK *x,
               continue;
             thissad = vfp->sdf(what->buf, what->stride,
                                get_buf_from_mv(in_what, &this_mv),
-                               in_what->stride, bestsad);
+                               in_what->stride);
             CHECK_BETTER
           }
         }
@@ -623,7 +622,7 @@ static int vp9_pattern_search(const MACROBLOCK *x,
                                 bc + candidates[s][next_chkpts_indices[i]].col};
             thissad = vfp->sdf(what->buf, what->stride,
                                get_buf_from_mv(in_what, &this_mv),
-                               in_what->stride, bestsad);
+                               in_what->stride);
             CHECK_BETTER
           }
         } else {
@@ -634,7 +633,7 @@ static int vp9_pattern_search(const MACROBLOCK *x,
               continue;
             thissad = vfp->sdf(what->buf, what->stride,
                                get_buf_from_mv(in_what, &this_mv),
-                               in_what->stride, bestsad);
+                               in_what->stride);
             CHECK_BETTER
           }
         }
@@ -661,7 +660,7 @@ static int vp9_pattern_search(const MACROBLOCK *x,
                               bc + neighbors[i].col};
           thissad = vfp->sdf(what->buf, what->stride,
                              get_buf_from_mv(in_what, &this_mv),
-                             in_what->stride, bestsad);
+                             in_what->stride);
           CHECK_BETTER
         }
       } else {
@@ -672,7 +671,7 @@ static int vp9_pattern_search(const MACROBLOCK *x,
             continue;
           thissad = vfp->sdf(what->buf, what->stride,
                              get_buf_from_mv(in_what, &this_mv),
-                             in_what->stride, bestsad);
+                             in_what->stride);
           CHECK_BETTER
         }
       }
@@ -894,8 +893,7 @@ int vp9_full_range_search_c(const MACROBLOCK *x,
   *best_mv = *ref_mv;
   *num00 = 11;
   best_sad = fn_ptr->sdf(what->buf, what->stride,
-                         get_buf_from_mv(in_what, ref_mv), in_what->stride,
-                         0x7fffffff) +
+                         get_buf_from_mv(in_what, ref_mv), in_what->stride) +
                  mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
   start_row = MAX(-range, x->mv_row_min - ref_mv->row);
   start_col = MAX(-range, x->mv_col_min - ref_mv->col);
@@ -929,7 +927,7 @@ int vp9_full_range_search_c(const MACROBLOCK *x,
         for (i = 0; i < end_col - c; ++i) {
           const MV mv = {ref_mv->row + r, ref_mv->col + c + i};
           unsigned int sad = fn_ptr->sdf(what->buf, what->stride,
-              get_buf_from_mv(in_what, &mv), in_what->stride, best_sad);
+              get_buf_from_mv(in_what, &mv), in_what->stride);
           if (sad < best_sad) {
             sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
             if (sad < best_sad) {
@@ -975,7 +973,7 @@ int vp9_diamond_search_sad_c(const MACROBLOCK *x,
 
   // Check the starting position
   best_sad = fn_ptr->sdf(what->buf, what->stride,
-                         best_address, in_what->stride, 0x7fffffff) +
+                         best_address, in_what->stride) +
       mvsad_err_cost(x, best_mv, &fcenter_mv, sad_per_bit);
 
   i = 1;
@@ -986,8 +984,7 @@ int vp9_diamond_search_sad_c(const MACROBLOCK *x,
                      best_mv->col + ss[i].mv.col};
       if (is_mv_in(x, &mv)) {
        int sad = fn_ptr->sdf(what->buf, what->stride,
-                             best_address + ss[i].offset, in_what->stride,
-                             best_sad);
+                             best_address + ss[i].offset, in_what->stride);
         if (sad < best_sad) {
           sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
           if (sad < best_sad) {
@@ -1012,7 +1009,7 @@ int vp9_diamond_search_sad_c(const MACROBLOCK *x,
         if (is_mv_in(x, &this_mv)) {
           int sad = fn_ptr->sdf(what->buf, what->stride,
                                 best_address + ss[best_site].offset,
-                                in_what->stride, best_sad);
+                                in_what->stride);
           if (sad < best_sad) {
             sad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit);
             if (sad < best_sad) {
@@ -1077,7 +1074,7 @@ int vp9_diamond_search_sadx4(const MACROBLOCK *x,
   best_address = in_what;
 
   // Check the starting position
-  bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, 0x7fffffff)
+  bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride)
                 + mvsad_err_cost(x, best_mv, &fcenter_mv, sad_per_bit);
 
   i = 1;
@@ -1129,7 +1126,7 @@ int vp9_diamond_search_sadx4(const MACROBLOCK *x,
         if (is_mv_in(x, &this_mv)) {
           const uint8_t *const check_here = ss[i].offset + best_address;
           unsigned int thissad = fn_ptr->sdf(what, what_stride, check_here,
-                                             in_what_stride, bestsad);
+                                             in_what_stride);
 
           if (thissad < bestsad) {
             thissad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit);
@@ -1154,7 +1151,7 @@ int vp9_diamond_search_sadx4(const MACROBLOCK *x,
         if (is_mv_in(x, &this_mv)) {
           const uint8_t *const check_here = ss[best_site].offset + best_address;
           unsigned int thissad = fn_ptr->sdf(what, what_stride, check_here,
-                                             in_what_stride, bestsad);
+                                             in_what_stride);
           if (thissad < bestsad) {
             thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
                                       mvjsadcost, mvsadcost, sad_per_bit);
@@ -1253,7 +1250,7 @@ int vp9_full_search_sad_c(const MACROBLOCK *x, const MV *ref_mv,
   const int col_max = MIN(ref_mv->col + distance, x->mv_col_max);
   const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
   int best_sad = fn_ptr->sdf(what->buf, what->stride,
-      get_buf_from_mv(in_what, ref_mv), in_what->stride, 0x7fffffff) +
+      get_buf_from_mv(in_what, ref_mv), in_what->stride) +
       mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
   *best_mv = *ref_mv;
 
@@ -1261,7 +1258,7 @@ int vp9_full_search_sad_c(const MACROBLOCK *x, const MV *ref_mv,
     for (c = col_min; c < col_max; ++c) {
       const MV mv = {r, c};
       const int sad = fn_ptr->sdf(what->buf, what->stride,
-          get_buf_from_mv(in_what, &mv), in_what->stride, best_sad) +
+          get_buf_from_mv(in_what, &mv), in_what->stride) +
               mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
       if (sad < best_sad) {
         best_sad = sad;
@@ -1286,7 +1283,7 @@ int vp9_full_search_sadx3(const MACROBLOCK *x, const MV *ref_mv,
   const int col_max = MIN(ref_mv->col + distance, x->mv_col_max);
   const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
   unsigned int best_sad = fn_ptr->sdf(what->buf, what->stride,
-      get_buf_from_mv(in_what, ref_mv), in_what->stride, 0x7fffffff) +
+      get_buf_from_mv(in_what, ref_mv), in_what->stride) +
       mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
   *best_mv = *ref_mv;
 
@@ -1320,7 +1317,7 @@ int vp9_full_search_sadx3(const MACROBLOCK *x, const MV *ref_mv,
 
     while (c < col_max) {
       unsigned int sad = fn_ptr->sdf(what->buf, what->stride,
-                                     check_here, in_what->stride, best_sad);
+                                     check_here, in_what->stride);
       if (sad < best_sad) {
         const MV mv = {r, c};
         sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
@@ -1351,7 +1348,7 @@ int vp9_full_search_sadx8(const MACROBLOCK *x, const MV *ref_mv,
   const int col_max = MIN(ref_mv->col + distance, x->mv_col_max);
   const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
   unsigned int best_sad = fn_ptr->sdf(what->buf, what->stride,
-      get_buf_from_mv(in_what, ref_mv), in_what->stride, 0x7fffffff) +
+      get_buf_from_mv(in_what, ref_mv), in_what->stride) +
       mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
   *best_mv = *ref_mv;
 
@@ -1409,7 +1406,7 @@ int vp9_full_search_sadx8(const MACROBLOCK *x, const MV *ref_mv,
 
     while (c < col_max) {
       unsigned int sad = fn_ptr->sdf(what->buf, what->stride,
-                                     check_here, in_what->stride, best_sad);
+                                     check_here, in_what->stride);
       if (sad < best_sad) {
         const MV mv = {r, c};
         sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
@@ -1438,7 +1435,7 @@ int vp9_refining_search_sad_c(const MACROBLOCK *x,
   const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
   unsigned int best_sad = fn_ptr->sdf(what->buf, what->stride,
                                      get_buf_from_mv(in_what, ref_mv),
-                                     in_what->stride, 0x7fffffff) +
+                                     in_what->stride) +
       mvsad_err_cost(x, ref_mv, &fcenter_mv, error_per_bit);
   int i, j;
 
@@ -1450,7 +1447,7 @@ int vp9_refining_search_sad_c(const MACROBLOCK *x,
                      ref_mv->col + neighbors[j].col};
       if (is_mv_in(x, &mv)) {
         unsigned int sad = fn_ptr->sdf(what->buf, what->stride,
-            get_buf_from_mv(in_what, &mv), in_what->stride, best_sad);
+            get_buf_from_mv(in_what, &mv), in_what->stride);
         if (sad < best_sad) {
           sad += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit);
           if (sad < best_sad) {
@@ -1483,7 +1480,7 @@ int vp9_refining_search_sadx4(const MACROBLOCK *x,
   const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
   const uint8_t *best_address = get_buf_from_mv(in_what, ref_mv);
   unsigned int best_sad = fn_ptr->sdf(what->buf, what->stride, best_address,
-                                    in_what->stride, 0x7fffffff) +
+                                    in_what->stride) +
       mvsad_err_cost(x, ref_mv, &fcenter_mv, error_per_bit);
   int i, j;
 
@@ -1524,7 +1521,7 @@ int vp9_refining_search_sadx4(const MACROBLOCK *x,
         if (is_mv_in(x, &mv)) {
           unsigned int sad = fn_ptr->sdf(what->buf, what->stride,
                                          get_buf_from_mv(in_what, &mv),
-                                         in_what->stride, best_sad);
+                                         in_what->stride);
           if (sad < best_sad) {
             sad += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit);
             if (sad < best_sad) {
@@ -1563,8 +1560,7 @@ int vp9_refining_search_8p_c(const MACROBLOCK *x,
   const struct buf_2d *const in_what = &xd->plane[0].pre[0];
   const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
   unsigned int best_sad = fn_ptr->sdaf(what->buf, what->stride,
-      get_buf_from_mv(in_what, ref_mv), in_what->stride,
-      second_pred, 0x7fffffff) +
+      get_buf_from_mv(in_what, ref_mv), in_what->stride, second_pred) +
       mvsad_err_cost(x, ref_mv, &fcenter_mv, error_per_bit);
   int i, j;
 
@@ -1577,8 +1573,7 @@ int vp9_refining_search_8p_c(const MACROBLOCK *x,
 
       if (is_mv_in(x, &mv)) {
         unsigned int sad = fn_ptr->sdaf(what->buf, what->stride,
-            get_buf_from_mv(in_what, &mv), in_what->stride,
-            second_pred, best_sad);
+            get_buf_from_mv(in_what, &mv), in_what->stride, second_pred);
         if (sad < best_sad) {
           sad += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit);
           if (sad < best_sad) {
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 437b68078..913b8ead4 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -280,13 +280,11 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
       int rate_mv = 0;
 
-      if (cpi->sf.disable_inter_mode_mask[bsize] &
-          (1 << INTER_OFFSET(this_mode)))
+      if (!(cpi->sf.inter_mode_mask[bsize] & (1 << this_mode)))
         continue;
 
-      if (best_rd < ((int64_t)rd_threshes[mode_idx[this_mode]] *
-          rd_thresh_freq_fact[this_mode] >> 5) ||
-          rd_threshes[mode_idx[this_mode]] == INT_MAX)
+      if (rd_less_than_thresh(best_rd, rd_threshes[mode_idx[this_mode]],
+                              rd_thresh_freq_fact[this_mode]))
         continue;
 
       if (this_mode == NEWMV) {
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index a04622c8c..b58eac981 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -200,9 +200,8 @@ void vp9_rc_init(const VP9EncoderConfig *oxcf, int pass, RATE_CONTROL *rc) {
                                    oxcf->best_allowed_q) / 2;
   }
 
-  rc->last_q[0] = oxcf->best_allowed_q;
-  rc->last_q[1] = oxcf->best_allowed_q;
-  rc->last_q[2] = oxcf->best_allowed_q;
+  rc->last_q[KEY_FRAME] = oxcf->best_allowed_q;
+  rc->last_q[INTER_FRAME] = oxcf->best_allowed_q;
 
   rc->buffer_level =    oxcf->starting_buffer_level;
   rc->bits_off_target = oxcf->starting_buffer_level;
@@ -213,6 +212,7 @@ void vp9_rc_init(const VP9EncoderConfig *oxcf, int pass, RATE_CONTROL *rc) {
   rc->long_rolling_actual_bits = rc->avg_frame_bandwidth;
 
   rc->total_actual_bits = 0;
+  rc->total_target_bits = 0;
   rc->total_target_vs_actual = 0;
 
   rc->baseline_gf_interval = DEFAULT_GF_INTERVAL;
@@ -607,13 +607,27 @@ static int rc_pick_q_and_bounds_one_pass_cbr(const VP9_COMP *cpi,
   return q;
 }
 
+static int get_active_cq_level(const RATE_CONTROL *rc,
+                               const VP9EncoderConfig *const oxcf) {
+  static const double cq_adjust_threshold = 0.5;
+  int active_cq_level = oxcf->cq_level;
+  if (oxcf->rc_mode == RC_MODE_CONSTRAINED_QUALITY &&
+      rc->total_target_bits > 0) {
+    const double x = (double)rc->total_actual_bits / rc->total_target_bits;
+    if (x < cq_adjust_threshold) {
+      active_cq_level = (int)(active_cq_level * x / cq_adjust_threshold);
+    }
+  }
+  return active_cq_level;
+}
+
 static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi,
                                              int *bottom_index,
                                              int *top_index) {
   const VP9_COMMON *const cm = &cpi->common;
   const RATE_CONTROL *const rc = &cpi->rc;
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
-  const int cq_level = oxcf->cq_level;
+  const int cq_level = get_active_cq_level(rc, oxcf);
   int active_best_quality;
   int active_worst_quality = calc_active_worst_quality_one_pass_vbr(cpi);
   int q;
@@ -791,7 +805,7 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi,
   const VP9_COMMON *const cm = &cpi->common;
   const RATE_CONTROL *const rc = &cpi->rc;
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
-  const int cq_level = oxcf->cq_level;
+  const int cq_level = get_active_cq_level(rc, oxcf);
   int active_best_quality;
   int active_worst_quality = cpi->twopass.active_worst_quality;
   int q;
@@ -1075,7 +1089,6 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
   } else if (!rc->is_src_frame_alt_ref &&
              (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame) &&
              !(cpi->use_svc && oxcf->rc_mode == RC_MODE_CBR)) {
-    rc->last_q[2] = qindex;
     rc->avg_frame_qindex[2] =
         ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[2] + qindex, 2);
   } else {
@@ -1375,6 +1388,24 @@ int vp9_compute_qdelta_by_rate(const RATE_CONTROL *rc, FRAME_TYPE frame_type,
   return target_index - qindex;
 }
 
+void vp9_rc_set_gf_max_interval(const VP9EncoderConfig *const oxcf,
+                                RATE_CONTROL *const rc) {
+  // Set Maximum gf/arf interval
+  rc->max_gf_interval = 16;
+
+  // Extended interval for genuinely static scenes
+  rc->static_scene_max_gf_interval = oxcf->key_freq >> 1;
+
+  // Special conditions when alt ref frame enabled
+  if (oxcf->play_alternate && oxcf->lag_in_frames) {
+    if (rc->static_scene_max_gf_interval > oxcf->lag_in_frames - 1)
+      rc->static_scene_max_gf_interval = oxcf->lag_in_frames - 1;
+  }
+
+  if (rc->max_gf_interval > rc->static_scene_max_gf_interval)
+    rc->max_gf_interval = rc->static_scene_max_gf_interval;
+}
+
 void vp9_rc_update_framerate(VP9_COMP *cpi) {
   const VP9_COMMON *const cm = &cpi->common;
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
@@ -1399,21 +1430,5 @@ void vp9_rc_update_framerate(VP9_COMP *cpi) {
   rc->max_frame_bandwidth = MAX(MAX((cm->MBs * MAX_MB_RATE), MAXRATE_1080P),
                                     vbr_max_bits);
 
-  // Set Maximum gf/arf interval
-  rc->max_gf_interval = 16;
-
-  // Extended interval for genuinely static scenes
-  rc->static_scene_max_gf_interval = cpi->oxcf.key_freq >> 1;
-
-  // Special conditions when alt ref frame enabled in lagged compress mode
-  if (oxcf->play_alternate && oxcf->lag_in_frames) {
-    if (rc->max_gf_interval > oxcf->lag_in_frames - 1)
-      rc->max_gf_interval = oxcf->lag_in_frames - 1;
-
-    if (rc->static_scene_max_gf_interval > oxcf->lag_in_frames - 1)
-      rc->static_scene_max_gf_interval = oxcf->lag_in_frames - 1;
-  }
-
-  if (rc->max_gf_interval > rc->static_scene_max_gf_interval)
-    rc->max_gf_interval = rc->static_scene_max_gf_interval;
+  vp9_rc_set_gf_max_interval(oxcf, rc);
 }
diff --git a/vp9/encoder/vp9_ratectrl.h b/vp9/encoder/vp9_ratectrl.h
index b1cc67609..d6a0151b6 100644
--- a/vp9/encoder/vp9_ratectrl.h
+++ b/vp9/encoder/vp9_ratectrl.h
@@ -30,7 +30,7 @@ typedef struct {
   int this_frame_target;           // Actual frame target after rc adjustment.
   int projected_frame_size;
   int sb64_target_rate;
-  int last_q[3];                   // Separate values for Intra/Inter/ARF-GF
+  int last_q[FRAME_TYPES];         // Separate values for Intra/Inter
   int last_boosted_qindex;         // Last boosted GF/KF/ARF q
 
   int gfu_boost;
@@ -178,6 +178,9 @@ int vp9_compute_qdelta_by_rate(const RATE_CONTROL *rc, FRAME_TYPE frame_type,
 
 void vp9_rc_update_framerate(struct VP9_COMP *cpi);
 
+void vp9_rc_set_gf_max_interval(const struct VP9EncoderConfig *const oxcf,
+                                RATE_CONTROL *const rc);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 601e64d39..f68aa2738 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -9,7 +9,6 @@
  */
 
 #include <assert.h>
-#include <limits.h>
 #include <math.h>
 #include <stdio.h>
 
@@ -22,7 +21,6 @@
 #include "vp9/common/vp9_entropymode.h"
 #include "vp9/common/vp9_idct.h"
 #include "vp9/common/vp9_mvref_common.h"
-#include "vp9/common/vp9_pragmas.h"
 #include "vp9/common/vp9_pred_common.h"
 #include "vp9/common/vp9_quant_common.h"
 #include "vp9/common/vp9_reconinter.h"
@@ -1677,9 +1675,9 @@ static INLINE int mv_has_subpel(const MV *mv) {
 static int check_best_zero_mv(
     const VP9_COMP *cpi, const uint8_t mode_context[MAX_REF_FRAMES],
     int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES],
-    int disable_inter_mode_mask, int this_mode,
+    int inter_mode_mask, int this_mode,
     const MV_REFERENCE_FRAME ref_frames[2]) {
-  if (!(disable_inter_mode_mask & (1 << INTER_OFFSET(ZEROMV))) &&
+  if ((inter_mode_mask & (1 << ZEROMV)) &&
       (this_mode == NEARMV || this_mode == NEARESTMV || this_mode == ZEROMV) &&
       frame_mv[this_mode][ref_frames[0]].as_int == 0 &&
       (ref_frames[1] == NONE ||
@@ -1745,7 +1743,7 @@ static int64_t rd_pick_best_sub8x8_mode(VP9_COMP *cpi, MACROBLOCK *x,
   ENTROPY_CONTEXT t_above[2], t_left[2];
   int subpelmv = 1, have_ref = 0;
   const int has_second_rf = has_second_ref(mbmi);
-  const int disable_inter_mode_mask = cpi->sf.disable_inter_mode_mask[bsize];
+  const int inter_mode_mask = cpi->sf.inter_mode_mask[bsize];
 
   vp9_zero(*bsi);
 
@@ -1794,11 +1792,11 @@ static int64_t rd_pick_best_sub8x8_mode(VP9_COMP *cpi, MACROBLOCK *x,
 
         mode_idx = INTER_OFFSET(this_mode);
         bsi->rdstat[i][mode_idx].brdcost = INT64_MAX;
-        if (disable_inter_mode_mask & (1 << mode_idx))
+        if (!(inter_mode_mask & (1 << this_mode)))
           continue;
 
         if (!check_best_zero_mv(cpi, mbmi->mode_context, frame_mv,
-                                disable_inter_mode_mask,
+                                inter_mode_mask,
                                 this_mode, mbmi->ref_frame))
           continue;
 
@@ -2131,8 +2129,7 @@ static void mv_pred(VP9_COMP *cpi, MACROBLOCK *x,
 
     // Find sad for current vector.
     this_sad = cpi->fn_ptr[block_size].sdf(src_y_ptr, x->plane[0].src.stride,
-                                           ref_y_ptr, ref_y_stride,
-                                           0x7fffffff);
+                                           ref_y_ptr, ref_y_stride);
 
     // Note if it is the best so far.
     if (this_sad < best_sad) {
@@ -2210,8 +2207,6 @@ static void estimate_ref_frame_costs(const VP9_COMMON *cm,
 
 static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
                          int mode_index,
-                         int_mv *ref_mv,
-                         int_mv *second_ref_mv,
                          int64_t comp_pred_diff[REFERENCE_MODES],
                          const int64_t tx_size_diff[TX_MODES],
                          int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS]) {
@@ -2991,11 +2986,6 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   ctx->mic = *xd->mi[0];
 }
 
-static INLINE int rd_less_than_thresh(int64_t best_rd, int thresh,
-                                      int thresh_fact) {
-    return best_rd < ((int64_t)thresh * thresh_fact >> 5) || thresh == INT_MAX;
-}
-
 // Updating rd_thresh_freq_fact[] here means that the different
 // partition/block sizes are handled independently based on the best
 // choice for the current partition. It may well be better to keep a scaled
@@ -3073,7 +3063,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   const int mode_search_skip_flags = cpi->sf.mode_search_skip_flags;
   const int intra_y_mode_mask =
       cpi->sf.intra_y_mode_mask[max_txsize_lookup[bsize]];
-  int disable_inter_mode_mask = cpi->sf.disable_inter_mode_mask[bsize];
+  int inter_mode_mask = cpi->sf.inter_mode_mask[bsize];
   vp9_zero(best_mbmode);
   x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
 
@@ -3139,6 +3129,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   if (vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)) {
     const int inter_non_zero_mode_mask = 0x1F7F7;
     mode_skip_mask |= inter_non_zero_mode_mask;
+    mode_skip_mask &= ~(1 << THR_ZEROMV);
+    inter_mode_mask = (1 << ZEROMV);
   }
 
   // Disable this drop out case if the ref frame
@@ -3172,7 +3164,11 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   }
 
   if (bsize > cpi->sf.max_intra_bsize) {
-    mode_skip_mask |= 0xFF30808;
+    const int all_intra_modes = (1 << THR_DC) | (1 << THR_TM) |
+        (1 << THR_H_PRED) | (1 << THR_V_PRED) | (1 << THR_D135_PRED) |
+        (1 << THR_D207_PRED) | (1 << THR_D153_PRED) | (1 << THR_D63_PRED) |
+        (1 << THR_D117_PRED) | (1 << THR_D45_PRED);
+    mode_skip_mask |= all_intra_modes;
   }
 
   if (!x->in_active_map) {
@@ -3186,7 +3182,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       mode_index = THR_ZEROMV;
     mode_skip_mask = ~(1 << mode_index);
     mode_skip_start = MAX_MODES;
-    disable_inter_mode_mask = 0;
+    inter_mode_mask = (1 << NEARESTMV) | (1 << NEARMV) | (1 << ZEROMV) |
+                      (1 << NEWMV);
   }
 
   for (mode_index = 0; mode_index < MAX_MODES; ++mode_index) {
@@ -3228,13 +3225,12 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
 
     // Test best rd so far against threshold for trying this mode.
     if (rd_less_than_thresh(best_rd, rd_threshes[mode_index],
-        rd_thresh_freq_fact[mode_index]))
+                            rd_thresh_freq_fact[mode_index]))
       continue;
 
     this_mode = vp9_mode_order[mode_index].mode;
     ref_frame = vp9_mode_order[mode_index].ref_frame[0];
-    if (ref_frame != INTRA_FRAME &&
-        disable_inter_mode_mask & (1 << INTER_OFFSET(this_mode)))
+    if (ref_frame != INTRA_FRAME && !(inter_mode_mask & (1 << this_mode)))
       continue;
     second_ref_frame = vp9_mode_order[mode_index].ref_frame[1];
 
@@ -3283,7 +3279,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
           !vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
         const MV_REFERENCE_FRAME ref_frames[2] = {ref_frame, second_ref_frame};
         if (!check_best_zero_mv(cpi, mbmi->mode_context, frame_mv,
-                                disable_inter_mode_mask, this_mode, ref_frames))
+                                inter_mode_mask, this_mode, ref_frames))
           continue;
       }
     }
@@ -3621,9 +3617,6 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
 
   set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
   store_coding_context(x, ctx, best_mode_index,
-                       &mbmi->ref_mvs[mbmi->ref_frame[0]][0],
-                       &mbmi->ref_mvs[mbmi->ref_frame[1] < 0 ? 0 :
-                                      mbmi->ref_frame[1]][0],
                        best_pred_diff, best_tx_diff, best_filter_diff);
 
   return best_rd;
@@ -3672,7 +3665,6 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
   int_mv seg_mvs[4][MAX_REF_FRAMES];
   b_mode_info best_bmodes[4];
   int best_skip2 = 0;
-  int ref_frame_mask = 0;
   int mode_skip_mask = 0;
 
   x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
@@ -3707,17 +3699,6 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
     frame_mv[ZEROMV][ref_frame].as_int = 0;
   }
 
-  for (ref_frame = LAST_FRAME;
-       ref_frame <= ALTREF_FRAME && cpi->sf.reference_masking; ++ref_frame) {
-    int i;
-    for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
-      if ((x->pred_mv_sad[ref_frame] >> 1) > x->pred_mv_sad[i]) {
-        ref_frame_mask |= (1 << ref_frame);
-        break;
-      }
-    }
-  }
-
   for (ref_index = 0; ref_index < MAX_REFS; ++ref_index) {
     int mode_excluded = 0;
     int64_t this_rd = INT64_MAX;
@@ -3812,11 +3793,6 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
         vp9_get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) !=
             (int)ref_frame) {
       continue;
-    // If the segment skip feature is enabled....
-    // then do nothing if the current mode is not allowed..
-    } else if (vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP) &&
-               ref_frame != INTRA_FRAME) {
-      continue;
     // Disable this drop out case if the ref frame
     // segment level feature is enabled for this segment. This is to
     // prevent the possibility that we end up unable to pick any mode.
@@ -4041,15 +4017,10 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
     }
 
     if (!disable_skip) {
-      // Test for the condition where skip block will be activated
-      // because there are no non zero coefficients and make any
-      // necessary adjustment for rate. Ignore if skip is coded at
-      // segment level as the cost wont have been added in.
-      // Is Mb level skip allowed (i.e. not coded at segment level).
-      const int mb_skip_allowed = !vp9_segfeature_active(seg, segment_id,
-                                                         SEG_LVL_SKIP);
+      // Skip is never coded at the segment level for sub8x8 blocks and instead
+      // always coded in the bitstream at the mode info level.
 
-      if (mb_skip_allowed && ref_frame != INTRA_FRAME && !xd->lossless) {
+      if (ref_frame != INTRA_FRAME && !xd->lossless) {
         if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv, distortion2) <
             RDCOST(x->rdmult, x->rddiv, 0, total_sse)) {
           // Add in the cost of the no skip flag.
@@ -4064,7 +4035,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
           rate_uv = 0;
           this_skip2 = 1;
         }
-      } else if (mb_skip_allowed) {
+      } else {
         // Add in the cost of the no skip flag.
         rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0);
       }
@@ -4254,9 +4225,6 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
 
   set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
   store_coding_context(x, ctx, best_ref_index,
-                       &mbmi->ref_mvs[mbmi->ref_frame[0]][0],
-                       &mbmi->ref_mvs[mbmi->ref_frame[1] < 0 ? 0 :
-                                      mbmi->ref_frame[1]][0],
                        best_pred_diff, best_tx_diff, best_filter_diff);
 
   return best_rd;
diff --git a/vp9/encoder/vp9_rdopt.h b/vp9/encoder/vp9_rdopt.h
index b6b51e553..5ea09a8a7 100644
--- a/vp9/encoder/vp9_rdopt.h
+++ b/vp9/encoder/vp9_rdopt.h
@@ -11,6 +11,8 @@
 #ifndef VP9_ENCODER_VP9_RDOPT_H_
 #define VP9_ENCODER_VP9_RDOPT_H_
 
+#include <limits.h>
+
 #include "vp9/encoder/vp9_encoder.h"
 
 #ifdef __cplusplus
@@ -87,6 +89,11 @@ void vp9_set_rd_speed_thresholds(VP9_COMP *cpi);
 
 void vp9_set_rd_speed_thresholds_sub8x8(VP9_COMP *cpi);
 
+static INLINE int rd_less_than_thresh(int64_t best_rd, int thresh,
+                                      int thresh_fact) {
+    return best_rd < ((int64_t)thresh * thresh_fact >> 5) || thresh == INT_MAX;
+}
+
 static INLINE int full_pixel_search(VP9_COMP *cpi, MACROBLOCK *x,
                                     BLOCK_SIZE bsize, MV *mvp_full,
                                     int step_param, int error_per_bit,
diff --git a/vp9/encoder/vp9_sad.c b/vp9/encoder/vp9_sad.c
index 892e90551..d06263676 100644
--- a/vp9/encoder/vp9_sad.c
+++ b/vp9/encoder/vp9_sad.c
@@ -35,14 +35,12 @@ static INLINE unsigned int sad(const uint8_t *a, int a_stride,
 
 #define sadMxN(m, n) \
 unsigned int vp9_sad##m##x##n##_c(const uint8_t *src, int src_stride, \
-                                  const uint8_t *ref, int ref_stride, \
-                                  unsigned int max_sad) { \
+                                  const uint8_t *ref, int ref_stride) { \
   return sad(src, src_stride, ref, ref_stride, m, n); \
 } \
 unsigned int vp9_sad##m##x##n##_avg_c(const uint8_t *src, int src_stride, \
                                       const uint8_t *ref, int ref_stride, \
-                                      const uint8_t *second_pred, \
-                                      unsigned int max_sad) { \
+                                      const uint8_t *second_pred) { \
   uint8_t comp_pred[m * n]; \
   vp9_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride); \
   return sad(src, src_stride, comp_pred, m, m, n); \
@@ -54,8 +52,7 @@ void vp9_sad##m##x##n##x##k##_c(const uint8_t *src, int src_stride, \
                                 unsigned int *sads) { \
   int i; \
   for (i = 0; i < k; ++i) \
-    sads[i] = vp9_sad##m##x##n##_c(src, src_stride, &ref[i], ref_stride, \
-                                   0x7fffffff); \
+    sads[i] = vp9_sad##m##x##n##_c(src, src_stride, &ref[i], ref_stride); \
 }
 
 #define sadMxNx4D(m, n) \
@@ -64,8 +61,7 @@ void vp9_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride, \
                              unsigned int *sads) { \
   int i; \
   for (i = 0; i < 4; ++i) \
-    sads[i] = vp9_sad##m##x##n##_c(src, src_stride, refs[i], ref_stride, \
-                                   0x7fffffff); \
+    sads[i] = vp9_sad##m##x##n##_c(src, src_stride, refs[i], ref_stride); \
 }
 
 // 64x64
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index 7b2d1e2f0..15b986197 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -14,20 +14,23 @@
 #include "vp9/encoder/vp9_speed_features.h"
 
 enum {
-  ALL_INTRA_MODES = (1 << DC_PRED) |
+  INTRA_ALL       = (1 << DC_PRED) |
                     (1 << V_PRED) | (1 << H_PRED) |
                     (1 << D45_PRED) | (1 << D135_PRED) |
                     (1 << D117_PRED) | (1 << D153_PRED) |
                     (1 << D207_PRED) | (1 << D63_PRED) |
                     (1 << TM_PRED),
-
-  INTRA_DC_ONLY   = (1 << DC_PRED),
-
-  INTRA_DC_TM     = (1 << TM_PRED) | (1 << DC_PRED),
-
+  INTRA_DC        = (1 << DC_PRED),
+  INTRA_DC_TM     = (1 << DC_PRED) | (1 << TM_PRED),
   INTRA_DC_H_V    = (1 << DC_PRED) | (1 << V_PRED) | (1 << H_PRED),
+  INTRA_DC_TM_H_V = (1 << DC_PRED) | (1 << TM_PRED) | (1 << V_PRED) |
+                    (1 << H_PRED)
+};
 
-  INTRA_DC_TM_H_V = INTRA_DC_TM | (1 << V_PRED) | (1 << H_PRED)
+enum {
+  INTER_ALL = (1 << NEARESTMV) | (1 << NEARMV) | (1 << ZEROMV) | (1 << NEWMV),
+  INTER_NEAREST = (1 << NEARESTMV),
+  INTER_NEAREST_NEAR_NEW = (1 << NEARESTMV) | (1 << NEARMV) | (1 << NEWMV)
 };
 
 enum {
@@ -80,12 +83,16 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm,
   }
 
   if (speed >= 2) {
-    if (MIN(cm->width, cm->height) >= 720)
+    if (MIN(cm->width, cm->height) >= 720) {
+      sf->lf_motion_threshold = LOW_MOITION_THRESHOLD;
+      sf->last_partitioning_redo_frequency = 3;
       sf->disable_split_mask = cm->show_frame ? DISABLE_ALL_SPLIT
                                               : DISABLE_ALL_INTER_SPLIT;
-    else
+    } else {
       sf->disable_split_mask = LAST_AND_INTRA_SPLIT_ONLY;
-
+      sf->last_partitioning_redo_frequency = 2;
+      sf->lf_motion_threshold = NO_MOITION_THRESHOLD;
+    }
     sf->adaptive_pred_interp_filter = 2;
     sf->reference_masking = 1;
     sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH |
@@ -97,7 +104,6 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm,
     sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX;
     sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_LOW_MOTION;
     sf->adjust_partitioning_from_last_frame = 1;
-    sf->last_partitioning_redo_frequency = 3;
   }
 
   if (speed >= 3) {
@@ -108,6 +114,8 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm,
     else
       sf->disable_split_mask = DISABLE_ALL_INTER_SPLIT;
 
+    sf->lf_motion_threshold = LOW_MOITION_THRESHOLD;
+    sf->last_partitioning_redo_frequency = 3;
     sf->recode_loop = ALLOW_RECODE_KFMAXBW;
     sf->adaptive_rd_thresh = 3;
     sf->mode_skip_start = 6;
@@ -135,20 +143,23 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm,
     sf->search_method = HEX;
     sf->disable_filter_search_var_thresh = 500;
     for (i = 0; i < TX_SIZES; ++i) {
-      sf->intra_y_mode_mask[i] = INTRA_DC_ONLY;
-      sf->intra_uv_mode_mask[i] = INTRA_DC_ONLY;
+      sf->intra_y_mode_mask[i] = INTRA_DC;
+      sf->intra_uv_mode_mask[i] = INTRA_DC;
     }
     cpi->allow_encode_breakout = ENCODE_BREAKOUT_ENABLED;
   }
 }
 
-static void set_rt_speed_feature(VP9_COMMON *cm, SPEED_FEATURES *sf,
+static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf,
                                  int speed) {
+  VP9_COMMON *const cm = &cpi->common;
+  const int frames_since_key =
+      cm->frame_type == KEY_FRAME ? 0 : cpi->rc.frames_since_key;
   sf->static_segmentation = 0;
   sf->adaptive_rd_thresh = 1;
   sf->use_fast_coef_costing = 1;
 
-  if (speed == 1) {
+  if (speed >= 1) {
     sf->use_square_partition_only = !frame_is_intra_only(cm);
     sf->less_rectangular_check = 1;
     sf->tx_size_search_method = frame_is_intra_only(cm) ? USE_FULL_RD
@@ -171,13 +182,9 @@ static void set_rt_speed_feature(VP9_COMMON *cm, SPEED_FEATURES *sf,
   }
 
   if (speed >= 2) {
-    sf->use_square_partition_only = !frame_is_intra_only(cm);
-    sf->less_rectangular_check = 1;
-    sf->tx_size_search_method = frame_is_intra_only(cm) ? USE_FULL_RD
-                                                        : USE_LARGESTALL;
     if (MIN(cm->width, cm->height) >= 720)
-      sf->disable_split_mask = cm->show_frame ?
-        DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT;
+      sf->disable_split_mask = cm->show_frame ? DISABLE_ALL_SPLIT
+                                              : DISABLE_ALL_INTER_SPLIT;
     else
       sf->disable_split_mask = LAST_AND_INTRA_SPLIT_ONLY;
 
@@ -185,27 +192,18 @@ static void set_rt_speed_feature(VP9_COMMON *cm, SPEED_FEATURES *sf,
                                  FLAG_SKIP_INTRA_BESTINTER |
                                  FLAG_SKIP_COMP_BESTINTRA |
                                  FLAG_SKIP_INTRA_LOWVAR;
-    sf->use_rd_breakout = 1;
-    sf->adaptive_motion_search = 1;
     sf->adaptive_pred_interp_filter = 2;
-    sf->auto_mv_step_size = 1;
     sf->reference_masking = 1;
-
     sf->disable_filter_search_var_thresh = 50;
     sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
-
     sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX;
     sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_LOW_MOTION;
+    sf->lf_motion_threshold = LOW_MOITION_THRESHOLD;
     sf->adjust_partitioning_from_last_frame = 1;
     sf->last_partitioning_redo_frequency = 3;
-
-    sf->adaptive_rd_thresh = 2;
     sf->use_lp32x32fdct = 1;
     sf->mode_skip_start = 11;
-    sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
     sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V;
-    sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V;
-    sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V;
   }
 
   if (speed >= 3) {
@@ -233,19 +231,19 @@ static void set_rt_speed_feature(VP9_COMMON *cm, SPEED_FEATURES *sf,
     sf->auto_min_max_partition_size = STRICT_NEIGHBORING_MIN_MAX;
     sf->adjust_partitioning_from_last_frame =
         cm->last_frame_type != cm->frame_type || (0 ==
-        (cm->current_video_frame + 1) % sf->last_partitioning_redo_frequency);
+        (frames_since_key + 1) % sf->last_partitioning_redo_frequency);
     sf->subpel_force_stop = 1;
     for (i = 0; i < TX_SIZES; i++) {
       sf->intra_y_mode_mask[i] = INTRA_DC_H_V;
-      sf->intra_uv_mode_mask[i] = INTRA_DC_ONLY;
+      sf->intra_uv_mode_mask[i] = INTRA_DC;
     }
-    sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_ONLY;
+    sf->intra_y_mode_mask[TX_32X32] = INTRA_DC;
     sf->frame_parameter_update = 0;
     sf->search_method = FAST_HEX;
-    sf->disable_inter_mode_mask[BLOCK_32X32] = 1 << INTER_OFFSET(ZEROMV);
-    sf->disable_inter_mode_mask[BLOCK_32X64] = ~(1 << INTER_OFFSET(NEARESTMV));
-    sf->disable_inter_mode_mask[BLOCK_64X32] = ~(1 << INTER_OFFSET(NEARESTMV));
-    sf->disable_inter_mode_mask[BLOCK_64X64] = ~(1 << INTER_OFFSET(NEARESTMV));
+    sf->inter_mode_mask[BLOCK_32X32] = INTER_NEAREST_NEAR_NEW;
+    sf->inter_mode_mask[BLOCK_32X64] = INTER_NEAREST;
+    sf->inter_mode_mask[BLOCK_64X32] = INTER_NEAREST;
+    sf->inter_mode_mask[BLOCK_64X64] = INTER_NEAREST;
     sf->max_intra_bsize = BLOCK_32X32;
     sf->allow_skip_recode = 1;
   }
@@ -254,9 +252,9 @@ static void set_rt_speed_feature(VP9_COMMON *cm, SPEED_FEATURES *sf,
     sf->max_partition_size = BLOCK_32X32;
     sf->min_partition_size = BLOCK_8X8;
     sf->partition_check =
-        (cm->current_video_frame % sf->last_partitioning_redo_frequency == 1);
+        (frames_since_key % sf->last_partitioning_redo_frequency == 1);
     sf->force_frame_boost = cm->frame_type == KEY_FRAME ||
-        (cm->current_video_frame %
+        (frames_since_key %
             (sf->last_partitioning_redo_frequency << 1) == 1);
     sf->max_delta_qindex = (cm->frame_type == KEY_FRAME) ? 20 : 15;
     sf->partition_search_type = REFERENCE_PARTITION;
@@ -276,7 +274,7 @@ static void set_rt_speed_feature(VP9_COMMON *cm, SPEED_FEATURES *sf,
   if (speed >= 7) {
     int i;
     for (i = 0; i < BLOCK_SIZES; ++i)
-      sf->disable_inter_mode_mask[i] = ~(1 << INTER_OFFSET(NEARESTMV));
+      sf->inter_mode_mask[i] = INTER_NEAREST;
   }
 }
 
@@ -321,8 +319,8 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
   sf->disable_split_var_thresh = 0;
   sf->disable_filter_search_var_thresh = 0;
   for (i = 0; i < TX_SIZES; i++) {
-    sf->intra_y_mode_mask[i] = ALL_INTRA_MODES;
-    sf->intra_uv_mode_mask[i] = ALL_INTRA_MODES;
+    sf->intra_y_mode_mask[i] = INTRA_ALL;
+    sf->intra_uv_mode_mask[i] = INTRA_ALL;
   }
   sf->use_rd_breakout = 0;
   sf->skip_encode_sb = 0;
@@ -334,7 +332,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
   sf->mode_skip_start = MAX_MODES;  // Mode index at which mode skip mask set
   sf->use_nonrd_pick_mode = 0;
   for (i = 0; i < BLOCK_SIZES; ++i)
-    sf->disable_inter_mode_mask[i] = 0;
+    sf->inter_mode_mask[i] = INTER_ALL;
   sf->max_intra_bsize = BLOCK_64X64;
   // This setting only takes effect when partition_search_type is set
   // to FIXED_PARTITION.
@@ -356,7 +354,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
       set_good_speed_feature(cpi, cm, sf, oxcf->speed);
       break;
     case REALTIME:
-      set_rt_speed_feature(cm, sf, oxcf->speed);
+      set_rt_speed_feature(cpi, sf, oxcf->speed);
       break;
   }
 
diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h
index d8c1a8be2..3e7cd27d8 100644
--- a/vp9/encoder/vp9_speed_features.h
+++ b/vp9/encoder/vp9_speed_features.h
@@ -44,6 +44,11 @@ typedef enum {
 } SUBPEL_SEARCH_METHODS;
 
 typedef enum {
+  NO_MOITION_THRESHOLD = 0,
+  LOW_MOITION_THRESHOLD = 7
+} MOTION_THRESHOLD;
+
+typedef enum {
   LAST_FRAME_PARTITION_OFF = 0,
   LAST_FRAME_PARTITION_LOW_MOTION = 1,
   LAST_FRAME_PARTITION_ALL = 2
@@ -200,6 +205,10 @@ typedef struct SPEED_FEATURES {
   // partitioning.
   LAST_FRAME_PARTITION_METHOD use_lastframe_partitioning;
 
+  // The threshold is to determine how slow the motino is, it is used when
+  // use_lastframe_partitioning is set to LAST_FRAME_PARTITION_LOW_MOTION
+  MOTION_THRESHOLD lf_motion_threshold;
+
   // Determine which method we use to determine transform size. We can choose
   // between options like full rd, largest for prediction size, largest
   // for intra and model coefs for the rest.
@@ -322,8 +331,8 @@ typedef struct SPEED_FEATURES {
   int use_nonrd_pick_mode;
 
   // A binary mask indicating if NEARESTMV, NEARMV, ZEROMV, NEWMV
-  // modes are disabled in order from LSB to MSB for each BLOCK_SIZE.
-  int disable_inter_mode_mask[BLOCK_SIZES];
+  // modes are used in order from LSB to MSB for each BLOCK_SIZE.
+  int inter_mode_mask[BLOCK_SIZES];
 
   // This feature controls whether we do the expensive context update and
   // calculation in the rd coefficient costing loop.
diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c
index dd28496be..c25314b42 100644
--- a/vp9/encoder/vp9_svc_layercontext.c
+++ b/vp9/encoder/vp9_svc_layercontext.c
@@ -46,13 +46,12 @@ void vp9_init_layer_context(VP9_COMP *const cpi) {
     lrc->key_frame_rate_correction_factor = 1.0;
 
     if (svc->number_temporal_layers > 1) {
-      lc->target_bandwidth = oxcf->ts_target_bitrate[layer] * 1000;
+      lc->target_bandwidth = oxcf->ts_target_bitrate[layer];
       lrc->last_q[INTER_FRAME] = oxcf->worst_allowed_q;
     } else {
-      lc->target_bandwidth = oxcf->ss_target_bitrate[layer] * 1000;
-      lrc->last_q[0] = oxcf->best_allowed_q;
-      lrc->last_q[1] = oxcf->best_allowed_q;
-      lrc->last_q[2] = oxcf->best_allowed_q;
+      lc->target_bandwidth = oxcf->ss_target_bitrate[layer];
+      lrc->last_q[KEY_FRAME] = oxcf->best_allowed_q;
+      lrc->last_q[INTER_FRAME] = oxcf->best_allowed_q;
     }
 
     lrc->buffer_level = vp9_rescale((int)(oxcf->starting_buffer_level),
@@ -82,9 +81,9 @@ void vp9_update_layer_context_change_config(VP9_COMP *const cpi,
     RATE_CONTROL *const lrc = &lc->rc;
 
     if (svc->number_temporal_layers > 1) {
-      lc->target_bandwidth = oxcf->ts_target_bitrate[layer] * 1000;
+      lc->target_bandwidth = oxcf->ts_target_bitrate[layer];
     } else {
-      lc->target_bandwidth = oxcf->ss_target_bitrate[layer] * 1000;
+      lc->target_bandwidth = oxcf->ss_target_bitrate[layer];
     }
     bitrate_alloc = (float)lc->target_bandwidth / target_bandwidth;
     // Update buffer-related quantities.
@@ -132,8 +131,7 @@ void vp9_update_temporal_layer_framerate(VP9_COMP *const cpi) {
   } else {
     const double prev_layer_framerate =
         oxcf->framerate / oxcf->ts_rate_decimator[layer - 1];
-    const int prev_layer_target_bandwidth =
-        oxcf->ts_target_bitrate[layer - 1] * 1000;
+    const int prev_layer_target_bandwidth = oxcf->ts_target_bitrate[layer - 1];
     lc->avg_frame_size =
         (int)((lc->target_bandwidth - prev_layer_target_bandwidth) /
               (lc->framerate - prev_layer_framerate));
@@ -151,20 +149,7 @@ void vp9_update_spatial_layer_framerate(VP9_COMP *const cpi, double framerate) {
                                    oxcf->two_pass_vbrmin_section / 100);
   lrc->max_frame_bandwidth = (int)(((int64_t)lrc->avg_frame_bandwidth *
                                    oxcf->two_pass_vbrmax_section) / 100);
-  lrc->max_gf_interval = 16;
-
-  lrc->static_scene_max_gf_interval = cpi->oxcf.key_freq >> 1;
-
-  if (oxcf->play_alternate && oxcf->lag_in_frames) {
-    if (lrc->max_gf_interval > oxcf->lag_in_frames - 1)
-      lrc->max_gf_interval = oxcf->lag_in_frames - 1;
-
-    if (lrc->static_scene_max_gf_interval > oxcf->lag_in_frames - 1)
-      lrc->static_scene_max_gf_interval = oxcf->lag_in_frames - 1;
-  }
-
-  if (lrc->max_gf_interval > lrc->static_scene_max_gf_interval)
-    lrc->max_gf_interval = lrc->static_scene_max_gf_interval;
+  vp9_rc_set_gf_max_interval(oxcf, lrc);
 }
 
 void vp9_restore_layer_context(VP9_COMP *const cpi) {
diff --git a/vp9/encoder/vp9_variance.c b/vp9/encoder/vp9_variance.c
index 91d8ea4dc..eb5ae2e41 100644
--- a/vp9/encoder/vp9_variance.c
+++ b/vp9/encoder/vp9_variance.c
@@ -156,6 +156,18 @@ unsigned int vp9_sub_pixel_avg_variance##W##x##H##_c( \
   return vp9_variance##W##x##H##_c(temp3, W, dst, dst_stride, sse); \
 }
 
+void vp9_get16x16var_c(const uint8_t *src_ptr, int source_stride,
+                       const uint8_t *ref_ptr, int ref_stride,
+                       unsigned int *sse, int *sum) {
+  variance(src_ptr, source_stride, ref_ptr, ref_stride, 16, 16, sse, sum);
+}
+
+void vp9_get8x8var_c(const uint8_t *src_ptr, int source_stride,
+                     const uint8_t *ref_ptr, int ref_stride,
+                     unsigned int *sse, int *sum) {
+  variance(src_ptr, source_stride, ref_ptr, ref_stride, 8, 8, sse, sum);
+}
+
 unsigned int vp9_mse16x16_c(const uint8_t *src, int src_stride,
                             const uint8_t *ref, int ref_stride,
                             unsigned int *sse) {
diff --git a/vp9/encoder/vp9_variance.h b/vp9/encoder/vp9_variance.h
index c47fe1335..4a194b72c 100644
--- a/vp9/encoder/vp9_variance.h
+++ b/vp9/encoder/vp9_variance.h
@@ -25,15 +25,13 @@ void variance(const uint8_t *a, int a_stride,
 typedef unsigned int(*vp9_sad_fn_t)(const uint8_t *src_ptr,
                                     int source_stride,
                                     const uint8_t *ref_ptr,
-                                    int ref_stride,
-                                    unsigned int max_sad);
+                                    int ref_stride);
 
 typedef unsigned int(*vp9_sad_avg_fn_t)(const uint8_t *src_ptr,
                                         int source_stride,
                                         const uint8_t *ref_ptr,
                                         int ref_stride,
-                                        const uint8_t *second_pred,
-                                        unsigned int max_sad);
+                                        const uint8_t *second_pred);
 
 typedef void (*vp9_sad_multi_fn_t)(const uint8_t *src_ptr,
                                    int source_stride,
diff --git a/vp9/encoder/x86/vp9_dct_ssse3.asm b/vp9/encoder/x86/vp9_dct_ssse3_x86_64.asm
index 8723a7114..8723a7114 100644
--- a/vp9/encoder/x86/vp9_dct_ssse3.asm
+++ b/vp9/encoder/x86/vp9_dct_ssse3_x86_64.asm
diff --git a/vp9/encoder/x86/vp9_quantize_ssse3.asm b/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
index 48ccef8cc..48ccef8cc 100644
--- a/vp9/encoder/x86/vp9_quantize_ssse3.asm
+++ b/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
diff --git a/vp9/encoder/x86/vp9_ssim_opt.asm b/vp9/encoder/x86/vp9_ssim_opt_x86_64.asm
index 455d10d2c..455d10d2c 100644
--- a/vp9/encoder/x86/vp9_ssim_opt.asm
+++ b/vp9/encoder/x86/vp9_ssim_opt_x86_64.asm
diff --git a/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm b/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm
index 673e0b3a6..21aaa9383 100644
--- a/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm
+++ b/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm
@@ -43,9 +43,9 @@ sym(vp9_temporal_filter_apply_sse2):
     mov         [rsp + rbp_backup], rbp
     ; end prolog
 
-        mov         rdx,            arg(3)
+        mov         edx,            arg(3)
         mov         [rsp + block_width], rdx
-        mov         rdx,            arg(4)
+        mov         edx,            arg(4)
         mov         [rsp + block_height], rdx
         movd        xmm6,           arg(5)
         movdqa      [rsp + strength], xmm6 ; where strength is used, all 16 bytes are read
diff --git a/vp9/encoder/x86/vp9_variance_avx2.c b/vp9/encoder/x86/vp9_variance_avx2.c
index 835c51957..7f81f46b8 100644
--- a/vp9/encoder/x86/vp9_variance_avx2.c
+++ b/vp9/encoder/x86/vp9_variance_avx2.c
@@ -10,7 +10,6 @@
 #include "./vpx_config.h"
 
 #include "vp9/encoder/vp9_variance.h"
-#include "vp9/common/vp9_pragmas.h"
 #include "vpx_ports/mem.h"
 
 typedef void (*get_var_avx2) (
diff --git a/vp9/encoder/x86/vp9_variance_mmx.c b/vp9/encoder/x86/vp9_variance_mmx.c
index c4d17fc0f..ae2f976af 100644
--- a/vp9/encoder/x86/vp9_variance_mmx.c
+++ b/vp9/encoder/x86/vp9_variance_mmx.c
@@ -10,7 +10,6 @@
 
 #include "./vpx_config.h"
 #include "vp9/encoder/vp9_variance.h"
-#include "vp9/common/vp9_pragmas.h"
 #include "vpx_ports/mem.h"
 
 extern unsigned int vp9_get8x8var_mmx
diff --git a/vp9/encoder/x86/vp9_variance_sse2.c b/vp9/encoder/x86/vp9_variance_sse2.c
index 41f225922..e935a233a 100644
--- a/vp9/encoder/x86/vp9_variance_sse2.c
+++ b/vp9/encoder/x86/vp9_variance_sse2.c
@@ -11,53 +11,29 @@
 #include "./vpx_config.h"
 
 #include "vp9/encoder/vp9_variance.h"
-#include "vp9/common/vp9_pragmas.h"
 #include "vpx_ports/mem.h"
 
-extern unsigned int vp9_get4x4var_mmx
-(
-  const unsigned char *src_ptr,
-  int  source_stride,
-  const unsigned char *ref_ptr,
-  int  recon_stride,
-  unsigned int *SSE,
-  int *Sum
-);
-
-unsigned int vp9_get16x16var_sse2
-(
-  const unsigned char *src_ptr,
-  int source_stride,
-  const unsigned char *ref_ptr,
-  int recon_stride,
-  unsigned int *SSE,
-  int *Sum
-);
-unsigned int vp9_get8x8var_sse2
-(
-  const unsigned char *src_ptr,
-  int source_stride,
-  const unsigned char *ref_ptr,
-  int recon_stride,
-  unsigned int *SSE,
-  int *Sum
-);
-
-typedef unsigned int (*get_var_sse2) (
-  const unsigned char *src_ptr,
-  int source_stride,
-  const unsigned char *ref_ptr,
-  int recon_stride,
-  unsigned int *SSE,
-  int *Sum
-);
-
-static void variance_sse2(const unsigned char *src_ptr, int  source_stride,
-                        const unsigned char *ref_ptr, int  recon_stride,
-                        int  w, int  h, unsigned int *sse, int *sum,
-                        get_var_sse2 var_fn, int block_size) {
-  unsigned int sse0;
-  int sum0;
+typedef unsigned int (*variance_fn_t) (const unsigned char *src, int src_stride,
+                                       const unsigned char *ref, int ref_stride,
+                                       unsigned int *sse, int *sum);
+
+unsigned int vp9_get4x4var_mmx(const unsigned char *src, int src_stride,
+                               const unsigned char *ref, int ref_stride,
+                               unsigned int *sse, int *sum);
+
+
+unsigned int vp9_get8x8var_sse2(const unsigned char *src, int src_stride,
+                                const unsigned char *ref, int ref_stride,
+                                unsigned int *sse, int *sum);
+
+unsigned int vp9_get16x16var_sse2(const unsigned char *src, int src_stride,
+                                  const unsigned char *ref, int ref_stride,
+                                  unsigned int *sse, int *sum);
+
+static void variance_sse2(const unsigned char *src, int src_stride,
+                          const unsigned char *ref, int ref_stride,
+                          int w, int h, unsigned int *sse, int *sum,
+                          variance_fn_t var_fn, int block_size) {
   int i, j;
 
   *sse = 0;
@@ -65,217 +41,139 @@ static void variance_sse2(const unsigned char *src_ptr, int  source_stride,
 
   for (i = 0; i < h; i += block_size) {
     for (j = 0; j < w; j += block_size) {
-      var_fn(src_ptr + source_stride * i + j, source_stride,
-             ref_ptr + recon_stride * i + j, recon_stride, &sse0, &sum0);
+      unsigned int sse0;
+      int sum0;
+      var_fn(src + src_stride * i + j, src_stride,
+             ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
       *sse += sse0;
       *sum += sum0;
     }
   }
 }
 
-unsigned int vp9_variance4x4_sse2(
-  const unsigned char *src_ptr,
-  int  source_stride,
-  const unsigned char *ref_ptr,
-  int  recon_stride,
-  unsigned int *sse) {
-  unsigned int var;
-  int avg;
-
-  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 4, 4,
-                  &var, &avg, vp9_get4x4var_mmx, 4);
-  *sse = var;
-  return (var - (((unsigned int)avg * avg) >> 4));
+unsigned int vp9_variance4x4_sse2(const unsigned char *src, int src_stride,
+                                  const unsigned char *ref, int ref_stride,
+                                  unsigned int *sse) {
+  int sum;
+  variance_sse2(src, src_stride, ref, ref_stride, 4, 4,
+                sse, &sum, vp9_get4x4var_mmx, 4);
+  return *sse - (((unsigned int)sum * sum) >> 4);
 }
 
-unsigned int vp9_variance8x4_sse2(const uint8_t *src_ptr,
-                                  int  source_stride,
-                                  const uint8_t *ref_ptr,
-                                  int  recon_stride,
+unsigned int vp9_variance8x4_sse2(const uint8_t *src, int src_stride,
+                                  const uint8_t *ref, int ref_stride,
                                   unsigned int *sse) {
-  unsigned int var;
-  int avg;
-
-  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 8, 4,
-                  &var, &avg, vp9_get4x4var_mmx, 4);
-  *sse = var;
-  return (var - (((unsigned int)avg * avg) >> 5));
+  int sum;
+  variance_sse2(src, src_stride, ref, ref_stride, 8, 4,
+                sse, &sum, vp9_get4x4var_mmx, 4);
+  return *sse - (((unsigned int)sum * sum) >> 5);
 }
 
-unsigned int vp9_variance4x8_sse2(const uint8_t *src_ptr,
-                                  int  source_stride,
-                                  const uint8_t *ref_ptr,
-                                  int  recon_stride,
+unsigned int vp9_variance4x8_sse2(const uint8_t *src, int src_stride,
+                                  const uint8_t *ref, int ref_stride,
                                   unsigned int *sse) {
-  unsigned int var;
-  int avg;
-
-  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 4, 8,
-                  &var, &avg, vp9_get4x4var_mmx, 4);
-  *sse = var;
-  return (var - (((unsigned int)avg * avg) >> 5));
+  int sum;
+  variance_sse2(src, src_stride, ref, ref_stride, 4, 8,
+                sse, &sum, vp9_get4x4var_mmx, 4);
+  return *sse - (((unsigned int)sum * sum) >> 5);
 }
 
-unsigned int vp9_variance8x8_sse2
-(
-  const unsigned char *src_ptr,
-  int  source_stride,
-  const unsigned char *ref_ptr,
-  int  recon_stride,
-  unsigned int *sse) {
-  unsigned int var;
-  int avg;
-
-  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 8, 8,
-                  &var, &avg, vp9_get8x8var_sse2, 8);
-  *sse = var;
-  return (var - (((unsigned int)avg * avg) >> 6));
+unsigned int vp9_variance8x8_sse2(const unsigned char *src, int src_stride,
+                                  const unsigned char *ref, int ref_stride,
+                                  unsigned int *sse) {
+  int sum;
+  variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
+                sse, &sum, vp9_get8x8var_sse2, 8);
+  return *sse - (((unsigned int)sum * sum) >> 6);
 }
 
-unsigned int vp9_variance16x8_sse2
-(
-  const unsigned char *src_ptr,
-  int  source_stride,
-  const unsigned char *ref_ptr,
-  int  recon_stride,
-  unsigned int *sse) {
-  unsigned int var;
-  int avg;
-
-  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 16, 8,
-                  &var, &avg, vp9_get8x8var_sse2, 8);
-  *sse = var;
-  return (var - (((unsigned int)avg * avg) >> 7));
+unsigned int vp9_variance16x8_sse2(const unsigned char *src, int src_stride,
+                                   const unsigned char *ref, int ref_stride,
+                                   unsigned int *sse) {
+  int sum;
+  variance_sse2(src, src_stride, ref, ref_stride, 16, 8,
+                sse, &sum, vp9_get8x8var_sse2, 8);
+  return *sse - (((unsigned int)sum * sum) >> 7);
 }
 
-unsigned int vp9_variance8x16_sse2
-(
-  const unsigned char *src_ptr,
-  int  source_stride,
-  const unsigned char *ref_ptr,
-  int  recon_stride,
-  unsigned int *sse) {
-  unsigned int var;
-  int avg;
-
-  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 8, 16,
-                &var, &avg, vp9_get8x8var_sse2, 8);
-  *sse = var;
-  return (var - (((unsigned int)avg * avg) >> 7));
+unsigned int vp9_variance8x16_sse2(const unsigned char *src, int src_stride,
+                                   const unsigned char *ref, int ref_stride,
+                                   unsigned int *sse) {
+  int sum;
+  variance_sse2(src, src_stride, ref, ref_stride, 8, 16,
+                sse, &sum, vp9_get8x8var_sse2, 8);
+  return *sse - (((unsigned int)sum * sum) >> 7);
 }
 
-unsigned int vp9_variance16x16_sse2
-(
-  const unsigned char *src_ptr,
-  int  source_stride,
-  const unsigned char *ref_ptr,
-  int  recon_stride,
-  unsigned int *sse) {
-  unsigned int var;
-  int avg;
-
-  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16,
-                &var, &avg, vp9_get16x16var_sse2, 16);
-  *sse = var;
-  return (var - (((unsigned int)avg * avg) >> 8));
+unsigned int vp9_variance16x16_sse2(const unsigned char *src, int src_stride,
+                                    const unsigned char *ref, int ref_stride,
+                                    unsigned int *sse) {
+  int sum;
+  variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
+                sse, &sum, vp9_get16x16var_sse2, 16);
+  return *sse - (((unsigned int)sum * sum) >> 8);
 }
 
-unsigned int vp9_mse16x16_sse2(
-  const unsigned char *src_ptr,
-  int  source_stride,
-  const unsigned char *ref_ptr,
-  int  recon_stride,
-  unsigned int *sse) {
-  unsigned int sse0;
-  int sum0;
-  vp9_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0,
-                       &sum0);
-  *sse = sse0;
-  return sse0;
+unsigned int vp9_mse16x16_sse2(const unsigned char *src, int src_stride,
+                               const unsigned char *ref, int ref_stride,
+                               unsigned int *sse) {
+  int sum;
+  vp9_get16x16var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
+  return *sse;
 }
 
-unsigned int vp9_variance32x32_sse2(const uint8_t *src_ptr,
-                                    int  source_stride,
-                                    const uint8_t *ref_ptr,
-                                    int  recon_stride,
+unsigned int vp9_variance32x32_sse2(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
                                     unsigned int *sse) {
-  unsigned int var;
-  int avg;
-
-  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 32,
-                &var, &avg, vp9_get16x16var_sse2, 16);
-  *sse = var;
-  return (var - (((int64_t)avg * avg) >> 10));
+  int sum;
+  variance_sse2(src, src_stride, ref, ref_stride, 32, 32,
+                sse, &sum, vp9_get16x16var_sse2, 16);
+  return *sse - (((int64_t)sum * sum) >> 10);
 }
 
-unsigned int vp9_variance32x16_sse2(const uint8_t *src_ptr,
-                                    int  source_stride,
-                                    const uint8_t *ref_ptr,
-                                    int  recon_stride,
+unsigned int vp9_variance32x16_sse2(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
                                     unsigned int *sse) {
-  unsigned int var;
-  int avg;
-
-  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 16,
-                &var, &avg, vp9_get16x16var_sse2, 16);
-  *sse = var;
-  return (var - (((int64_t)avg * avg) >> 9));
+  int sum;
+  variance_sse2(src, src_stride, ref, ref_stride, 32, 16,
+                sse, &sum, vp9_get16x16var_sse2, 16);
+  return *sse - (((int64_t)sum * sum) >> 9);
 }
 
-unsigned int vp9_variance16x32_sse2(const uint8_t *src_ptr,
-                                    int  source_stride,
-                                    const uint8_t *ref_ptr,
-                                    int  recon_stride,
+unsigned int vp9_variance16x32_sse2(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
                                     unsigned int *sse) {
-  unsigned int var;
-  int avg;
-
-  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 16, 32,
-                &var, &avg, vp9_get16x16var_sse2, 16);
-  *sse = var;
-  return (var - (((int64_t)avg * avg) >> 9));
+  int sum;
+  variance_sse2(src, src_stride, ref, ref_stride, 16, 32,
+                sse, &sum, vp9_get16x16var_sse2, 16);
+  return *sse - (((int64_t)sum * sum) >> 9);
 }
 
-unsigned int vp9_variance64x64_sse2(const uint8_t *src_ptr,
-                                    int  source_stride,
-                                    const uint8_t *ref_ptr,
-                                    int  recon_stride,
+unsigned int vp9_variance64x64_sse2(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
                                     unsigned int *sse) {
-  unsigned int var;
-  int avg;
-
-  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 64, 64,
-                &var, &avg, vp9_get16x16var_sse2, 16);
-  *sse = var;
-  return (var - (((int64_t)avg * avg) >> 12));
+  int sum;
+  variance_sse2(src, src_stride, ref, ref_stride, 64, 64,
+                sse, &sum, vp9_get16x16var_sse2, 16);
+  return *sse - (((int64_t)sum * sum) >> 12);
 }
 
-unsigned int vp9_variance64x32_sse2(const uint8_t *src_ptr,
-                                    int  source_stride,
-                                    const uint8_t *ref_ptr,
-                                    int  recon_stride,
+unsigned int vp9_variance64x32_sse2(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
                                     unsigned int *sse) {
-  unsigned int var;
-  int avg;
-
-  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 64, 32,
-                &var, &avg, vp9_get16x16var_sse2, 16);
-  *sse = var;
-  return (var - (((int64_t)avg * avg) >> 11));
+  int sum;
+  variance_sse2(src, src_stride, ref, ref_stride, 64, 32,
+                sse, &sum, vp9_get16x16var_sse2, 16);
+  return *sse - (((int64_t)sum * sum) >> 11);
 }
 
-unsigned int vp9_variance32x64_sse2(const uint8_t *src_ptr,
-                                    int  source_stride,
-                                    const uint8_t *ref_ptr,
-                                    int  recon_stride,
+unsigned int vp9_variance32x64_sse2(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
                                     unsigned int *sse) {
-  unsigned int var;
-  int avg;
-
-  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 64,
-                &var, &avg, vp9_get16x16var_sse2, 16);
-  *sse = var;
-  return (var - (((int64_t)avg * avg) >> 11));
+  int sum;
+  variance_sse2(src, src_stride, ref, ref_stride, 32, 64,
+                sse, &sum, vp9_get16x16var_sse2, 16);
+  return *sse - (((int64_t)sum * sum) >> 11);
 }
 
 #define DECL(w, opt) \