18 files changed, 768 insertions, 1394 deletions
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index c6af7f6da..21e2b16a4 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -115,10 +115,6 @@ static INLINE int mi_width_log2(BLOCK_SIZE sb_type) {
   return mi_width_log2_lookup[sb_type];
 }
 
-static INLINE int mi_height_log2(BLOCK_SIZE sb_type) {
-  return mi_height_log2_lookup[sb_type];
-}
-
 // This structure now relates to 8x8 block regions.
 typedef struct {
   MB_PREDICTION_MODE mode, uv_mode;
diff --git a/vp9/common/vp9_common_data.c b/vp9/common/vp9_common_data.c
index 886c0afc6..a927823e0 100644
--- a/vp9/common/vp9_common_data.c
+++ b/vp9/common/vp9_common_data.c
@@ -26,8 +26,6 @@ const int mi_width_log2_lookup[BLOCK_SIZES] =
   {0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3};
 const int num_8x8_blocks_wide_lookup[BLOCK_SIZES] =
   {1, 1, 1, 1, 1, 2, 2, 2, 4, 4, 4, 8, 8};
-const int mi_height_log2_lookup[BLOCK_SIZES] =
-  {0, 0, 0, 0, 1, 0, 1, 2, 1, 2, 3, 2, 3};
 const int num_8x8_blocks_high_lookup[BLOCK_SIZES] =
   {1, 1, 1, 1, 2, 1, 2, 4, 2, 4, 8, 4, 8};
 
diff --git a/vp9/common/vp9_common_data.h b/vp9/common/vp9_common_data.h
index a367c65c6..5222d29c1 100644
--- a/vp9/common/vp9_common_data.h
+++ b/vp9/common/vp9_common_data.h
@@ -16,7 +16,6 @@
 extern const int b_width_log2_lookup[BLOCK_SIZES];
 extern const int b_height_log2_lookup[BLOCK_SIZES];
 extern const int mi_width_log2_lookup[BLOCK_SIZES];
-extern const int mi_height_log2_lookup[BLOCK_SIZES];
 extern const int num_8x8_blocks_wide_lookup[BLOCK_SIZES];
 extern const int num_8x8_blocks_high_lookup[BLOCK_SIZES];
 extern const int num_4x4_blocks_high_lookup[BLOCK_SIZES];
diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h
index b5ed959e1..f6fe4d3f1 100644
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h
@@ -346,7 +346,7 @@ static INLINE int partition_plane_context(
   const int bs = 1 << bsl;
   int above = 0, left = 0, i;
 
-  assert(mi_width_log2(bsize) == mi_height_log2(bsize));
+  assert(b_width_log2(bsize) == b_height_log2(bsize));
   assert(bsl >= 0);
 
   for (i = 0; i < bs; i++) {
diff --git a/vp9/common/vp9_reconinter.h b/vp9/common/vp9_reconinter.h
index 9f379399c..3cc16d94e 100644
--- a/vp9/common/vp9_reconinter.h
+++ b/vp9/common/vp9_reconinter.h
@@ -58,34 +58,34 @@ static void setup_pred_plane(struct buf_2d *dst,
 static void setup_dst_planes(MACROBLOCKD *xd,
                              const YV12_BUFFER_CONFIG *src,
                              int mi_row, int mi_col) {
-  uint8_t *buffers[4] = {src->y_buffer, src->u_buffer, src->v_buffer,
-                         src->alpha_buffer};
-  int strides[4] = {src->y_stride, src->uv_stride, src->uv_stride,
-                    src->alpha_stride};
+  uint8_t *const buffers[4] = {src->y_buffer, src->u_buffer, src->v_buffer,
+                               src->alpha_buffer};
+  const int strides[4] = {src->y_stride, src->uv_stride, src->uv_stride,
+                          src->alpha_stride};
   int i;
 
   for (i = 0; i < MAX_MB_PLANE; ++i) {
-    struct macroblockd_plane *pd = &xd->plane[i];
+    struct macroblockd_plane *const pd = &xd->plane[i];
     setup_pred_plane(&pd->dst, buffers[i], strides[i], mi_row, mi_col, NULL,
                      pd->subsampling_x, pd->subsampling_y);
   }
 }
 
-static void setup_pre_planes(MACROBLOCKD *xd, int i,
+static void setup_pre_planes(MACROBLOCKD *xd, int idx,
                              const YV12_BUFFER_CONFIG *src,
                              int mi_row, int mi_col,
                              const struct scale_factors *sf) {
-  if (src) {
-    int j;
-    uint8_t* buffers[4] = {src->y_buffer, src->u_buffer, src->v_buffer,
-                           src->alpha_buffer};
-    int strides[4] = {src->y_stride, src->uv_stride, src->uv_stride,
-                      src->alpha_stride};
+  if (src != NULL) {
+    int i;
+    uint8_t *const buffers[4] = {src->y_buffer, src->u_buffer, src->v_buffer,
+                                 src->alpha_buffer};
+    const int strides[4] = {src->y_stride, src->uv_stride, src->uv_stride,
+                            src->alpha_stride};
 
-    for (j = 0; j < MAX_MB_PLANE; ++j) {
-      struct macroblockd_plane *pd = &xd->plane[j];
-      setup_pred_plane(&pd->pre[i], buffers[j], strides[j],
-                     mi_row, mi_col, sf, pd->subsampling_x, pd->subsampling_y);
+    for (i = 0; i < MAX_MB_PLANE; ++i) {
+      struct macroblockd_plane *const pd = &xd->plane[i];
+      setup_pred_plane(&pd->pre[idx], buffers[i], strides[i], mi_row, mi_col,
+                       sf, pd->subsampling_x, pd->subsampling_y);
     }
   }
 }
diff --git a/vp9/common/x86/vp9_asm_stubs.c b/vp9/common/x86/vp9_asm_stubs.c
index f4f758297..f95423678 100644
--- a/vp9/common/x86/vp9_asm_stubs.c
+++ b/vp9/common/x86/vp9_asm_stubs.c
@@ -23,105 +23,20 @@ typedef void filter8_1dfunction (
   const short *filter
 );
 
-#if (HAVE_SSSE3)
+#if HAVE_SSSE3
+filter8_1dfunction vp9_filter_block1d16_v8_ssse3;
+filter8_1dfunction vp9_filter_block1d16_h8_ssse3;
+filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
+filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
 filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3;
 filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3;
 filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3;
 filter8_1dfunction vp9_filter_block1d8_h8_avg_ssse3;
 filter8_1dfunction vp9_filter_block1d4_v8_avg_ssse3;
 filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3;
-#if (ARCH_X86_64)
-filter8_1dfunction vp9_filter_block1d16_v8_intrin_ssse3;
-filter8_1dfunction vp9_filter_block1d16_h8_intrin_ssse3;
-filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3;
-filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3;
-filter8_1dfunction vp9_filter_block1d4_v8_intrin_ssse3;
-filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3;
-
-void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-                               uint8_t *dst, ptrdiff_t dst_stride,
-                               const int16_t *filter_x, int x_step_q4,
-                               const int16_t *filter_y, int y_step_q4,
-                               int w, int h) {
-  /* Ensure the filter can be compressed to int16_t. */
-  if (x_step_q4 == 16 && filter_x[3] != 128) {
-    while (w >= 16) {
-      vp9_filter_block1d16_h8_intrin_ssse3(src, src_stride,
-                                    dst, dst_stride,
-                                    h, filter_x);
-      src += 16;
-      dst += 16;
-      w -= 16;
-    }
-    while (w >= 8) {
-      vp9_filter_block1d8_h8_intrin_ssse3(src, src_stride,
-                                   dst, dst_stride,
-                                   h, filter_x);
-      src += 8;
-      dst += 8;
-      w -= 8;
-    }
-    while (w >= 4) {
-      vp9_filter_block1d4_h8_intrin_ssse3(src, src_stride,
-                                   dst, dst_stride,
-                                   h, filter_x);
-      src += 4;
-      dst += 4;
-      w -= 4;
-    }
-  }
-  if (w) {
-    vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride,
-                          filter_x, x_step_q4, filter_y, y_step_q4,
-                          w, h);
-  }
-}
 
-void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-                              uint8_t *dst, ptrdiff_t dst_stride,
-                              const int16_t *filter_x, int x_step_q4,
-                              const int16_t *filter_y, int y_step_q4,
-                              int w, int h) {
-  if (y_step_q4 == 16 && filter_y[3] != 128) {
-    while (w >= 16) {
-      vp9_filter_block1d16_v8_intrin_ssse3(src - src_stride * 3, src_stride,
-                                    dst, dst_stride,
-                                    h, filter_y);
-      src += 16;
-      dst += 16;
-      w -= 16;
-    }
-    while (w >= 8) {
-      vp9_filter_block1d8_v8_intrin_ssse3(src - src_stride * 3, src_stride,
-                                   dst, dst_stride,
-                                   h, filter_y);
-      src += 8;
-      dst += 8;
-      w -= 8;
-    }
-    while (w >= 4) {
-      vp9_filter_block1d4_v8_intrin_ssse3(src - src_stride * 3, src_stride,
-                                   dst, dst_stride,
-                                   h, filter_y);
-      src += 4;
-      dst += 4;
-      w -= 4;
-    }
-  }
-  if (w) {
-    vp9_convolve8_vert_c(src, src_stride, dst, dst_stride,
-                         filter_x, x_step_q4, filter_y, y_step_q4,
-                         w, h);
-  }
-}
-
-#else
-filter8_1dfunction vp9_filter_block1d16_v8_ssse3;
-filter8_1dfunction vp9_filter_block1d16_h8_ssse3;
-filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
-filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
-filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
-filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
 void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
                                uint8_t *dst, ptrdiff_t dst_stride,
                                const int16_t *filter_x, int x_step_q4,
@@ -198,7 +113,6 @@ void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
                          w, h);
   }
 }
-#endif
 
 void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
                                uint8_t *dst, ptrdiff_t dst_stride,
diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c
index 501bed5a8..2f6149464 100644
--- a/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -380,17 +380,13 @@ void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride,
     out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
   }
 
-#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1, out2, out3) \
+#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1) \
   {                                                     \
     const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
     const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
-    const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \
-    const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \
                                                         \
     in0 = _mm_unpacklo_epi32(tr0_0, tr0_1);  /* i1 i0 */  \
     in1 = _mm_unpackhi_epi32(tr0_0, tr0_1);  /* i3 i2 */  \
-    in2 = _mm_unpacklo_epi32(tr0_2, tr0_3);  /* i5 i4 */  \
-    in3 = _mm_unpackhi_epi32(tr0_2, tr0_3);  /* i7 i6 */  \
   }
 
 #define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \
@@ -662,7 +658,6 @@ static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
 }
 
 static INLINE void array_transpose_4X8(__m128i *in, __m128i * out) {
-  const __m128i zero = _mm_setzero_si128();
   const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
   const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
   const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
@@ -677,7 +672,6 @@ static INLINE void array_transpose_4X8(__m128i *in, __m128i * out) {
   out[1] = _mm_unpackhi_epi64(tr1_0, tr1_4);
   out[2] = _mm_unpacklo_epi64(tr1_2, tr1_6);
   out[3] = _mm_unpackhi_epi64(tr1_2, tr1_6);
-  out[4] = out[5] = out[6] = out[7] = zero;
 }
 
 static void idct8_1d_sse2(__m128i *in) {
@@ -1270,6 +1264,114 @@ void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
                            stp2_10, stp2_13, stp2_11, stp2_12) \
   }
 
+#define IDCT16_10_1D \
+    /* Stage2 */ \
+    { \
+      const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], zero); \
+      const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], zero); \
+      const __m128i lo_13_3 = _mm_unpacklo_epi16(zero, in[3]); \
+      const __m128i hi_13_3 = _mm_unpackhi_epi16(zero, in[3]); \
+      \
+      MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_13_3, hi_13_3, \
+                             stg2_0, stg2_1, stg2_6, stg2_7, \
+                             stp1_8_0, stp1_15, stp1_11, stp1_12_0) \
+    } \
+      \
+    /* Stage3 */ \
+    { \
+      const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], zero); \
+      const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], zero); \
+      \
+      MULTIPLICATION_AND_ADD_2(lo_2_14, hi_2_14, \
+                               stg3_0, stg3_1,  \
+                               stp2_4, stp2_7) \
+      \
+      stp1_9  =  stp1_8_0; \
+      stp1_10 =  stp1_11;  \
+      \
+      stp1_13 = stp1_12_0; \
+      stp1_14 = stp1_15;   \
+    } \
+    \
+    /* Stage4 */ \
+    { \
+      const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); \
+      const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], zero); \
+      \
+      const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
+      const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
+      const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
+      const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
+      \
+      MULTIPLICATION_AND_ADD_2(lo_0_8, hi_0_8, \
+                               stg4_0, stg4_1, \
+                               stp1_0, stp1_1) \
+      stp2_5 = stp2_4; \
+      stp2_6 = stp2_7; \
+      \
+      MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \
+                             stg4_4, stg4_5, stg4_6, stg4_7, \
+                             stp2_9, stp2_14, stp2_10, stp2_13) \
+    } \
+      \
+    /* Stage5 */ \
+    { \
+      const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
+      const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
+      \
+      stp1_2 = stp1_1; \
+      stp1_3 = stp1_0; \
+      \
+      tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
+      tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
+      tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
+      tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
+      \
+      tmp0 = _mm_add_epi32(tmp0, rounding); \
+      tmp1 = _mm_add_epi32(tmp1, rounding); \
+      tmp2 = _mm_add_epi32(tmp2, rounding); \
+      tmp3 = _mm_add_epi32(tmp3, rounding); \
+      \
+      tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+      tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+      tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+      tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+      \
+      stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
+      stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
+      \
+      stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);  \
+      stp1_9 = _mm_add_epi16(stp2_9, stp2_10);    \
+      stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);   \
+      stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \
+      \
+      stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \
+      stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);   \
+      stp1_14 = _mm_add_epi16(stp2_14, stp2_13);   \
+      stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \
+    } \
+      \
+    /* Stage6 */ \
+    { \
+      const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
+      const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
+      const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
+      const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
+      \
+      stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \
+      stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
+      stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
+      stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \
+      stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \
+      stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
+      stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
+      stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
+      \
+      MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
+                             stg6_0, stg4_0, stg6_0, stg4_0, \
+                             stp2_10, stp2_13, stp2_11, stp2_12) \
+    }
+
 void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest,
                                 int stride) {
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
@@ -2433,22 +2535,14 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
 
   const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
   const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
-  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
-  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
-  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
-  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
   const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
   const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
 
   const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
   const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
-  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
 
   const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
   const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
   const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
   const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
   const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
@@ -2456,119 +2550,72 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
 
   const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
   __m128i in[16], l[16];
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
+  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6,
           stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
           stp1_8_0, stp1_12_0;
   __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
-          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
+          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14;
   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   int i;
-  in[4] = in[5] = in[6] = in[7] = in[12] = in[13] = in[14] = in[15] = zero;
-  // 1-D idct. Load input data.
+  // First 1-D inverse DCT
+  // Load input data.
   in[0] = _mm_load_si128((const __m128i *)input);
-  in[8] = _mm_load_si128((const __m128i *)(input + 8 * 1));
   in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2));
-  in[9] = _mm_load_si128((const __m128i *)(input + 8 * 3));
   in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4));
-  in[10] = _mm_load_si128((const __m128i *)(input + 8 * 5));
   in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6));
-  in[11] = _mm_load_si128((const __m128i *)(input + 8 * 7));
 
-  TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1], in[2], in[3]);
-  TRANSPOSE_8X4(in[8], in[9], in[10], in[11], in[8], in[9], in[10], in[11]);
+  TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]);
 
   // Stage2
   {
-    const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], in[11]);
-    const __m128i lo_9_7 = _mm_unpackhi_epi16(in[8], in[3]);
-    const __m128i lo_5_11 = _mm_unpackhi_epi16(in[2], in[9]);
-    const __m128i lo_13_3 = _mm_unpackhi_epi16(in[10], in[1]);
+    const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], zero);
+    const __m128i lo_13_3 =  _mm_unpackhi_epi16(zero, in[1]);
 
     tmp0 = _mm_madd_epi16(lo_1_15, stg2_0);
     tmp2 = _mm_madd_epi16(lo_1_15, stg2_1);
-    tmp4 = _mm_madd_epi16(lo_9_7, stg2_2);
-    tmp6 = _mm_madd_epi16(lo_9_7, stg2_3);
-    tmp1 = _mm_madd_epi16(lo_5_11, stg2_4);
-    tmp3 = _mm_madd_epi16(lo_5_11, stg2_5);
     tmp5 = _mm_madd_epi16(lo_13_3, stg2_6);
     tmp7 = _mm_madd_epi16(lo_13_3, stg2_7);
 
     tmp0 = _mm_add_epi32(tmp0, rounding);
     tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp4 = _mm_add_epi32(tmp4, rounding);
-    tmp6 = _mm_add_epi32(tmp6, rounding);
-    tmp1 = _mm_add_epi32(tmp1, rounding);
-    tmp3 = _mm_add_epi32(tmp3, rounding);
     tmp5 = _mm_add_epi32(tmp5, rounding);
     tmp7 = _mm_add_epi32(tmp7, rounding);
 
     tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
     tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
-    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
     tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
     tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
 
-    stp2_8 = _mm_packs_epi32(tmp0, zero);
-    stp2_15 = _mm_packs_epi32(tmp2, zero);
-    stp2_9 = _mm_packs_epi32(tmp4, zero);
-    stp2_14 = _mm_packs_epi32(tmp6, zero);
-
-    stp2_10 = _mm_packs_epi32(tmp1, zero);
-    stp2_13 = _mm_packs_epi32(tmp3, zero);
-    stp2_11 = _mm_packs_epi32(tmp5, zero);
-    stp2_12 = _mm_packs_epi32(tmp7, zero);
+    stp2_8  = _mm_packs_epi32(tmp0, tmp2);
+    stp2_11 = _mm_packs_epi32(tmp5, tmp7);
   }
 
   // Stage3
   {
-    const __m128i lo_2_14 = _mm_unpacklo_epi16(in[1], in[11]);
-    const __m128i lo_10_6 = _mm_unpacklo_epi16(in[9], in[3]);
+    const __m128i lo_2_14 = _mm_unpacklo_epi16(in[1], zero);
 
     tmp0 = _mm_madd_epi16(lo_2_14, stg3_0);
     tmp2 = _mm_madd_epi16(lo_2_14, stg3_1);
-    tmp4 = _mm_madd_epi16(lo_10_6, stg3_2);
-    tmp6 = _mm_madd_epi16(lo_10_6, stg3_3);
 
     tmp0 = _mm_add_epi32(tmp0, rounding);
     tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp4 = _mm_add_epi32(tmp4, rounding);
-    tmp6 = _mm_add_epi32(tmp6, rounding);
-
     tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
     tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
-    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
-
-    stp1_4 = _mm_packs_epi32(tmp0, zero);
-    stp1_7 = _mm_packs_epi32(tmp2, zero);
-    stp1_5 = _mm_packs_epi32(tmp4, zero);
-    stp1_6 = _mm_packs_epi32(tmp6, zero);
 
-    stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9);
-    stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);
-    stp1_10 = _mm_sub_epi16(stp2_11, stp2_10);
-    stp1_11 = _mm_add_epi16(stp2_11, stp2_10);
+    stp1_13 = _mm_unpackhi_epi64(stp2_11, zero);
+    stp1_14 = _mm_unpackhi_epi64(stp2_8, zero);
 
-    stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13);
-    stp1_13 = _mm_sub_epi16(stp2_12, stp2_13);
-    stp1_14 = _mm_sub_epi16(stp2_15, stp2_14);
-    stp1_15 = _mm_add_epi16(stp2_15, stp2_14);
+    stp1_4 = _mm_packs_epi32(tmp0, tmp2);
   }
 
   // Stage4
   {
-    const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]);
-    const __m128i lo_4_12 = _mm_unpacklo_epi16(in[2], in[10]);
-    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
+    const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero);
+    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp1_14);
+    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp1_13);
 
     tmp0 = _mm_madd_epi16(lo_0_8, stg4_0);
     tmp2 = _mm_madd_epi16(lo_0_8, stg4_1);
-    tmp4 = _mm_madd_epi16(lo_4_12, stg4_2);
-    tmp6 = _mm_madd_epi16(lo_4_12, stg4_3);
     tmp1 = _mm_madd_epi16(lo_9_14, stg4_4);
     tmp3 = _mm_madd_epi16(lo_9_14, stg4_5);
     tmp5 = _mm_madd_epi16(lo_10_13, stg4_6);
@@ -2576,8 +2623,6 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
 
     tmp0 = _mm_add_epi32(tmp0, rounding);
     tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp4 = _mm_add_epi32(tmp4, rounding);
-    tmp6 = _mm_add_epi32(tmp6, rounding);
     tmp1 = _mm_add_epi32(tmp1, rounding);
     tmp3 = _mm_add_epi32(tmp3, rounding);
     tmp5 = _mm_add_epi32(tmp5, rounding);
@@ -2585,49 +2630,40 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
 
     tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
     tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
-    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
     tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
     tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
     tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
     tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
 
-    stp2_0 = _mm_packs_epi32(tmp0, zero);
-    stp2_1 = _mm_packs_epi32(tmp2, zero);
-    stp2_2 = _mm_packs_epi32(tmp4, zero);
-    stp2_3 = _mm_packs_epi32(tmp6, zero);
-    stp2_9 = _mm_packs_epi32(tmp1, zero);
-    stp2_14 = _mm_packs_epi32(tmp3, zero);
-    stp2_10 = _mm_packs_epi32(tmp5, zero);
-    stp2_13 = _mm_packs_epi32(tmp7, zero);
-
-    stp2_4 = _mm_add_epi16(stp1_4, stp1_5);
-    stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);
-    stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);
-    stp2_7 = _mm_add_epi16(stp1_7, stp1_6);
+    stp1_0 = _mm_packs_epi32(tmp0, tmp0);
+    stp1_1 = _mm_packs_epi32(tmp2, tmp2);
+    stp2_9 = _mm_packs_epi32(tmp1, tmp3);
+    stp2_10 = _mm_packs_epi32(tmp5, tmp7);
+
+    stp2_6 = _mm_unpackhi_epi64(stp1_4, zero);
   }
 
   // Stage5 and Stage6
   {
-    stp1_0 = _mm_add_epi16(stp2_0, stp2_3);
-    stp1_1 = _mm_add_epi16(stp2_1, stp2_2);
-    stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);
-    stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);
-
-    stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);
-    stp1_9 = _mm_add_epi16(stp2_9, stp2_10);
-    stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);
-    stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11);
-
-    stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0);
-    stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);
-    stp1_14 = _mm_add_epi16(stp2_14, stp2_13);
-    stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0);
+    tmp0 = _mm_add_epi16(stp2_8, stp2_11);
+    tmp1 = _mm_sub_epi16(stp2_8, stp2_11);
+    tmp2 = _mm_add_epi16(stp2_9, stp2_10);
+    tmp3 = _mm_sub_epi16(stp2_9, stp2_10);
+
+    stp1_9  = _mm_unpacklo_epi64(tmp2, zero);
+    stp1_10 = _mm_unpacklo_epi64(tmp3, zero);
+    stp1_8  = _mm_unpacklo_epi64(tmp0, zero);
+    stp1_11 = _mm_unpacklo_epi64(tmp1, zero);
+
+    stp1_13 = _mm_unpackhi_epi64(tmp3, zero);
+    stp1_14 = _mm_unpackhi_epi64(tmp2, zero);
+    stp1_12 = _mm_unpackhi_epi64(tmp1, zero);
+    stp1_15 = _mm_unpackhi_epi64(tmp0, zero);
   }
 
   // Stage6
   {
-    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);
+    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp1_4);
     const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
     const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);
 
@@ -2652,21 +2688,26 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
     tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
     tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
 
-    stp1_5 = _mm_packs_epi32(tmp1, zero);
-    stp1_6 = _mm_packs_epi32(tmp3, zero);
+    stp1_6 = _mm_packs_epi32(tmp3, tmp1);
+
     stp2_10 = _mm_packs_epi32(tmp0, zero);
     stp2_13 = _mm_packs_epi32(tmp2, zero);
     stp2_11 = _mm_packs_epi32(tmp4, zero);
     stp2_12 = _mm_packs_epi32(tmp6, zero);
 
-    stp2_0 = _mm_add_epi16(stp1_0, stp2_7);
-    stp2_1 = _mm_add_epi16(stp1_1, stp1_6);
-    stp2_2 = _mm_add_epi16(stp1_2, stp1_5);
-    stp2_3 = _mm_add_epi16(stp1_3, stp2_4);
-    stp2_4 = _mm_sub_epi16(stp1_3, stp2_4);
-    stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);
-    stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);
-    stp2_7 = _mm_sub_epi16(stp1_0, stp2_7);
+    tmp0 = _mm_add_epi16(stp1_0, stp1_4);
+    tmp1 = _mm_sub_epi16(stp1_0, stp1_4);
+    tmp2 = _mm_add_epi16(stp1_1, stp1_6);
+    tmp3 = _mm_sub_epi16(stp1_1, stp1_6);
+
+    stp2_0 = _mm_unpackhi_epi64(tmp0, zero);
+    stp2_1 = _mm_unpacklo_epi64(tmp2, zero);
+    stp2_2 = _mm_unpackhi_epi64(tmp2, zero);
+    stp2_3 = _mm_unpacklo_epi64(tmp0, zero);
+    stp2_4 = _mm_unpacklo_epi64(tmp1, zero);
+    stp2_5 = _mm_unpackhi_epi64(tmp3, zero);
+    stp2_6 = _mm_unpacklo_epi64(tmp3, zero);
+    stp2_7 = _mm_unpackhi_epi64(tmp1, zero);
   }
 
   // Stage7. Left 8x16 only.
@@ -2687,12 +2728,11 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
   l[14] = _mm_sub_epi16(stp2_1, stp1_14);
   l[15] = _mm_sub_epi16(stp2_0, stp1_15);
 
-  // 2-D idct. We do 2 8x16 blocks.
+  // Second 1-D inverse transform, performed per 8x16 block
   for (i = 0; i < 2; i++) {
     array_transpose_4X8(l + 8*i, in);
-    in[8] = in[9] = in[10] = in[11] = in[12] = in[13] = in[14] = in[15] = zero;
 
-    IDCT16_1D
+    IDCT16_10_1D
 
     // Stage7
     in[0] = _mm_add_epi16(stp2_0, stp1_15);
diff --git a/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c b/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c
deleted file mode 100644
index 303fced3b..000000000
--- a/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c
+++ /dev/null
@@ -1,591 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <tmmintrin.h>
-#include "vpx_ports/mem.h"
-#include "vpx_ports/emmintrin_compat.h"
-
-
-// filters only for the 4_h8 convolution
-DECLARE_ALIGNED(16, const unsigned char,
-filt1_4_h8[16])= {0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6};
-
-DECLARE_ALIGNED(16, const unsigned char,
-filt2_4_h8[16])= {4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10};
-
-// filters for 8_h8 and 16_h8
-DECLARE_ALIGNED(16, const unsigned char,
-filt1_global[16])= {0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8};
-
-DECLARE_ALIGNED(16, const unsigned char,
-filt2_global[16])= {2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10};
-
-DECLARE_ALIGNED(16, const unsigned char,
-filt3_global[16])= {4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12};
-
-DECLARE_ALIGNED(16, const unsigned char,
-filt4_global[16])= {6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14};
-
-
-
-void vp9_filter_block1d4_h8_intrin_ssse3(unsigned char *src_ptr,
-                                         unsigned int src_pixels_per_line,
-                                         unsigned char *output_ptr,
-                                         unsigned int output_pitch,
-                                         unsigned int output_height,
-                                         int16_t *filter) {
-    __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
-    __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
-    __m128i addFilterReg64, filtersReg, srcReg, minReg;
-    unsigned int i;
-
-    // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
-    addFilterReg64 =_mm_set1_epi32((int)0x0400040u);
-    filtersReg = _mm_loadu_si128((__m128i *)filter);
-    // converting the 16 bit (short) to  8 bit (byte) and have the same data
-    // in both lanes of 128 bit register.
-    filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
-
-    // duplicate only the first 16 bits in the filter into the first lane
-    firstFilters = _mm_shufflelo_epi16(filtersReg, 0);
-    // duplicate only the third 16 bit in the filter into the first lane
-    secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu);
-    // duplicate only the seconds 16 bits in the filter into the second lane
-    firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u);
-    // duplicate only the forth 16 bits in the filter into the second lane
-    secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu);
-
-    // loading the local filters
-    thirdFilters =_mm_load_si128((__m128i const *)filt1_4_h8);
-    forthFilters = _mm_load_si128((__m128i const *)filt2_4_h8);
-
-    for (i = 0; i < output_height; i++) {
-        srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3));
-
-        // filter the source buffer
-        srcRegFilt1= _mm_shuffle_epi8(srcReg, thirdFilters);
-        srcRegFilt2= _mm_shuffle_epi8(srcReg, forthFilters);
-
-        // multiply 2 adjacent elements with the filter and add the result
-        srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
-        srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
-
-        // extract the higher half of the lane
-        srcRegFilt3 =  _mm_srli_si128(srcRegFilt1, 8);
-        srcRegFilt4 =  _mm_srli_si128(srcRegFilt2, 8);
-
-        minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2);
-
-        // add and saturate all the results together
-        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
-
-        srcRegFilt3 = _mm_max_epi16(srcRegFilt3, srcRegFilt2);
-
-        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
-
-        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
-
-        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
-
-        // shift by 7 bit each 16 bits
-        srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
-
-        // shrink to 8 bit each 16 bits
-        srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
-
-        src_ptr+=src_pixels_per_line;
-
-        // save only 4 bytes
-        *((int*)&output_ptr[0])= _mm_cvtsi128_si32(srcRegFilt1);
-
-        output_ptr+=output_pitch;
-    }
-}
-
-
-void vp9_filter_block1d8_h8_intrin_ssse3(unsigned char *src_ptr,
-                                         unsigned int src_pixels_per_line,
-                                         unsigned char *output_ptr,
-                                         unsigned int output_pitch,
-                                         unsigned int output_height,
-                                         int16_t *filter) {
-    __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg;
-    __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
-    __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
-    __m128i addFilterReg64, filtersReg, minReg;
-    unsigned int i;
-
-    // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
-    addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
-    filtersReg = _mm_loadu_si128((__m128i *)filter);
-    // converting the 16 bit (short) to  8 bit (byte) and have the same data
-    // in both lanes of 128 bit register.
-    filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
-
-    // duplicate only the first 16 bits (first and second byte)
-    // across 128 bit register
-    firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
-    // duplicate only the second 16 bits (third and forth byte)
-    // across 128 bit register
-    secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
-    // duplicate only the third 16 bits (fifth and sixth byte)
-    // across 128 bit register
-    thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
-    // duplicate only the forth 16 bits (seventh and eighth byte)
-    // across 128 bit register
-    forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
-
-    filt1Reg = _mm_load_si128((__m128i const *)filt1_global);
-    filt2Reg = _mm_load_si128((__m128i const *)filt2_global);
-    filt3Reg = _mm_load_si128((__m128i const *)filt3_global);
-    filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
-
-    for (i = 0; i < output_height; i++) {
-        srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3));
-
-        // filter the source buffer
-        srcRegFilt1= _mm_shuffle_epi8(srcReg, filt1Reg);
-        srcRegFilt2= _mm_shuffle_epi8(srcReg, filt2Reg);
-
-        // multiply 2 adjacent elements with the filter and add the result
-        srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
-        srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
-
-        // filter the source buffer
-        srcRegFilt3= _mm_shuffle_epi8(srcReg, filt3Reg);
-        srcRegFilt4= _mm_shuffle_epi8(srcReg, filt4Reg);
-
-        // multiply 2 adjacent elements with the filter and add the result
-        srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters);
-        srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters);
-
-        // add and saturate all the results together
-        minReg = _mm_min_epi16(srcRegFilt4, srcRegFilt3);
-        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
-
-        srcRegFilt4= _mm_max_epi16(srcRegFilt4, srcRegFilt3);
-
-        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
-
-        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
-
-        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
-
-        // shift by 7 bit each 16 bits
-        srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
-
-        // shrink to 8 bit each 16 bits
-        srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
-
-        src_ptr+=src_pixels_per_line;
-
-       // save only 8 bytes
-        _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);
-
-        output_ptr+=output_pitch;
-    }
-}
-
-void vp9_filter_block1d16_h8_intrin_ssse3(unsigned char *src_ptr,
-                                          unsigned int src_pixels_per_line,
-                                          unsigned char *output_ptr,
-                                          unsigned int output_pitch,
-                                          unsigned int output_height,
-                                          int16_t *filter) {
-    __m128i addFilterReg64, filtersReg, srcReg1, srcReg2;
-    __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
-    __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
-    __m128i srcRegFilt1_1, srcRegFilt2_1, srcRegFilt2, srcRegFilt3;
-    unsigned int i;
-
-    // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
-    addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
-    filtersReg = _mm_loadu_si128((__m128i *)filter);
-    // converting the 16 bit (short) to  8 bit (byte) and have the same data
-    // in both lanes of 128 bit register.
-    filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
-
-    // duplicate only the first 16 bits (first and second byte)
-    // across 128 bit register
-    firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
-    // duplicate only the second 16 bits (third and forth byte)
-    // across 128 bit register
-    secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
-    // duplicate only the third 16 bits (fifth and sixth byte)
-    // across 128 bit register
-    thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
-    // duplicate only the forth 16 bits (seventh and eighth byte)
-    // across 128 bit register
-    forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
-
-    filt1Reg = _mm_load_si128((__m128i const *)filt1_global);
-    filt2Reg = _mm_load_si128((__m128i const *)filt2_global);
-    filt3Reg = _mm_load_si128((__m128i const *)filt3_global);
-    filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
-
-    for (i = 0; i < output_height; i++) {
-        srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr-3));
-
-        // filter the source buffer
-        srcRegFilt1_1= _mm_shuffle_epi8(srcReg1, filt1Reg);
-        srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt2Reg);
-
-        // multiply 2 adjacent elements with the filter and add the result
-        srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, firstFilters);
-        srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
-
-        // add and saturate the results together
-        srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
-
-        // filter the source buffer
-        srcRegFilt3= _mm_shuffle_epi8(srcReg1, filt4Reg);
-        srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt3Reg);
-
-        // multiply 2 adjacent elements with the filter and add the result
-        srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters);
-        srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
-
-        // add and saturate the results together
-        srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
-        _mm_min_epi16(srcRegFilt3, srcRegFilt2));
-
-        // reading the next 16 bytes.
-        // (part of it was being read by earlier read)
-        srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr+5));
-
-        // add and saturate the results together
-        srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
-        _mm_max_epi16(srcRegFilt3, srcRegFilt2));
-
-        // filter the source buffer
-        srcRegFilt2_1= _mm_shuffle_epi8(srcReg2, filt1Reg);
-        srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt2Reg);
-
-        // multiply 2 adjacent elements with the filter and add the result
-        srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, firstFilters);
-        srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
-
-        // add and saturate the results together
-        srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);
-
-        // filter the source buffer
-        srcRegFilt3= _mm_shuffle_epi8(srcReg2, filt4Reg);
-        srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt3Reg);
-
-        // multiply 2 adjacent elements with the filter and add the result
-        srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters);
-        srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
-
-        // add and saturate the results together
-        srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
-        _mm_min_epi16(srcRegFilt3, srcRegFilt2));
-        srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
-        _mm_max_epi16(srcRegFilt3, srcRegFilt2));
-
-        srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, addFilterReg64);
-
-        srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, addFilterReg64);
-
-        // shift by 7 bit each 16 bit
-        srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7);
-        srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7);
-
-        // shrink to 8 bit each 16 bits, the first lane contain the first
-        // convolve result and the second lane contain the second convolve
-        // result
-        srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);
-
-        src_ptr+=src_pixels_per_line;
-
-        // save 16 bytes
-        _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1);
-
-        output_ptr+=output_pitch;
-    }
-}
-
-
-
-void vp9_filter_block1d4_v8_intrin_ssse3(unsigned char *src_ptr,
-                                         unsigned int src_pitch,
-                                         unsigned char *output_ptr,
-                                         unsigned int out_pitch,
-                                         unsigned int output_height,
-                                         int16_t *filter) {
-    __m128i addFilterReg64, filtersReg, firstFilters, secondFilters;
-    __m128i minReg, srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
-    unsigned int i;
-
-    // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
-    addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
-    filtersReg = _mm_loadu_si128((__m128i *)filter);
-    // converting the 16 bit (short) to  8 bit (byte) and have the same data
-    // in both lanes of 128 bit register.
-    filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
-
-    // duplicate only the first 16 bits in the filter into the first lane
-    firstFilters = _mm_shufflelo_epi16(filtersReg, 0);
-    // duplicate only the second 16 bits in the filter into the second lane
-    firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u);
-    // duplicate only the third 16 bits in the filter into the first lane
-    secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu);
-    // duplicate only the forth 16 bits in the filter into the second lane
-    secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu);
-
-    for (i = 0; i < output_height; i++) {
-        // load the first 4 byte
-        srcRegFilt1 = _mm_cvtsi32_si128(*((int*)&src_ptr[0]));
-        // load the next 4 bytes in stride of src_pitch
-        srcRegFilt2 = _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch)[0]));
-
-        // merge the result together
-        srcRegFilt1 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2);
-
-
-        srcRegFilt2 = _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch*2)[0]));
-        srcRegFilt3 =  _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch*3)[0]));
-
-        // merge the result together
-        srcRegFilt2 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3);
-
-        srcRegFilt3 = _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch*4)[0]));
-        srcRegFilt4 = _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch*5)[0]));
-
-        // merge the result together
-        srcRegFilt3 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4);
-        srcRegFilt1 = _mm_unpacklo_epi64(srcRegFilt1, srcRegFilt2);
-
-        srcRegFilt4 = _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch*6)[0]));
-        srcRegFilt2 = _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch*7)[0]));
-
-        // merge the result together
-        srcRegFilt4 = _mm_unpacklo_epi8(srcRegFilt4, srcRegFilt2);
-        srcRegFilt3 = _mm_unpacklo_epi64(srcRegFilt3, srcRegFilt4);
-
-        // multiply 2 adjacent elements with the filter and add the result
-        srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
-        srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
-
-        // extract the second lane of the 128 bit register
-        srcRegFilt2 = _mm_srli_si128(srcRegFilt1, 8);
-
-        // add and saturate the results together
-        minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
-        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
-        _mm_srli_si128(srcRegFilt3, 8));
-        srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3);
-        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
-        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
-        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
-
-        // shift by 7 bit each 16 bit
-        srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
-
-        // shrink to 8 bit each 16 bits
-        srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
-
-        src_ptr+=src_pitch;
-
-        // save only 4 bytes convolve result
-        *((int*)&output_ptr[0])= _mm_cvtsi128_si32(srcRegFilt1);
-
-        output_ptr+=out_pitch;
-    }
-}
-
-void vp9_filter_block1d8_v8_intrin_ssse3(unsigned char *src_ptr,
-                                         unsigned int src_pitch,
-                                         unsigned char *output_ptr,
-                                         unsigned int out_pitch,
-                                         unsigned int output_height,
-                                         int16_t *filter) {
-    __m128i addFilterReg64, filtersReg, minReg, srcRegFilt6;
-    __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
-    __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4, srcRegFilt5;
-    unsigned int i;
-
-    // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
-    addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
-    filtersReg = _mm_loadu_si128((__m128i *)filter);
-    // converting the 16 bit (short) to  8 bit (byte) and have the same data
-    // in both lanes of 128 bit register.
-    filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
-
-    // duplicate only the first 16 bits in the filter
-    firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
-    // duplicate only the second 16 bits in the filter
-    secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
-    // duplicate only the third 16 bits in the filter
-    thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
-    // duplicate only the forth 16 bits in the filter
-    forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
-
-    for (i = 0; i < output_height; i++) {
-        // load the first 8 bytes
-        srcRegFilt1 = _mm_loadl_epi64((__m128i *)&src_ptr[0]);
-        // load the next 8 bytes in stride of src_pitch
-        srcRegFilt2 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch)[0]);
-        srcRegFilt3 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*2)[0]);
-        srcRegFilt4 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*3)[0]);
-
-        // merge the result together
-        srcRegFilt1 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2);
-        srcRegFilt3 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4);
-
-        // load the next 8 bytes in stride of src_pitch
-        srcRegFilt2 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*4)[0]);
-        srcRegFilt4 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*5)[0]);
-        srcRegFilt5 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*6)[0]);
-        srcRegFilt6 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*7)[0]);
-
-        // merge the result together
-        srcRegFilt2 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt4);
-        srcRegFilt5 = _mm_unpacklo_epi8(srcRegFilt5, srcRegFilt6);
-
-        // multiply 2 adjacent elements with the filter and add the result
-        srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
-        srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
-        srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
-        srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, forthFilters);
-
-        // add and saturate the results together
-        minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
-        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt5);
-        srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3);
-        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
-        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
-        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
-
-        // shift by 7 bit each 16 bit
-        srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
-
-        // shrink to 8 bit each 16 bits
-        srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
-
-        src_ptr+=src_pitch;
-
-        // save only 8 bytes convolve result
-        _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);
-
-        output_ptr+=out_pitch;
-    }
-}
-
-
-void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr,
-                                          unsigned int src_pitch,
-                                          unsigned char *output_ptr,
-                                          unsigned int out_pitch,
-                                          unsigned int output_height,
-                                          int16_t *filter) {
-    __m128i addFilterReg64, filtersReg, srcRegFilt1, srcRegFilt2, srcRegFilt3;
-    __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
-    __m128i srcRegFilt4, srcRegFilt5, srcRegFilt6, srcRegFilt7, srcRegFilt8;
-    unsigned int i;
-
-    // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
-    addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
-    filtersReg = _mm_loadu_si128((__m128i *)filter);
-    // converting the 16 bit (short) to  8 bit (byte) and have the same data
-    // in both lanes of 128 bit register.
-    filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
-
-    // duplicate only the first 16 bits in the filter
-    firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
-    // duplicate only the second 16 bits in the filter
-    secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
-    // duplicate only the third 16 bits in the filter
-    thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
-    // duplicate only the forth 16 bits in the filter
-    forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
-
-
-    for (i = 0; i < output_height; i++) {
-        // load the first 16 bytes
-        srcRegFilt1 = _mm_loadu_si128((__m128i *)(src_ptr));
-        // load the next 16 bytes in stride of src_pitch
-        srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch));
-        srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*6));
-        srcRegFilt4 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7));
-
-        // merge the result together
-        srcRegFilt5 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2);
-        srcRegFilt6 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4);
-        srcRegFilt1 = _mm_unpackhi_epi8(srcRegFilt1, srcRegFilt2);
-        srcRegFilt3 = _mm_unpackhi_epi8(srcRegFilt3, srcRegFilt4);
-
-        // multiply 2 adjacent elements with the filter and add the result
-        srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, firstFilters);
-        srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, forthFilters);
-        srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
-        srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters);
-
-
-        // add and saturate the results together
-        srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, srcRegFilt6);
-        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
-
-        // load the next 16 bytes in stride of two/three src_pitch
-        srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*2));
-        srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*3));
-
-        // merge the result together
-        srcRegFilt4 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3);
-        srcRegFilt6 = _mm_unpackhi_epi8(srcRegFilt2, srcRegFilt3);
-
-        // multiply 2 adjacent elements with the filter and add the result
-        srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, secondFilters);
-        srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, secondFilters);
-
-        // load the next 16 bytes in stride of four/five src_pitch
-        srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*4));
-        srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*5));
-
-        // merge the result together
-        srcRegFilt7 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3);
-        srcRegFilt8 = _mm_unpackhi_epi8(srcRegFilt2, srcRegFilt3);
-
-        // multiply 2 adjacent elements with the filter and add the result
-        srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7, thirdFilters);
-        srcRegFilt8 = _mm_maddubs_epi16(srcRegFilt8, thirdFilters);
-
-
-        // add and saturate the results together
-        srcRegFilt5 = _mm_adds_epi16(srcRegFilt5,
-        _mm_min_epi16(srcRegFilt4, srcRegFilt7));
-        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
-        _mm_min_epi16(srcRegFilt6, srcRegFilt8));
-
-
-        // add and saturate the results together
-        srcRegFilt5 = _mm_adds_epi16(srcRegFilt5,
-        _mm_max_epi16(srcRegFilt4, srcRegFilt7));
-        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
-        _mm_max_epi16(srcRegFilt6, srcRegFilt8));
-        srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, addFilterReg64);
-        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
-
-        // shift by 7 bit each 16 bit
-        srcRegFilt5 = _mm_srai_epi16(srcRegFilt5, 7);
-        srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
-
-        // shrink to 8 bit each 16 bits, the first lane contain the first
-        // convolve result and the second lane contain the second convolve
-        // result
-        srcRegFilt1 = _mm_packus_epi16(srcRegFilt5, srcRegFilt1);
-
-        src_ptr+=src_pitch;
-
-        // save 16 bytes convolve result
-        _mm_store_si128((__m128i*)output_ptr, srcRegFilt1);
-
-        output_ptr+=out_pitch;
-    }
-}
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index d17952487..6894f553f 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -1923,9 +1923,6 @@ static void encode_sb_row(VP9_COMP *cpi, const TileInfo *const tile,
 
     vp9_zero(cpi->mb.pred_mv);
 
-    if (cpi->sf.reference_masking)
-      rd_pick_reference_frame(cpi, tile, mi_row, mi_col);
-
     if (cpi->sf.use_lastframe_partitioning ||
         cpi->sf.use_one_partition_size_always ) {
       const int idx_str = cm->mode_info_stride * mi_row + mi_col;
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 5317661c0..812ffa96d 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -106,6 +106,7 @@ static int lookup_next_frame_stats(const struct twopass_rc *p,
   return 1;
 }
 
+
 // Read frame stats at an offset from the current position
 static int read_frame_stats(const struct twopass_rc *p,
                             FIRSTPASS_STATS *frame_stats, int offset) {
@@ -149,7 +150,7 @@ static void output_stats(const VP9_COMP            *cpi,
     FILE *fpfile;
     fpfile = fopen("firstpass.stt", "a");
 
-    fprintf(stdout, "%12.0f %12.0f %12.0f %12.0f %12.0f %12.4f %12.4f"
+    fprintf(fpfile, "%12.0f %12.0f %12.0f %12.0f %12.0f %12.4f %12.4f"
             "%12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f"
             "%12.0f %12.0f %12.4f %12.0f %12.0f %12.4f\n",
             stats->frame,
@@ -349,17 +350,14 @@ static double simple_weight(YV12_BUFFER_CONFIG *source) {
 }
 
 
-// This function returns the current per frame maximum bitrate target.
+// This function returns the maximum target rate per frame.
 static int frame_max_bits(VP9_COMP *cpi) {
-  // Max allocation for a single frame based on the max section guidelines
-  // passed in and how many bits are left.
-  // For VBR base this on the bits and frames left plus the
-  // two_pass_vbrmax_section rate passed in by the user.
-  const double max_bits = (1.0 * cpi->twopass.bits_left /
-      (cpi->twopass.total_stats.count - cpi->common.current_video_frame)) *
-      (cpi->oxcf.two_pass_vbrmax_section / 100.0);
+  int64_t max_bits =
+     ((int64_t)cpi->rc.av_per_frame_bandwidth *
+      (int64_t)cpi->oxcf.two_pass_vbrmax_section) / 100;
+
   if (max_bits < 0)
-      return 0;
+    return 0;
   if (max_bits >= INT_MAX)
     return INT_MAX;
   return (int)max_bits;
@@ -1662,7 +1660,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
 
   // Don't allow a gf too near the next kf
   if ((cpi->rc.frames_to_key - i) < MIN_GF_INTERVAL) {
-    while (i < cpi->rc.frames_to_key) {
+    while (i < (cpi->rc.frames_to_key + !cpi->rc.next_key_frame_forced)) {
       i++;
 
       if (EOF == input_stats(&cpi->twopass, this_frame))
@@ -1697,6 +1695,9 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   if (allow_alt_ref &&
       (i < cpi->oxcf.lag_in_frames) &&
       (i >= MIN_GF_INTERVAL) &&
+      // for real scene cuts (not forced kfs) dont allow arf very near kf.
+      (cpi->rc.next_key_frame_forced ||
+        (i <= (cpi->rc.frames_to_key - MIN_GF_INTERVAL))) &&
       ((next_frame.pcnt_inter > 0.75) ||
        (next_frame.pcnt_second_ref > 0.5)) &&
       ((mv_in_out_accumulator / (double)i > -0.2) ||
@@ -1765,18 +1766,6 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
 #endif
 #endif
 
-  // Now decide how many bits should be allocated to the GF group as  a
-  // proportion of those remaining in the kf group.
-  // The final key frame group in the clip is treated as a special case
-  // where cpi->twopass.kf_group_bits is tied to cpi->twopass.bits_left.
-  // This is also important for short clips where there may only be one
-  // key frame.
-  if (cpi->rc.frames_to_key >= (int)(cpi->twopass.total_stats.count -
-                                          cpi->common.current_video_frame)) {
-    cpi->twopass.kf_group_bits =
-      (cpi->twopass.bits_left > 0) ? cpi->twopass.bits_left : 0;
-  }
-
   // Calculate the bits to be allocated to the group as a whole
   if ((cpi->twopass.kf_group_bits > 0) &&
       (cpi->twopass.kf_group_error_left > 0)) {
@@ -1863,9 +1852,6 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     if (gf_bits < 0)
       gf_bits = 0;
 
-    // Add in minimum for a frame
-    gf_bits += cpi->rc.min_frame_bandwidth;
-
     if (i == 0) {
       cpi->twopass.gf_bits = gf_bits;
     }
@@ -1899,8 +1885,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
       cpi->twopass.gf_group_error_left = (int64_t)gf_group_err;
     }
 
-    cpi->twopass.gf_group_bits -= cpi->twopass.gf_bits
-        - cpi->rc.min_frame_bandwidth;
+    cpi->twopass.gf_group_bits -= cpi->twopass.gf_bits;
 
     if (cpi->twopass.gf_group_bits < 0)
       cpi->twopass.gf_group_bits = 0;
@@ -1985,9 +1970,6 @@ static void assign_std_frame_bits(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   if (cpi->twopass.gf_group_bits < 0)
     cpi->twopass.gf_group_bits = 0;
 
-  // Add in the minimum number of bits that is set aside for every frame.
-  target_frame_size += cpi->rc.min_frame_bandwidth;
-
   // Per frame bit target for this frame.
   cpi->rc.per_frame_bandwidth = target_frame_size;
 }
@@ -2029,6 +2011,22 @@ void vp9_get_one_pass_params(VP9_COMP *cpi) {
   }
 }
 
+void vp9_get_one_pass_cbr_params(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  if ((cm->current_video_frame == 0 ||
+      cm->frame_flags & FRAMEFLAGS_KEY ||
+      cpi->rc.frames_to_key == 0 ||
+      (cpi->oxcf.auto_key && test_for_kf_one_pass(cpi)))) {
+    cm->frame_type = KEY_FRAME;
+    cpi->rc.frames_to_key = cpi->key_frame_frequency;
+  } else {
+    cm->frame_type = INTER_FRAME;
+  }
+  // Don't use gf_update by default in CBR mode.
+  cpi->rc.frames_till_gf_update_due = INT_MAX;
+  cpi->rc.baseline_gf_interval = INT_MAX;
+}
+
 void vp9_get_first_pass_params(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   if (!cpi->refresh_alt_ref_frame &&
@@ -2038,8 +2036,8 @@ void vp9_get_first_pass_params(VP9_COMP *cpi) {
   } else {
     cm->frame_type = INTER_FRAME;
   }
-  cpi->rc.frames_to_key = INT_MAX;
   // Do not use periodic key frames
+  cpi->rc.frames_to_key = INT_MAX;
 }
 
 void vp9_get_second_pass_params(VP9_COMP *cpi) {
@@ -2265,8 +2263,8 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   vp9_zero(next_frame);
 
   vp9_clear_system_state();  // __asm emms;
-  start_position = cpi->twopass.stats_in;
 
+  start_position = cpi->twopass.stats_in;
   cpi->common.frame_type = KEY_FRAME;
 
   // is this a forced key frame by interval
@@ -2348,7 +2346,6 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   // interval is between 1x and 2x
   if (cpi->oxcf.auto_key
       && cpi->rc.frames_to_key > (int)cpi->key_frame_frequency) {
-    FIRSTPASS_STATS *current_pos = cpi->twopass.stats_in;
     FIRSTPASS_STATS tmp_frame;
 
     cpi->rc.frames_to_key /= 2;
@@ -2373,15 +2370,14 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
       // Load a the next frame's stats
       input_stats(&cpi->twopass, &tmp_frame);
     }
-
-    // Reset to the start of the group
-    reset_fpf_position(&cpi->twopass, current_pos);
-
+    cpi->rc.next_key_frame_forced = 1;
+  } else if (cpi->twopass.stats_in == cpi->twopass.stats_in_end) {
     cpi->rc.next_key_frame_forced = 1;
   } else {
     cpi->rc.next_key_frame_forced = 0;
   }
-  // Special case for the last frame of the file
+
+  // Special case for the last key frame of the file
   if (cpi->twopass.stats_in >= cpi->twopass.stats_in_end) {
     // Accumulate kf group error
     kf_group_err += calculate_modified_err(cpi, this_frame);
@@ -2566,8 +2562,6 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     }
 
     cpi->twopass.kf_group_bits -= cpi->twopass.kf_bits;
-    // Add in the minimum frame allowance
-    cpi->twopass.kf_bits += cpi->rc.min_frame_bandwidth;
 
     // Peer frame bit target for this frame
     cpi->rc.per_frame_bandwidth = cpi->twopass.kf_bits;
diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h
index 43703c2c5..f89e4cb1c 100644
--- a/vp9/encoder/vp9_firstpass.h
+++ b/vp9/encoder/vp9_firstpass.h
@@ -22,6 +22,7 @@ void vp9_end_second_pass(VP9_COMP *cpi);
 
 void vp9_get_first_pass_params(VP9_COMP *cpi);
 void vp9_get_one_pass_params(VP9_COMP *cpi);
+void vp9_get_one_pass_cbr_params(VP9_COMP *cpi);
 void vp9_get_svc_params(VP9_COMP *cpi);
 
 #endif  // VP9_ENCODER_VP9_FIRSTPASS_H_
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index 946ed2818..8e60bc96d 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -581,22 +581,21 @@ static void set_rd_speed_thresholds_sub8x8(VP9_COMP *cpi, int mode) {
     sf->thresh_mult_sub8x8[THR_COMP_GA] = INT_MAX;
 }
 
-static void set_rt_speed_feature(VP9_COMMON *cm,
-                                 SPEED_FEATURES *sf,
-                                 int speed) {
-  sf->static_segmentation = 0;
+static void set_good_speed_feature(VP9_COMMON *cm,
+                                   SPEED_FEATURES *sf,
+                                   int speed) {
+  int i;
   sf->adaptive_rd_thresh = 1;
   sf->recode_loop = (speed < 1);
-  if (speed >= 1) {
+  if (speed == 1) {
     sf->use_square_partition_only = !frame_is_intra_only(cm);
-    sf->less_rectangular_check = 1;
-    sf->tx_size_search_method =
-        frame_is_intra_only(cm) ? USE_FULL_RD : USE_LARGESTALL;
+    sf->less_rectangular_check  = 1;
+    sf->tx_size_search_method = frame_is_intra_only(cm)
+      ? USE_FULL_RD : USE_LARGESTALL;
 
     if (MIN(cm->width, cm->height) >= 720)
       sf->disable_split_mask = cm->show_frame ?
-      DISABLE_ALL_SPLIT :
-                                                DISABLE_ALL_INTER_SPLIT;
+        DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT;
     else
       sf->disable_split_mask = DISABLE_COMPOUND_SPLIT;
 
@@ -610,26 +609,26 @@ static void set_rt_speed_feature(VP9_COMMON *cm,
     sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V;
     sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V;
   }
-  if (speed >= 2) {
+  if (speed == 2) {
     sf->use_square_partition_only = !frame_is_intra_only(cm);
-    sf->less_rectangular_check = 1;
-    sf->tx_size_search_method =
-        frame_is_intra_only(cm) ? USE_FULL_RD : USE_LARGESTALL;
+    sf->less_rectangular_check  = 1;
+    sf->tx_size_search_method = frame_is_intra_only(cm)
+      ? USE_FULL_RD : USE_LARGESTALL;
 
     if (MIN(cm->width, cm->height) >= 720)
       sf->disable_split_mask = cm->show_frame ?
-      DISABLE_ALL_SPLIT :
-                                                DISABLE_ALL_INTER_SPLIT;
+        DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT;
     else
       sf->disable_split_mask = LAST_AND_INTRA_SPLIT_ONLY;
 
-    sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH
-        | FLAG_SKIP_INTRA_BESTINTER | FLAG_SKIP_COMP_BESTINTRA
-        | FLAG_SKIP_INTRA_LOWVAR;
-
+    sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH |
+                                 FLAG_SKIP_INTRA_BESTINTER |
+                                 FLAG_SKIP_COMP_BESTINTRA |
+                                 FLAG_SKIP_INTRA_LOWVAR;
     sf->use_rd_breakout = 1;
     sf->adaptive_motion_search = 1;
     sf->adaptive_pred_filter_type = 2;
+    sf->reference_masking = 1;
     sf->auto_mv_step_size = 1;
 
     sf->disable_filter_search_var_thresh = 50;
@@ -649,7 +648,7 @@ static void set_rt_speed_feature(VP9_COMMON *cm,
     sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V;
     sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V;
   }
-  if (speed >= 3) {
+  if (speed == 3) {
     sf->use_square_partition_only = 1;
     sf->tx_size_search_method = USE_LARGESTALL;
 
@@ -658,13 +657,15 @@ static void set_rt_speed_feature(VP9_COMMON *cm,
     else
       sf->disable_split_mask = DISABLE_ALL_INTER_SPLIT;
 
-    sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH
-        | FLAG_SKIP_INTRA_BESTINTER | FLAG_SKIP_COMP_BESTINTRA
-        | FLAG_SKIP_INTRA_LOWVAR;
+    sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH |
+      FLAG_SKIP_INTRA_BESTINTER |
+      FLAG_SKIP_COMP_BESTINTRA |
+      FLAG_SKIP_INTRA_LOWVAR;
 
     sf->use_rd_breakout = 1;
     sf->adaptive_motion_search = 1;
     sf->adaptive_pred_filter_type = 2;
+    sf->reference_masking = 1;
     sf->auto_mv_step_size = 1;
 
     sf->disable_filter_search_var_thresh = 100;
@@ -684,19 +685,22 @@ static void set_rt_speed_feature(VP9_COMMON *cm,
     sf->adaptive_rd_thresh = 4;
     sf->mode_skip_start = 6;
   }
-  if (speed >= 4) {
+  if (speed == 4) {
     sf->use_square_partition_only = 1;
     sf->tx_size_search_method = USE_LARGESTALL;
     sf->disable_split_mask = DISABLE_ALL_SPLIT;
 
-    sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH
-        | FLAG_SKIP_INTRA_BESTINTER | FLAG_SKIP_COMP_BESTINTRA
-        | FLAG_SKIP_COMP_REFMISMATCH | FLAG_SKIP_INTRA_LOWVAR
-        | FLAG_EARLY_TERMINATE;
+    sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH |
+      FLAG_SKIP_INTRA_BESTINTER |
+      FLAG_SKIP_COMP_BESTINTRA |
+      FLAG_SKIP_COMP_REFMISMATCH |
+      FLAG_SKIP_INTRA_LOWVAR |
+      FLAG_EARLY_TERMINATE;
 
     sf->use_rd_breakout = 1;
     sf->adaptive_motion_search = 1;
     sf->adaptive_pred_filter_type = 2;
+    sf->reference_masking = 1;
     sf->auto_mv_step_size = 1;
 
     sf->disable_filter_search_var_thresh = 200;
@@ -715,30 +719,24 @@ static void set_rt_speed_feature(VP9_COMMON *cm,
 
     sf->adaptive_rd_thresh = 4;
     sf->mode_skip_start = 6;
-
-    /* sf->intra_y_mode_mask = INTRA_DC_ONLY;
-     sf->intra_uv_mode_mask = INTRA_DC_ONLY;
-     sf->search_method = BIGDIA;
-     sf->disable_split_var_thresh = 64;
-     sf->disable_filter_search_var_thresh = 64; */
   }
-  if (speed >= 5) {
-    int i;
+  if (speed == 5) {
     sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
     sf->use_one_partition_size_always = 1;
     sf->always_this_block_size = BLOCK_16X16;
-    sf->tx_size_search_method =
-        frame_is_intra_only(cm) ? USE_FULL_RD : USE_LARGESTALL;
-    sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH
-        | FLAG_SKIP_INTRA_BESTINTER | FLAG_SKIP_COMP_BESTINTRA
-        | FLAG_SKIP_COMP_REFMISMATCH | FLAG_SKIP_INTRA_LOWVAR
-        | FLAG_EARLY_TERMINATE;
+    sf->tx_size_search_method = frame_is_intra_only(cm) ?
+      USE_FULL_RD : USE_LARGESTALL;
+    sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH |
+                                 FLAG_SKIP_INTRA_BESTINTER |
+                                 FLAG_SKIP_COMP_BESTINTRA |
+                                 FLAG_SKIP_COMP_REFMISMATCH |
+                                 FLAG_SKIP_INTRA_LOWVAR |
+                                 FLAG_EARLY_TERMINATE;
     sf->use_rd_breakout = 1;
     sf->use_lp32x32fdct = 1;
     sf->optimize_coefficients = 0;
     sf->auto_mv_step_size = 1;
-    // sf->reduce_first_step_size = 1;
-    // sf->reference_masking = 1;
+    sf->reference_masking = 1;
 
     sf->disable_split_mask = DISABLE_ALL_SPLIT;
     sf->search_method = HEX;
@@ -754,6 +752,107 @@ static void set_rt_speed_feature(VP9_COMMON *cm,
     sf->mode_skip_start = 6;
   }
 }
+static void set_rt_speed_feature(VP9_COMMON *cm,
+                                 SPEED_FEATURES *sf,
+                                 int speed) {
+  sf->static_segmentation = 0;
+  sf->adaptive_rd_thresh = 1;
+  sf->recode_loop = (speed < 1);
+  if (speed == 1) {
+    sf->use_square_partition_only = !frame_is_intra_only(cm);
+    sf->less_rectangular_check = 1;
+    sf->tx_size_search_method =
+        frame_is_intra_only(cm) ? USE_FULL_RD : USE_LARGESTALL;
+
+    if (MIN(cm->width, cm->height) >= 720)
+      sf->disable_split_mask = cm->show_frame ?
+        DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT;
+    else
+      sf->disable_split_mask = DISABLE_COMPOUND_SPLIT;
+
+    sf->use_rd_breakout = 1;
+    sf->adaptive_motion_search = 1;
+    sf->adaptive_pred_filter_type = 1;
+    sf->auto_mv_step_size = 1;
+    sf->adaptive_rd_thresh = 2;
+    sf->recode_loop = 2;
+    sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
+    sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V;
+    sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V;
+  }
+  if (speed >= 2) {
+    sf->use_square_partition_only = !frame_is_intra_only(cm);
+    sf->less_rectangular_check = 1;
+    sf->tx_size_search_method =
+        frame_is_intra_only(cm) ? USE_FULL_RD : USE_LARGESTALL;
+
+    if (MIN(cm->width, cm->height) >= 720)
+      sf->disable_split_mask = cm->show_frame ?
+        DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT;
+    else
+      sf->disable_split_mask = LAST_AND_INTRA_SPLIT_ONLY;
+
+    sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH
+        | FLAG_SKIP_INTRA_BESTINTER | FLAG_SKIP_COMP_BESTINTRA
+        | FLAG_SKIP_INTRA_LOWVAR;
+
+    sf->use_rd_breakout = 1;
+    sf->adaptive_motion_search = 1;
+    sf->adaptive_pred_filter_type = 2;
+    sf->auto_mv_step_size = 1;
+    sf->reference_masking = 1;
+
+    sf->disable_filter_search_var_thresh = 50;
+    sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
+
+    sf->auto_min_max_partition_size = 1;
+    sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_LOW_MOTION;
+    sf->adjust_partitioning_from_last_frame = 1;
+    sf->last_partitioning_redo_frequency = 3;
+
+    sf->adaptive_rd_thresh = 2;
+    sf->recode_loop = 2;
+    sf->use_lp32x32fdct = 1;
+    sf->mode_skip_start = 11;
+    sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
+    sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V;
+    sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V;
+    sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V;
+  }
+  if (speed >= 3) {
+    sf->use_square_partition_only = 1;
+    sf->tx_size_search_method = USE_LARGESTALL;
+
+    if (MIN(cm->width, cm->height) >= 720)
+      sf->disable_split_mask = DISABLE_ALL_SPLIT;
+    else
+      sf->disable_split_mask = DISABLE_ALL_INTER_SPLIT;
+
+    sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH
+        | FLAG_SKIP_INTRA_BESTINTER | FLAG_SKIP_COMP_BESTINTRA
+        | FLAG_SKIP_INTRA_LOWVAR;
+
+    sf->disable_filter_search_var_thresh = 100;
+    sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_ALL;
+    sf->use_uv_intra_rd_estimate = 1;
+    sf->skip_encode_sb = 1;
+    sf->subpel_iters_per_step = 1;
+    sf->use_fast_coef_updates = 2;
+    sf->adaptive_rd_thresh = 4;
+    sf->mode_skip_start = 6;
+  }
+  if (speed >= 4) {
+    sf->optimize_coefficients = 0;
+  }
+  if (speed >= 5) {
+    int i;
+    sf->disable_split_mask = DISABLE_ALL_SPLIT;
+    for (i = 0; i < TX_SIZES; i++) {
+      sf->intra_y_mode_mask[i] = INTRA_DC_H_V;
+      sf->intra_uv_mode_mask[i] = INTRA_DC_ONLY;
+    }
+  }
+}
 
 void vp9_set_speed_features(VP9_COMP *cpi) {
   SPEED_FEATURES *sf = &cpi->sf;
@@ -772,7 +871,6 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
   // best quality defaults
   sf->RD = 1;
   sf->search_method = NSTEP;
-  sf->auto_filter = 1;
   sf->recode_loop = 1;
   sf->subpel_search_method = SUBPEL_TREE;
   sf->subpel_iters_per_step = 2;
@@ -812,199 +910,13 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
   sf->using_small_partition_info = 0;
   sf->mode_skip_start = MAX_MODES;  // Mode index at which mode skip mask set
 
-#if CONFIG_MULTIPLE_ARF
-  // Switch segmentation off.
-  sf->static_segmentation = 0;
-#else
-  sf->static_segmentation = 0;
-#endif
-
   switch (mode) {
     case 0:  // This is the best quality mode.
       cpi->diamond_search_sad = vp9_full_range_search;
       break;
-
     case 1:
-
-#if CONFIG_MULTIPLE_ARF
-      // Switch segmentation off.
-      sf->static_segmentation = 0;
-#else
-      sf->static_segmentation = 0;
-#endif
-      sf->adaptive_rd_thresh = 1;
-      sf->recode_loop = (speed < 1);
-
-      if (speed == 1) {
-        sf->use_square_partition_only = !frame_is_intra_only(cm);
-        sf->less_rectangular_check  = 1;
-        sf->tx_size_search_method = frame_is_intra_only(cm)
-                                     ? USE_FULL_RD : USE_LARGESTALL;
-
-        if (MIN(cm->width, cm->height) >= 720)
-          sf->disable_split_mask = cm->show_frame ?
-              DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT;
-        else
-          sf->disable_split_mask = DISABLE_COMPOUND_SPLIT;
-
-        sf->use_rd_breakout = 1;
-        sf->adaptive_motion_search = 1;
-        sf->adaptive_pred_filter_type = 1;
-        sf->auto_mv_step_size = 1;
-        sf->adaptive_rd_thresh = 2;
-        sf->recode_loop = 2;
-        sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
-        sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V;
-        sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V;
-      }
-      if (speed == 2) {
-        sf->use_square_partition_only = !frame_is_intra_only(cm);
-        sf->less_rectangular_check  = 1;
-        sf->tx_size_search_method = frame_is_intra_only(cm)
-                                     ? USE_FULL_RD : USE_LARGESTALL;
-
-        if (MIN(cm->width, cm->height) >= 720)
-          sf->disable_split_mask = cm->show_frame ?
-              DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT;
-        else
-          sf->disable_split_mask = LAST_AND_INTRA_SPLIT_ONLY;
-
-
-        sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH |
-                                     FLAG_SKIP_INTRA_BESTINTER |
-                                     FLAG_SKIP_COMP_BESTINTRA |
-                                     FLAG_SKIP_INTRA_LOWVAR;
-
-        sf->use_rd_breakout = 1;
-        sf->adaptive_motion_search = 1;
-        sf->adaptive_pred_filter_type = 2;
-        sf->auto_mv_step_size = 1;
-
-        sf->disable_filter_search_var_thresh = 50;
-        sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
-
-        sf->auto_min_max_partition_size = 1;
-        sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_LOW_MOTION;
-        sf->adjust_partitioning_from_last_frame = 1;
-        sf->last_partitioning_redo_frequency = 3;
-
-        sf->adaptive_rd_thresh = 2;
-        sf->recode_loop = 2;
-        sf->use_lp32x32fdct = 1;
-        sf->mode_skip_start = 11;
-        sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
-        sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V;
-        sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V;
-        sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V;
-      }
-      if (speed == 3) {
-        sf->use_square_partition_only = 1;
-        sf->tx_size_search_method = USE_LARGESTALL;
-
-        if (MIN(cm->width, cm->height) >= 720)
-          sf->disable_split_mask = DISABLE_ALL_SPLIT;
-        else
-          sf->disable_split_mask = DISABLE_ALL_INTER_SPLIT;
-
-        sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH |
-                                     FLAG_SKIP_INTRA_BESTINTER |
-                                     FLAG_SKIP_COMP_BESTINTRA |
-                                     FLAG_SKIP_INTRA_LOWVAR;
-
-        sf->use_rd_breakout = 1;
-        sf->adaptive_motion_search = 1;
-        sf->adaptive_pred_filter_type = 2;
-        sf->auto_mv_step_size = 1;
-
-        sf->disable_filter_search_var_thresh = 100;
-        sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
-
-        sf->auto_min_max_partition_size = 1;
-        sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_ALL;
-        sf->adjust_partitioning_from_last_frame = 1;
-        sf->last_partitioning_redo_frequency = 3;
-
-        sf->use_uv_intra_rd_estimate = 1;
-        sf->skip_encode_sb = 1;
-        sf->use_lp32x32fdct = 1;
-        sf->subpel_iters_per_step = 1;
-        sf->use_fast_coef_updates = 2;
-
-        sf->adaptive_rd_thresh = 4;
-        sf->mode_skip_start = 6;
-      }
-      if (speed == 4) {
-        sf->use_square_partition_only = 1;
-        sf->tx_size_search_method = USE_LARGESTALL;
-        sf->disable_split_mask = DISABLE_ALL_SPLIT;
-
-        sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH |
-                                     FLAG_SKIP_INTRA_BESTINTER |
-                                     FLAG_SKIP_COMP_BESTINTRA |
-                                     FLAG_SKIP_COMP_REFMISMATCH |
-                                     FLAG_SKIP_INTRA_LOWVAR |
-                                     FLAG_EARLY_TERMINATE;
-
-        sf->use_rd_breakout = 1;
-        sf->adaptive_motion_search = 1;
-        sf->adaptive_pred_filter_type = 2;
-        sf->auto_mv_step_size = 1;
-
-        sf->disable_filter_search_var_thresh = 200;
-        sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
-
-        sf->auto_min_max_partition_size = 1;
-        sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_ALL;
-        sf->adjust_partitioning_from_last_frame = 1;
-        sf->last_partitioning_redo_frequency = 3;
-
-        sf->use_uv_intra_rd_estimate = 1;
-        sf->skip_encode_sb = 1;
-        sf->use_lp32x32fdct = 1;
-        sf->subpel_iters_per_step = 1;
-        sf->use_fast_coef_updates = 2;
-
-        sf->adaptive_rd_thresh = 4;
-        sf->mode_skip_start = 6;
-
-        /* sf->intra_y_mode_mask = INTRA_DC_ONLY;
-        sf->intra_uv_mode_mask = INTRA_DC_ONLY;
-        sf->search_method = BIGDIA;
-        sf->disable_split_var_thresh = 64;
-        sf->disable_filter_search_var_thresh = 64; */
-      }
-      if (speed == 5) {
-        sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
-        sf->use_one_partition_size_always = 1;
-        sf->always_this_block_size = BLOCK_16X16;
-        sf->tx_size_search_method = frame_is_intra_only(cm) ?
-                                     USE_FULL_RD : USE_LARGESTALL;
-        sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH |
-                                     FLAG_SKIP_INTRA_BESTINTER |
-                                     FLAG_SKIP_COMP_BESTINTRA |
-                                     FLAG_SKIP_COMP_REFMISMATCH |
-                                     FLAG_SKIP_INTRA_LOWVAR |
-                                     FLAG_EARLY_TERMINATE;
-        sf->use_rd_breakout = 1;
-        sf->use_lp32x32fdct = 1;
-        sf->optimize_coefficients = 0;
-        sf->auto_mv_step_size = 1;
-        // sf->reduce_first_step_size = 1;
-        // sf->reference_masking = 1;
-
-        sf->disable_split_mask = DISABLE_ALL_SPLIT;
-        sf->search_method = HEX;
-        sf->subpel_iters_per_step = 1;
-        sf->disable_split_var_thresh = 64;
-        sf->disable_filter_search_var_thresh = 500;
-        for (i = 0; i < TX_SIZES; i++) {
-          sf->intra_y_mode_mask[i] = INTRA_DC_ONLY;
-          sf->intra_uv_mode_mask[i] = INTRA_DC_ONLY;
-        }
-        sf->use_fast_coef_updates = 2;
-        sf->adaptive_rd_thresh = 4;
-        sf->mode_skip_start = 6;
-      }
+      set_good_speed_feature(cm, sf, speed);
+      break;
       break;
     case 2:
       set_rt_speed_feature(cm, sf, speed);
@@ -1350,8 +1262,6 @@ void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {
 
   cpi->ref_frame_flags = VP9_ALT_FLAG | VP9_GOLD_FLAG | VP9_LAST_FLAG;
 
-  // cpi->use_golden_frame_only = 0;
-  // cpi->use_last_frame_only = 0;
   cpi->refresh_golden_frame = 0;
   cpi->refresh_last_frame = 1;
   cm->refresh_frame_context = 1;
@@ -1392,6 +1302,12 @@ void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {
   else
     cpi->oxcf.maximum_buffer_size = rescale(cpi->oxcf.maximum_buffer_size,
                                             cpi->oxcf.target_bandwidth, 1000);
+  // Under a configuration change, where maximum_buffer_size may change,
+  // keep buffer level clipped to the maximum allowed buffer size.
+  if (cpi->rc.bits_off_target > cpi->oxcf.maximum_buffer_size) {
+    cpi->rc.bits_off_target = cpi->oxcf.maximum_buffer_size;
+    cpi->rc.buffer_level = cpi->rc.bits_off_target;
+  }
 
   // Set up frame rate and related parameters rate control values.
   vp9_new_framerate(cpi, cpi->oxcf.framerate);
@@ -1453,6 +1369,9 @@ void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {
 #endif
 
   set_tile_limits(cpi);
+
+  cpi->ext_refresh_frame_flags_pending = 0;
+  cpi->ext_refresh_frame_context_pending = 0;
 }
 
 #define M_LOG2_E 0.693147180559945309417
@@ -2257,19 +2176,20 @@ int vp9_update_reference(VP9_PTR ptr, int ref_frame_flags) {
   if (ref_frame_flags > 7)
     return -1;
 
-  cpi->refresh_golden_frame = 0;
-  cpi->refresh_alt_ref_frame = 0;
-  cpi->refresh_last_frame   = 0;
+  cpi->ext_refresh_golden_frame = 0;
+  cpi->ext_refresh_alt_ref_frame = 0;
+  cpi->ext_refresh_last_frame   = 0;
 
   if (ref_frame_flags & VP9_LAST_FLAG)
-    cpi->refresh_last_frame = 1;
+    cpi->ext_refresh_last_frame = 1;
 
   if (ref_frame_flags & VP9_GOLD_FLAG)
-    cpi->refresh_golden_frame = 1;
+    cpi->ext_refresh_golden_frame = 1;
 
   if (ref_frame_flags & VP9_ALT_FLAG)
-    cpi->refresh_alt_ref_frame = 1;
+    cpi->ext_refresh_alt_ref_frame = 1;
 
+  cpi->ext_refresh_frame_flags_pending = 1;
   return 0;
 }
 
@@ -2325,7 +2245,8 @@ int vp9_set_reference_enc(VP9_PTR ptr, VP9_REFFRAME ref_frame_flag,
 }
 
 int vp9_update_entropy(VP9_PTR comp, int update) {
-  ((VP9_COMP *)comp)->common.refresh_frame_context = update;
+  ((VP9_COMP *)comp)->ext_refresh_frame_context = update;
+  ((VP9_COMP *)comp)->ext_refresh_frame_context_pending = 1;
   return 0;
 }
 
@@ -2975,6 +2896,23 @@ static void get_ref_frame_flags(VP9_COMP *cpi) {
     cpi->ref_frame_flags &= ~VP9_ALT_FLAG;
 }
 
+static void set_ext_overrides(VP9_COMP *cpi) {
+  // Overrides the defaults with the externally supplied values with
+  // vp9_update_reference() and vp9_update_entropy() calls
+  // Note: The overrides are valid only for the next frame passed
+  // to encode_frame_to_data_rate() function
+  if (cpi->ext_refresh_frame_context_pending) {
+    cpi->common.refresh_frame_context = cpi->ext_refresh_frame_context;
+    cpi->ext_refresh_frame_context_pending = 0;
+  }
+  if (cpi->ext_refresh_frame_flags_pending) {
+    cpi->refresh_last_frame = cpi->ext_refresh_last_frame;
+    cpi->refresh_golden_frame = cpi->ext_refresh_golden_frame;
+    cpi->refresh_alt_ref_frame = cpi->ext_refresh_alt_ref_frame;
+    cpi->ext_refresh_frame_flags_pending = 0;
+  }
+}
+
 static void encode_frame_to_data_rate(VP9_COMP *cpi,
                                       size_t *size,
                                       uint8_t *dest,
@@ -2991,6 +2929,8 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
   unsigned int max_mv_def = MIN(cpi->common.width, cpi->common.height);
   struct segmentation *const seg = &cm->seg;
 
+  set_ext_overrides(cpi);
+
   /* Scale the source buffer, if required. */
   if (cm->mi_cols * 8 != cpi->un_scaled_source->y_width ||
       cm->mi_rows * 8 != cpi->un_scaled_source->y_height) {
@@ -3166,6 +3106,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
     cpi->ambient_err = vp9_calc_ss_err(cpi->Source, get_frame_new_buffer(cm));
   }
 
+  // If the encoder forced a KEY_FRAME decision
   if (cm->frame_type == KEY_FRAME)
     cpi->refresh_last_frame = 1;
 
@@ -3318,7 +3259,11 @@ static void SvcEncode(VP9_COMP *cpi, size_t *size, uint8_t *dest,
 
 static void Pass0Encode(VP9_COMP *cpi, size_t *size, uint8_t *dest,
                         unsigned int *frame_flags) {
-  vp9_get_one_pass_params(cpi);
+  if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) {
+    vp9_get_one_pass_cbr_params(cpi);
+  } else {
+    vp9_get_one_pass_params(cpi);
+  }
   encode_frame_to_data_rate(cpi, size, dest, frame_flags);
 }
 
@@ -3396,6 +3341,44 @@ int is_next_frame_arf(VP9_COMP *cpi) {
 }
 #endif
 
+void adjust_frame_rate(VP9_COMP *cpi) {
+  int64_t this_duration;
+  int step = 0;
+
+  if (cpi->source->ts_start == cpi->first_time_stamp_ever) {
+    this_duration = cpi->source->ts_end - cpi->source->ts_start;
+    step = 1;
+  } else {
+    int64_t last_duration = cpi->last_end_time_stamp_seen
+        - cpi->last_time_stamp_seen;
+
+    this_duration = cpi->source->ts_end - cpi->last_end_time_stamp_seen;
+
+    // do a step update if the duration changes by 10%
+    if (last_duration)
+      step = (int)((this_duration - last_duration) * 10 / last_duration);
+  }
+
+  if (this_duration) {
+    if (step) {
+      vp9_new_framerate(cpi, 10000000.0 / this_duration);
+    } else {
+      // Average this frame's rate into the last second's average
+      // frame rate. If we haven't seen 1 second yet, then average
+      // over the whole interval seen.
+      const double interval = MIN((double)(cpi->source->ts_end
+                                   - cpi->first_time_stamp_ever), 10000000.0);
+      double avg_duration = 10000000.0 / cpi->oxcf.framerate;
+      avg_duration *= (interval - avg_duration + this_duration);
+      avg_duration /= interval;
+
+      vp9_new_framerate(cpi, 10000000.0 / avg_duration);
+    }
+  }
+  cpi->last_time_stamp_seen = cpi->source->ts_start;
+  cpi->last_end_time_stamp_seen = cpi->source->ts_end;
+}
+
 int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
                             size_t *size, uint8_t *dest,
                             int64_t *time_stamp, int64_t *time_end, int flush) {
@@ -3415,6 +3398,13 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
 
   set_high_precision_mv(cpi, ALTREF_HIGH_PRECISION_MV);
 
+  // Normal defaults
+  cm->reset_frame_context = 0;
+  cm->refresh_frame_context = 1;
+  cpi->refresh_last_frame = 1;
+  cpi->refresh_golden_frame = 0;
+  cpi->refresh_alt_ref_frame = 0;
+
   // Should we code an alternate reference frame.
   if (cpi->oxcf.play_alternate && cpi->rc.source_alt_ref_pending) {
     int frames_to_arf;
@@ -3425,7 +3415,7 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
 
     if (cpi->multi_arf_enabled && (cpi->pass == 2))
       frames_to_arf = (-cpi->frame_coding_order[cpi->sequence_number])
-        - cpi->next_frame_in_order;
+          - cpi->next_frame_in_order;
     else
 #endif
       frames_to_arf = cpi->rc.frames_till_gf_update_due;
@@ -3509,18 +3499,6 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
     *time_end = cpi->source->ts_end;
     *frame_flags = cpi->source->flags;
 
-    // fprintf(fp_out, "   Frame:%d", cm->current_video_frame);
-#if CONFIG_MULTIPLE_ARF
-    if (cpi->multi_arf_enabled) {
-      // fprintf(fp_out, "   seq_no:%d  this_frame_weight:%d",
-      //         cpi->sequence_number, cpi->this_frame_weight);
-    } else {
-      // fprintf(fp_out, "\n");
-    }
-#else
-    // fprintf(fp_out, "\n");
-#endif
-
 #if CONFIG_MULTIPLE_ARF
     if ((cm->frame_type != KEY_FRAME) && (cpi->pass == 2))
       cpi->rc.source_alt_ref_pending = is_next_frame_arf(cpi);
@@ -3542,43 +3520,8 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
   }
 
   // adjust frame rates based on timestamps given
-  if (!cpi->refresh_alt_ref_frame) {
-    int64_t this_duration;
-    int step = 0;
-
-    if (cpi->source->ts_start == cpi->first_time_stamp_ever) {
-      this_duration = cpi->source->ts_end - cpi->source->ts_start;
-      step = 1;
-    } else {
-      int64_t last_duration = cpi->last_end_time_stamp_seen
-                                - cpi->last_time_stamp_seen;
-
-      this_duration = cpi->source->ts_end - cpi->last_end_time_stamp_seen;
-
-      // do a step update if the duration changes by 10%
-      if (last_duration)
-        step = (int)((this_duration - last_duration) * 10 / last_duration);
-    }
-
-    if (this_duration) {
-      if (step) {
-        vp9_new_framerate(cpi, 10000000.0 / this_duration);
-      } else {
-        // Average this frame's rate into the last second's average
-        // frame rate. If we haven't seen 1 second yet, then average
-        // over the whole interval seen.
-        const double interval = MIN((double)(cpi->source->ts_end
-                                     - cpi->first_time_stamp_ever), 10000000.0);
-        double avg_duration = 10000000.0 / cpi->oxcf.framerate;
-        avg_duration *= (interval - avg_duration + this_duration);
-        avg_duration /= interval;
-
-        vp9_new_framerate(cpi, 10000000.0 / avg_duration);
-      }
-    }
-
-    cpi->last_time_stamp_seen = cpi->source->ts_start;
-    cpi->last_end_time_stamp_seen = cpi->source->ts_end;
+  if (cm->show_frame) {
+    adjust_frame_rate(cpi);
   }
 
   // start with a 0 size frame
@@ -3604,21 +3547,6 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
   }
 #endif
 
-#if 0  // CONFIG_MULTIPLE_ARF
-  if (cpi->multi_arf_enabled) {
-    fprintf(fp_out, "      idx(%d, %d, %d, %d) active(%d, %d, %d)",
-        cpi->lst_fb_idx, cpi->gld_fb_idx, cpi->alt_fb_idx, cm->new_fb_idx,
-        cm->ref_frame_map[cpi->lst_fb_idx],
-        cm->ref_frame_map[cpi->gld_fb_idx],
-        cm->ref_frame_map[cpi->alt_fb_idx]);
-    if (cpi->refresh_alt_ref_frame)
-      fprintf(fp_out, "  type:ARF");
-    if (cpi->rc.is_src_frame_alt_ref)
-      fprintf(fp_out, "  type:OVERLAY[%d]", cpi->alt_fb_idx);
-    fprintf(fp_out, "\n");
-  }
-#endif
-
   cm->frame_flags = *frame_flags;
 
   // Reset the frame pointers to the current frame size
@@ -3669,15 +3597,7 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
   }
 
   if (*size > 0) {
-    // if its a dropped frame honor the requests on subsequent frames
     cpi->droppable = !frame_is_reference(cpi);
-
-    // return to normal state
-    cm->reset_frame_context = 0;
-    cm->refresh_frame_context = 1;
-    cpi->refresh_alt_ref_frame = 0;
-    cpi->refresh_golden_frame = 0;
-    cpi->refresh_last_frame = 1;
   }
 
   vpx_usec_timer_mark(&cmptimer);
diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h
index 946424534..a5be0f424 100644
--- a/vp9/encoder/vp9_onyx_int.h
+++ b/vp9/encoder/vp9_onyx_int.h
@@ -232,57 +232,185 @@ typedef enum {
 } LAST_FRAME_PARTITION_METHOD;
 
 typedef struct {
+  // This flag refers to whether or not to perform rd optimization.
   int RD;
+
+  // Motion search method (Diamond, NSTEP, Hex, Big Diamond, Square, etc).
   SEARCH_METHODS search_method;
-  int auto_filter;
+
+  // Recode_loop can be:
+  // 0 means we only encode a frame once
+  // 1 means we can re-encode based on bitrate constraints on any frame
+  // 2 means we can only recode gold, alt, and key frames.
   int recode_loop;
+
+  // Subpel_search_method can only be subpel_tree which does a subpixel
+  // logarithmic search that keeps stepping at 1/2 pixel units until
+  // you stop getting a gain, and then goes on to 1/4 and repeats
+  // the same process. Along the way it skips many diagonals.
   SUBPEL_SEARCH_METHODS subpel_search_method;
+
+  // Maximum number of steps in logarithmic subpel search before giving up.
   int subpel_iters_per_step;
+
+  // Thresh_mult is used to set a threshold for the rd score. A higher value
+  // means that we will accept the best mode so far more often. This number
+  // is used in combination with the current block size, and thresh_freq_fact
+  // to pick a threshold.
   int thresh_mult[MAX_MODES];
   int thresh_mult_sub8x8[MAX_REFS];
+
+  // This parameter controls the number of steps we'll do in a diamond
+  // search.
   int max_step_search_steps;
+
+  // This parameter controls which step in the n-step process we start at.
+  // It's changed adaptively based on circumstances.
   int reduce_first_step_size;
+
+  // If this is set to 1, we limit the motion search range to 2 times the
+  // largest motion vector found in the last frame.
   int auto_mv_step_size;
+
+  // Trellis (dynamic programming) optimization of quantized values (+1, 0).
   int optimize_coefficients;
+
+  // Always set to 0. If on it enables 0 cost background transmission
+  // (except for the initial transmission of the segmentation). The feature is
+  // disabled because the addition of very large block sizes make the
+  // backgrounds very to cheap to encode, and the segmentation we have
+  // adds overhead.
   int static_segmentation;
+
+  // If 1 we iterate finding a best reference for 2 ref frames together - via
+  // a log search that iterates 4 times (check around mv for last for best
+  // error of combined predictor then check around mv for alt). If 0 we
+  // we just use the best motion vector found for each frame by itself.
   int comp_inter_joint_search_thresh;
+
+  // This variable is used to cap the maximum number of times we skip testing a
+  // mode to be evaluated. A high value means we will be faster.
   int adaptive_rd_thresh;
+
+  // Enables skipping the reconstruction step (idct, recon) in the
+  // intermediate steps assuming the last frame didn't have too many intra
+  // blocks and the q is less than a threshold.
   int skip_encode_sb;
   int skip_encode_frame;
+
+  // This variable allows us to reuse the last frames partition choices
+  // (64x64 v 32x32 etc) for this frame. It can be set to only use the last
+  // frame as a starting point in low motion scenes or always use it. If set
+  // we use last partitioning_redo frequency to determine how often to redo
+  // the partitioning from scratch. Adjust_partitioning_from_last_frame
+  // enables us to adjust up or down one partitioning from the last frames
+  // partitioning.
   LAST_FRAME_PARTITION_METHOD use_lastframe_partitioning;
+
+  // Determine which method we use to determine transform size. We can choose
+  // between options like full rd, largest for prediction size, largest
+  // for intra and model coefs for the rest.
   TX_SIZE_SEARCH_METHOD tx_size_search_method;
+
+  // Low precision 32x32 fdct keeps everything in 16 bits and thus is less
+  // precise but significantly faster than the non lp version.
   int use_lp32x32fdct;
+
+  // TODO(JBB): remove this as its no longer used.
+
+  // If set partition size will always be always_this_block_size.
   int use_one_partition_size_always;
+
+  // Skip rectangular partition test when partition type none gives better
+  // rd than partition type split.
   int less_rectangular_check;
+
+  // Disable testing non square partitions. (eg 16x32)
   int use_square_partition_only;
+
+  // After looking at the first set of modes (set by index here), skip
+  // checking modes for reference frames that don't match the reference frame
+  // of the best so far.
   int mode_skip_start;
+
+  // TODO(JBB): Remove this.
   int reference_masking;
+
+  // Used in conjunction with use_one_partition_size_always.
   BLOCK_SIZE always_this_block_size;
+
+  // Sets min and max partition sizes for this 64x64 region based on the
+  // same superblock in last encoded frame, and the left and above neighbor
+  // in this block.
   int auto_min_max_partition_size;
+
+  // Min and max partition size we enable (block_size) as per auto
+  // min max, but also used by adjust partitioning, and pick_partitioning.
   BLOCK_SIZE min_partition_size;
   BLOCK_SIZE max_partition_size;
+
+  // Whether or not we allow partitions one smaller or one greater than the last
+  // frame's partitioning. Only used if use_lastframe_partitioning is set.
   int adjust_partitioning_from_last_frame;
+
+  // How frequently we re do the partitioning from scratch. Only used if
+  // use_lastframe_partitioning is set.
   int last_partitioning_redo_frequency;
+
+  // Disables sub 8x8 blocksizes in different scenarios: Choices are to disable
+  // it always, to allow it for only Last frame and Intra, disable it for all
+  // inter modes or to enable it always.
   int disable_split_mask;
+
+  // TODO(jbb): Remove this and everything that uses it. It's only valid if
+  // we were doing small to large partition checks. We currently do the
+  // reverse.
   int using_small_partition_info;
+
   // TODO(jingning): combine the related motion search speed features
+  // This allows us to use motion search at other sizes as a starting
+  // point for this motion search and limits the search range around it.
   int adaptive_motion_search;
+
+  // Allows sub 8x8 modes to use the prediction filter that was determined
+  // best for 8x8 mode. If set to 0 we always re check all the filters for
+  // sizes less than 8x8, 1 means we check all filter modes if no 8x8 filter
+  // was selected, and 2 means we use 8 tap if no 8x8 filter mode was selected.
   int adaptive_pred_filter_type;
 
   // Implements various heuristics to skip searching modes
   // The heuristics selected are based on  flags
   // defined in the MODE_SEARCH_SKIP_HEURISTICS enum
   unsigned int mode_search_skip_flags;
+
   // A source variance threshold below which the split mode is disabled
   unsigned int disable_split_var_thresh;
+
   // A source variance threshold below which filter search is disabled
   // Choose a very large value (UINT_MAX) to use 8-tap always
   unsigned int disable_filter_search_var_thresh;
+
+  // These bit masks allow you to enable or disable intra modes for each
+  // transform size separately.
   int intra_y_mode_mask[TX_SIZES];
   int intra_uv_mode_mask[TX_SIZES];
+
+  // This variable enables an early break out of mode testing if the model for
+  // rd built from the prediction signal indicates a value that's much
+  // higher than the best rd we've seen so far.
   int use_rd_breakout;
+
+  // This enables us to use an estimate for intra rd based on dc mode rather
+  // than choosing an actual uv mode in the stage of encoding before the actual
+  // final encode.
   int use_uv_intra_rd_estimate;
+
+  // This picks a loop filter strength by trying a small portion of the image
+  // with different values.
   int use_fast_lpf_pick;
+
+  // This feature limits the number of coefficients updates we actually do
+  // by only looking at counts from 1/2 the bands.
   int use_fast_coef_updates;  // 0: 2-loop, 1: 1-loop, 2: 1-loop reduced
 } SPEED_FEATURES;
 
@@ -400,6 +528,15 @@ typedef struct VP9_COMP {
   int refresh_last_frame;
   int refresh_golden_frame;
   int refresh_alt_ref_frame;
+
+  int ext_refresh_frame_flags_pending;
+  int ext_refresh_last_frame;
+  int ext_refresh_golden_frame;
+  int ext_refresh_alt_ref_frame;
+
+  int ext_refresh_frame_context_pending;
+  int ext_refresh_frame_context;
+
   YV12_BUFFER_CONFIG last_frame_uf;
 
   TOKENEXTRA *tok;
@@ -596,7 +733,7 @@ typedef struct VP9_COMP {
   int *mb_norm_activity_map;
   int output_partition;
 
-  /* force next frame to intra when kf_auto says so */
+  // Force next frame to intra when kf_auto says so.
   int force_next_frame_intra;
 
   int droppable;
@@ -638,7 +775,7 @@ typedef struct VP9_COMP {
   int64_t mode_test_hits[BLOCK_SIZES];
 #endif
 
-  /* Y,U,V,(A) */
+  // Y,U,V,(A)
   ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
   ENTROPY_CONTEXT left_context[MAX_MB_PLANE][16];
 
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index 31b0116a4..939a7f998 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -184,8 +184,6 @@ void vp9_setup_key_frame(VP9_COMP *cpi) {
 
   vp9_setup_past_independence(cm);
 
-  // interval before next GF
-  cpi->rc.frames_till_gf_update_due = cpi->rc.baseline_gf_interval;
   /* All buffers are implicitly updated on key frames. */
   cpi->refresh_golden_frame = 1;
   cpi->refresh_alt_ref_frame = 1;
@@ -432,10 +430,32 @@ static void calc_pframe_target_size(VP9_COMP *const cpi) {
   }
 }
 
+static double get_rate_correction_factor(const VP9_COMP *cpi) {
+  if (cpi->common.frame_type == KEY_FRAME) {
+    return cpi->rc.key_frame_rate_correction_factor;
+  } else {
+    if (cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame)
+      return cpi->rc.gf_rate_correction_factor;
+    else
+      return cpi->rc.rate_correction_factor;
+  }
+}
+
+static void set_rate_correction_factor(VP9_COMP *cpi, double factor) {
+  if (cpi->common.frame_type == KEY_FRAME) {
+    cpi->rc.key_frame_rate_correction_factor = factor;
+  } else {
+    if (cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame)
+      cpi->rc.gf_rate_correction_factor = factor;
+    else
+      cpi->rc.rate_correction_factor = factor;
+  }
+}
+
 void vp9_rc_update_rate_correction_factors(VP9_COMP *cpi, int damp_var) {
   const int q = cpi->common.base_qindex;
   int correction_factor = 100;
-  double rate_correction_factor;
+  double rate_correction_factor = get_rate_correction_factor(cpi);
   double adjustment_limit;
 
   int projected_size_based_on_q = 0;
@@ -443,15 +463,6 @@ void vp9_rc_update_rate_correction_factors(VP9_COMP *cpi, int damp_var) {
   // Clear down mmx registers to allow floating point in what follows
   vp9_clear_system_state();  // __asm emms;
 
-  if (cpi->common.frame_type == KEY_FRAME) {
-    rate_correction_factor = cpi->rc.key_frame_rate_correction_factor;
-  } else {
-    if (cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame)
-      rate_correction_factor = cpi->rc.gf_rate_correction_factor;
-    else
-      rate_correction_factor = cpi->rc.rate_correction_factor;
-  }
-
   // Work out how big we would have expected the frame to be at this Q given
   // the current correction factor.
   // Stay in double to avoid int overflow when values are large
@@ -501,36 +512,16 @@ void vp9_rc_update_rate_correction_factors(VP9_COMP *cpi, int damp_var) {
       rate_correction_factor = MIN_BPB_FACTOR;
   }
 
-  if (cpi->common.frame_type == KEY_FRAME) {
-    cpi->rc.key_frame_rate_correction_factor = rate_correction_factor;
-  } else {
-    if (cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame)
-      cpi->rc.gf_rate_correction_factor = rate_correction_factor;
-    else
-      cpi->rc.rate_correction_factor = rate_correction_factor;
-  }
+  set_rate_correction_factor(cpi, rate_correction_factor);
 }
 
 
 int vp9_rc_regulate_q(const VP9_COMP *cpi, int target_bits_per_frame,
                       int active_best_quality, int active_worst_quality) {
   int q = active_worst_quality;
-
-  int i;
   int last_error = INT_MAX;
-  int target_bits_per_mb;
-  int bits_per_mb_at_this_q;
-  double correction_factor;
-
-  // Select the appropriate correction factor based upon type of frame.
-  if (cpi->common.frame_type == KEY_FRAME) {
-    correction_factor = cpi->rc.key_frame_rate_correction_factor;
-  } else {
-    if (cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame)
-      correction_factor = cpi->rc.gf_rate_correction_factor;
-    else
-      correction_factor = cpi->rc.rate_correction_factor;
-  }
+  int i, target_bits_per_mb, bits_per_mb_at_this_q;
+  const double correction_factor = get_rate_correction_factor(cpi);
 
   // Calculate required scaling factor based on target frame size and size of
   // frame produced using previous Q.
@@ -855,7 +846,6 @@ static void update_golden_frame_stats(VP9_COMP *cpi) {
   // Update the Golden frame usage counts.
   if (cpi->refresh_golden_frame) {
     // this frame refreshes means next frames don't unless specified by user
-    cpi->refresh_golden_frame = 0;
     cpi->rc.frames_since_golden = 0;
 
     if (!cpi->rc.source_alt_ref_pending)
@@ -876,36 +866,35 @@ static void update_golden_frame_stats(VP9_COMP *cpi) {
 
 void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
   VP9_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
   // Update rate control heuristics
-  cpi->rc.projected_frame_size = (bytes_used << 3);
+  rc->projected_frame_size = (bytes_used << 3);
 
   // Post encode loop adjustment of Q prediction.
-  vp9_rc_update_rate_correction_factors(
-      cpi, (cpi->sf.recode_loop ||
+  vp9_rc_update_rate_correction_factors(cpi, (cpi->sf.recode_loop ||
             cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) ? 2 : 0);
 
   // Keep a record of last Q and ambient average Q.
   if (cm->frame_type == KEY_FRAME) {
-    cpi->rc.last_q[KEY_FRAME] = cm->base_qindex;
-    cpi->rc.avg_frame_qindex[KEY_FRAME] =
-        (2 + 3 * cpi->rc.avg_frame_qindex[KEY_FRAME] + cm->base_qindex) >> 2;
-  } else if (!cpi->rc.is_src_frame_alt_ref &&
+    rc->last_q[KEY_FRAME] = cm->base_qindex;
+    rc->avg_frame_qindex[KEY_FRAME] = ROUND_POWER_OF_TWO(
+        3 * rc->avg_frame_qindex[KEY_FRAME] + cm->base_qindex, 2);
+  } else if (!rc->is_src_frame_alt_ref &&
              (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
-    cpi->rc.last_q[2] = cm->base_qindex;
-    cpi->rc.avg_frame_qindex[2] =
-        (2 + 3 * cpi->rc.avg_frame_qindex[2] + cm->base_qindex) >> 2;
+    rc->last_q[2] = cm->base_qindex;
+    rc->avg_frame_qindex[2] = ROUND_POWER_OF_TWO(
+        3 * rc->avg_frame_qindex[2] + cm->base_qindex, 2);
   } else {
-    cpi->rc.last_q[INTER_FRAME] = cm->base_qindex;
-    cpi->rc.avg_frame_qindex[INTER_FRAME] =
-        (2 + 3 * cpi->rc.avg_frame_qindex[INTER_FRAME] +
-         cm->base_qindex) >> 2;
-    cpi->rc.ni_frames++;
-    cpi->rc.tot_q += vp9_convert_qindex_to_q(cm->base_qindex);
-    cpi->rc.avg_q = cpi->rc.tot_q / (double)cpi->rc.ni_frames;
+    rc->last_q[INTER_FRAME] = cm->base_qindex;
+    rc->avg_frame_qindex[INTER_FRAME] = ROUND_POWER_OF_TWO(
+        3 * rc->avg_frame_qindex[INTER_FRAME] + cm->base_qindex, 2);
+    rc->ni_frames++;
+    rc->tot_q += vp9_convert_qindex_to_q(cm->base_qindex);
+    rc->avg_q = rc->tot_q / (double)rc->ni_frames;
 
     // Calculate the average Q for normal inter frames (not key or GFU frames).
-    cpi->rc.ni_tot_qi += cm->base_qindex;
-    cpi->rc.ni_av_qi = cpi->rc.ni_tot_qi / cpi->rc.ni_frames;
+    rc->ni_tot_qi += cm->base_qindex;
+    rc->ni_av_qi = rc->ni_tot_qi / rc->ni_frames;
   }
 
   // Keep record of last boosted (KF/KF/ARF) Q value.
@@ -913,38 +902,34 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
   // If all mbs in this group are skipped only update if the Q value is
   // better than that already stored.
   // This is used to help set quality in forced key frames to reduce popping
-  if ((cm->base_qindex < cpi->rc.last_boosted_qindex) ||
+  if ((cm->base_qindex < rc->last_boosted_qindex) ||
       ((cpi->static_mb_pct < 100) &&
        ((cm->frame_type == KEY_FRAME) || cpi->refresh_alt_ref_frame ||
-        (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)))) {
-    cpi->rc.last_boosted_qindex = cm->base_qindex;
+        (cpi->refresh_golden_frame && !rc->is_src_frame_alt_ref)))) {
+    rc->last_boosted_qindex = cm->base_qindex;
   }
 
-  vp9_update_buffer_level(cpi, cpi->rc.projected_frame_size);
+  vp9_update_buffer_level(cpi, rc->projected_frame_size);
 
   // Rolling monitors of whether we are over or underspending used to help
   // regulate min and Max Q in two pass.
   if (cm->frame_type != KEY_FRAME) {
-    cpi->rc.rolling_target_bits =
-        ((cpi->rc.rolling_target_bits * 3) +
-         cpi->rc.this_frame_target + 2) / 4;
-    cpi->rc.rolling_actual_bits =
-        ((cpi->rc.rolling_actual_bits * 3) +
-         cpi->rc.projected_frame_size + 2) / 4;
-    cpi->rc.long_rolling_target_bits =
-        ((cpi->rc.long_rolling_target_bits * 31) +
-         cpi->rc.this_frame_target + 16) / 32;
-    cpi->rc.long_rolling_actual_bits =
-        ((cpi->rc.long_rolling_actual_bits * 31) +
-         cpi->rc.projected_frame_size + 16) / 32;
+    rc->rolling_target_bits = ROUND_POWER_OF_TWO(
+        rc->rolling_target_bits * 3 + rc->this_frame_target, 2);
+    rc->rolling_actual_bits = ROUND_POWER_OF_TWO(
+        rc->rolling_actual_bits * 3 + rc->projected_frame_size, 2);
+    rc->long_rolling_target_bits = ROUND_POWER_OF_TWO(
+        rc->long_rolling_target_bits * 31 + rc->this_frame_target, 5);
+    rc->long_rolling_actual_bits = ROUND_POWER_OF_TWO(
+        rc->long_rolling_actual_bits * 31 + rc->projected_frame_size, 5);
   }
 
   // Actual bits spent
-  cpi->rc.total_actual_bits += cpi->rc.projected_frame_size;
+  rc->total_actual_bits += rc->projected_frame_size;
 
   // Debug stats
-  cpi->rc.total_target_vs_actual += (cpi->rc.this_frame_target -
-                                     cpi->rc.projected_frame_size);
+  rc->total_target_vs_actual += (rc->this_frame_target -
+                                 rc->projected_frame_size);
 
 #ifndef DISABLE_RC_LONG_TERM_MEM
   // Update bits left to the kf and gf groups to account for overshoot or
@@ -962,8 +947,8 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
   }
 #endif
 
-  if (cpi->oxcf.play_alternate && cpi->refresh_alt_ref_frame
-      && (cm->frame_type != KEY_FRAME))
+  if (cpi->oxcf.play_alternate && cpi->refresh_alt_ref_frame &&
+      (cm->frame_type != KEY_FRAME))
     // Update the alternate reference frame stats as appropriate.
     update_alt_ref_frame_stats(cpi);
   else
@@ -971,10 +956,10 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
     update_golden_frame_stats(cpi);
 
   if (cm->frame_type == KEY_FRAME)
-    cpi->rc.frames_since_key = 0;
+    rc->frames_since_key = 0;
   if (cm->show_frame) {
-    cpi->rc.frames_since_key++;
-    cpi->rc.frames_to_key--;
+    rc->frames_since_key++;
+    rc->frames_to_key--;
   }
 }
 
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 97dc1e0ff..81d47de92 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -2124,10 +2124,10 @@ static void mv_pred(VP9_COMP *cpi, MACROBLOCK *x,
                      cpi->common.show_frame &&
                      block_size < cpi->sf.max_partition_size);
 
-  int_mv pred_mv[3] = {
-      mbmi->ref_mvs[ref_frame][0], mbmi->ref_mvs[ref_frame][1],
-      x->pred_mv[ref_frame]
-  };
+  int_mv pred_mv[3];
+  pred_mv[0] = mbmi->ref_mvs[ref_frame][0];
+  pred_mv[1] = mbmi->ref_mvs[ref_frame][1];
+  pred_mv[2] = x->pred_mv[ref_frame];
 
   // Get the sad for each candidate reference mv
   for (i = 0; i < num_mv_refs; i++) {
@@ -2276,14 +2276,14 @@ static void setup_pred_block(const MACROBLOCKD *xd,
   }
 }
 
-static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
-                               const TileInfo *const tile,
-                               int idx, MV_REFERENCE_FRAME frame_type,
-                               BLOCK_SIZE block_size,
-                               int mi_row, int mi_col,
-                               int_mv frame_nearest_mv[MAX_REF_FRAMES],
-                               int_mv frame_near_mv[MAX_REF_FRAMES],
-                               struct buf_2d yv12_mb[4][MAX_MB_PLANE]) {
+void vp9_setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
+                            const TileInfo *const tile,
+                            int idx, MV_REFERENCE_FRAME frame_type,
+                            BLOCK_SIZE block_size,
+                            int mi_row, int mi_col,
+                            int_mv frame_nearest_mv[MAX_REF_FRAMES],
+                            int_mv frame_near_mv[MAX_REF_FRAMES],
+                            struct buf_2d yv12_mb[4][MAX_MB_PLANE]) {
   VP9_COMMON *cm = &cpi->common;
   YV12_BUFFER_CONFIG *yv12 = &cm->yv12_fb[cpi->common.ref_frame_map[idx]];
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -2355,9 +2355,10 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
 
   YV12_BUFFER_CONFIG *scaled_ref_frame = get_scaled_ref_frame(cpi, ref);
 
-  int_mv pred_mv[3] = {
-      mbmi->ref_mvs[ref][0], mbmi->ref_mvs[ref][1], x->pred_mv[ref]
-  };
+  int_mv pred_mv[3];
+  pred_mv[0] = mbmi->ref_mvs[ref][0];
+  pred_mv[1] = mbmi->ref_mvs[ref][1];
+  pred_mv[2] = x->pred_mv[ref];
 
   if (scaled_ref_frame) {
     int i;
@@ -3174,17 +3175,29 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
 
   *returnrate = INT_MAX;
 
-  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
     x->pred_mv_sad[ref_frame] = INT_MAX;
     if (cpi->ref_frame_flags & flag_list[ref_frame]) {
-      setup_buffer_inter(cpi, x, tile, get_ref_frame_idx(cpi, ref_frame),
-                         ref_frame, block_size, mi_row, mi_col,
-                         frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb);
+      vp9_setup_buffer_inter(cpi, x, tile, get_ref_frame_idx(cpi, ref_frame),
+                             ref_frame, block_size, mi_row, mi_col,
+                             frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb);
     }
     frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
     frame_mv[ZEROMV][ref_frame].as_int = 0;
   }
 
+  cpi->ref_frame_mask = 0;
+  for (ref_frame = LAST_FRAME;
+       ref_frame <= ALTREF_FRAME && cpi->sf.reference_masking; ++ref_frame) {
+    int i;
+    for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
+      if ((x->pred_mv_sad[ref_frame] >> 2) > x->pred_mv_sad[i]) {
+        cpi->ref_frame_mask |= (1 << ref_frame);
+        break;
+      }
+    }
+  }
+
   for (mode_index = 0; mode_index < MAX_MODES; ++mode_index) {
     int mode_excluded = 0;
     int64_t this_rd = INT64_MAX;
@@ -3234,8 +3247,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     }
 
     // Skip if the current reference frame has been masked off
-    if (cpi->sf.reference_masking && !cpi->set_ref_frame_mask &&
-        (cpi->ref_frame_mask & (1 << ref_frame)))
+    if (cpi->ref_frame_mask & (1 << ref_frame) && this_mode != NEWMV)
       continue;
 
     // Test best rd so far against threshold for trying this mode.
@@ -3640,11 +3652,6 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     }
   }
 
-  // If we are using reference masking and the set mask flag is set then
-  // create the reference frame mask.
-  if (cpi->sf.reference_masking && cpi->set_ref_frame_mask)
-    cpi->ref_frame_mask = ~(1 << vp9_mode_order[best_mode_index].ref_frame[0]);
-
   // Flag all modes that have a distortion thats > 2x the best we found at
   // this level.
   for (mode_index = 0; mode_index < MB_MODE_COUNT; ++mode_index) {
@@ -3796,15 +3803,27 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
 
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
     if (cpi->ref_frame_flags & flag_list[ref_frame]) {
-      setup_buffer_inter(cpi, x, tile, get_ref_frame_idx(cpi, ref_frame),
-                         ref_frame, block_size, mi_row, mi_col,
-                         frame_mv[NEARESTMV], frame_mv[NEARMV],
-                         yv12_mb);
+      vp9_setup_buffer_inter(cpi, x, tile, get_ref_frame_idx(cpi, ref_frame),
+                             ref_frame, block_size, mi_row, mi_col,
+                             frame_mv[NEARESTMV], frame_mv[NEARMV],
+                             yv12_mb);
     }
     frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
     frame_mv[ZEROMV][ref_frame].as_int = 0;
   }
 
+  cpi->ref_frame_mask = 0;
+  for (ref_frame = LAST_FRAME;
+       ref_frame <= ALTREF_FRAME && cpi->sf.reference_masking; ++ref_frame) {
+    int i;
+    for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
+      if ((x->pred_mv_sad[ref_frame] >> 1) > x->pred_mv_sad[i]) {
+        cpi->ref_frame_mask |= (1 << ref_frame);
+        break;
+      }
+    }
+  }
+
   for (mode_index = 0; mode_index < MAX_REFS; ++mode_index) {
     int mode_excluded = 0;
     int64_t this_rd = INT64_MAX;
@@ -3852,11 +3871,6 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
         continue;
     }
 
-    // Skip if the current reference frame has been masked off
-    if (cpi->sf.reference_masking && !cpi->set_ref_frame_mask &&
-        (cpi->ref_frame_mask & (1 << ref_frame)))
-      continue;
-
     // Test best rd so far against threshold for trying this mode.
     if ((best_rd <
          ((int64_t)cpi->rd_thresh_sub8x8[segment_id][bsize][mode_index] *
@@ -4366,11 +4380,6 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
     }
   }
 
-  // If we are using reference masking and the set mask flag is set then
-  // create the reference frame mask.
-  if (cpi->sf.reference_masking && cpi->set_ref_frame_mask)
-    cpi->ref_frame_mask = ~(1 << vp9_ref_order[best_mode_index].ref_frame[0]);
-
   if (best_rd == INT64_MAX && bsize < BLOCK_8X8) {
     *returnrate = INT_MAX;
     *returndistortion = INT_MAX;
diff --git a/vp9/encoder/vp9_rdopt.h b/vp9/encoder/vp9_rdopt.h
index f0e8849c1..5732c2b2d 100644
--- a/vp9/encoder/vp9_rdopt.h
+++ b/vp9/encoder/vp9_rdopt.h
@@ -27,6 +27,15 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi);
 
 void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex);
 
+void vp9_setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
+                            const TileInfo *const tile,
+                            int idx, MV_REFERENCE_FRAME frame_type,
+                            BLOCK_SIZE block_size,
+                            int mi_row, int mi_col,
+                            int_mv frame_nearest_mv[MAX_REF_FRAMES],
+                            int_mv frame_near_mv[MAX_REF_FRAMES],
+                            struct buf_2d yv12_mb[4][MAX_MB_PLANE]);
+
 void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
                                int *r, int64_t *d, BLOCK_SIZE bsize,
                                PICK_MODE_CONTEXT *ctx, int64_t best_rd);
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index bd1150b98..b1c029cba 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -74,7 +74,6 @@ VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.c
 VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_loopfilter_mmx.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_8t_sse2.asm
 VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_ssse3.asm
-VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_intrin_ssse3.c
 ifeq ($(CONFIG_VP9_POSTPROC),yes)
 VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_postproc_mmx.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_postproc_sse2.asm
diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c
index a7d2e1d6a..a03f7befc 100644
--- a/vp9/vp9_dx_iface.c
+++ b/vp9/vp9_dx_iface.c
@@ -25,7 +25,7 @@ typedef vpx_codec_stream_info_t  vp9_stream_info_t;
 
 /* Structures for handling memory allocations */
 typedef enum {
-  VP9_SEG_ALG_PRIV     = 256,
+  VP9_SEG_ALG_PRIV = 256,
   VP9_SEG_MAX
 } mem_seg_id_t;
 #define NELEMENTS(x) ((int)(sizeof(x)/sizeof(x[0])))
@@ -107,12 +107,11 @@ static void vp9_finalize_mmaps(vpx_codec_alg_priv_t *ctx) {
 
 static vpx_codec_err_t vp9_init(vpx_codec_ctx_t *ctx,
                                 vpx_codec_priv_enc_mr_cfg_t *data) {
-  vpx_codec_err_t        res = VPX_CODEC_OK;
+  vpx_codec_err_t res = VPX_CODEC_OK;
 
-  /* This function only allocates space for the vpx_codec_alg_priv_t
-   * structure. More memory may be required at the time the stream
-   * information becomes known.
-   */
+  // This function only allocates space for the vpx_codec_alg_priv_t
+  // structure. More memory may be required at the time the stream
+  // information becomes known.
   if (!ctx->priv) {
     vpx_codec_mmap_t mmap;
 
@@ -122,12 +121,10 @@ static vpx_codec_err_t vp9_init(vpx_codec_ctx_t *ctx,
     mmap.flags = vp9_mem_req_segs[0].flags;
 
     res = vpx_mmap_alloc(&mmap);
-
     if (!res) {
       vp9_init_ctx(ctx, &mmap);
 
       ctx->priv->alg_priv->defer_alloc = 1;
-      /*post processing level initialized to do nothing */
     }
   }
 
@@ -147,8 +144,7 @@ static vpx_codec_err_t vp9_destroy(vpx_codec_alg_priv_t *ctx) {
   return VPX_CODEC_OK;
 }
 
-static vpx_codec_err_t vp9_peek_si(const uint8_t         *data,
-                                   unsigned int           data_sz,
+static vpx_codec_err_t vp9_peek_si(const uint8_t *data, unsigned int data_sz,
                                    vpx_codec_stream_info_t *si) {
   if (data_sz <= 8) return VPX_CODEC_UNSUP_BITSTREAM;
   if (data + data_sz <= data) return VPX_CODEC_INVALID_PARAM;
@@ -213,13 +209,9 @@ static vpx_codec_err_t vp9_peek_si(const uint8_t         *data,
 
 static vpx_codec_err_t vp9_get_si(vpx_codec_alg_priv_t    *ctx,
                                   vpx_codec_stream_info_t *si) {
-  unsigned int sz;
-
-  if (si->sz >= sizeof(vp9_stream_info_t))
-    sz = sizeof(vp9_stream_info_t);
-  else
-    sz = sizeof(vpx_codec_stream_info_t);
-
+  const size_t sz = (si->sz >= sizeof(vp9_stream_info_t))
+                       ? sizeof(vp9_stream_info_t)
+                       : sizeof(vpx_codec_stream_info_t);
   memcpy(si, &ctx->si, sz);
   si->sz = sz;
 
@@ -227,24 +219,17 @@ static vpx_codec_err_t vp9_get_si(vpx_codec_alg_priv_t    *ctx,
 }
 
 
-static vpx_codec_err_t
-update_error_state(vpx_codec_alg_priv_t                 *ctx,
-                   const struct vpx_internal_error_info *error) {
-  vpx_codec_err_t res;
-
-  if ((res = error->error_code))
-    ctx->base.err_detail = error->has_detail
-                           ? error->detail
-                           : NULL;
+static vpx_codec_err_t update_error_state(vpx_codec_alg_priv_t *ctx,
+                           const struct vpx_internal_error_info *error) {
+  if (error->error_code)
+    ctx->base.err_detail = error->has_detail ? error->detail : NULL;
 
-  return res;
+  return error->error_code;
 }
 
-static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t  *ctx,
-                                  const uint8_t        **data,
-                                  unsigned int           data_sz,
-                                  void                  *user_priv,
-                                  long                   deadline) {
+static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx,
+                                  const uint8_t **data, unsigned int data_sz,
+                                  void *user_priv, int64_t deadline) {
   vpx_codec_err_t res = VPX_CODEC_OK;
 
   ctx->img_avail = 0;
@@ -304,13 +289,11 @@ static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t  *ctx,
       oxcf.inv_tile_order = ctx->invert_tile_order;
       optr = vp9_create_decompressor(&oxcf);
 
-      /* If postprocessing was enabled by the application and a
-       * configuration has not been provided, default it.
-       */
-      if (!ctx->postproc_cfg_set
-          && (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC)) {
-        ctx->postproc_cfg.post_proc_flag =
-          VP8_DEBLOCK | VP8_DEMACROBLOCK;
+      // If postprocessing was enabled by the application and a
+      // configuration has not been provided, default it.
+      if (!ctx->postproc_cfg_set &&
+          (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC)) {
+        ctx->postproc_cfg.post_proc_flag = VP8_DEBLOCK | VP8_DEMACROBLOCK;
         ctx->postproc_cfg.deblocking_level = 4;
         ctx->postproc_cfg.noise_level = 0;
       }
@@ -354,25 +337,20 @@ static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t  *ctx,
     if (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC) {
       flags.post_proc_flag =
 #if CONFIG_POSTPROC_VISUALIZER
-          ((ctx->dbg_color_ref_frame_flag != 0) ?
-              VP9D_DEBUG_CLR_FRM_REF_BLKS : 0)
-          | ((ctx->dbg_color_mb_modes_flag != 0) ?
-              VP9D_DEBUG_CLR_BLK_MODES : 0)
-          | ((ctx->dbg_color_b_modes_flag != 0) ?
-              VP9D_DEBUG_CLR_BLK_MODES : 0)
-          | ((ctx->dbg_display_mv_flag != 0) ?
-              VP9D_DEBUG_DRAW_MV : 0)
-          |
+          (ctx->dbg_color_ref_frame_flag ? VP9D_DEBUG_CLR_FRM_REF_BLKS : 0) |
+          (ctx->dbg_color_mb_modes_flag ? VP9D_DEBUG_CLR_BLK_MODES : 0) |
+          (ctx->dbg_color_b_modes_flag ? VP9D_DEBUG_CLR_BLK_MODES : 0) |
+          (ctx->dbg_display_mv_flag ? VP9D_DEBUG_DRAW_MV : 0) |
 #endif
           ctx->postproc_cfg.post_proc_flag;
 
-      flags.deblocking_level      = ctx->postproc_cfg.deblocking_level;
-      flags.noise_level           = ctx->postproc_cfg.noise_level;
+      flags.deblocking_level = ctx->postproc_cfg.deblocking_level;
+      flags.noise_level = ctx->postproc_cfg.noise_level;
 #if CONFIG_POSTPROC_VISUALIZER
       flags.display_ref_frame_flag = ctx->dbg_color_ref_frame_flag;
       flags.display_mb_modes_flag = ctx->dbg_color_mb_modes_flag;
-      flags.display_b_modes_flag  = ctx->dbg_color_b_modes_flag;
-      flags.display_mv_flag       = ctx->dbg_display_mv_flag;
+      flags.display_b_modes_flag = ctx->dbg_color_b_modes_flag;
+      flags.display_mv_flag = ctx->dbg_display_mv_flag;
 #endif
     }
 
@@ -391,10 +369,8 @@ static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t  *ctx,
   return res;
 }
 
-static void parse_superframe_index(const uint8_t *data,
-                                   size_t         data_sz,
-                                   uint32_t       sizes[8],
-                                   int           *count) {
+static void parse_superframe_index(const uint8_t *data, size_t data_sz,
+                                   uint32_t sizes[8], int *count) {
   uint8_t marker;
 
   assert(data_sz);
@@ -527,11 +503,11 @@ static vpx_codec_err_t vp9_set_frame_buffers(
   return VPX_CODEC_ERROR;
 }
 
-static vpx_codec_err_t vp9_xma_get_mmap(const vpx_codec_ctx_t      *ctx,
-                                        vpx_codec_mmap_t           *mmap,
-                                        vpx_codec_iter_t           *iter) {
-  vpx_codec_err_t     res;
-  const mem_req_t  *seg_iter = *iter;
+static vpx_codec_err_t vp9_xma_get_mmap(const vpx_codec_ctx_t *ctx,
+                                        vpx_codec_mmap_t *mmap,
+                                        vpx_codec_iter_t *iter) {
+  vpx_codec_err_t res;
+  const mem_req_t *seg_iter = *iter;
 
   /* Get address of next segment request */
   do {
@@ -560,7 +536,7 @@ static vpx_codec_err_t vp9_xma_get_mmap(const vpx_codec_ctx_t      *ctx,
   return res;
 }
 
-static vpx_codec_err_t vp9_xma_set_mmap(vpx_codec_ctx_t         *ctx,
+static vpx_codec_err_t vp9_xma_set_mmap(vpx_codec_ctx_t *ctx,
                                         const vpx_codec_mmap_t  *mmap) {
   vpx_codec_err_t res = VPX_CODEC_MEM_ERROR;
   int i, done;
@@ -596,8 +572,7 @@ static vpx_codec_err_t vp9_xma_set_mmap(vpx_codec_ctx_t         *ctx,
   return res;
 }
 
-static vpx_codec_err_t set_reference(vpx_codec_alg_priv_t *ctx,
-                                     int ctr_id,
+static vpx_codec_err_t set_reference(vpx_codec_alg_priv_t *ctx, int ctr_id,
                                      va_list args) {
   vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *);
 
@@ -606,7 +581,6 @@ static vpx_codec_err_t set_reference(vpx_codec_alg_priv_t *ctx,
     YV12_BUFFER_CONFIG sd;
 
     image2yuvconfig(&frame->img, &sd);
-
     return vp9_set_reference_dec(ctx->pbi,
                                  (VP9_REFFRAME)frame->frame_type, &sd);
   } else {
@@ -614,8 +588,7 @@ static vpx_codec_err_t set_reference(vpx_codec_alg_priv_t *ctx,
   }
 }
 
-static vpx_codec_err_t copy_reference(vpx_codec_alg_priv_t *ctx,
-                                      int ctr_id,
+static vpx_codec_err_t copy_reference(vpx_codec_alg_priv_t *ctx, int ctr_id,
                                       va_list args) {
   vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *);
 
@@ -632,8 +605,7 @@ static vpx_codec_err_t copy_reference(vpx_codec_alg_priv_t *ctx,
   }
 }
 
-static vpx_codec_err_t get_reference(vpx_codec_alg_priv_t *ctx,
-                                     int ctr_id,
+static vpx_codec_err_t get_reference(vpx_codec_alg_priv_t *ctx, int ctr_id,
                                      va_list args) {
   vp9_ref_frame_t *data = va_arg(args, vp9_ref_frame_t *);
 
@@ -648,8 +620,7 @@ static vpx_codec_err_t get_reference(vpx_codec_alg_priv_t *ctx,
   }
 }
 
-static vpx_codec_err_t set_postproc(vpx_codec_alg_priv_t *ctx,
-                                    int ctr_id,
+static vpx_codec_err_t set_postproc(vpx_codec_alg_priv_t *ctx, int ctr_id,
                                     va_list args) {
 #if CONFIG_VP9_POSTPROC
   vp8_postproc_cfg_t *data = va_arg(args, vp8_postproc_cfg_t *);
@@ -666,8 +637,7 @@ static vpx_codec_err_t set_postproc(vpx_codec_alg_priv_t *ctx,
 #endif
 }
 
-static vpx_codec_err_t set_dbg_options(vpx_codec_alg_priv_t *ctx,
-                                       int ctrl_id,
+static vpx_codec_err_t set_dbg_options(vpx_codec_alg_priv_t *ctx, int ctrl_id,
                                        va_list args) {
 #if CONFIG_POSTPROC_VISUALIZER && CONFIG_POSTPROC
   int data = va_arg(args, int);
@@ -688,8 +658,7 @@ static vpx_codec_err_t set_dbg_options(vpx_codec_alg_priv_t *ctx,
 }
 
 static vpx_codec_err_t get_last_ref_updates(vpx_codec_alg_priv_t *ctx,
-                                            int ctrl_id,
-                                            va_list args) {
+                                            int ctrl_id, va_list args) {
   int *update_info = va_arg(args, int *);
   VP9D_COMP *pbi = (VP9D_COMP*)ctx->pbi;
 
@@ -704,8 +673,7 @@ static vpx_codec_err_t get_last_ref_updates(vpx_codec_alg_priv_t *ctx,
 
 
 static vpx_codec_err_t get_frame_corrupted(vpx_codec_alg_priv_t *ctx,
-                                           int ctrl_id,
-                                           va_list args) {
+                                           int ctrl_id, va_list args) {
   int *corrupted = va_arg(args, int *);
 
   if (corrupted) {
@@ -721,8 +689,7 @@ static vpx_codec_err_t get_frame_corrupted(vpx_codec_alg_priv_t *ctx,
 }
 
 static vpx_codec_err_t get_display_size(vpx_codec_alg_priv_t *ctx,
-                                        int ctrl_id,
-                                        va_list args) {
+                                        int ctrl_id, va_list args) {
   int *const display_size = va_arg(args, int *);
 
   if (display_size) {