46 files changed, 1372 insertions, 1114 deletions
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index 8ca356dd6..9088b0bde 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -228,8 +228,6 @@ typedef struct macroblockd {
   DECLARE_ALIGNED(16, uint8_t, mc_buf[80 * 2 * 80 * 2]);
 
   int lossless;
-  /* Inverse transform function pointers. */
-  void (*itxm_add)(const int16_t *input, uint8_t *dest, int stride, int eob);
 
   int corrupted;
 
diff --git a/vp9/common/vp9_pragmas.h b/vp9/common/vp9_pragmas.h
deleted file mode 100644
index 0efc713ca..000000000
--- a/vp9/common/vp9_pragmas.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VP9_COMMON_VP9_PRAGMAS_H_
-#define VP9_COMMON_VP9_PRAGMAS_H_
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#ifdef __INTEL_COMPILER
-#pragma warning(disable:997 1011 170)
-#endif
-
-#ifdef _MSC_VER
-#pragma warning(disable:4799)
-#endif
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // VP9_COMMON_VP9_PRAGMAS_H_
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 182739620..a18ca8ea6 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -356,7 +356,7 @@ specialize qw/vp9_idct16x16_1_add sse2 neon_asm dspr2/;
 $vp9_idct16x16_1_add_neon_asm=vp9_idct16x16_1_add_neon;
 
 add_proto qw/void vp9_idct16x16_256_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
-specialize qw/vp9_idct16x16_256_add sse2 neon_asm dspr2/;
+specialize qw/vp9_idct16x16_256_add sse2 ssse3 neon_asm dspr2/;
 $vp9_idct16x16_256_add_neon_asm=vp9_idct16x16_256_add_neon;
 
 add_proto qw/void vp9_idct16x16_10_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
@@ -431,6 +431,12 @@ specialize qw/vp9_variance8x16 mmx/, "$sse2_x86inc";
 add_proto qw/unsigned int vp9_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
 specialize qw/vp9_variance8x8 mmx/, "$sse2_x86inc";
 
+add_proto qw/void vp9_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+specialize qw/vp9_get8x8var mmx/, "$sse2_x86inc";
+
+add_proto qw/void vp9_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+specialize qw/vp9_get16x16var avx2/, "$sse2_x86inc";
+
 add_proto qw/unsigned int vp9_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
 specialize qw/vp9_variance8x4/, "$sse2_x86inc";
 
@@ -520,82 +526,82 @@ specialize qw/vp9_sub_pixel_variance4x4/, "$sse_x86inc", "$ssse3_x86inc";
 add_proto qw/unsigned int vp9_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
 specialize qw/vp9_sub_pixel_avg_variance4x4/, "$sse_x86inc", "$ssse3_x86inc";
 
-add_proto qw/unsigned int vp9_sad64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad";
+add_proto qw/unsigned int vp9_sad64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
 specialize qw/vp9_sad64x64/, "$sse2_x86inc";
 
-add_proto qw/unsigned int vp9_sad32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad";
+add_proto qw/unsigned int vp9_sad32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
 specialize qw/vp9_sad32x64/, "$sse2_x86inc";
 
-add_proto qw/unsigned int vp9_sad64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad";
+add_proto qw/unsigned int vp9_sad64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
 specialize qw/vp9_sad64x32/, "$sse2_x86inc";
 
-add_proto qw/unsigned int vp9_sad32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad";
+add_proto qw/unsigned int vp9_sad32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
 specialize qw/vp9_sad32x16/, "$sse2_x86inc";
 
-add_proto qw/unsigned int vp9_sad16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad";
+add_proto qw/unsigned int vp9_sad16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
 specialize qw/vp9_sad16x32/, "$sse2_x86inc";
 
-add_proto qw/unsigned int vp9_sad32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad";
+add_proto qw/unsigned int vp9_sad32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
 specialize qw/vp9_sad32x32/, "$sse2_x86inc";
 
-add_proto qw/unsigned int vp9_sad16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad";
+add_proto qw/unsigned int vp9_sad16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
 specialize qw/vp9_sad16x16 mmx/, "$sse2_x86inc";
 
-add_proto qw/unsigned int vp9_sad16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad";
+add_proto qw/unsigned int vp9_sad16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
 specialize qw/vp9_sad16x8 mmx/, "$sse2_x86inc";
 
-add_proto qw/unsigned int vp9_sad8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad";
+add_proto qw/unsigned int vp9_sad8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
 specialize qw/vp9_sad8x16 mmx/, "$sse2_x86inc";
 
-add_proto qw/unsigned int vp9_sad8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad";
+add_proto qw/unsigned int vp9_sad8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
 specialize qw/vp9_sad8x8 mmx/, "$sse2_x86inc";
 
-add_proto qw/unsigned int vp9_sad8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad";
+add_proto qw/unsigned int vp9_sad8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
 specialize qw/vp9_sad8x4/, "$sse2_x86inc";
 
-add_proto qw/unsigned int vp9_sad4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad";
+add_proto qw/unsigned int vp9_sad4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
 specialize qw/vp9_sad4x8/, "$sse_x86inc";
 
-add_proto qw/unsigned int vp9_sad4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad";
+add_proto qw/unsigned int vp9_sad4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
 specialize qw/vp9_sad4x4 mmx/, "$sse_x86inc";
 
-add_proto qw/unsigned int vp9_sad64x64_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred, unsigned int max_sad";
+add_proto qw/unsigned int vp9_sad64x64_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
 specialize qw/vp9_sad64x64_avg/, "$sse2_x86inc";
 
-add_proto qw/unsigned int vp9_sad32x64_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad";
+add_proto qw/unsigned int vp9_sad32x64_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
 specialize qw/vp9_sad32x64_avg/, "$sse2_x86inc";
 
-add_proto qw/unsigned int vp9_sad64x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad";
+add_proto qw/unsigned int vp9_sad64x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
 specialize qw/vp9_sad64x32_avg/, "$sse2_x86inc";
 
-add_proto qw/unsigned int vp9_sad32x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad";
+add_proto qw/unsigned int vp9_sad32x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
 specialize qw/vp9_sad32x16_avg/, "$sse2_x86inc";
 
-add_proto qw/unsigned int vp9_sad16x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad";
+add_proto qw/unsigned int vp9_sad16x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
 specialize qw/vp9_sad16x32_avg/, "$sse2_x86inc";
 
-add_proto qw/unsigned int vp9_sad32x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred, unsigned int max_sad";
+add_proto qw/unsigned int vp9_sad32x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
 specialize qw/vp9_sad32x32_avg/, "$sse2_x86inc";
 
-add_proto qw/unsigned int vp9_sad16x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred, unsigned int max_sad";
+add_proto qw/unsigned int vp9_sad16x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
 specialize qw/vp9_sad16x16_avg/, "$sse2_x86inc";
 
-add_proto qw/unsigned int vp9_sad16x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred, unsigned int max_sad";
+add_proto qw/unsigned int vp9_sad16x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
 specialize qw/vp9_sad16x8_avg/, "$sse2_x86inc";
 
-add_proto qw/unsigned int vp9_sad8x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred, unsigned int max_sad";
+add_proto qw/unsigned int vp9_sad8x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
 specialize qw/vp9_sad8x16_avg/, "$sse2_x86inc";
 
-add_proto qw/unsigned int vp9_sad8x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred, unsigned int max_sad";
+add_proto qw/unsigned int vp9_sad8x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
 specialize qw/vp9_sad8x8_avg/, "$sse2_x86inc";
 
-add_proto qw/unsigned int vp9_sad8x4_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad";
+add_proto qw/unsigned int vp9_sad8x4_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
 specialize qw/vp9_sad8x4_avg/, "$sse2_x86inc";
 
-add_proto qw/unsigned int vp9_sad4x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad";
+add_proto qw/unsigned int vp9_sad4x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
 specialize qw/vp9_sad4x8_avg/, "$sse_x86inc";
 
-add_proto qw/unsigned int vp9_sad4x4_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred, unsigned int max_sad";
+add_proto qw/unsigned int vp9_sad4x4_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
 specialize qw/vp9_sad4x4_avg/, "$sse_x86inc";
 
 add_proto qw/void vp9_sad64x64x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array";
diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c
index 0231726dc..ff9c43221 100644
--- a/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -8,12 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include <assert.h>
-#include <emmintrin.h>  // SSE2
-#include "./vpx_config.h"
-#include "vpx/vpx_integer.h"
-#include "vp9/common/vp9_common.h"
-#include "vp9/common/vp9_idct.h"
+#include "vp9/common/x86/vp9_idct_intrin_sse2.h"
 
 #define RECON_AND_STORE4X4(dest, in_x) \
 {                                                     \
@@ -527,16 +522,6 @@ void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride,
   out7 = _mm_subs_epi16(stp1_0, stp2_7); \
   }
 
-#define RECON_AND_STORE(dest, in_x) \
-  {                                                     \
-     __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \
-      d0 = _mm_unpacklo_epi8(d0, zero); \
-      d0 = _mm_add_epi16(in_x, d0); \
-      d0 = _mm_packus_epi16(d0, d0); \
-      _mm_storel_epi64((__m128i *)(dest), d0); \
-      dest += stride; \
-  }
-
 void vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
@@ -627,36 +612,6 @@ void vp9_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
   RECON_AND_STORE(dest, dc_value);
 }
 
-// perform 8x8 transpose
-static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
-  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
-  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
-  const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
-  const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
-  const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
-  const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
-  const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
-  const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
-
-  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-  const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
-  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
-  const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
-  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
-  const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
-  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
-  const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
-
-  res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
-  res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
-  res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
-  res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
-  res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5);
-  res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5);
-  res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
-  res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
-}
-
 static INLINE void array_transpose_4X8(__m128i *in, __m128i * out) {
   const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
   const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
@@ -1573,23 +1528,6 @@ void vp9_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
   }
 }
 
-static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
-  __m128i tbuf[8];
-  array_transpose_8x8(res0, res0);
-  array_transpose_8x8(res1, tbuf);
-  array_transpose_8x8(res0 + 8, res1);
-  array_transpose_8x8(res1 + 8, res1 + 8);
-
-  res0[8] = tbuf[0];
-  res0[9] = tbuf[1];
-  res0[10] = tbuf[2];
-  res0[11] = tbuf[3];
-  res0[12] = tbuf[4];
-  res0[13] = tbuf[5];
-  res0[14] = tbuf[6];
-  res0[15] = tbuf[7];
-}
-
 static void iadst16_8col(__m128i *in) {
   // perform 16x16 1-D ADST for 8 columns
   __m128i s[16], x[16], u[32], v[32];
@@ -2416,82 +2354,6 @@ static void iadst16_sse2(__m128i *in0, __m128i *in1) {
   iadst16_8col(in1);
 }
 
-static INLINE void load_buffer_8x16(const int16_t *input, __m128i *in) {
-  in[0]  = _mm_load_si128((const __m128i *)(input + 0 * 16));
-  in[1]  = _mm_load_si128((const __m128i *)(input + 1 * 16));
-  in[2]  = _mm_load_si128((const __m128i *)(input + 2 * 16));
-  in[3]  = _mm_load_si128((const __m128i *)(input + 3 * 16));
-  in[4]  = _mm_load_si128((const __m128i *)(input + 4 * 16));
-  in[5]  = _mm_load_si128((const __m128i *)(input + 5 * 16));
-  in[6]  = _mm_load_si128((const __m128i *)(input + 6 * 16));
-  in[7]  = _mm_load_si128((const __m128i *)(input + 7 * 16));
-
-  in[8]  = _mm_load_si128((const __m128i *)(input + 8 * 16));
-  in[9]  = _mm_load_si128((const __m128i *)(input + 9 * 16));
-  in[10]  = _mm_load_si128((const __m128i *)(input + 10 * 16));
-  in[11]  = _mm_load_si128((const __m128i *)(input + 11 * 16));
-  in[12]  = _mm_load_si128((const __m128i *)(input + 12 * 16));
-  in[13]  = _mm_load_si128((const __m128i *)(input + 13 * 16));
-  in[14]  = _mm_load_si128((const __m128i *)(input + 14 * 16));
-  in[15]  = _mm_load_si128((const __m128i *)(input + 15 * 16));
-}
-
-static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) {
-  const __m128i final_rounding = _mm_set1_epi16(1<<5);
-  const __m128i zero = _mm_setzero_si128();
-  // Final rounding and shift
-  in[0] = _mm_adds_epi16(in[0], final_rounding);
-  in[1] = _mm_adds_epi16(in[1], final_rounding);
-  in[2] = _mm_adds_epi16(in[2], final_rounding);
-  in[3] = _mm_adds_epi16(in[3], final_rounding);
-  in[4] = _mm_adds_epi16(in[4], final_rounding);
-  in[5] = _mm_adds_epi16(in[5], final_rounding);
-  in[6] = _mm_adds_epi16(in[6], final_rounding);
-  in[7] = _mm_adds_epi16(in[7], final_rounding);
-  in[8] = _mm_adds_epi16(in[8], final_rounding);
-  in[9] = _mm_adds_epi16(in[9], final_rounding);
-  in[10] = _mm_adds_epi16(in[10], final_rounding);
-  in[11] = _mm_adds_epi16(in[11], final_rounding);
-  in[12] = _mm_adds_epi16(in[12], final_rounding);
-  in[13] = _mm_adds_epi16(in[13], final_rounding);
-  in[14] = _mm_adds_epi16(in[14], final_rounding);
-  in[15] = _mm_adds_epi16(in[15], final_rounding);
-
-  in[0] = _mm_srai_epi16(in[0], 6);
-  in[1] = _mm_srai_epi16(in[1], 6);
-  in[2] = _mm_srai_epi16(in[2], 6);
-  in[3] = _mm_srai_epi16(in[3], 6);
-  in[4] = _mm_srai_epi16(in[4], 6);
-  in[5] = _mm_srai_epi16(in[5], 6);
-  in[6] = _mm_srai_epi16(in[6], 6);
-  in[7] = _mm_srai_epi16(in[7], 6);
-  in[8] = _mm_srai_epi16(in[8], 6);
-  in[9] = _mm_srai_epi16(in[9], 6);
-  in[10] = _mm_srai_epi16(in[10], 6);
-  in[11] = _mm_srai_epi16(in[11], 6);
-  in[12] = _mm_srai_epi16(in[12], 6);
-  in[13] = _mm_srai_epi16(in[13], 6);
-  in[14] = _mm_srai_epi16(in[14], 6);
-  in[15] = _mm_srai_epi16(in[15], 6);
-
-  RECON_AND_STORE(dest, in[0]);
-  RECON_AND_STORE(dest, in[1]);
-  RECON_AND_STORE(dest, in[2]);
-  RECON_AND_STORE(dest, in[3]);
-  RECON_AND_STORE(dest, in[4]);
-  RECON_AND_STORE(dest, in[5]);
-  RECON_AND_STORE(dest, in[6]);
-  RECON_AND_STORE(dest, in[7]);
-  RECON_AND_STORE(dest, in[8]);
-  RECON_AND_STORE(dest, in[9]);
-  RECON_AND_STORE(dest, in[10]);
-  RECON_AND_STORE(dest, in[11]);
-  RECON_AND_STORE(dest, in[12]);
-  RECON_AND_STORE(dest, in[13]);
-  RECON_AND_STORE(dest, in[14]);
-  RECON_AND_STORE(dest, in[15]);
-}
-
 void vp9_iht16x16_256_add_sse2(const int16_t *input, uint8_t *dest, int stride,
                                int tx_type) {
   __m128i in0[16], in1[16];
diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.h b/vp9/common/x86/vp9_idct_intrin_sse2.h
new file mode 100644
index 000000000..1c62e3272
--- /dev/null
+++ b/vp9/common/x86/vp9_idct_intrin_sse2.h
@@ -0,0 +1,149 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>  // SSE2
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_idct.h"
+
+// perform 8x8 transpose
+static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
+  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
+  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
+  const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
+  const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
+  const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
+  const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
+  const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
+  const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
+
+  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+  const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+  const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+  const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+  const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+
+  res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
+  res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
+  res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
+  res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
+  res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5);
+  res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5);
+  res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
+  res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
+}
+
+static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
+  __m128i tbuf[8];
+  array_transpose_8x8(res0, res0);
+  array_transpose_8x8(res1, tbuf);
+  array_transpose_8x8(res0 + 8, res1);
+  array_transpose_8x8(res1 + 8, res1 + 8);
+
+  res0[8] = tbuf[0];
+  res0[9] = tbuf[1];
+  res0[10] = tbuf[2];
+  res0[11] = tbuf[3];
+  res0[12] = tbuf[4];
+  res0[13] = tbuf[5];
+  res0[14] = tbuf[6];
+  res0[15] = tbuf[7];
+}
+
+static INLINE void load_buffer_8x16(const int16_t *input, __m128i *in) {
+  in[0]  = _mm_load_si128((const __m128i *)(input + 0 * 16));
+  in[1]  = _mm_load_si128((const __m128i *)(input + 1 * 16));
+  in[2]  = _mm_load_si128((const __m128i *)(input + 2 * 16));
+  in[3]  = _mm_load_si128((const __m128i *)(input + 3 * 16));
+  in[4]  = _mm_load_si128((const __m128i *)(input + 4 * 16));
+  in[5]  = _mm_load_si128((const __m128i *)(input + 5 * 16));
+  in[6]  = _mm_load_si128((const __m128i *)(input + 6 * 16));
+  in[7]  = _mm_load_si128((const __m128i *)(input + 7 * 16));
+
+  in[8]  = _mm_load_si128((const __m128i *)(input + 8 * 16));
+  in[9]  = _mm_load_si128((const __m128i *)(input + 9 * 16));
+  in[10]  = _mm_load_si128((const __m128i *)(input + 10 * 16));
+  in[11]  = _mm_load_si128((const __m128i *)(input + 11 * 16));
+  in[12]  = _mm_load_si128((const __m128i *)(input + 12 * 16));
+  in[13]  = _mm_load_si128((const __m128i *)(input + 13 * 16));
+  in[14]  = _mm_load_si128((const __m128i *)(input + 14 * 16));
+  in[15]  = _mm_load_si128((const __m128i *)(input + 15 * 16));
+}
+
+#define RECON_AND_STORE(dest, in_x) \
+  {                                                     \
+     __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \
+      d0 = _mm_unpacklo_epi8(d0, zero); \
+      d0 = _mm_add_epi16(in_x, d0); \
+      d0 = _mm_packus_epi16(d0, d0); \
+      _mm_storel_epi64((__m128i *)(dest), d0); \
+      dest += stride; \
+  }
+
+static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) {
+  const __m128i final_rounding = _mm_set1_epi16(1<<5);
+  const __m128i zero = _mm_setzero_si128();
+  // Final rounding and shift
+  in[0] = _mm_adds_epi16(in[0], final_rounding);
+  in[1] = _mm_adds_epi16(in[1], final_rounding);
+  in[2] = _mm_adds_epi16(in[2], final_rounding);
+  in[3] = _mm_adds_epi16(in[3], final_rounding);
+  in[4] = _mm_adds_epi16(in[4], final_rounding);
+  in[5] = _mm_adds_epi16(in[5], final_rounding);
+  in[6] = _mm_adds_epi16(in[6], final_rounding);
+  in[7] = _mm_adds_epi16(in[7], final_rounding);
+  in[8] = _mm_adds_epi16(in[8], final_rounding);
+  in[9] = _mm_adds_epi16(in[9], final_rounding);
+  in[10] = _mm_adds_epi16(in[10], final_rounding);
+  in[11] = _mm_adds_epi16(in[11], final_rounding);
+  in[12] = _mm_adds_epi16(in[12], final_rounding);
+  in[13] = _mm_adds_epi16(in[13], final_rounding);
+  in[14] = _mm_adds_epi16(in[14], final_rounding);
+  in[15] = _mm_adds_epi16(in[15], final_rounding);
+
+  in[0] = _mm_srai_epi16(in[0], 6);
+  in[1] = _mm_srai_epi16(in[1], 6);
+  in[2] = _mm_srai_epi16(in[2], 6);
+  in[3] = _mm_srai_epi16(in[3], 6);
+  in[4] = _mm_srai_epi16(in[4], 6);
+  in[5] = _mm_srai_epi16(in[5], 6);
+  in[6] = _mm_srai_epi16(in[6], 6);
+  in[7] = _mm_srai_epi16(in[7], 6);
+  in[8] = _mm_srai_epi16(in[8], 6);
+  in[9] = _mm_srai_epi16(in[9], 6);
+  in[10] = _mm_srai_epi16(in[10], 6);
+  in[11] = _mm_srai_epi16(in[11], 6);
+  in[12] = _mm_srai_epi16(in[12], 6);
+  in[13] = _mm_srai_epi16(in[13], 6);
+  in[14] = _mm_srai_epi16(in[14], 6);
+  in[15] = _mm_srai_epi16(in[15], 6);
+
+  RECON_AND_STORE(dest, in[0]);
+  RECON_AND_STORE(dest, in[1]);
+  RECON_AND_STORE(dest, in[2]);
+  RECON_AND_STORE(dest, in[3]);
+  RECON_AND_STORE(dest, in[4]);
+  RECON_AND_STORE(dest, in[5]);
+  RECON_AND_STORE(dest, in[6]);
+  RECON_AND_STORE(dest, in[7]);
+  RECON_AND_STORE(dest, in[8]);
+  RECON_AND_STORE(dest, in[9]);
+  RECON_AND_STORE(dest, in[10]);
+  RECON_AND_STORE(dest, in[11]);
+  RECON_AND_STORE(dest, in[12]);
+  RECON_AND_STORE(dest, in[13]);
+  RECON_AND_STORE(dest, in[14]);
+  RECON_AND_STORE(dest, in[15]);
+}
diff --git a/vp9/common/x86/vp9_idct_intrin_ssse3.c b/vp9/common/x86/vp9_idct_intrin_ssse3.c
new file mode 100644
index 000000000..0930e7805
--- /dev/null
+++ b/vp9/common/x86/vp9_idct_intrin_ssse3.c
@@ -0,0 +1,385 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#if defined(_MSC_VER) && _MSC_VER <= 1500
+// Need to include math.h before calling tmmintrin.h/intrin.h
+// in certain versions of MSVS.
+#include <math.h>
+#endif
+#include <tmmintrin.h>  // SSSE3
+#include "vp9/common/x86/vp9_idct_intrin_sse2.h"
+
+static void idct16_8col(__m128i *in, int round) {
+  const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+  const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
+  const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
+  const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
+  const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
+  const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
+  const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+  const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
+  const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+  const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
+  const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
+  const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
+  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  const __m128i k__cospi_p16_p16_x2 = pair_set_epi16(23170, 23170);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+
+  __m128i v[16], u[16], s[16], t[16];
+
+  // stage 1
+  s[0] = in[0];
+  s[1] = in[8];
+  s[2] = in[4];
+  s[3] = in[12];
+  s[4] = in[2];
+  s[5] = in[10];
+  s[6] = in[6];
+  s[7] = in[14];
+  s[8] = in[1];
+  s[9] = in[9];
+  s[10] = in[5];
+  s[11] = in[13];
+  s[12] = in[3];
+  s[13] = in[11];
+  s[14] = in[7];
+  s[15] = in[15];
+
+  // stage 2
+  u[0] = _mm_unpacklo_epi16(s[8], s[15]);
+  u[1] = _mm_unpackhi_epi16(s[8], s[15]);
+  u[2] = _mm_unpacklo_epi16(s[9], s[14]);
+  u[3] = _mm_unpackhi_epi16(s[9], s[14]);
+  u[4] = _mm_unpacklo_epi16(s[10], s[13]);
+  u[5] = _mm_unpackhi_epi16(s[10], s[13]);
+  u[6] = _mm_unpacklo_epi16(s[11], s[12]);
+  u[7] = _mm_unpackhi_epi16(s[11], s[12]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14);
+  v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10);
+  v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10);
+  v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22);
+  v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22);
+  v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26);
+  v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26);
+  v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06);
+  v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
+  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
+  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
+  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
+  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
+  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
+  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
+  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
+
+  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+  u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
+  u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
+  u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
+  u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
+  u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
+  u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
+  u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
+  u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+
+  s[8]  = _mm_packs_epi32(u[0], u[1]);
+  s[15] = _mm_packs_epi32(u[2], u[3]);
+  s[9]  = _mm_packs_epi32(u[4], u[5]);
+  s[14] = _mm_packs_epi32(u[6], u[7]);
+  s[10] = _mm_packs_epi32(u[8], u[9]);
+  s[13] = _mm_packs_epi32(u[10], u[11]);
+  s[11] = _mm_packs_epi32(u[12], u[13]);
+  s[12] = _mm_packs_epi32(u[14], u[15]);
+
+  // stage 3
+  t[0] = s[0];
+  t[1] = s[1];
+  t[2] = s[2];
+  t[3] = s[3];
+  u[0] = _mm_unpacklo_epi16(s[4], s[7]);
+  u[1] = _mm_unpackhi_epi16(s[4], s[7]);
+  u[2] = _mm_unpacklo_epi16(s[5], s[6]);
+  u[3] = _mm_unpackhi_epi16(s[5], s[6]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+
+  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+
+  t[4] = _mm_packs_epi32(u[0], u[1]);
+  t[7] = _mm_packs_epi32(u[2], u[3]);
+  t[5] = _mm_packs_epi32(u[4], u[5]);
+  t[6] = _mm_packs_epi32(u[6], u[7]);
+  t[8] = _mm_add_epi16(s[8], s[9]);
+  t[9] = _mm_sub_epi16(s[8], s[9]);
+  t[10] = _mm_sub_epi16(s[11], s[10]);
+  t[11] = _mm_add_epi16(s[10], s[11]);
+  t[12] = _mm_add_epi16(s[12], s[13]);
+  t[13] = _mm_sub_epi16(s[12], s[13]);
+  t[14] = _mm_sub_epi16(s[15], s[14]);
+  t[15] = _mm_add_epi16(s[14], s[15]);
+
+  // stage 4
+  u[0] = _mm_add_epi16(t[0], t[1]);
+  u[1] = _mm_sub_epi16(t[0], t[1]);
+  u[2] = _mm_unpacklo_epi16(t[2], t[3]);
+  u[3] = _mm_unpackhi_epi16(t[2], t[3]);
+  u[4] = _mm_unpacklo_epi16(t[9], t[14]);
+  u[5] = _mm_unpackhi_epi16(t[9], t[14]);
+  u[6] = _mm_unpacklo_epi16(t[10], t[13]);
+  u[7] = _mm_unpackhi_epi16(t[10], t[13]);
+
+  s[0] = _mm_mulhrs_epi16(u[0], k__cospi_p16_p16_x2);
+  s[1] = _mm_mulhrs_epi16(u[1], k__cospi_p16_p16_x2);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
+  v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24);
+  v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24);
+  v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08);
+  v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08);
+  v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08);
+  v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08);
+  v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24);
+  v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24);
+
+  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
+  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
+  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
+  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
+  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
+  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
+  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
+  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
+
+  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+  u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
+  u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
+  u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
+  u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
+  u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
+  u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
+  u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
+  u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+
+  s[2] = _mm_packs_epi32(u[4], u[5]);
+  s[3] = _mm_packs_epi32(u[6], u[7]);
+  s[4] = _mm_add_epi16(t[4], t[5]);
+  s[5] = _mm_sub_epi16(t[4], t[5]);
+  s[6] = _mm_sub_epi16(t[7], t[6]);
+  s[7] = _mm_add_epi16(t[6], t[7]);
+  s[8] = t[8];
+  s[15] = t[15];
+  s[9]  = _mm_packs_epi32(u[8], u[9]);
+  s[14] = _mm_packs_epi32(u[10], u[11]);
+  s[10] = _mm_packs_epi32(u[12], u[13]);
+  s[13] = _mm_packs_epi32(u[14], u[15]);
+  s[11] = t[11];
+  s[12] = t[12];
+
+  // stage 5
+  t[0] = _mm_add_epi16(s[0], s[3]);
+  t[1] = _mm_add_epi16(s[1], s[2]);
+  t[2] = _mm_sub_epi16(s[1], s[2]);
+  t[3] = _mm_sub_epi16(s[0], s[3]);
+  t[4] = s[4];
+  t[7] = s[7];
+
+  u[0] = _mm_sub_epi16(s[6], s[5]);
+  u[1] = _mm_add_epi16(s[6], s[5]);
+  t[5] = _mm_mulhrs_epi16(u[0], k__cospi_p16_p16_x2);
+  t[6] = _mm_mulhrs_epi16(u[1], k__cospi_p16_p16_x2);
+
+  t[8] = _mm_add_epi16(s[8], s[11]);
+  t[9] = _mm_add_epi16(s[9], s[10]);
+  t[10] = _mm_sub_epi16(s[9], s[10]);
+  t[11] = _mm_sub_epi16(s[8], s[11]);
+  t[12] = _mm_sub_epi16(s[15], s[12]);
+  t[13] = _mm_sub_epi16(s[14], s[13]);
+  t[14] = _mm_add_epi16(s[13], s[14]);
+  t[15] = _mm_add_epi16(s[12], s[15]);
+
+  // stage 6
+  if (round == 1) {
+    s[0] = _mm_add_epi16(t[0], t[7]);
+    s[1] = _mm_add_epi16(t[1], t[6]);
+    s[2] = _mm_add_epi16(t[2], t[5]);
+    s[3] = _mm_add_epi16(t[3], t[4]);
+    s[4] = _mm_sub_epi16(t[3], t[4]);
+    s[5] = _mm_sub_epi16(t[2], t[5]);
+    s[6] = _mm_sub_epi16(t[1], t[6]);
+    s[7] = _mm_sub_epi16(t[0], t[7]);
+    s[8] = t[8];
+    s[9] = t[9];
+
+    u[0] = _mm_unpacklo_epi16(t[10], t[13]);
+    u[1] = _mm_unpackhi_epi16(t[10], t[13]);
+    u[2] = _mm_unpacklo_epi16(t[11], t[12]);
+    u[3] = _mm_unpackhi_epi16(t[11], t[12]);
+
+    v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
+    v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
+    v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
+    v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
+    v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
+    v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
+    v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
+    v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
+
+    u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+    u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+    u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+    u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+    u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+    u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+    u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+    u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+
+    u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+    u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+    u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+    u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+    u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+    u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+    u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+    u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+
+    s[10] = _mm_packs_epi32(u[0], u[1]);
+    s[13] = _mm_packs_epi32(u[2], u[3]);
+    s[11] = _mm_packs_epi32(u[4], u[5]);
+    s[12] = _mm_packs_epi32(u[6], u[7]);
+    s[14] = t[14];
+    s[15] = t[15];
+  } else {
+    s[0] = _mm_add_epi16(t[0], t[7]);
+    s[1] = _mm_add_epi16(t[1], t[6]);
+    s[2] = _mm_add_epi16(t[2], t[5]);
+    s[3] = _mm_add_epi16(t[3], t[4]);
+    s[4] = _mm_sub_epi16(t[3], t[4]);
+    s[5] = _mm_sub_epi16(t[2], t[5]);
+    s[6] = _mm_sub_epi16(t[1], t[6]);
+    s[7] = _mm_sub_epi16(t[0], t[7]);
+    s[8] = t[8];
+    s[9] = t[9];
+
+    u[0] = _mm_sub_epi16(t[13], t[10]);
+    u[1] = _mm_add_epi16(t[13], t[10]);
+    u[2] = _mm_sub_epi16(t[12], t[11]);
+    u[3] = _mm_add_epi16(t[12], t[11]);
+
+    s[10] = _mm_mulhrs_epi16(u[0], k__cospi_p16_p16_x2);
+    s[13] = _mm_mulhrs_epi16(u[1], k__cospi_p16_p16_x2);
+    s[11] = _mm_mulhrs_epi16(u[2], k__cospi_p16_p16_x2);
+    s[12] = _mm_mulhrs_epi16(u[3], k__cospi_p16_p16_x2);
+    s[14] = t[14];
+    s[15] = t[15];
+  }
+
+  // stage 7
+  in[0] = _mm_add_epi16(s[0], s[15]);
+  in[1] = _mm_add_epi16(s[1], s[14]);
+  in[2] = _mm_add_epi16(s[2], s[13]);
+  in[3] = _mm_add_epi16(s[3], s[12]);
+  in[4] = _mm_add_epi16(s[4], s[11]);
+  in[5] = _mm_add_epi16(s[5], s[10]);
+  in[6] = _mm_add_epi16(s[6], s[9]);
+  in[7] = _mm_add_epi16(s[7], s[8]);
+  in[8] = _mm_sub_epi16(s[7], s[8]);
+  in[9] = _mm_sub_epi16(s[6], s[9]);
+  in[10] = _mm_sub_epi16(s[5], s[10]);
+  in[11] = _mm_sub_epi16(s[4], s[11]);
+  in[12] = _mm_sub_epi16(s[3], s[12]);
+  in[13] = _mm_sub_epi16(s[2], s[13]);
+  in[14] = _mm_sub_epi16(s[1], s[14]);
+  in[15] = _mm_sub_epi16(s[0], s[15]);
+}
+
+static void idct16_sse2(__m128i *in0, __m128i *in1, int round) {
+  array_transpose_16x16(in0, in1);
+  idct16_8col(in0, round);
+  idct16_8col(in1, round);
+}
+
+void vp9_idct16x16_256_add_ssse3(const int16_t *input, uint8_t *dest,
+                                int stride) {
+  __m128i in0[16], in1[16];
+
+  load_buffer_8x16(input, in0);
+  input += 8;
+  load_buffer_8x16(input, in1);
+
+  idct16_sse2(in0, in1, 0);
+  idct16_sse2(in0, in1, 1);
+
+  write_buffer_8x16(dest, in0, stride);
+  dest += 8;
+  write_buffer_8x16(dest, in1, stride);
+}
diff --git a/vp9/common/x86/vp9_idct_ssse3.asm b/vp9/common/x86/vp9_idct_ssse3_x86_64.asm
index 2c1060710..2c1060710 100644
--- a/vp9/common/x86/vp9_idct_ssse3.asm
+++ b/vp9/common/x86/vp9_idct_ssse3_x86_64.asm
diff --git a/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c b/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c
index b84db970e..d109e136a 100644
--- a/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c
+++ b/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c
@@ -111,21 +111,21 @@ void vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr,
 
     // filter the source buffer
     srcRegFilt32b1_1= _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
-    srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
+    srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt4Reg);
 
     // multiply 2 adjacent elements with the filter and add the result
     srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
-    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters);
+    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
 
     // add and saturate the results together
     srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
 
     // filter the source buffer
-    srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b1, filt4Reg);
+    srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
     srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
 
     // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, forthFilters);
+    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
     srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
 
     // add and saturate the results together
@@ -146,21 +146,21 @@ void vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr,
 
     // filter the source buffer
     srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg);
-    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg);
+    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt4Reg);
 
     // multiply 2 adjacent elements with the filter and add the result
     srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters);
-    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters);
+    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
 
     // add and saturate the results together
     srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2);
 
     // filter the source buffer
-    srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b2, filt4Reg);
+    srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b2, filt2Reg);
     srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b2, filt3Reg);
 
     // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, forthFilters);
+    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
     srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
 
     // add and saturate the results together
@@ -208,26 +208,26 @@ void vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr,
     srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1,
                     _mm256_castsi256_si128(filt1Reg));
     srcRegFilt2 = _mm_shuffle_epi8(srcReg1,
-                  _mm256_castsi256_si128(filt2Reg));
+                  _mm256_castsi256_si128(filt4Reg));
 
     // multiply 2 adjacent elements with the filter and add the result
     srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1,
                     _mm256_castsi256_si128(firstFilters));
     srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
-                  _mm256_castsi256_si128(secondFilters));
+                  _mm256_castsi256_si128(forthFilters));
 
     // add and saturate the results together
     srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
 
     // filter the source buffer
     srcRegFilt3= _mm_shuffle_epi8(srcReg1,
-                 _mm256_castsi256_si128(filt4Reg));
+                 _mm256_castsi256_si128(filt2Reg));
     srcRegFilt2= _mm_shuffle_epi8(srcReg1,
                  _mm256_castsi256_si128(filt3Reg));
 
     // multiply 2 adjacent elements with the filter and add the result
     srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3,
-                  _mm256_castsi256_si128(forthFilters));
+                  _mm256_castsi256_si128(secondFilters));
     srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
                   _mm256_castsi256_si128(thirdFilters));
 
@@ -247,26 +247,26 @@ void vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr,
     srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2,
                     _mm256_castsi256_si128(filt1Reg));
     srcRegFilt2 = _mm_shuffle_epi8(srcReg2,
-                  _mm256_castsi256_si128(filt2Reg));
+                  _mm256_castsi256_si128(filt4Reg));
 
     // multiply 2 adjacent elements with the filter and add the result
     srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1,
                     _mm256_castsi256_si128(firstFilters));
     srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
-                  _mm256_castsi256_si128(secondFilters));
+                  _mm256_castsi256_si128(forthFilters));
 
     // add and saturate the results together
     srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);
 
     // filter the source buffer
     srcRegFilt3 = _mm_shuffle_epi8(srcReg2,
-                  _mm256_castsi256_si128(filt4Reg));
+                  _mm256_castsi256_si128(filt2Reg));
     srcRegFilt2 = _mm_shuffle_epi8(srcReg2,
                   _mm256_castsi256_si128(filt3Reg));
 
     // multiply 2 adjacent elements with the filter and add the result
     srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3,
-                  _mm256_castsi256_si128(forthFilters));
+                  _mm256_castsi256_si128(secondFilters));
     srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
                   _mm256_castsi256_si128(thirdFilters));
 
diff --git a/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c b/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c
index cf28d8d2b..c4efa6565 100644
--- a/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c
+++ b/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c
@@ -44,7 +44,7 @@ void vp9_filter_block1d4_h8_intrin_ssse3(unsigned char *src_ptr,
                                          unsigned int output_pitch,
                                          unsigned int output_height,
                                          int16_t *filter) {
-  __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
+  __m128i firstFilters, secondFilters, shuffle1, shuffle2;
   __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
   __m128i addFilterReg64, filtersReg, srcReg, minReg;
   unsigned int i;
@@ -61,20 +61,22 @@ void vp9_filter_block1d4_h8_intrin_ssse3(unsigned char *src_ptr,
   // duplicate only the third 16 bit in the filter into the first lane
   secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu);
   // duplicate only the seconds 16 bits in the filter into the second lane
+  // firstFilters: k0 k1 k0 k1 k0 k1 k0 k1 k2 k3 k2 k3 k2 k3 k2 k3
   firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u);
   // duplicate only the forth 16 bits in the filter into the second lane
+  // secondFilters: k4 k5 k4 k5 k4 k5 k4 k5 k6 k7 k6 k7 k6 k7 k6 k7
   secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu);
 
   // loading the local filters
-  thirdFilters =_mm_load_si128((__m128i const *)filt1_4_h8);
-  forthFilters = _mm_load_si128((__m128i const *)filt2_4_h8);
+  shuffle1 =_mm_load_si128((__m128i const *)filt1_4_h8);
+  shuffle2 = _mm_load_si128((__m128i const *)filt2_4_h8);
 
   for (i = 0; i < output_height; i++) {
     srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3));
 
     // filter the source buffer
-    srcRegFilt1= _mm_shuffle_epi8(srcReg, thirdFilters);
-    srcRegFilt2= _mm_shuffle_epi8(srcReg, forthFilters);
+    srcRegFilt1= _mm_shuffle_epi8(srcReg, shuffle1);
+    srcRegFilt2= _mm_shuffle_epi8(srcReg, shuffle2);
 
     // multiply 2 adjacent elements with the filter and add the result
     srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
@@ -164,12 +166,12 @@ void vp9_filter_block1d8_h8_intrin_ssse3(unsigned char *src_ptr,
     srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters);
 
     // add and saturate all the results together
-    minReg = _mm_min_epi16(srcRegFilt4, srcRegFilt3);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
+    minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
 
-    srcRegFilt4= _mm_max_epi16(srcRegFilt4, srcRegFilt3);
+    srcRegFilt2= _mm_max_epi16(srcRegFilt2, srcRegFilt3);
     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
 
     // shift by 7 bit each 16 bits
@@ -229,21 +231,21 @@ void vp9_filter_block1d16_h8_intrin_ssse3(unsigned char *src_ptr,
 
     // filter the source buffer
     srcRegFilt1_1= _mm_shuffle_epi8(srcReg1, filt1Reg);
-    srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt2Reg);
+    srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt4Reg);
 
     // multiply 2 adjacent elements with the filter and add the result
     srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, firstFilters);
-    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
+    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, forthFilters);
 
     // add and saturate the results together
     srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
 
     // filter the source buffer
-    srcRegFilt3= _mm_shuffle_epi8(srcReg1, filt4Reg);
+    srcRegFilt3= _mm_shuffle_epi8(srcReg1, filt2Reg);
     srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt3Reg);
 
     // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters);
+    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
     srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
 
     // add and saturate the results together
@@ -260,21 +262,21 @@ void vp9_filter_block1d16_h8_intrin_ssse3(unsigned char *src_ptr,
 
     // filter the source buffer
     srcRegFilt2_1= _mm_shuffle_epi8(srcReg2, filt1Reg);
-    srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt2Reg);
+    srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt4Reg);
 
     // multiply 2 adjacent elements with the filter and add the result
     srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, firstFilters);
-    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
+    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, forthFilters);
 
     // add and saturate the results together
     srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);
 
     // filter the source buffer
-    srcRegFilt3= _mm_shuffle_epi8(srcReg2, filt4Reg);
+    srcRegFilt3= _mm_shuffle_epi8(srcReg2, filt2Reg);
     srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt3Reg);
 
     // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters);
+    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
     srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
 
     // add and saturate the results together
diff --git a/vp9/common/x86/vp9_subpixel_8t_ssse3.asm b/vp9/common/x86/vp9_subpixel_8t_ssse3.asm
index 634fa7746..fd781d4bc 100644
--- a/vp9/common/x86/vp9_subpixel_8t_ssse3.asm
+++ b/vp9/common/x86/vp9_subpixel_8t_ssse3.asm
@@ -272,22 +272,23 @@
     punpcklbw   xmm2, xmm3                  ;C D
     punpcklbw   xmm4, xmm5                  ;E F
 
-
     movq        xmm6, [rsi + rbx + 8]       ;G
     movq        xmm7, [rax + rbx + 8]       ;H
     punpcklbw   xmm6, xmm7                  ;G H
 
-
     pmaddubsw   xmm0, k0k1
     pmaddubsw   xmm2, k2k3
     pmaddubsw   xmm4, k4k5
     pmaddubsw   xmm6, k6k7
 
     paddsw      xmm0, xmm6
-    paddsw      xmm0, xmm2
+    movdqa      xmm1, xmm2
+    pmaxsw      xmm2, xmm4
+    pminsw      xmm4, xmm1
     paddsw      xmm0, xmm4
-    paddsw      xmm0, krd
+    paddsw      xmm0, xmm2
 
+    paddsw      xmm0, krd
     psraw       xmm0, 7
     packuswb    xmm0, xmm0
 
diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index de58939fc..fc70035f2 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -195,30 +195,32 @@ static void inverse_transform_block(MACROBLOCKD* xd, int plane, int block,
   struct macroblockd_plane *const pd = &xd->plane[plane];
   if (eob > 0) {
     TX_TYPE tx_type;
-    const PLANE_TYPE plane_type = pd->plane_type;
     int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
-    switch (tx_size) {
-      case TX_4X4:
-        tx_type = get_tx_type_4x4(plane_type, xd, block);
-        if (tx_type == DCT_DCT)
-          xd->itxm_add(dqcoeff, dst, stride, eob);
-        else
-          vp9_iht4x4_16_add(dqcoeff, dst, stride, tx_type);
-        break;
-      case TX_8X8:
-        tx_type = get_tx_type(plane_type, xd);
-        vp9_iht8x8_add(tx_type, dqcoeff, dst, stride, eob);
-        break;
-      case TX_16X16:
-        tx_type = get_tx_type(plane_type, xd);
-        vp9_iht16x16_add(tx_type, dqcoeff, dst, stride, eob);
-        break;
-      case TX_32X32:
-        tx_type = DCT_DCT;
-        vp9_idct32x32_add(dqcoeff, dst, stride, eob);
-        break;
-      default:
-        assert(0 && "Invalid transform size");
+    if (xd->lossless) {
+      tx_type = DCT_DCT;
+      vp9_iwht4x4_add(dqcoeff, dst, stride, eob);
+    } else {
+      const PLANE_TYPE plane_type = pd->plane_type;
+      switch (tx_size) {
+        case TX_4X4:
+          tx_type = get_tx_type_4x4(plane_type, xd, block);
+          vp9_iht4x4_add(tx_type, dqcoeff, dst, stride, eob);
+          break;
+        case TX_8X8:
+          tx_type = get_tx_type(plane_type, xd);
+          vp9_iht8x8_add(tx_type, dqcoeff, dst, stride, eob);
+          break;
+        case TX_16X16:
+          tx_type = get_tx_type(plane_type, xd);
+          vp9_iht16x16_add(tx_type, dqcoeff, dst, stride, eob);
+          break;
+        case TX_32X32:
+          tx_type = DCT_DCT;
+          vp9_idct32x32_add(dqcoeff, dst, stride, eob);
+          break;
+        default:
+          assert(0 && "Invalid transform size");
+      }
     }
 
     if (eob == 1) {
@@ -588,8 +590,6 @@ static void setup_quantization(VP9_COMMON *const cm, MACROBLOCKD *const xd,
                  cm->y_dc_delta_q == 0 &&
                  cm->uv_dc_delta_q == 0 &&
                  cm->uv_ac_delta_q == 0;
-
-  xd->itxm_add = xd->lossless ? vp9_iwht4x4_add : vp9_idct4x4_add;
 }
 
 static INTERP_FILTER read_interp_filter(struct vp9_read_bit_buffer *rb) {
@@ -802,7 +802,7 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi,
     CHECK_MEM_ERROR(
         cm,
         pbi->tile_data,
-        vpx_malloc(tile_cols * tile_rows * (sizeof(*pbi->tile_data))));
+        vpx_memalign(32, tile_cols * tile_rows * (sizeof(*pbi->tile_data))));
     pbi->total_tiles = tile_rows * tile_cols;
   }
 
@@ -1317,16 +1317,15 @@ static struct vp9_read_bit_buffer* init_read_bit_buffer(
   return rb;
 }
 
-int vp9_decode_frame(VP9Decoder *pbi,
-                     const uint8_t *data, const uint8_t *data_end,
-                     const uint8_t **p_data_end) {
+void vp9_decode_frame(VP9Decoder *pbi,
+                      const uint8_t *data, const uint8_t *data_end,
+                      const uint8_t **p_data_end) {
   VP9_COMMON *const cm = &pbi->common;
   MACROBLOCKD *const xd = &pbi->mb;
   struct vp9_read_bit_buffer rb = { 0 };
   uint8_t clear_data[MAX_VP9_HEADER_SIZE];
   const size_t first_partition_size = read_uncompressed_header(pbi,
       init_read_bit_buffer(pbi, &rb, data, data_end, clear_data));
-  const int keyframe = cm->frame_type == KEY_FRAME;
   const int tile_rows = 1 << cm->log2_tile_rows;
   const int tile_cols = 1 << cm->log2_tile_cols;
   YV12_BUFFER_CONFIG *const new_fb = get_frame_new_buffer(cm);
@@ -1335,12 +1334,9 @@ int vp9_decode_frame(VP9Decoder *pbi,
   if (!first_partition_size) {
     // showing a frame directly
     *p_data_end = data + 1;
-    return 0;
+    return;
   }
 
-  if (!pbi->decoded_key_frame && !keyframe)
-    return -1;
-
   data += vp9_rb_bytes_read(&rb);
   if (!read_is_valid(data, first_partition_size, data_end))
     vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
@@ -1377,14 +1373,6 @@ int vp9_decode_frame(VP9Decoder *pbi,
 
   new_fb->corrupted |= xd->corrupted;
 
-  if (!pbi->decoded_key_frame) {
-    if (keyframe && !new_fb->corrupted)
-      pbi->decoded_key_frame = 1;
-    else
-      vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
-                         "A stream must start with a complete key frame");
-  }
-
   if (!new_fb->corrupted) {
     if (!cm->error_resilient_mode && !cm->frame_parallel_decoding_mode) {
       vp9_adapt_coef_probs(cm);
@@ -1400,6 +1388,4 @@ int vp9_decode_frame(VP9Decoder *pbi,
 
   if (cm->refresh_frame_context)
     cm->frame_contexts[cm->frame_context_idx] = cm->fc;
-
-  return 0;
 }
diff --git a/vp9/decoder/vp9_decodeframe.h b/vp9/decoder/vp9_decodeframe.h
index 8a19dafc5..fb15645a9 100644
--- a/vp9/decoder/vp9_decodeframe.h
+++ b/vp9/decoder/vp9_decodeframe.h
@@ -21,9 +21,9 @@ struct VP9Decoder;
 
 void vp9_init_dequantizer(struct VP9Common *cm);
 
-int vp9_decode_frame(struct VP9Decoder *pbi,
-                     const uint8_t *data, const uint8_t *data_end,
-                     const uint8_t **p_data_end);
+void vp9_decode_frame(struct VP9Decoder *pbi,
+                      const uint8_t *data, const uint8_t *data_end,
+                      const uint8_t **p_data_end);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vp9/decoder/vp9_decoder.c b/vp9/decoder/vp9_decoder.c
index 8902f179d..13d79ff44 100644
--- a/vp9/decoder/vp9_decoder.c
+++ b/vp9/decoder/vp9_decoder.c
@@ -67,7 +67,6 @@ VP9Decoder *vp9_decoder_create() {
 
   cm->current_video_frame = 0;
   pbi->ready_for_new_data = 1;
-  pbi->decoded_key_frame = 0;
 
   // vp9_init_dequantizer() is first called here. Add check in
   // frame_init_dequantizer() to avoid unnecessary calling of
@@ -267,15 +266,7 @@ int vp9_receive_compressed_data(VP9Decoder *pbi,
 
   cm->error.setjmp = 1;
 
-  retcode = vp9_decode_frame(pbi, source, source + size, psource);
-
-  if (retcode < 0) {
-    cm->error.error_code = VPX_CODEC_ERROR;
-    cm->error.setjmp = 0;
-    if (cm->frame_bufs[cm->new_fb_idx].ref_count > 0)
-      cm->frame_bufs[cm->new_fb_idx].ref_count--;
-    return retcode;
-  }
+  vp9_decode_frame(pbi, source, source + size, psource);
 
   swap_frame_buffers(pbi);
 
diff --git a/vp9/decoder/vp9_decoder.h b/vp9/decoder/vp9_decoder.h
index 1a5576e5a..a6edf0cbd 100644
--- a/vp9/decoder/vp9_decoder.h
+++ b/vp9/decoder/vp9_decoder.h
@@ -43,8 +43,6 @@ typedef struct VP9Decoder {
 
   int refresh_frame_flags;
 
-  int decoded_key_frame;
-
   VP9Worker lf_worker;
   VP9Worker *tile_workers;
   int num_tile_workers;
diff --git a/vp9/encoder/vp9_aq_complexity.c b/vp9/encoder/vp9_aq_complexity.c
index 47ad8d8cc..0d6b41d15 100644
--- a/vp9/encoder/vp9_aq_complexity.c
+++ b/vp9/encoder/vp9_aq_complexity.c
@@ -47,11 +47,21 @@ void vp9_setup_in_frame_q_adj(VP9_COMP *cpi) {
 
     // Use some of the segments for in frame Q adjustment.
     for (segment = 1; segment < 2; segment++) {
-      const int qindex_delta =
+      int qindex_delta =
           vp9_compute_qdelta_by_rate(&cpi->rc, cm->frame_type, cm->base_qindex,
                                      in_frame_q_adj_ratio[segment]);
-      vp9_enable_segfeature(seg, segment, SEG_LVL_ALT_Q);
-      vp9_set_segdata(seg, segment, SEG_LVL_ALT_Q, qindex_delta);
+
+      // For AQ mode 2, we dont allow Q0 in a segment if the base Q is not 0.
+      // Q0 (lossless) implies 4x4 only and in AQ mode 2 a segment Q delta
+      // is sometimes applied without going back around the rd loop.
+      // This could lead to an illegal combination of partition size and q.
+      if ((cm->base_qindex != 0) && ((cm->base_qindex + qindex_delta) == 0)) {
+        qindex_delta = -cm->base_qindex + 1;
+      }
+      if ((cm->base_qindex + qindex_delta) > 0) {
+        vp9_enable_segfeature(seg, segment, SEG_LVL_ALT_Q);
+        vp9_set_segdata(seg, segment, SEG_LVL_ALT_Q, qindex_delta);
+      }
     }
   }
 }
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index 8ef2b2eed..76f5e7bbe 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -20,7 +20,6 @@
 #include "vp9/common/vp9_entropymode.h"
 #include "vp9/common/vp9_entropymv.h"
 #include "vp9/common/vp9_mvref_common.h"
-#include "vp9/common/vp9_pragmas.h"
 #include "vp9/common/vp9_pred_common.h"
 #include "vp9/common/vp9_seg_common.h"
 #include "vp9/common/vp9_systemdependent.h"
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index c406860a0..c3cd93b78 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -109,6 +109,7 @@ struct macroblock {
   MV pred_mv[MAX_REF_FRAMES];
 
   void (*fwd_txm4x4)(const int16_t *input, int16_t *output, int stride);
+  void (*itxm_add)(const int16_t *input, uint8_t *dest, int stride, int eob);
 };
 
 #ifdef __cplusplus
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index ef33fcaf1..37fb0f3c6 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -76,18 +76,6 @@ typedef struct {
   unsigned int var;
 } diff;
 
-static void get_sse_sum_8x8(const uint8_t *src, int src_stride,
-                            const uint8_t *ref, int ref_stride,
-                            unsigned int *sse, int *sum) {
-  variance(src, src_stride, ref, ref_stride, 8, 8, sse, sum);
-}
-
-static void get_sse_sum_16x16(const uint8_t *src, int src_stride,
-                              const uint8_t *ref, int ref_stride,
-                              unsigned int *sse, int *sum) {
-  variance(src, src_stride, ref, ref_stride, 16, 16, sse, sum);
-}
-
 static unsigned int get_sby_perpixel_variance(VP9_COMP *cpi,
                                               const struct buf_2d *ref,
                                               BLOCK_SIZE bs) {
@@ -490,8 +478,8 @@ static void choose_partitioning(VP9_COMP *cpi,
         unsigned int sse = 0;
         int sum = 0;
         if (x_idx < pixels_wide && y_idx < pixels_high)
-          get_sse_sum_8x8(s + y_idx * sp + x_idx, sp,
-                          d + y_idx * dp + x_idx, dp, &sse, &sum);
+          vp9_get8x8var(s + y_idx * sp + x_idx, sp,
+                        d + y_idx * dp + x_idx, dp, &sse, &sum);
         fill_variance(sse, sum, 64, &vst->split[k].part_variances.none);
       }
     }
@@ -1226,9 +1214,9 @@ static void set_source_var_based_partition(VP9_COMP *cpi,
         int b_offset = b_mi_row * MI_SIZE * src_stride +
                        b_mi_col * MI_SIZE;
 
-        get_sse_sum_16x16(src + b_offset, src_stride,
-                          pre_src + b_offset, pre_stride,
-                          &d16[j].sse, &d16[j].sum);
+        vp9_get16x16var(src + b_offset, src_stride,
+                        pre_src + b_offset, pre_stride,
+                        &d16[j].sse, &d16[j].sum);
 
         d16[j].var = d16[j].sse -
             (((uint32_t)d16[j].sum * d16[j].sum) >> 8);
@@ -1303,14 +1291,14 @@ static int is_background(VP9_COMP *cpi, const TileInfo *const tile,
   if (row8x8_remaining >= MI_BLOCK_SIZE &&
       col8x8_remaining >= MI_BLOCK_SIZE) {
     this_sad = cpi->fn_ptr[BLOCK_64X64].sdf(src, src_stride,
-                                            pre, pre_stride, 0x7fffffff);
+                                            pre, pre_stride);
     threshold = (1 << 12);
   } else {
     int r, c;
     for (r = 0; r < row8x8_remaining; r += 2)
       for (c = 0; c < col8x8_remaining; c += 2)
-        this_sad += cpi->fn_ptr[BLOCK_16X16].sdf(src, src_stride, pre,
-                                                 pre_stride, 0x7fffffff);
+        this_sad += cpi->fn_ptr[BLOCK_16X16].sdf(src, src_stride,
+                                                 pre, pre_stride);
     threshold = (row8x8_remaining * col8x8_remaining) << 6;
   }
 
@@ -1318,7 +1306,8 @@ static int is_background(VP9_COMP *cpi, const TileInfo *const tile,
   return x->in_static_area;
 }
 
-static int sb_has_motion(const VP9_COMMON *cm, MODE_INFO **prev_mi_8x8) {
+static int sb_has_motion(const VP9_COMMON *cm, MODE_INFO **prev_mi_8x8,
+                         const int motion_thresh) {
   const int mis = cm->mi_stride;
   int block_row, block_col;
 
@@ -1327,8 +1316,8 @@ static int sb_has_motion(const VP9_COMMON *cm, MODE_INFO **prev_mi_8x8) {
       for (block_col = 0; block_col < 8; ++block_col) {
         const MODE_INFO *prev_mi = prev_mi_8x8[block_row * mis + block_col];
         if (prev_mi) {
-          if (abs(prev_mi->mbmi.mv[0].as_mv.row) >= 8 ||
-              abs(prev_mi->mbmi.mv[0].as_mv.col) >= 8)
+          if (abs(prev_mi->mbmi.mv[0].as_mv.row) > motion_thresh ||
+              abs(prev_mi->mbmi.mv[0].as_mv.col) > motion_thresh)
             return 1;
         }
       }
@@ -2324,7 +2313,7 @@ static void encode_rd_sb_row(VP9_COMP *cpi, const TileInfo *const tile,
             || cpi->rc.is_src_frame_alt_ref
             || ((sf->use_lastframe_partitioning ==
                  LAST_FRAME_PARTITION_LOW_MOTION) &&
-                 sb_has_motion(cm, prev_mi))) {
+                 sb_has_motion(cm, prev_mi, sf->lf_motion_threshold))) {
           // If required set upper and lower partition size limits
           if (sf->auto_min_max_partition_size) {
             set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64);
@@ -2337,7 +2326,7 @@ static void encode_rd_sb_row(VP9_COMP *cpi, const TileInfo *const tile,
                             cpi->pc_root);
         } else {
           if (sf->constrain_copy_partition &&
-              sb_has_motion(cm, prev_mi))
+              sb_has_motion(cm, prev_mi, sf->lf_motion_threshold))
             constrain_copy_partitioning(cpi, tile, mi, prev_mi,
                                         mi_row, mi_col, BLOCK_16X16);
           else
@@ -2384,7 +2373,7 @@ static void switch_lossless_mode(VP9_COMP *cpi, int lossless) {
   if (lossless) {
     // printf("Switching to lossless\n");
     cpi->mb.fwd_txm4x4 = vp9_fwht4x4;
-    cpi->mb.e_mbd.itxm_add = vp9_iwht4x4_add;
+    cpi->mb.itxm_add = vp9_iwht4x4_add;
     cpi->mb.optimize = 0;
     cpi->common.lf.filter_level = 0;
     cpi->zbin_mode_boost_enabled = 0;
@@ -2392,7 +2381,7 @@ static void switch_lossless_mode(VP9_COMP *cpi, int lossless) {
   } else {
     // printf("Not lossless\n");
     cpi->mb.fwd_txm4x4 = vp9_fdct4x4;
-    cpi->mb.e_mbd.itxm_add = vp9_idct4x4_add;
+    cpi->mb.itxm_add = vp9_idct4x4_add;
   }
 }
 
@@ -2869,7 +2858,7 @@ static void nonrd_use_partition(VP9_COMP *cpi,
       if (mi_row + hbs < cm->mi_rows) {
         nonrd_pick_sb_modes(cpi, tile, mi_row + hbs, mi_col,
                             &rate, &dist, subsize);
-        pc_tree->horizontal[1].mic.mbmi = mi[0]->mbmi;
+        pc_tree->horizontal[1].mic.mbmi = xd->mi[0]->mbmi;
         if (rate != INT_MAX && dist != INT64_MAX &&
             *totrate != INT_MAX && *totdist != INT64_MAX) {
           *totrate += rate;
@@ -3368,7 +3357,8 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
       vp9_tokenize_sb(cpi, t, !output_enabled, MAX(bsize, BLOCK_8X8));
     } else {
       mbmi->skip = 1;
-      if (output_enabled)
+      if (output_enabled &&
+          !vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP))
         cm->counts.skip[vp9_get_skip_context(xd)][1]++;
       reset_skip_context(xd, MAX(bsize, BLOCK_8X8));
     }
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 3b231b7f2..8581e6117 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -406,7 +406,7 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize,
       // this is like vp9_short_idct4x4 but has a special case around eob<=1
       // which is significant (not just an optimization) for the lossless
       // case.
-      xd->itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
+      x->itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
       break;
     default:
       assert(0 && "Invalid transform size");
@@ -428,7 +428,7 @@ static void encode_block_pass1(int plane, int block, BLOCK_SIZE plane_bsize,
   vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
 
   if (p->eobs[block] > 0)
-    xd->itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
+    x->itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
 }
 
 void vp9_encode_sby_pass1(MACROBLOCK *x, BLOCK_SIZE bsize) {
@@ -574,7 +574,7 @@ static void encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
           // this is like vp9_short_idct4x4 but has a special case around eob<=1
           // which is significant (not just an optimization) for the lossless
           // case.
-          xd->itxm_add(dqcoeff, dst, dst_stride, *eob);
+          x->itxm_add(dqcoeff, dst, dst_stride, *eob);
         else
           vp9_iht4x4_16_add(dqcoeff, dst, dst_stride, tx_type);
       }
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 645503103..c955d27c0 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -602,9 +602,9 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
     // is set.
     cpi->oxcf.worst_allowed_q = 0;
     cpi->oxcf.best_allowed_q = 0;
-    cpi->mb.e_mbd.itxm_add = vp9_iwht4x4_add;
+    cpi->mb.itxm_add = vp9_iwht4x4_add;
   } else {
-    cpi->mb.e_mbd.itxm_add = vp9_idct4x4_add;
+    cpi->mb.itxm_add = vp9_idct4x4_add;
   }
   rc->baseline_gf_interval = DEFAULT_GF_INTERVAL;
   cpi->ref_frame_flags = VP9_ALT_FLAG | VP9_GOLD_FLAG | VP9_LAST_FLAG;
@@ -632,9 +632,6 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
     cpi->oxcf.maximum_buffer_size     = 240000;
   }
 
-  // Convert target bandwidth from Kbit/s to Bit/s
-  cpi->oxcf.target_bandwidth       *= 1000;
-
   cpi->oxcf.starting_buffer_level =
       vp9_rescale(cpi->oxcf.starting_buffer_level,
                   cpi->oxcf.target_bandwidth, 1000);
@@ -2103,8 +2100,6 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
 
   vp9_clear_system_state();
 
-  vp9_zero(cpi->rd.tx_select_threshes);
-
 #if CONFIG_VP9_POSTPROC
   if (cpi->oxcf.noise_sensitivity > 0) {
     int l = 0;
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index fce5b28f3..dc3832b16 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -64,8 +64,7 @@
 #define MIN_GF_INTERVAL             4
 #endif
 
-
-// #define LONG_TERM_VBR_CORRECTION
+#define LONG_TERM_VBR_CORRECTION
 
 static void swap_yv12(YV12_BUFFER_CONFIG *a, YV12_BUFFER_CONFIG *b) {
   YV12_BUFFER_CONFIG temp = *a;
@@ -170,7 +169,6 @@ static void zero_stats(FIRSTPASS_STATS *section) {
   section->intra_error = 0.0;
   section->coded_error = 0.0;
   section->sr_coded_error = 0.0;
-  section->ssim_weighted_pred_err = 0.0;
   section->pcnt_inter  = 0.0;
   section->pcnt_motion  = 0.0;
   section->pcnt_second_ref = 0.0;
@@ -195,7 +193,6 @@ static void accumulate_stats(FIRSTPASS_STATS *section,
   section->intra_error += frame->intra_error;
   section->coded_error += frame->coded_error;
   section->sr_coded_error += frame->sr_coded_error;
-  section->ssim_weighted_pred_err += frame->ssim_weighted_pred_err;
   section->pcnt_inter  += frame->pcnt_inter;
   section->pcnt_motion += frame->pcnt_motion;
   section->pcnt_second_ref += frame->pcnt_second_ref;
@@ -218,7 +215,6 @@ static void subtract_stats(FIRSTPASS_STATS *section,
   section->intra_error -= frame->intra_error;
   section->coded_error -= frame->coded_error;
   section->sr_coded_error -= frame->sr_coded_error;
-  section->ssim_weighted_pred_err -= frame->ssim_weighted_pred_err;
   section->pcnt_inter  -= frame->pcnt_inter;
   section->pcnt_motion -= frame->pcnt_motion;
   section->pcnt_second_ref -= frame->pcnt_second_ref;
@@ -242,7 +238,6 @@ static void avg_stats(FIRSTPASS_STATS *section) {
   section->intra_error /= section->count;
   section->coded_error /= section->count;
   section->sr_coded_error /= section->count;
-  section->ssim_weighted_pred_err /= section->count;
   section->pcnt_inter  /= section->count;
   section->pcnt_second_ref /= section->count;
   section->pcnt_neutral /= section->count;
@@ -259,86 +254,18 @@ static void avg_stats(FIRSTPASS_STATS *section) {
 
 // Calculate a modified Error used in distributing bits between easier and
 // harder frames.
-static double calculate_modified_err(const VP9_COMP *cpi,
+static double calculate_modified_err(const TWO_PASS *twopass,
+                                     const VP9EncoderConfig *oxcf,
                                      const FIRSTPASS_STATS *this_frame) {
-  const TWO_PASS *twopass = &cpi->twopass;
-  const SVC *const svc = &cpi->svc;
-  const FIRSTPASS_STATS *stats;
-  double av_err;
-  double modified_error;
-
-  if (svc->number_spatial_layers > 1 &&
-      svc->number_temporal_layers == 1) {
-    twopass = &svc->layer_context[svc->spatial_layer_id].twopass;
-  }
-
-  stats = &twopass->total_stats;
-  av_err = stats->ssim_weighted_pred_err / stats->count;
-  modified_error = av_err * pow(this_frame->ssim_weighted_pred_err /
-                   DOUBLE_DIVIDE_CHECK(av_err),
-                   cpi->oxcf.two_pass_vbrbias / 100.0);
-
+  const FIRSTPASS_STATS *const stats = &twopass->total_stats;
+  const double av_err = stats->coded_error / stats->count;
+  const double modified_error = av_err *
+      pow(this_frame->coded_error / DOUBLE_DIVIDE_CHECK(av_err),
+          oxcf->two_pass_vbrbias / 100.0);
   return fclamp(modified_error,
                 twopass->modified_error_min, twopass->modified_error_max);
 }
 
-static const double weight_table[256] = {
-  0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
-  0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
-  0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
-  0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
-  0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.031250, 0.062500,
-  0.093750, 0.125000, 0.156250, 0.187500, 0.218750, 0.250000, 0.281250,
-  0.312500, 0.343750, 0.375000, 0.406250, 0.437500, 0.468750, 0.500000,
-  0.531250, 0.562500, 0.593750, 0.625000, 0.656250, 0.687500, 0.718750,
-  0.750000, 0.781250, 0.812500, 0.843750, 0.875000, 0.906250, 0.937500,
-  0.968750, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000
-};
-
-static double simple_weight(const YV12_BUFFER_CONFIG *buf) {
-  int i, j;
-  double sum = 0.0;
-  const int w = buf->y_crop_width;
-  const int h = buf->y_crop_height;
-  const uint8_t *row = buf->y_buffer;
-
-  for (i = 0; i < h; ++i) {
-    const uint8_t *pixel = row;
-    for (j = 0; j < w; ++j)
-      sum += weight_table[*pixel++];
-    row += buf->y_stride;
-  }
-
-  return MAX(0.1, sum / (w * h));
-}
-
 // This function returns the maximum target rate per frame.
 static int frame_max_bits(const RATE_CONTROL *rc,
                           const VP9EncoderConfig *oxcf) {
@@ -533,6 +460,7 @@ void vp9_first_pass(VP9_COMP *cpi) {
   TWO_PASS *twopass = &cpi->twopass;
   const MV zero_mv = {0, 0};
   const YV12_BUFFER_CONFIG *first_ref_buf = lst_yv12;
+  FIRSTPASS_STATS fps;
 
   vp9_clear_system_state();
 
@@ -830,14 +758,11 @@ void vp9_first_pass(VP9_COMP *cpi) {
 
   vp9_clear_system_state();
   {
-    FIRSTPASS_STATS fps;
-
     fps.frame = cm->current_video_frame;
     fps.spatial_layer_id = cpi->svc.spatial_layer_id;
     fps.intra_error = (double)(intra_error >> 8);
     fps.coded_error = (double)(coded_error >> 8);
     fps.sr_coded_error = (double)(sr_coded_error >> 8);
-    fps.ssim_weighted_pred_err = fps.coded_error * simple_weight(cpi->Source);
     fps.count = 1.0;
     fps.pcnt_inter = (double)intercount / cm->MBs;
     fps.pcnt_second_ref = (double)second_ref_count / cm->MBs;
@@ -871,8 +796,7 @@ void vp9_first_pass(VP9_COMP *cpi) {
     fps.duration = (double)(cpi->source->ts_end - cpi->source->ts_start);
 
     // Don't want to do output stats with a stack variable!
-    twopass->this_frame_stats = fps;
-    output_stats(&twopass->this_frame_stats, cpi->output_pkt_list);
+    output_stats(&fps, cpi->output_pkt_list);
     accumulate_stats(&twopass->total_stats, &fps);
   }
 
@@ -880,9 +804,9 @@ void vp9_first_pass(VP9_COMP *cpi) {
   // the prediction is good enough... but also don't allow it to lag too far.
   if ((twopass->sr_update_lag > 3) ||
       ((cm->current_video_frame > 0) &&
-       (twopass->this_frame_stats.pcnt_inter > 0.20) &&
-       ((twopass->this_frame_stats.intra_error /
-         DOUBLE_DIVIDE_CHECK(twopass->this_frame_stats.coded_error)) > 2.0))) {
+       (fps.pcnt_inter > 0.20) &&
+       ((fps.intra_error /
+         DOUBLE_DIVIDE_CHECK(fps.coded_error)) > 2.0))) {
     if (gld_yv12 != NULL) {
       vp8_yv12_copy_frame(lst_yv12, gld_yv12);
     }
@@ -1043,25 +967,19 @@ void vp9_init_second_pass(VP9_COMP *cpi) {
   // Scan the first pass file and calculate a modified total error based upon
   // the bias/power function used to allocate bits.
   {
-    const FIRSTPASS_STATS *const start_pos = twopass->stats_in;
-    FIRSTPASS_STATS this_frame;
-    const double av_error = stats->ssim_weighted_pred_err /
-                                DOUBLE_DIVIDE_CHECK(stats->count);
-
-
-    twopass->modified_error_total = 0.0;
-    twopass->modified_error_min =
-        (av_error * oxcf->two_pass_vbrmin_section) / 100;
-    twopass->modified_error_max =
-        (av_error * oxcf->two_pass_vbrmax_section) / 100;
-
-    while (input_stats(twopass, &this_frame) != EOF) {
-      twopass->modified_error_total +=
-          calculate_modified_err(cpi, &this_frame);
+    const double avg_error = stats->coded_error /
+                             DOUBLE_DIVIDE_CHECK(stats->count);
+    const FIRSTPASS_STATS *s = twopass->stats_in;
+    double modified_error_total = 0.0;
+    twopass->modified_error_min = (avg_error *
+                                      oxcf->two_pass_vbrmin_section) / 100;
+    twopass->modified_error_max = (avg_error *
+                                      oxcf->two_pass_vbrmax_section) / 100;
+    while (s < twopass->stats_in_end) {
+      modified_error_total += calculate_modified_err(twopass, oxcf, s);
+      ++s;
     }
-    twopass->modified_error_left = twopass->modified_error_total;
-
-    reset_fpf_position(twopass, start_pos);
+    twopass->modified_error_left = modified_error_total;
   }
 
   // Reset the vbr bits off target counter
@@ -1182,16 +1100,17 @@ static void accumulate_frame_motion_stats(
 }
 
 // Calculate a baseline boost number for the current frame.
-static double calc_frame_boost(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame,
+static double calc_frame_boost(const TWO_PASS *twopass,
+                               const FIRSTPASS_STATS *this_frame,
                                double this_frame_mv_in_out) {
   double frame_boost;
 
   // Underlying boost factor is based on inter intra error ratio.
-  if (this_frame->intra_error > cpi->twopass.gf_intra_err_min)
+  if (this_frame->intra_error > twopass->gf_intra_err_min)
     frame_boost = (IIFACTOR * this_frame->intra_error /
                    DOUBLE_DIVIDE_CHECK(this_frame->coded_error));
   else
-    frame_boost = (IIFACTOR * cpi->twopass.gf_intra_err_min /
+    frame_boost = (IIFACTOR * twopass->gf_intra_err_min /
                    DOUBLE_DIVIDE_CHECK(this_frame->coded_error));
 
   // Increase boost for frames where new data coming into frame (e.g. zoom out).
@@ -1244,8 +1163,8 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset,
                           ? MIN_DECAY_FACTOR : decay_accumulator;
     }
 
-    boost_score += (decay_accumulator *
-                    calc_frame_boost(cpi, &this_frame, this_frame_mv_in_out));
+    boost_score += decay_accumulator * calc_frame_boost(twopass, &this_frame,
+                                                        this_frame_mv_in_out);
   }
 
   *f_boost = (int)boost_score;
@@ -1281,8 +1200,8 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset,
                               ? MIN_DECAY_FACTOR : decay_accumulator;
     }
 
-    boost_score += (decay_accumulator *
-                    calc_frame_boost(cpi, &this_frame, this_frame_mv_in_out));
+    boost_score += decay_accumulator * calc_frame_boost(twopass, &this_frame,
+                                                        this_frame_mv_in_out);
   }
   *b_boost = (int)boost_score;
 
@@ -1500,6 +1419,66 @@ static int calculate_boost_bits(int frame_count,
   return MAX((int)(((int64_t)boost * total_group_bits) / allocation_chunks), 0);
 }
 
+static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits,
+                                   double group_error, int gf_arf_bits) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+  TWO_PASS *twopass = &cpi->twopass;
+  FIRSTPASS_STATS frame_stats;
+  int i;
+  int group_frame_index = 1;
+  int target_frame_size;
+  int key_frame;
+  const int max_bits = frame_max_bits(&cpi->rc, &cpi->oxcf);
+  int64_t total_group_bits = gf_group_bits;
+  double modified_err = 0.0;
+  double err_fraction;
+
+  key_frame = cpi->common.frame_type == KEY_FRAME ||
+              vp9_is_upper_layer_key_frame(cpi);
+
+  // For key frames the frame target rate is already set and it
+  // is also the golden frame.
+  // NOTE: We dont bother to check for the special case of ARF overlay
+  // frames here, as there is clamping code for this in the function
+  // vp9_rc_clamp_pframe_target_size(), which applies to one and two pass
+  // encodes.
+  if (!key_frame) {
+    twopass->gf_group_bit_allocation[0] = gf_arf_bits;
+
+    // Step over the golden frame / overlay frame
+    if (EOF == input_stats(twopass, &frame_stats))
+      return;
+  }
+
+  // Store the bits to spend on the ARF if there is one.
+  if (rc->source_alt_ref_pending) {
+    twopass->gf_group_bit_allocation[group_frame_index++] = gf_arf_bits;
+  }
+
+  // Deduct the boost bits for arf or gf if it is not a key frame.
+  if (rc->source_alt_ref_pending || !key_frame)
+    total_group_bits -= gf_arf_bits;
+
+  // Allocate bits to the other frames in the group.
+  for (i = 0; i < rc->baseline_gf_interval - 1; ++i) {
+    if (EOF == input_stats(twopass, &frame_stats))
+      break;
+
+    modified_err = calculate_modified_err(twopass, oxcf, &frame_stats);
+
+    if (group_error > 0)
+      err_fraction = modified_err / DOUBLE_DIVIDE_CHECK(group_error);
+    else
+      err_fraction = 0.0;
+
+    target_frame_size = (int)((double)total_group_bits * err_fraction);
+    target_frame_size = clamp(target_frame_size, 0,
+                              MIN(max_bits, (int)total_group_bits));
+
+    twopass->gf_group_bit_allocation[group_frame_index++] = target_frame_size;
+  }
+}
 
 // Analyse and define a gf/arf group.
 static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
@@ -1509,6 +1488,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   FIRSTPASS_STATS next_frame;
   const FIRSTPASS_STATS *const start_pos = twopass->stats_in;
   int i;
+
   double boost_score = 0.0;
   double old_boost_score = 0.0;
   double gf_group_err = 0.0;
@@ -1532,14 +1512,24 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   int b_boost = 0;
   int flash_detected;
   int active_max_gf_interval;
+  int64_t gf_group_bits;
+  double gf_group_error_left;
+  int gf_arf_bits;
+
+  // Reset the GF group data structures unless this is a key
+  // frame in which case it will already have been done.
+  if (cpi->common.frame_type != KEY_FRAME) {
+    twopass->gf_group_index = 0;
+    vp9_zero(twopass->gf_group_bit_allocation);
+  }
 
   vp9_clear_system_state();
   vp9_zero(next_frame);
 
-  twopass->gf_group_bits = 0;
+  gf_group_bits = 0;
 
   // Load stats for the current frame.
-  mod_frame_err = calculate_modified_err(cpi, this_frame);
+  mod_frame_err = calculate_modified_err(twopass, oxcf, this_frame);
 
   // Note the error of the frame at the start of the group. This will be
   // the GF frame error if we code a normal gf.
@@ -1571,7 +1561,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     ++i;
 
     // Accumulate error score of frames in this gf group.
-    mod_frame_err = calculate_modified_err(cpi, this_frame);
+    mod_frame_err = calculate_modified_err(twopass, oxcf, this_frame);
     gf_group_err += mod_frame_err;
 
     if (EOF == input_stats(twopass, &next_frame))
@@ -1610,12 +1600,12 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     }
 
     // Calculate a boost number for this frame.
-    boost_score += (decay_accumulator *
-       calc_frame_boost(cpi, &next_frame, this_frame_mv_in_out));
+    boost_score += decay_accumulator * calc_frame_boost(twopass, &next_frame,
+                                                        this_frame_mv_in_out);
 
     // Break out conditions.
     if (
-      // Break at cpi->max_gf_interval unless almost totally static.
+      // Break at active_max_gf_interval unless almost totally static.
       (i >= active_max_gf_interval && (zero_motion_accumulator < 0.995)) ||
       (
         // Don't break out with a very short interval.
@@ -1646,7 +1636,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
         break;
 
       if (i < rc->frames_to_key) {
-        mod_frame_err = calculate_modified_err(cpi, this_frame);
+        mod_frame_err = calculate_modified_err(twopass, oxcf, this_frame);
         gf_group_err += mod_frame_err;
       }
     }
@@ -1742,7 +1732,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   reset_fpf_position(twopass, start_pos);
 
   // Calculate the bits to be allocated to the gf/arf group as a whole
-  twopass->gf_group_bits = calculate_total_gf_group_bits(cpi, gf_group_err);
+  gf_group_bits = calculate_total_gf_group_bits(cpi, gf_group_err);
 
   // Calculate the extra bits to be used for boosted frame(s)
   {
@@ -1753,19 +1743,8 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     boost = clamp(boost, 125, (rc->baseline_gf_interval + 1) * 200);
 
     // Calculate the extra bits to be used for boosted frame(s)
-    twopass->gf_bits = calculate_boost_bits(rc->baseline_gf_interval,
-                                            boost, twopass->gf_group_bits);
-
-
-    // For key frames the frame target rate is set already.
-    // NOTE: We dont bother to check for the special case of ARF overlay
-    // frames here, as there is clamping code for this in the function
-    // vp9_rc_clamp_pframe_target_size(), which applies to one and two pass
-    // encodes.
-    if (cpi->common.frame_type != KEY_FRAME &&
-        !vp9_is_upper_layer_key_frame(cpi)) {
-      vp9_rc_set_frame_target(cpi, twopass->gf_bits);
-    }
+    gf_arf_bits = calculate_boost_bits(rc->baseline_gf_interval,
+                                       boost, gf_group_bits);
   }
 
   // Adjust KF group bits and error remaining.
@@ -1778,14 +1757,19 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   // For normal GFs remove the score for the GF itself unless this is
   // also a key frame in which case it has already been accounted for.
   if (rc->source_alt_ref_pending) {
-    twopass->gf_group_error_left = (int64_t)(gf_group_err - mod_frame_err);
+    gf_group_error_left = gf_group_err - mod_frame_err;
   } else if (cpi->common.frame_type != KEY_FRAME) {
-    twopass->gf_group_error_left = (int64_t)(gf_group_err
-                                                 - gf_first_frame_err);
+    gf_group_error_left = gf_group_err - gf_first_frame_err;
   } else {
-    twopass->gf_group_error_left = (int64_t)gf_group_err;
+    gf_group_error_left = gf_group_err;
   }
 
+  // Allocate bits to each of the frames in the GF group.
+  allocate_gf_group_bits(cpi, gf_group_bits, gf_group_error_left, gf_arf_bits);
+
+  // Reset the file position.
+  reset_fpf_position(twopass, start_pos);
+
   // Calculate a section intra ratio used in setting max loop filter.
   if (cpi->common.frame_type != KEY_FRAME) {
     twopass->section_intra_rating =
@@ -1794,37 +1778,6 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   }
 }
 
-// Allocate bits to a normal frame that is neither a gf an arf or a key frame.
-static void assign_std_frame_bits(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
-  TWO_PASS *twopass = &cpi->twopass;
-  // For a single frame.
-  const int max_bits = frame_max_bits(&cpi->rc, &cpi->oxcf);
-  // Calculate modified prediction error used in bit allocation.
-  const double modified_err = calculate_modified_err(cpi, this_frame);
-  int target_frame_size;
-  double err_fraction;
-
-  if (twopass->gf_group_error_left > 0)
-    // What portion of the remaining GF group error is used by this frame.
-    err_fraction = modified_err / twopass->gf_group_error_left;
-  else
-    err_fraction = 0.0;
-
-  // How many of those bits available for allocation should we give it?
-  target_frame_size = (int)((double)twopass->gf_group_bits * err_fraction);
-
-  // Clip target size to 0 - max_bits (or cpi->twopass.gf_group_bits) at
-  // the top end.
-  target_frame_size = clamp(target_frame_size, 0,
-                            MIN(max_bits, (int)twopass->gf_group_bits));
-
-  // Adjust error and bits remaining.
-  twopass->gf_group_error_left -= (int64_t)modified_err;
-
-  // Per frame bit target for this frame.
-  vp9_rc_set_frame_target(cpi, target_frame_size);
-}
-
 static int test_candidate_kf(TWO_PASS *twopass,
                              const FIRSTPASS_STATS *last_frame,
                              const FIRSTPASS_STATS *this_frame,
@@ -1906,10 +1859,12 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   int i, j;
   RATE_CONTROL *const rc = &cpi->rc;
   TWO_PASS *const twopass = &cpi->twopass;
+  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
   const FIRSTPASS_STATS first_frame = *this_frame;
   const FIRSTPASS_STATS *const start_position = twopass->stats_in;
   FIRSTPASS_STATS next_frame;
   FIRSTPASS_STATS last_frame;
+  int kf_bits = 0;
   double decay_accumulator = 1.0;
   double zero_motion_accumulator = 1.0;
   double boost_score = 0.0;
@@ -1921,6 +1876,10 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
 
   cpi->common.frame_type = KEY_FRAME;
 
+  // Reset the GF group data structures.
+  twopass->gf_group_index = 0;
+  vp9_zero(twopass->gf_group_bit_allocation);
+
   // Is this a forced key frame by interval.
   rc->this_key_frame_forced = rc->next_key_frame_forced;
 
@@ -1935,14 +1894,14 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   twopass->kf_group_bits = 0;        // Total bits available to kf group
   twopass->kf_group_error_left = 0;  // Group modified error score.
 
-  kf_mod_err = calculate_modified_err(cpi, this_frame);
+  kf_mod_err = calculate_modified_err(twopass, oxcf, this_frame);
 
   // Find the next keyframe.
   i = 0;
   while (twopass->stats_in < twopass->stats_in_end &&
          rc->frames_to_key < cpi->oxcf.key_freq) {
     // Accumulate kf group error.
-    kf_group_err += calculate_modified_err(cpi, this_frame);
+    kf_group_err += calculate_modified_err(twopass, oxcf, this_frame);
 
     // Load the next frame's stats.
     last_frame = *this_frame;
@@ -2004,7 +1963,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
 
     // Rescan to get the correct error data for the forced kf group.
     for (i = 0; i < rc->frames_to_key; ++i) {
-      kf_group_err += calculate_modified_err(cpi, &tmp_frame);
+      kf_group_err += calculate_modified_err(twopass, oxcf, &tmp_frame);
       input_stats(twopass, &tmp_frame);
     }
     rc->next_key_frame_forced = 1;
@@ -2018,7 +1977,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   // Special case for the last key frame of the file.
   if (twopass->stats_in >= twopass->stats_in_end) {
     // Accumulate kf group error.
-    kf_group_err += calculate_modified_err(cpi, this_frame);
+    kf_group_err += calculate_modified_err(twopass, oxcf, this_frame);
   }
 
   // Calculate the number of bits that should be assigned to the kf group.
@@ -2086,6 +2045,8 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     }
   }
 
+  reset_fpf_position(twopass, start_position);
+
   // Store the zero motion percentage
   twopass->kf_zeromotion_pct = (int)(zero_motion_accumulator * 100.0);
 
@@ -2102,13 +2063,13 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   if (rc->kf_boost   < MIN_KF_BOOST)
     rc->kf_boost = MIN_KF_BOOST;
 
-  twopass->kf_bits = calculate_boost_bits((rc->frames_to_key - 1),
-                                          rc->kf_boost, twopass->kf_group_bits);
+  kf_bits = calculate_boost_bits((rc->frames_to_key - 1),
+                                  rc->kf_boost, twopass->kf_group_bits);
 
-  twopass->kf_group_bits -= twopass->kf_bits;
+  twopass->kf_group_bits -= kf_bits;
 
-  // Per frame bit target for this frame.
-  vp9_rc_set_frame_target(cpi, twopass->kf_bits);
+  // Save the bits to spend on the key frame.
+  twopass->gf_group_bit_allocation[0] = kf_bits;
 
   // Note the total error score of the kf group minus the key frame itself.
   twopass->kf_group_error_left = (int)(kf_group_err - kf_mod_err);
@@ -2144,7 +2105,7 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
   FIRSTPASS_STATS this_frame;
   FIRSTPASS_STATS this_frame_copy;
 
-  int target;
+  int target_rate;
   LAYER_CONTEXT *lc = NULL;
   const int is_spatial_svc = (cpi->use_svc &&
                               cpi->svc.number_temporal_layers == 1);
@@ -2160,16 +2121,23 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
   if (!twopass->stats_in)
     return;
 
+  // Increment the gf group index.
+  ++twopass->gf_group_index;
+
+  // If this is an arf frame then we dont want to read the stats file or
+  // advance the input pointer as we already have what we need.
   if (cpi->refresh_alt_ref_frame) {
-    int modified_target = twopass->gf_bits;
-    rc->base_frame_target = twopass->gf_bits;
-    cm->frame_type = INTER_FRAME;
+    int target_rate;
+    target_rate = twopass->gf_group_bit_allocation[twopass->gf_group_index];
+    target_rate = vp9_rc_clamp_pframe_target_size(cpi, target_rate);
+    rc->base_frame_target = target_rate;
 #ifdef LONG_TERM_VBR_CORRECTION
     // Correction to rate target based on prior over or under shoot.
     if (cpi->oxcf.rc_mode == RC_MODE_VBR)
-      vbr_rate_correction(&modified_target, rc->vbr_bits_off_target);
+      vbr_rate_correction(&target_rate, rc->vbr_bits_off_target);
 #endif
-    vp9_rc_set_frame_target(cpi, modified_target);
+    vp9_rc_set_frame_target(cpi, target_rate);
+    cm->frame_type = INTER_FRAME;
     return;
   }
 
@@ -2197,11 +2165,13 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
   if (EOF == input_stats(twopass, &this_frame))
     return;
 
+  // Local copy of the current frame's first pass stats.
+  this_frame_copy = this_frame;
+
   // Keyframe and section processing.
   if (rc->frames_to_key == 0 ||
       (cpi->frame_flags & FRAMEFLAGS_KEY)) {
     // Define next KF group and assign bits to it.
-    this_frame_copy = this_frame;
     find_next_key_frame(cpi, &this_frame_copy);
   } else {
     cm->frame_type = INTER_FRAME;
@@ -2220,11 +2190,8 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
     }
   }
 
-  // Is this frame a GF / ARF? (Note: a key frame is always also a GF).
+  // Define a new GF/ARF group. (Should always enter here for key frames).
   if (rc->frames_till_gf_update_due == 0) {
-    // Define next gf group and assign bits to it.
-    this_frame_copy = this_frame;
-
 #if CONFIG_MULTIPLE_ARF
     if (cpi->multi_arf_enabled) {
       define_fixed_arf_period(cpi);
@@ -2247,11 +2214,6 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
 
     rc->frames_till_gf_update_due = rc->baseline_gf_interval;
     cpi->refresh_golden_frame = 1;
-  } else {
-    // Otherwise this is an ordinary frame.
-    // Assign bits from those allocated to the GF group.
-    this_frame_copy =  this_frame;
-    assign_std_frame_bits(cpi, &this_frame_copy);
   }
 
   {
@@ -2262,18 +2224,19 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
     }
   }
 
+  target_rate = twopass->gf_group_bit_allocation[twopass->gf_group_index];
   if (cpi->common.frame_type == KEY_FRAME)
-    target = vp9_rc_clamp_iframe_target_size(cpi, rc->this_frame_target);
+    target_rate = vp9_rc_clamp_iframe_target_size(cpi, target_rate);
   else
-    target = vp9_rc_clamp_pframe_target_size(cpi, rc->this_frame_target);
+    target_rate = vp9_rc_clamp_pframe_target_size(cpi, target_rate);
 
-  rc->base_frame_target = target;
+  rc->base_frame_target = target_rate;
 #ifdef LONG_TERM_VBR_CORRECTION
   // Correction to rate target based on prior over or under shoot.
   if (cpi->oxcf.rc_mode == RC_MODE_VBR)
-    vbr_rate_correction(&target, rc->vbr_bits_off_target);
+    vbr_rate_correction(&target_rate, rc->vbr_bits_off_target);
 #endif
-  vp9_rc_set_frame_target(cpi, target);
+  vp9_rc_set_frame_target(cpi, target_rate);
 
   // Update the total stats remaining structure.
   subtract_stats(&twopass->total_left_stats, &this_frame);
@@ -2322,8 +2285,6 @@ void vp9_twopass_postencode_update(VP9_COMP *cpi) {
   } else {
 #endif
     twopass->kf_group_bits -= bits_used;
-    twopass->gf_group_bits -= bits_used;
-    twopass->gf_group_bits = MAX(twopass->gf_group_bits, 0);
   }
   twopass->kf_group_bits = MAX(twopass->kf_group_bits, 0);
 }
diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h
index d84793e28..c89cfaf8d 100644
--- a/vp9/encoder/vp9_firstpass.h
+++ b/vp9/encoder/vp9_firstpass.h
@@ -11,6 +11,8 @@
 #ifndef VP9_ENCODER_VP9_FIRSTPASS_H_
 #define VP9_ENCODER_VP9_FIRSTPASS_H_
 
+#include "vp9/encoder/vp9_lookahead.h"
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -42,7 +44,6 @@ typedef struct {
   unsigned int section_intra_rating;
   unsigned int next_iiratio;
   FIRSTPASS_STATS total_stats;
-  FIRSTPASS_STATS this_frame_stats;
   const FIRSTPASS_STATS *stats_in;
   const FIRSTPASS_STATS *stats_in_start;
   const FIRSTPASS_STATS *stats_in_end;
@@ -51,31 +52,24 @@ typedef struct {
   int64_t bits_left;
   double modified_error_min;
   double modified_error_max;
-  double modified_error_total;
   double modified_error_left;
   double kf_intra_err_min;
   double gf_intra_err_min;
-  int kf_bits;
-  // Remaining error from uncoded frames in a gf group. Two pass use only
-  int64_t gf_group_error_left;
 
   // Projected total bits available for a key frame group of frames
   int64_t kf_group_bits;
 
   // Error score of frames still to be coded in kf group
   int64_t kf_group_error_left;
-
-  // Projected Bits available for a group of frames including 1 GF or ARF
-  int64_t gf_group_bits;
-  // Bits for the golden frame or ARF - 2 pass only
-  int gf_bits;
-
   int sr_update_lag;
 
   int kf_zeromotion_pct;
   int gf_zeromotion_pct;
 
   int active_worst_quality;
+
+  int gf_group_index;
+  int gf_group_bit_allocation[MAX_LAG_BUFFERS * 2];
 } TWO_PASS;
 
 struct VP9_COMP;
diff --git a/vp9/encoder/vp9_mbgraph.c b/vp9/encoder/vp9_mbgraph.c
index 041e583fd..842bc5b9d 100644
--- a/vp9/encoder/vp9_mbgraph.c
+++ b/vp9/encoder/vp9_mbgraph.c
@@ -72,8 +72,7 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi,
   x->mv_row_max = tmp_row_max;
 
   return vp9_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
-          xd->plane[0].dst.buf, xd->plane[0].dst.stride,
-          INT_MAX);
+          xd->plane[0].dst.buf, xd->plane[0].dst.stride);
 }
 
 static int do_16x16_motion_search(VP9_COMP *cpi, const MV *ref_mv,
@@ -86,8 +85,7 @@ static int do_16x16_motion_search(VP9_COMP *cpi, const MV *ref_mv,
   // Try zero MV first
   // FIXME should really use something like near/nearest MV and/or MV prediction
   err = vp9_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
-                     xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride,
-                     INT_MAX);
+                     xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride);
   dst_mv->as_int = 0;
 
   // Test last reference frame using the previous best mv as the
@@ -123,8 +121,7 @@ static int do_16x16_zerozero_search(VP9_COMP *cpi, int_mv *dst_mv) {
   // Try zero MV first
   // FIXME should really use something like near/nearest MV and/or MV prediction
   err = vp9_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
-                     xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride,
-                     INT_MAX);
+                     xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride);
 
   dst_mv->as_int = 0;
 
@@ -147,7 +144,7 @@ static int find_best_16x16_intra(VP9_COMP *cpi, PREDICTION_MODE *pbest_mode) {
                             xd->plane[0].dst.buf, xd->plane[0].dst.stride,
                             0, 0, 0);
     err = vp9_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
-                       xd->plane[0].dst.buf, xd->plane[0].dst.stride, best_err);
+                       xd->plane[0].dst.buf, xd->plane[0].dst.stride);
 
     // find best
     if (err < best_err) {
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 4f7d6f17c..dbd19a2d6 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -524,9 +524,8 @@ static int vp9_pattern_search(const MACROBLOCK *x,
 
   // Work out the start point for the search
   bestsad = vfp->sdf(what->buf, what->stride,
-                     get_buf_from_mv(in_what, ref_mv), in_what->stride,
-                     0x7fffffff) + mvsad_err_cost(x, ref_mv, &fcenter_mv,
-                                                  sad_per_bit);
+                     get_buf_from_mv(in_what, ref_mv), in_what->stride) +
+      mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
 
   // Search all possible scales upto the search param around the center point
   // pick the scale of the point that is best as the starting scale of
@@ -542,7 +541,7 @@ static int vp9_pattern_search(const MACROBLOCK *x,
                               bc + candidates[t][i].col};
           thissad = vfp->sdf(what->buf, what->stride,
                              get_buf_from_mv(in_what, &this_mv),
-                             in_what->stride, bestsad);
+                             in_what->stride);
           CHECK_BETTER
         }
       } else {
@@ -553,7 +552,7 @@ static int vp9_pattern_search(const MACROBLOCK *x,
             continue;
           thissad = vfp->sdf(what->buf, what->stride,
                              get_buf_from_mv(in_what, &this_mv),
-                             in_what->stride, bestsad);
+                             in_what->stride);
           CHECK_BETTER
         }
       }
@@ -585,7 +584,7 @@ static int vp9_pattern_search(const MACROBLOCK *x,
                                 bc + candidates[s][i].col};
             thissad = vfp->sdf(what->buf, what->stride,
                                get_buf_from_mv(in_what, &this_mv),
-                               in_what->stride, bestsad);
+                               in_what->stride);
             CHECK_BETTER
           }
         } else {
@@ -596,7 +595,7 @@ static int vp9_pattern_search(const MACROBLOCK *x,
               continue;
             thissad = vfp->sdf(what->buf, what->stride,
                                get_buf_from_mv(in_what, &this_mv),
-                               in_what->stride, bestsad);
+                               in_what->stride);
             CHECK_BETTER
           }
         }
@@ -623,7 +622,7 @@ static int vp9_pattern_search(const MACROBLOCK *x,
                                 bc + candidates[s][next_chkpts_indices[i]].col};
             thissad = vfp->sdf(what->buf, what->stride,
                                get_buf_from_mv(in_what, &this_mv),
-                               in_what->stride, bestsad);
+                               in_what->stride);
             CHECK_BETTER
           }
         } else {
@@ -634,7 +633,7 @@ static int vp9_pattern_search(const MACROBLOCK *x,
               continue;
             thissad = vfp->sdf(what->buf, what->stride,
                                get_buf_from_mv(in_what, &this_mv),
-                               in_what->stride, bestsad);
+                               in_what->stride);
             CHECK_BETTER
           }
         }
@@ -661,7 +660,7 @@ static int vp9_pattern_search(const MACROBLOCK *x,
                               bc + neighbors[i].col};
           thissad = vfp->sdf(what->buf, what->stride,
                              get_buf_from_mv(in_what, &this_mv),
-                             in_what->stride, bestsad);
+                             in_what->stride);
           CHECK_BETTER
         }
       } else {
@@ -672,7 +671,7 @@ static int vp9_pattern_search(const MACROBLOCK *x,
             continue;
           thissad = vfp->sdf(what->buf, what->stride,
                              get_buf_from_mv(in_what, &this_mv),
-                             in_what->stride, bestsad);
+                             in_what->stride);
           CHECK_BETTER
         }
       }
@@ -894,8 +893,7 @@ int vp9_full_range_search_c(const MACROBLOCK *x,
   *best_mv = *ref_mv;
   *num00 = 11;
   best_sad = fn_ptr->sdf(what->buf, what->stride,
-                         get_buf_from_mv(in_what, ref_mv), in_what->stride,
-                         0x7fffffff) +
+                         get_buf_from_mv(in_what, ref_mv), in_what->stride) +
                  mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
   start_row = MAX(-range, x->mv_row_min - ref_mv->row);
   start_col = MAX(-range, x->mv_col_min - ref_mv->col);
@@ -929,7 +927,7 @@ int vp9_full_range_search_c(const MACROBLOCK *x,
         for (i = 0; i < end_col - c; ++i) {
           const MV mv = {ref_mv->row + r, ref_mv->col + c + i};
           unsigned int sad = fn_ptr->sdf(what->buf, what->stride,
-              get_buf_from_mv(in_what, &mv), in_what->stride, best_sad);
+              get_buf_from_mv(in_what, &mv), in_what->stride);
           if (sad < best_sad) {
             sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
             if (sad < best_sad) {
@@ -975,7 +973,7 @@ int vp9_diamond_search_sad_c(const MACROBLOCK *x,
 
   // Check the starting position
   best_sad = fn_ptr->sdf(what->buf, what->stride,
-                         best_address, in_what->stride, 0x7fffffff) +
+                         best_address, in_what->stride) +
       mvsad_err_cost(x, best_mv, &fcenter_mv, sad_per_bit);
 
   i = 1;
@@ -986,8 +984,7 @@ int vp9_diamond_search_sad_c(const MACROBLOCK *x,
                      best_mv->col + ss[i].mv.col};
       if (is_mv_in(x, &mv)) {
        int sad = fn_ptr->sdf(what->buf, what->stride,
-                             best_address + ss[i].offset, in_what->stride,
-                             best_sad);
+                             best_address + ss[i].offset, in_what->stride);
         if (sad < best_sad) {
           sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
           if (sad < best_sad) {
@@ -1012,7 +1009,7 @@ int vp9_diamond_search_sad_c(const MACROBLOCK *x,
         if (is_mv_in(x, &this_mv)) {
           int sad = fn_ptr->sdf(what->buf, what->stride,
                                 best_address + ss[best_site].offset,
-                                in_what->stride, best_sad);
+                                in_what->stride);
           if (sad < best_sad) {
             sad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit);
             if (sad < best_sad) {
@@ -1077,7 +1074,7 @@ int vp9_diamond_search_sadx4(const MACROBLOCK *x,
   best_address = in_what;
 
   // Check the starting position
-  bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, 0x7fffffff)
+  bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride)
                 + mvsad_err_cost(x, best_mv, &fcenter_mv, sad_per_bit);
 
   i = 1;
@@ -1129,7 +1126,7 @@ int vp9_diamond_search_sadx4(const MACROBLOCK *x,
         if (is_mv_in(x, &this_mv)) {
           const uint8_t *const check_here = ss[i].offset + best_address;
           unsigned int thissad = fn_ptr->sdf(what, what_stride, check_here,
-                                             in_what_stride, bestsad);
+                                             in_what_stride);
 
           if (thissad < bestsad) {
             thissad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit);
@@ -1154,7 +1151,7 @@ int vp9_diamond_search_sadx4(const MACROBLOCK *x,
         if (is_mv_in(x, &this_mv)) {
           const uint8_t *const check_here = ss[best_site].offset + best_address;
           unsigned int thissad = fn_ptr->sdf(what, what_stride, check_here,
-                                             in_what_stride, bestsad);
+                                             in_what_stride);
           if (thissad < bestsad) {
             thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
                                       mvjsadcost, mvsadcost, sad_per_bit);
@@ -1253,7 +1250,7 @@ int vp9_full_search_sad_c(const MACROBLOCK *x, const MV *ref_mv,
   const int col_max = MIN(ref_mv->col + distance, x->mv_col_max);
   const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
   int best_sad = fn_ptr->sdf(what->buf, what->stride,
-      get_buf_from_mv(in_what, ref_mv), in_what->stride, 0x7fffffff) +
+      get_buf_from_mv(in_what, ref_mv), in_what->stride) +
       mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
   *best_mv = *ref_mv;
 
@@ -1261,7 +1258,7 @@ int vp9_full_search_sad_c(const MACROBLOCK *x, const MV *ref_mv,
     for (c = col_min; c < col_max; ++c) {
       const MV mv = {r, c};
       const int sad = fn_ptr->sdf(what->buf, what->stride,
-          get_buf_from_mv(in_what, &mv), in_what->stride, best_sad) +
+          get_buf_from_mv(in_what, &mv), in_what->stride) +
               mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
       if (sad < best_sad) {
         best_sad = sad;
@@ -1286,7 +1283,7 @@ int vp9_full_search_sadx3(const MACROBLOCK *x, const MV *ref_mv,
   const int col_max = MIN(ref_mv->col + distance, x->mv_col_max);
   const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
   unsigned int best_sad = fn_ptr->sdf(what->buf, what->stride,
-      get_buf_from_mv(in_what, ref_mv), in_what->stride, 0x7fffffff) +
+      get_buf_from_mv(in_what, ref_mv), in_what->stride) +
       mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
   *best_mv = *ref_mv;
 
@@ -1320,7 +1317,7 @@ int vp9_full_search_sadx3(const MACROBLOCK *x, const MV *ref_mv,
 
     while (c < col_max) {
       unsigned int sad = fn_ptr->sdf(what->buf, what->stride,
-                                     check_here, in_what->stride, best_sad);
+                                     check_here, in_what->stride);
       if (sad < best_sad) {
         const MV mv = {r, c};
         sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
@@ -1351,7 +1348,7 @@ int vp9_full_search_sadx8(const MACROBLOCK *x, const MV *ref_mv,
   const int col_max = MIN(ref_mv->col + distance, x->mv_col_max);
   const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
   unsigned int best_sad = fn_ptr->sdf(what->buf, what->stride,
-      get_buf_from_mv(in_what, ref_mv), in_what->stride, 0x7fffffff) +
+      get_buf_from_mv(in_what, ref_mv), in_what->stride) +
       mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
   *best_mv = *ref_mv;
 
@@ -1409,7 +1406,7 @@ int vp9_full_search_sadx8(const MACROBLOCK *x, const MV *ref_mv,
 
     while (c < col_max) {
       unsigned int sad = fn_ptr->sdf(what->buf, what->stride,
-                                     check_here, in_what->stride, best_sad);
+                                     check_here, in_what->stride);
       if (sad < best_sad) {
         const MV mv = {r, c};
         sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
@@ -1438,7 +1435,7 @@ int vp9_refining_search_sad_c(const MACROBLOCK *x,
   const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
   unsigned int best_sad = fn_ptr->sdf(what->buf, what->stride,
                                      get_buf_from_mv(in_what, ref_mv),
-                                     in_what->stride, 0x7fffffff) +
+                                     in_what->stride) +
       mvsad_err_cost(x, ref_mv, &fcenter_mv, error_per_bit);
   int i, j;
 
@@ -1450,7 +1447,7 @@ int vp9_refining_search_sad_c(const MACROBLOCK *x,
                      ref_mv->col + neighbors[j].col};
       if (is_mv_in(x, &mv)) {
         unsigned int sad = fn_ptr->sdf(what->buf, what->stride,
-            get_buf_from_mv(in_what, &mv), in_what->stride, best_sad);
+            get_buf_from_mv(in_what, &mv), in_what->stride);
         if (sad < best_sad) {
           sad += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit);
           if (sad < best_sad) {
@@ -1483,7 +1480,7 @@ int vp9_refining_search_sadx4(const MACROBLOCK *x,
   const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
   const uint8_t *best_address = get_buf_from_mv(in_what, ref_mv);
   unsigned int best_sad = fn_ptr->sdf(what->buf, what->stride, best_address,
-                                    in_what->stride, 0x7fffffff) +
+                                    in_what->stride) +
       mvsad_err_cost(x, ref_mv, &fcenter_mv, error_per_bit);
   int i, j;
 
@@ -1524,7 +1521,7 @@ int vp9_refining_search_sadx4(const MACROBLOCK *x,
         if (is_mv_in(x, &mv)) {
           unsigned int sad = fn_ptr->sdf(what->buf, what->stride,
                                          get_buf_from_mv(in_what, &mv),
-                                         in_what->stride, best_sad);
+                                         in_what->stride);
           if (sad < best_sad) {
             sad += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit);
             if (sad < best_sad) {
@@ -1563,8 +1560,7 @@ int vp9_refining_search_8p_c(const MACROBLOCK *x,
   const struct buf_2d *const in_what = &xd->plane[0].pre[0];
   const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
   unsigned int best_sad = fn_ptr->sdaf(what->buf, what->stride,
-      get_buf_from_mv(in_what, ref_mv), in_what->stride,
-      second_pred, 0x7fffffff) +
+      get_buf_from_mv(in_what, ref_mv), in_what->stride, second_pred) +
       mvsad_err_cost(x, ref_mv, &fcenter_mv, error_per_bit);
   int i, j;
 
@@ -1577,8 +1573,7 @@ int vp9_refining_search_8p_c(const MACROBLOCK *x,
 
       if (is_mv_in(x, &mv)) {
         unsigned int sad = fn_ptr->sdaf(what->buf, what->stride,
-            get_buf_from_mv(in_what, &mv), in_what->stride,
-            second_pred, best_sad);
+            get_buf_from_mv(in_what, &mv), in_what->stride, second_pred);
         if (sad < best_sad) {
           sad += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit);
           if (sad < best_sad) {
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 437b68078..913b8ead4 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -280,13 +280,11 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
       int rate_mv = 0;
 
-      if (cpi->sf.disable_inter_mode_mask[bsize] &
-          (1 << INTER_OFFSET(this_mode)))
+      if (!(cpi->sf.inter_mode_mask[bsize] & (1 << this_mode)))
         continue;
 
-      if (best_rd < ((int64_t)rd_threshes[mode_idx[this_mode]] *
-          rd_thresh_freq_fact[this_mode] >> 5) ||
-          rd_threshes[mode_idx[this_mode]] == INT_MAX)
+      if (rd_less_than_thresh(best_rd, rd_threshes[mode_idx[this_mode]],
+                              rd_thresh_freq_fact[this_mode]))
         continue;
 
       if (this_mode == NEWMV) {
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index a04622c8c..b58eac981 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -200,9 +200,8 @@ void vp9_rc_init(const VP9EncoderConfig *oxcf, int pass, RATE_CONTROL *rc) {
                                    oxcf->best_allowed_q) / 2;
   }
 
-  rc->last_q[0] = oxcf->best_allowed_q;
-  rc->last_q[1] = oxcf->best_allowed_q;
-  rc->last_q[2] = oxcf->best_allowed_q;
+  rc->last_q[KEY_FRAME] = oxcf->best_allowed_q;
+  rc->last_q[INTER_FRAME] = oxcf->best_allowed_q;
 
   rc->buffer_level =    oxcf->starting_buffer_level;
   rc->bits_off_target = oxcf->starting_buffer_level;
@@ -213,6 +212,7 @@ void vp9_rc_init(const VP9EncoderConfig *oxcf, int pass, RATE_CONTROL *rc) {
   rc->long_rolling_actual_bits = rc->avg_frame_bandwidth;
 
   rc->total_actual_bits = 0;
+  rc->total_target_bits = 0;
   rc->total_target_vs_actual = 0;
 
   rc->baseline_gf_interval = DEFAULT_GF_INTERVAL;
@@ -607,13 +607,27 @@ static int rc_pick_q_and_bounds_one_pass_cbr(const VP9_COMP *cpi,
   return q;
 }
 
+static int get_active_cq_level(const RATE_CONTROL *rc,
+                               const VP9EncoderConfig *const oxcf) {
+  static const double cq_adjust_threshold = 0.5;
+  int active_cq_level = oxcf->cq_level;
+  if (oxcf->rc_mode == RC_MODE_CONSTRAINED_QUALITY &&
+      rc->total_target_bits > 0) {
+    const double x = (double)rc->total_actual_bits / rc->total_target_bits;
+    if (x < cq_adjust_threshold) {
+      active_cq_level = (int)(active_cq_level * x / cq_adjust_threshold);
+    }
+  }
+  return active_cq_level;
+}
+
 static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi,
                                              int *bottom_index,
                                              int *top_index) {
   const VP9_COMMON *const cm = &cpi->common;
   const RATE_CONTROL *const rc = &cpi->rc;
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
-  const int cq_level = oxcf->cq_level;
+  const int cq_level = get_active_cq_level(rc, oxcf);
   int active_best_quality;
   int active_worst_quality = calc_active_worst_quality_one_pass_vbr(cpi);
   int q;
@@ -791,7 +805,7 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi,
   const VP9_COMMON *const cm = &cpi->common;
   const RATE_CONTROL *const rc = &cpi->rc;
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
-  const int cq_level = oxcf->cq_level;
+  const int cq_level = get_active_cq_level(rc, oxcf);
   int active_best_quality;
   int active_worst_quality = cpi->twopass.active_worst_quality;
   int q;
@@ -1075,7 +1089,6 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
   } else if (!rc->is_src_frame_alt_ref &&
              (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame) &&
              !(cpi->use_svc && oxcf->rc_mode == RC_MODE_CBR)) {
-    rc->last_q[2] = qindex;
     rc->avg_frame_qindex[2] =
         ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[2] + qindex, 2);
   } else {
@@ -1375,6 +1388,24 @@ int vp9_compute_qdelta_by_rate(const RATE_CONTROL *rc, FRAME_TYPE frame_type,
   return target_index - qindex;
 }
 
+void vp9_rc_set_gf_max_interval(const VP9EncoderConfig *const oxcf,
+                                RATE_CONTROL *const rc) {
+  // Set Maximum gf/arf interval
+  rc->max_gf_interval = 16;
+
+  // Extended interval for genuinely static scenes
+  rc->static_scene_max_gf_interval = oxcf->key_freq >> 1;
+
+  // Special conditions when alt ref frame enabled
+  if (oxcf->play_alternate && oxcf->lag_in_frames) {
+    if (rc->static_scene_max_gf_interval > oxcf->lag_in_frames - 1)
+      rc->static_scene_max_gf_interval = oxcf->lag_in_frames - 1;
+  }
+
+  if (rc->max_gf_interval > rc->static_scene_max_gf_interval)
+    rc->max_gf_interval = rc->static_scene_max_gf_interval;
+}
+
 void vp9_rc_update_framerate(VP9_COMP *cpi) {
   const VP9_COMMON *const cm = &cpi->common;
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
@@ -1399,21 +1430,5 @@ void vp9_rc_update_framerate(VP9_COMP *cpi) {
   rc->max_frame_bandwidth = MAX(MAX((cm->MBs * MAX_MB_RATE), MAXRATE_1080P),
                                     vbr_max_bits);
 
-  // Set Maximum gf/arf interval
-  rc->max_gf_interval = 16;
-
-  // Extended interval for genuinely static scenes
-  rc->static_scene_max_gf_interval = cpi->oxcf.key_freq >> 1;
-
-  // Special conditions when alt ref frame enabled in lagged compress mode
-  if (oxcf->play_alternate && oxcf->lag_in_frames) {
-    if (rc->max_gf_interval > oxcf->lag_in_frames - 1)
-      rc->max_gf_interval = oxcf->lag_in_frames - 1;
-
-    if (rc->static_scene_max_gf_interval > oxcf->lag_in_frames - 1)
-      rc->static_scene_max_gf_interval = oxcf->lag_in_frames - 1;
-  }
-
-  if (rc->max_gf_interval > rc->static_scene_max_gf_interval)
-    rc->max_gf_interval = rc->static_scene_max_gf_interval;
+  vp9_rc_set_gf_max_interval(oxcf, rc);
 }
diff --git a/vp9/encoder/vp9_ratectrl.h b/vp9/encoder/vp9_ratectrl.h
index b1cc67609..d6a0151b6 100644
--- a/vp9/encoder/vp9_ratectrl.h
+++ b/vp9/encoder/vp9_ratectrl.h
@@ -30,7 +30,7 @@ typedef struct {
   int this_frame_target;           // Actual frame target after rc adjustment.
   int projected_frame_size;
   int sb64_target_rate;
-  int last_q[3];                   // Separate values for Intra/Inter/ARF-GF
+  int last_q[FRAME_TYPES];         // Separate values for Intra/Inter
   int last_boosted_qindex;         // Last boosted GF/KF/ARF q
 
   int gfu_boost;
@@ -178,6 +178,9 @@ int vp9_compute_qdelta_by_rate(const RATE_CONTROL *rc, FRAME_TYPE frame_type,
 
 void vp9_rc_update_framerate(struct VP9_COMP *cpi);
 
+void vp9_rc_set_gf_max_interval(const struct VP9EncoderConfig *const oxcf,
+                                RATE_CONTROL *const rc);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 601e64d39..f68aa2738 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -9,7 +9,6 @@
  */
 
 #include <assert.h>
-#include <limits.h>
 #include <math.h>
 #include <stdio.h>
 
@@ -22,7 +21,6 @@
 #include "vp9/common/vp9_entropymode.h"
 #include "vp9/common/vp9_idct.h"
 #include "vp9/common/vp9_mvref_common.h"
-#include "vp9/common/vp9_pragmas.h"
 #include "vp9/common/vp9_pred_common.h"
 #include "vp9/common/vp9_quant_common.h"
 #include "vp9/common/vp9_reconinter.h"
@@ -1677,9 +1675,9 @@ static INLINE int mv_has_subpel(const MV *mv) {
 static int check_best_zero_mv(
     const VP9_COMP *cpi, const uint8_t mode_context[MAX_REF_FRAMES],
     int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES],
-    int disable_inter_mode_mask, int this_mode,
+    int inter_mode_mask, int this_mode,
     const MV_REFERENCE_FRAME ref_frames[2]) {
-  if (!(disable_inter_mode_mask & (1 << INTER_OFFSET(ZEROMV))) &&
+  if ((inter_mode_mask & (1 << ZEROMV)) &&
       (this_mode == NEARMV || this_mode == NEARESTMV || this_mode == ZEROMV) &&
       frame_mv[this_mode][ref_frames[0]].as_int == 0 &&
       (ref_frames[1] == NONE ||
@@ -1745,7 +1743,7 @@ static int64_t rd_pick_best_sub8x8_mode(VP9_COMP *cpi, MACROBLOCK *x,
   ENTROPY_CONTEXT t_above[2], t_left[2];
   int subpelmv = 1, have_ref = 0;
   const int has_second_rf = has_second_ref(mbmi);
-  const int disable_inter_mode_mask = cpi->sf.disable_inter_mode_mask[bsize];
+  const int inter_mode_mask = cpi->sf.inter_mode_mask[bsize];
 
   vp9_zero(*bsi);
 
@@ -1794,11 +1792,11 @@ static int64_t rd_pick_best_sub8x8_mode(VP9_COMP *cpi, MACROBLOCK *x,
 
         mode_idx = INTER_OFFSET(this_mode);
         bsi->rdstat[i][mode_idx].brdcost = INT64_MAX;
-        if (disable_inter_mode_mask & (1 << mode_idx))
+        if (!(inter_mode_mask & (1 << this_mode)))
           continue;
 
         if (!check_best_zero_mv(cpi, mbmi->mode_context, frame_mv,
-                                disable_inter_mode_mask,
+                                inter_mode_mask,
                                 this_mode, mbmi->ref_frame))
           continue;
 
@@ -2131,8 +2129,7 @@ static void mv_pred(VP9_COMP *cpi, MACROBLOCK *x,
 
     // Find sad for current vector.
     this_sad = cpi->fn_ptr[block_size].sdf(src_y_ptr, x->plane[0].src.stride,
-                                           ref_y_ptr, ref_y_stride,
-                                           0x7fffffff);
+                                           ref_y_ptr, ref_y_stride);
 
     // Note if it is the best so far.
     if (this_sad < best_sad) {
@@ -2210,8 +2207,6 @@ static void estimate_ref_frame_costs(const VP9_COMMON *cm,
 
 static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
                          int mode_index,
-                         int_mv *ref_mv,
-                         int_mv *second_ref_mv,
                          int64_t comp_pred_diff[REFERENCE_MODES],
                          const int64_t tx_size_diff[TX_MODES],
                          int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS]) {
@@ -2991,11 +2986,6 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   ctx->mic = *xd->mi[0];
 }
 
-static INLINE int rd_less_than_thresh(int64_t best_rd, int thresh,
-                                      int thresh_fact) {
-    return best_rd < ((int64_t)thresh * thresh_fact >> 5) || thresh == INT_MAX;
-}
-
 // Updating rd_thresh_freq_fact[] here means that the different
 // partition/block sizes are handled independently based on the best
 // choice for the current partition. It may well be better to keep a scaled
@@ -3073,7 +3063,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   const int mode_search_skip_flags = cpi->sf.mode_search_skip_flags;
   const int intra_y_mode_mask =
       cpi->sf.intra_y_mode_mask[max_txsize_lookup[bsize]];
-  int disable_inter_mode_mask = cpi->sf.disable_inter_mode_mask[bsize];
+  int inter_mode_mask = cpi->sf.inter_mode_mask[bsize];
   vp9_zero(best_mbmode);
   x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
 
@@ -3139,6 +3129,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   if (vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)) {
     const int inter_non_zero_mode_mask = 0x1F7F7;
     mode_skip_mask |= inter_non_zero_mode_mask;
+    mode_skip_mask &= ~(1 << THR_ZEROMV);
+    inter_mode_mask = (1 << ZEROMV);
   }
 
   // Disable this drop out case if the ref frame
@@ -3172,7 +3164,11 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   }
 
   if (bsize > cpi->sf.max_intra_bsize) {
-    mode_skip_mask |= 0xFF30808;
+    const int all_intra_modes = (1 << THR_DC) | (1 << THR_TM) |
+        (1 << THR_H_PRED) | (1 << THR_V_PRED) | (1 << THR_D135_PRED) |
+        (1 << THR_D207_PRED) | (1 << THR_D153_PRED) | (1 << THR_D63_PRED) |
+        (1 << THR_D117_PRED) | (1 << THR_D45_PRED);
+    mode_skip_mask |= all_intra_modes;
   }
 
   if (!x->in_active_map) {
@@ -3186,7 +3182,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       mode_index = THR_ZEROMV;
     mode_skip_mask = ~(1 << mode_index);
     mode_skip_start = MAX_MODES;
-    disable_inter_mode_mask = 0;
+    inter_mode_mask = (1 << NEARESTMV) | (1 << NEARMV) | (1 << ZEROMV) |
+                      (1 << NEWMV);
   }
 
   for (mode_index = 0; mode_index < MAX_MODES; ++mode_index) {
@@ -3228,13 +3225,12 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
 
     // Test best rd so far against threshold for trying this mode.
     if (rd_less_than_thresh(best_rd, rd_threshes[mode_index],
-        rd_thresh_freq_fact[mode_index]))
+                            rd_thresh_freq_fact[mode_index]))
       continue;
 
     this_mode = vp9_mode_order[mode_index].mode;
     ref_frame = vp9_mode_order[mode_index].ref_frame[0];
-    if (ref_frame != INTRA_FRAME &&
-        disable_inter_mode_mask & (1 << INTER_OFFSET(this_mode)))
+    if (ref_frame != INTRA_FRAME && !(inter_mode_mask & (1 << this_mode)))
       continue;
     second_ref_frame = vp9_mode_order[mode_index].ref_frame[1];
 
@@ -3283,7 +3279,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
           !vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
         const MV_REFERENCE_FRAME ref_frames[2] = {ref_frame, second_ref_frame};
         if (!check_best_zero_mv(cpi, mbmi->mode_context, frame_mv,
-                                disable_inter_mode_mask, this_mode, ref_frames))
+                                inter_mode_mask, this_mode, ref_frames))
           continue;
       }
     }
@@ -3621,9 +3617,6 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
 
   set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
   store_coding_context(x, ctx, best_mode_index,
-                       &mbmi->ref_mvs[mbmi->ref_frame[0]][0],
-                       &mbmi->ref_mvs[mbmi->ref_frame[1] < 0 ? 0 :
-                                      mbmi->ref_frame[1]][0],
                        best_pred_diff, best_tx_diff, best_filter_diff);
 
   return best_rd;
@@ -3672,7 +3665,6 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
   int_mv seg_mvs[4][MAX_REF_FRAMES];
   b_mode_info best_bmodes[4];
   int best_skip2 = 0;
-  int ref_frame_mask = 0;
   int mode_skip_mask = 0;
 
   x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
@@ -3707,17 +3699,6 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
     frame_mv[ZEROMV][ref_frame].as_int = 0;
   }
 
-  for (ref_frame = LAST_FRAME;
-       ref_frame <= ALTREF_FRAME && cpi->sf.reference_masking; ++ref_frame) {
-    int i;
-    for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
-      if ((x->pred_mv_sad[ref_frame] >> 1) > x->pred_mv_sad[i]) {
-        ref_frame_mask |= (1 << ref_frame);
-        break;
-      }
-    }
-  }
-
   for (ref_index = 0; ref_index < MAX_REFS; ++ref_index) {
     int mode_excluded = 0;
     int64_t this_rd = INT64_MAX;
@@ -3812,11 +3793,6 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
         vp9_get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) !=
             (int)ref_frame) {
       continue;
-    // If the segment skip feature is enabled....
-    // then do nothing if the current mode is not allowed..
-    } else if (vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP) &&
-               ref_frame != INTRA_FRAME) {
-      continue;
     // Disable this drop out case if the ref frame
     // segment level feature is enabled for this segment. This is to
     // prevent the possibility that we end up unable to pick any mode.
@@ -4041,15 +4017,10 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
     }
 
     if (!disable_skip) {
-      // Test for the condition where skip block will be activated
-      // because there are no non zero coefficients and make any
-      // necessary adjustment for rate. Ignore if skip is coded at
-      // segment level as the cost wont have been added in.
-      // Is Mb level skip allowed (i.e. not coded at segment level).
-      const int mb_skip_allowed = !vp9_segfeature_active(seg, segment_id,
-                                                         SEG_LVL_SKIP);
+      // Skip is never coded at the segment level for sub8x8 blocks and instead
+      // always coded in the bitstream at the mode info level.
 
-      if (mb_skip_allowed && ref_frame != INTRA_FRAME && !xd->lossless) {
+      if (ref_frame != INTRA_FRAME && !xd->lossless) {
         if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv, distortion2) <
             RDCOST(x->rdmult, x->rddiv, 0, total_sse)) {
           // Add in the cost of the no skip flag.
@@ -4064,7 +4035,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
           rate_uv = 0;
           this_skip2 = 1;
         }
-      } else if (mb_skip_allowed) {
+      } else {
         // Add in the cost of the no skip flag.
         rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0);
       }
@@ -4254,9 +4225,6 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
 
   set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
   store_coding_context(x, ctx, best_ref_index,
-                       &mbmi->ref_mvs[mbmi->ref_frame[0]][0],
-                       &mbmi->ref_mvs[mbmi->ref_frame[1] < 0 ? 0 :
-                                      mbmi->ref_frame[1]][0],
                        best_pred_diff, best_tx_diff, best_filter_diff);
 
   return best_rd;
diff --git a/vp9/encoder/vp9_rdopt.h b/vp9/encoder/vp9_rdopt.h
index b6b51e553..5ea09a8a7 100644
--- a/vp9/encoder/vp9_rdopt.h
+++ b/vp9/encoder/vp9_rdopt.h
@@ -11,6 +11,8 @@
 #ifndef VP9_ENCODER_VP9_RDOPT_H_
 #define VP9_ENCODER_VP9_RDOPT_H_
 
+#include <limits.h>
+
 #include "vp9/encoder/vp9_encoder.h"
 
 #ifdef __cplusplus
@@ -87,6 +89,11 @@ void vp9_set_rd_speed_thresholds(VP9_COMP *cpi);
 
 void vp9_set_rd_speed_thresholds_sub8x8(VP9_COMP *cpi);
 
+static INLINE int rd_less_than_thresh(int64_t best_rd, int thresh,
+                                      int thresh_fact) {
+    return best_rd < ((int64_t)thresh * thresh_fact >> 5) || thresh == INT_MAX;
+}
+
 static INLINE int full_pixel_search(VP9_COMP *cpi, MACROBLOCK *x,
                                     BLOCK_SIZE bsize, MV *mvp_full,
                                     int step_param, int error_per_bit,
diff --git a/vp9/encoder/vp9_sad.c b/vp9/encoder/vp9_sad.c
index 892e90551..d06263676 100644
--- a/vp9/encoder/vp9_sad.c
+++ b/vp9/encoder/vp9_sad.c
@@ -35,14 +35,12 @@ static INLINE unsigned int sad(const uint8_t *a, int a_stride,
 
 #define sadMxN(m, n) \
 unsigned int vp9_sad##m##x##n##_c(const uint8_t *src, int src_stride, \
-                                  const uint8_t *ref, int ref_stride, \
-                                  unsigned int max_sad) { \
+                                  const uint8_t *ref, int ref_stride) { \
   return sad(src, src_stride, ref, ref_stride, m, n); \
 } \
 unsigned int vp9_sad##m##x##n##_avg_c(const uint8_t *src, int src_stride, \
                                       const uint8_t *ref, int ref_stride, \
-                                      const uint8_t *second_pred, \
-                                      unsigned int max_sad) { \
+                                      const uint8_t *second_pred) { \
   uint8_t comp_pred[m * n]; \
   vp9_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride); \
   return sad(src, src_stride, comp_pred, m, m, n); \
@@ -54,8 +52,7 @@ void vp9_sad##m##x##n##x##k##_c(const uint8_t *src, int src_stride, \
                                 unsigned int *sads) { \
   int i; \
   for (i = 0; i < k; ++i) \
-    sads[i] = vp9_sad##m##x##n##_c(src, src_stride, &ref[i], ref_stride, \
-                                   0x7fffffff); \
+    sads[i] = vp9_sad##m##x##n##_c(src, src_stride, &ref[i], ref_stride); \
 }
 
 #define sadMxNx4D(m, n) \
@@ -64,8 +61,7 @@ void vp9_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride, \
                              unsigned int *sads) { \
   int i; \
   for (i = 0; i < 4; ++i) \
-    sads[i] = vp9_sad##m##x##n##_c(src, src_stride, refs[i], ref_stride, \
-                                   0x7fffffff); \
+    sads[i] = vp9_sad##m##x##n##_c(src, src_stride, refs[i], ref_stride); \
 }
 
 // 64x64
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index 7b2d1e2f0..15b986197 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -14,20 +14,23 @@
 #include "vp9/encoder/vp9_speed_features.h"
 
 enum {
-  ALL_INTRA_MODES = (1 << DC_PRED) |
+  INTRA_ALL       = (1 << DC_PRED) |
                     (1 << V_PRED) | (1 << H_PRED) |
                     (1 << D45_PRED) | (1 << D135_PRED) |
                     (1 << D117_PRED) | (1 << D153_PRED) |
                     (1 << D207_PRED) | (1 << D63_PRED) |
                     (1 << TM_PRED),
-
-  INTRA_DC_ONLY   = (1 << DC_PRED),
-
-  INTRA_DC_TM     = (1 << TM_PRED) | (1 << DC_PRED),
-
+  INTRA_DC        = (1 << DC_PRED),
+  INTRA_DC_TM     = (1 << DC_PRED) | (1 << TM_PRED),
   INTRA_DC_H_V    = (1 << DC_PRED) | (1 << V_PRED) | (1 << H_PRED),
+  INTRA_DC_TM_H_V = (1 << DC_PRED) | (1 << TM_PRED) | (1 << V_PRED) |
+                    (1 << H_PRED)
+};
 
-  INTRA_DC_TM_H_V = INTRA_DC_TM | (1 << V_PRED) | (1 << H_PRED)
+enum {
+  INTER_ALL = (1 << NEARESTMV) | (1 << NEARMV) | (1 << ZEROMV) | (1 << NEWMV),
+  INTER_NEAREST = (1 << NEARESTMV),
+  INTER_NEAREST_NEAR_NEW = (1 << NEARESTMV) | (1 << NEARMV) | (1 << NEWMV)
 };
 
 enum {
@@ -80,12 +83,16 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm,
   }
 
   if (speed >= 2) {
-    if (MIN(cm->width, cm->height) >= 720)
+    if (MIN(cm->width, cm->height) >= 720) {
+      sf->lf_motion_threshold = LOW_MOITION_THRESHOLD;
+      sf->last_partitioning_redo_frequency = 3;
       sf->disable_split_mask = cm->show_frame ? DISABLE_ALL_SPLIT
                                               : DISABLE_ALL_INTER_SPLIT;
-    else
+    } else {
       sf->disable_split_mask = LAST_AND_INTRA_SPLIT_ONLY;
-
+      sf->last_partitioning_redo_frequency = 2;
+      sf->lf_motion_threshold = NO_MOITION_THRESHOLD;
+    }
     sf->adaptive_pred_interp_filter = 2;
     sf->reference_masking = 1;
     sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH |
@@ -97,7 +104,6 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm,
     sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX;
     sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_LOW_MOTION;
     sf->adjust_partitioning_from_last_frame = 1;
-    sf->last_partitioning_redo_frequency = 3;
   }
 
   if (speed >= 3) {
@@ -108,6 +114,8 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm,
     else
       sf->disable_split_mask = DISABLE_ALL_INTER_SPLIT;
 
+    sf->lf_motion_threshold = LOW_MOITION_THRESHOLD;
+    sf->last_partitioning_redo_frequency = 3;
     sf->recode_loop = ALLOW_RECODE_KFMAXBW;
     sf->adaptive_rd_thresh = 3;
     sf->mode_skip_start = 6;
@@ -135,20 +143,23 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm,
     sf->search_method = HEX;
     sf->disable_filter_search_var_thresh = 500;
     for (i = 0; i < TX_SIZES; ++i) {
-      sf->intra_y_mode_mask[i] = INTRA_DC_ONLY;
-      sf->intra_uv_mode_mask[i] = INTRA_DC_ONLY;
+      sf->intra_y_mode_mask[i] = INTRA_DC;
+      sf->intra_uv_mode_mask[i] = INTRA_DC;
     }
     cpi->allow_encode_breakout = ENCODE_BREAKOUT_ENABLED;
   }
 }
 
-static void set_rt_speed_feature(VP9_COMMON *cm, SPEED_FEATURES *sf,
+static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf,
                                  int speed) {
+  VP9_COMMON *const cm = &cpi->common;
+  const int frames_since_key =
+      cm->frame_type == KEY_FRAME ? 0 : cpi->rc.frames_since_key;
   sf->static_segmentation = 0;
   sf->adaptive_rd_thresh = 1;
   sf->use_fast_coef_costing = 1;
 
-  if (speed == 1) {
+  if (speed >= 1) {
     sf->use_square_partition_only = !frame_is_intra_only(cm);
     sf->less_rectangular_check = 1;
     sf->tx_size_search_method = frame_is_intra_only(cm) ? USE_FULL_RD
@@ -171,13 +182,9 @@ static void set_rt_speed_feature(VP9_COMMON *cm, SPEED_FEATURES *sf,
   }
 
   if (speed >= 2) {
-    sf->use_square_partition_only = !frame_is_intra_only(cm);
-    sf->less_rectangular_check = 1;
-    sf->tx_size_search_method = frame_is_intra_only(cm) ? USE_FULL_RD
-                                                        : USE_LARGESTALL;
     if (MIN(cm->width, cm->height) >= 720)
-      sf->disable_split_mask = cm->show_frame ?
-        DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT;
+      sf->disable_split_mask = cm->show_frame ? DISABLE_ALL_SPLIT
+                                              : DISABLE_ALL_INTER_SPLIT;
     else
       sf->disable_split_mask = LAST_AND_INTRA_SPLIT_ONLY;
 
@@ -185,27 +192,18 @@ static void set_rt_speed_feature(VP9_COMMON *cm, SPEED_FEATURES *sf,
                                  FLAG_SKIP_INTRA_BESTINTER |
                                  FLAG_SKIP_COMP_BESTINTRA |
                                  FLAG_SKIP_INTRA_LOWVAR;
-    sf->use_rd_breakout = 1;
-    sf->adaptive_motion_search = 1;
     sf->adaptive_pred_interp_filter = 2;
-    sf->auto_mv_step_size = 1;
     sf->reference_masking = 1;
-
     sf->disable_filter_search_var_thresh = 50;
     sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
-
     sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX;
     sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_LOW_MOTION;
+    sf->lf_motion_threshold = LOW_MOITION_THRESHOLD;
     sf->adjust_partitioning_from_last_frame = 1;
     sf->last_partitioning_redo_frequency = 3;
-
-    sf->adaptive_rd_thresh = 2;
     sf->use_lp32x32fdct = 1;
     sf->mode_skip_start = 11;
-    sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
     sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V;
-    sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V;
-    sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V;
   }
 
   if (speed >= 3) {
@@ -233,19 +231,19 @@ static void set_rt_speed_feature(VP9_COMMON *cm, SPEED_FEATURES *sf,
     sf->auto_min_max_partition_size = STRICT_NEIGHBORING_MIN_MAX;
     sf->adjust_partitioning_from_last_frame =
         cm->last_frame_type != cm->frame_type || (0 ==
-        (cm->current_video_frame + 1) % sf->last_partitioning_redo_frequency);
+        (frames_since_key + 1) % sf->last_partitioning_redo_frequency);
     sf->subpel_force_stop = 1;
     for (i = 0; i < TX_SIZES; i++) {
       sf->intra_y_mode_mask[i] = INTRA_DC_H_V;
-      sf->intra_uv_mode_mask[i] = INTRA_DC_ONLY;
+      sf->intra_uv_mode_mask[i] = INTRA_DC;
     }
-    sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_ONLY;
+    sf->intra_y_mode_mask[TX_32X32] = INTRA_DC;
     sf->frame_parameter_update = 0;
     sf->search_method = FAST_HEX;
-    sf->disable_inter_mode_mask[BLOCK_32X32] = 1 << INTER_OFFSET(ZEROMV);
-    sf->disable_inter_mode_mask[BLOCK_32X64] = ~(1 << INTER_OFFSET(NEARESTMV));
-    sf->disable_inter_mode_mask[BLOCK_64X32] = ~(1 << INTER_OFFSET(NEARESTMV));
-    sf->disable_inter_mode_mask[BLOCK_64X64] = ~(1 << INTER_OFFSET(NEARESTMV));
+    sf->inter_mode_mask[BLOCK_32X32] = INTER_NEAREST_NEAR_NEW;
+    sf->inter_mode_mask[BLOCK_32X64] = INTER_NEAREST;
+    sf->inter_mode_mask[BLOCK_64X32] = INTER_NEAREST;
+    sf->inter_mode_mask[BLOCK_64X64] = INTER_NEAREST;
     sf->max_intra_bsize = BLOCK_32X32;
     sf->allow_skip_recode = 1;
   }
@@ -254,9 +252,9 @@ static void set_rt_speed_feature(VP9_COMMON *cm, SPEED_FEATURES *sf,
     sf->max_partition_size = BLOCK_32X32;
     sf->min_partition_size = BLOCK_8X8;
     sf->partition_check =
-        (cm->current_video_frame % sf->last_partitioning_redo_frequency == 1);
+        (frames_since_key % sf->last_partitioning_redo_frequency == 1);
     sf->force_frame_boost = cm->frame_type == KEY_FRAME ||
-        (cm->current_video_frame %
+        (frames_since_key %
             (sf->last_partitioning_redo_frequency << 1) == 1);
     sf->max_delta_qindex = (cm->frame_type == KEY_FRAME) ? 20 : 15;
     sf->partition_search_type = REFERENCE_PARTITION;
@@ -276,7 +274,7 @@ static void set_rt_speed_feature(VP9_COMMON *cm, SPEED_FEATURES *sf,
   if (speed >= 7) {
     int i;
     for (i = 0; i < BLOCK_SIZES; ++i)
-      sf->disable_inter_mode_mask[i] = ~(1 << INTER_OFFSET(NEARESTMV));
+      sf->inter_mode_mask[i] = INTER_NEAREST;
   }
 }
 
@@ -321,8 +319,8 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
   sf->disable_split_var_thresh = 0;
   sf->disable_filter_search_var_thresh = 0;
   for (i = 0; i < TX_SIZES; i++) {
-    sf->intra_y_mode_mask[i] = ALL_INTRA_MODES;
-    sf->intra_uv_mode_mask[i] = ALL_INTRA_MODES;
+    sf->intra_y_mode_mask[i] = INTRA_ALL;
+    sf->intra_uv_mode_mask[i] = INTRA_ALL;
   }
   sf->use_rd_breakout = 0;
   sf->skip_encode_sb = 0;
@@ -334,7 +332,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
   sf->mode_skip_start = MAX_MODES;  // Mode index at which mode skip mask set
   sf->use_nonrd_pick_mode = 0;
   for (i = 0; i < BLOCK_SIZES; ++i)
-    sf->disable_inter_mode_mask[i] = 0;
+    sf->inter_mode_mask[i] = INTER_ALL;
   sf->max_intra_bsize = BLOCK_64X64;
   // This setting only takes effect when partition_search_type is set
   // to FIXED_PARTITION.
@@ -356,7 +354,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
       set_good_speed_feature(cpi, cm, sf, oxcf->speed);
       break;
     case REALTIME:
-      set_rt_speed_feature(cm, sf, oxcf->speed);
+      set_rt_speed_feature(cpi, sf, oxcf->speed);
       break;
   }
 
diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h
index d8c1a8be2..3e7cd27d8 100644
--- a/vp9/encoder/vp9_speed_features.h
+++ b/vp9/encoder/vp9_speed_features.h
@@ -44,6 +44,11 @@ typedef enum {
 } SUBPEL_SEARCH_METHODS;
 
 typedef enum {
+  NO_MOITION_THRESHOLD = 0,
+  LOW_MOITION_THRESHOLD = 7
+} MOTION_THRESHOLD;
+
+typedef enum {
   LAST_FRAME_PARTITION_OFF = 0,
   LAST_FRAME_PARTITION_LOW_MOTION = 1,
   LAST_FRAME_PARTITION_ALL = 2
@@ -200,6 +205,10 @@ typedef struct SPEED_FEATURES {
   // partitioning.
   LAST_FRAME_PARTITION_METHOD use_lastframe_partitioning;
 
+  // The threshold is to determine how slow the motino is, it is used when
+  // use_lastframe_partitioning is set to LAST_FRAME_PARTITION_LOW_MOTION
+  MOTION_THRESHOLD lf_motion_threshold;
+
   // Determine which method we use to determine transform size. We can choose
   // between options like full rd, largest for prediction size, largest
   // for intra and model coefs for the rest.
@@ -322,8 +331,8 @@ typedef struct SPEED_FEATURES {
   int use_nonrd_pick_mode;
 
   // A binary mask indicating if NEARESTMV, NEARMV, ZEROMV, NEWMV
-  // modes are disabled in order from LSB to MSB for each BLOCK_SIZE.
-  int disable_inter_mode_mask[BLOCK_SIZES];
+  // modes are used in order from LSB to MSB for each BLOCK_SIZE.
+  int inter_mode_mask[BLOCK_SIZES];
 
   // This feature controls whether we do the expensive context update and
   // calculation in the rd coefficient costing loop.
diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c
index dd28496be..c25314b42 100644
--- a/vp9/encoder/vp9_svc_layercontext.c
+++ b/vp9/encoder/vp9_svc_layercontext.c
@@ -46,13 +46,12 @@ void vp9_init_layer_context(VP9_COMP *const cpi) {
     lrc->key_frame_rate_correction_factor = 1.0;
 
     if (svc->number_temporal_layers > 1) {
-      lc->target_bandwidth = oxcf->ts_target_bitrate[layer] * 1000;
+      lc->target_bandwidth = oxcf->ts_target_bitrate[layer];
       lrc->last_q[INTER_FRAME] = oxcf->worst_allowed_q;
     } else {
-      lc->target_bandwidth = oxcf->ss_target_bitrate[layer] * 1000;
-      lrc->last_q[0] = oxcf->best_allowed_q;
-      lrc->last_q[1] = oxcf->best_allowed_q;
-      lrc->last_q[2] = oxcf->best_allowed_q;
+      lc->target_bandwidth = oxcf->ss_target_bitrate[layer];
+      lrc->last_q[KEY_FRAME] = oxcf->best_allowed_q;
+      lrc->last_q[INTER_FRAME] = oxcf->best_allowed_q;
     }
 
     lrc->buffer_level = vp9_rescale((int)(oxcf->starting_buffer_level),
@@ -82,9 +81,9 @@ void vp9_update_layer_context_change_config(VP9_COMP *const cpi,
     RATE_CONTROL *const lrc = &lc->rc;
 
     if (svc->number_temporal_layers > 1) {
-      lc->target_bandwidth = oxcf->ts_target_bitrate[layer] * 1000;
+      lc->target_bandwidth = oxcf->ts_target_bitrate[layer];
     } else {
-      lc->target_bandwidth = oxcf->ss_target_bitrate[layer] * 1000;
+      lc->target_bandwidth = oxcf->ss_target_bitrate[layer];
     }
     bitrate_alloc = (float)lc->target_bandwidth / target_bandwidth;
     // Update buffer-related quantities.
@@ -132,8 +131,7 @@ void vp9_update_temporal_layer_framerate(VP9_COMP *const cpi) {
   } else {
     const double prev_layer_framerate =
         oxcf->framerate / oxcf->ts_rate_decimator[layer - 1];
-    const int prev_layer_target_bandwidth =
-        oxcf->ts_target_bitrate[layer - 1] * 1000;
+    const int prev_layer_target_bandwidth = oxcf->ts_target_bitrate[layer - 1];
     lc->avg_frame_size =
         (int)((lc->target_bandwidth - prev_layer_target_bandwidth) /
               (lc->framerate - prev_layer_framerate));
@@ -151,20 +149,7 @@ void vp9_update_spatial_layer_framerate(VP9_COMP *const cpi, double framerate) {
                                    oxcf->two_pass_vbrmin_section / 100);
   lrc->max_frame_bandwidth = (int)(((int64_t)lrc->avg_frame_bandwidth *
                                    oxcf->two_pass_vbrmax_section) / 100);
-  lrc->max_gf_interval = 16;
-
-  lrc->static_scene_max_gf_interval = cpi->oxcf.key_freq >> 1;
-
-  if (oxcf->play_alternate && oxcf->lag_in_frames) {
-    if (lrc->max_gf_interval > oxcf->lag_in_frames - 1)
-      lrc->max_gf_interval = oxcf->lag_in_frames - 1;
-
-    if (lrc->static_scene_max_gf_interval > oxcf->lag_in_frames - 1)
-      lrc->static_scene_max_gf_interval = oxcf->lag_in_frames - 1;
-  }
-
-  if (lrc->max_gf_interval > lrc->static_scene_max_gf_interval)
-    lrc->max_gf_interval = lrc->static_scene_max_gf_interval;
+  vp9_rc_set_gf_max_interval(oxcf, lrc);
 }
 
 void vp9_restore_layer_context(VP9_COMP *const cpi) {
diff --git a/vp9/encoder/vp9_variance.c b/vp9/encoder/vp9_variance.c
index 91d8ea4dc..eb5ae2e41 100644
--- a/vp9/encoder/vp9_variance.c
+++ b/vp9/encoder/vp9_variance.c
@@ -156,6 +156,18 @@ unsigned int vp9_sub_pixel_avg_variance##W##x##H##_c( \
   return vp9_variance##W##x##H##_c(temp3, W, dst, dst_stride, sse); \
 }
 
+void vp9_get16x16var_c(const uint8_t *src_ptr, int source_stride,
+                       const uint8_t *ref_ptr, int ref_stride,
+                       unsigned int *sse, int *sum) {
+  variance(src_ptr, source_stride, ref_ptr, ref_stride, 16, 16, sse, sum);
+}
+
+void vp9_get8x8var_c(const uint8_t *src_ptr, int source_stride,
+                     const uint8_t *ref_ptr, int ref_stride,
+                     unsigned int *sse, int *sum) {
+  variance(src_ptr, source_stride, ref_ptr, ref_stride, 8, 8, sse, sum);
+}
+
 unsigned int vp9_mse16x16_c(const uint8_t *src, int src_stride,
                             const uint8_t *ref, int ref_stride,
                             unsigned int *sse) {
diff --git a/vp9/encoder/vp9_variance.h b/vp9/encoder/vp9_variance.h
index c47fe1335..4a194b72c 100644
--- a/vp9/encoder/vp9_variance.h
+++ b/vp9/encoder/vp9_variance.h
@@ -25,15 +25,13 @@ void variance(const uint8_t *a, int a_stride,
 typedef unsigned int(*vp9_sad_fn_t)(const uint8_t *src_ptr,
                                     int source_stride,
                                     const uint8_t *ref_ptr,
-                                    int ref_stride,
-                                    unsigned int max_sad);
+                                    int ref_stride);
 
 typedef unsigned int(*vp9_sad_avg_fn_t)(const uint8_t *src_ptr,
                                         int source_stride,
                                         const uint8_t *ref_ptr,
                                         int ref_stride,
-                                        const uint8_t *second_pred,
-                                        unsigned int max_sad);
+                                        const uint8_t *second_pred);
 
 typedef void (*vp9_sad_multi_fn_t)(const uint8_t *src_ptr,
                                    int source_stride,
diff --git a/vp9/encoder/x86/vp9_dct_ssse3.asm b/vp9/encoder/x86/vp9_dct_ssse3_x86_64.asm
index 8723a7114..8723a7114 100644
--- a/vp9/encoder/x86/vp9_dct_ssse3.asm
+++ b/vp9/encoder/x86/vp9_dct_ssse3_x86_64.asm
diff --git a/vp9/encoder/x86/vp9_quantize_ssse3.asm b/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
index 48ccef8cc..48ccef8cc 100644
--- a/vp9/encoder/x86/vp9_quantize_ssse3.asm
+++ b/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
diff --git a/vp9/encoder/x86/vp9_ssim_opt.asm b/vp9/encoder/x86/vp9_ssim_opt_x86_64.asm
index 455d10d2c..455d10d2c 100644
--- a/vp9/encoder/x86/vp9_ssim_opt.asm
+++ b/vp9/encoder/x86/vp9_ssim_opt_x86_64.asm
diff --git a/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm b/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm
index 673e0b3a6..21aaa9383 100644
--- a/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm
+++ b/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm
@@ -43,9 +43,9 @@ sym(vp9_temporal_filter_apply_sse2):
     mov         [rsp + rbp_backup], rbp
     ; end prolog
 
-        mov         rdx,            arg(3)
+        mov         edx,            arg(3)
         mov         [rsp + block_width], rdx
-        mov         rdx,            arg(4)
+        mov         edx,            arg(4)
         mov         [rsp + block_height], rdx
         movd        xmm6,           arg(5)
         movdqa      [rsp + strength], xmm6 ; where strength is used, all 16 bytes are read
diff --git a/vp9/encoder/x86/vp9_variance_avx2.c b/vp9/encoder/x86/vp9_variance_avx2.c
index 835c51957..7f81f46b8 100644
--- a/vp9/encoder/x86/vp9_variance_avx2.c
+++ b/vp9/encoder/x86/vp9_variance_avx2.c
@@ -10,7 +10,6 @@
 #include "./vpx_config.h"
 
 #include "vp9/encoder/vp9_variance.h"
-#include "vp9/common/vp9_pragmas.h"
 #include "vpx_ports/mem.h"
 
 typedef void (*get_var_avx2) (
diff --git a/vp9/encoder/x86/vp9_variance_mmx.c b/vp9/encoder/x86/vp9_variance_mmx.c
index c4d17fc0f..ae2f976af 100644
--- a/vp9/encoder/x86/vp9_variance_mmx.c
+++ b/vp9/encoder/x86/vp9_variance_mmx.c
@@ -10,7 +10,6 @@
 
 #include "./vpx_config.h"
 #include "vp9/encoder/vp9_variance.h"
-#include "vp9/common/vp9_pragmas.h"
 #include "vpx_ports/mem.h"
 
 extern unsigned int vp9_get8x8var_mmx
diff --git a/vp9/encoder/x86/vp9_variance_sse2.c b/vp9/encoder/x86/vp9_variance_sse2.c
index 41f225922..e935a233a 100644
--- a/vp9/encoder/x86/vp9_variance_sse2.c
+++ b/vp9/encoder/x86/vp9_variance_sse2.c
@@ -11,53 +11,29 @@
 #include "./vpx_config.h"
 
 #include "vp9/encoder/vp9_variance.h"
-#include "vp9/common/vp9_pragmas.h"
 #include "vpx_ports/mem.h"
 
-extern unsigned int vp9_get4x4var_mmx
-(
-  const unsigned char *src_ptr,
-  int  source_stride,
-  const unsigned char *ref_ptr,
-  int  recon_stride,
-  unsigned int *SSE,
-  int *Sum
-);
-
-unsigned int vp9_get16x16var_sse2
-(
-  const unsigned char *src_ptr,
-  int source_stride,
-  const unsigned char *ref_ptr,
-  int recon_stride,
-  unsigned int *SSE,
-  int *Sum
-);
-unsigned int vp9_get8x8var_sse2
-(
-  const unsigned char *src_ptr,
-  int source_stride,
-  const unsigned char *ref_ptr,
-  int recon_stride,
-  unsigned int *SSE,
-  int *Sum
-);
-
-typedef unsigned int (*get_var_sse2) (
-  const unsigned char *src_ptr,
-  int source_stride,
-  const unsigned char *ref_ptr,
-  int recon_stride,
-  unsigned int *SSE,
-  int *Sum
-);
-
-static void variance_sse2(const unsigned char *src_ptr, int  source_stride,
-                        const unsigned char *ref_ptr, int  recon_stride,
-                        int  w, int  h, unsigned int *sse, int *sum,
-                        get_var_sse2 var_fn, int block_size) {
-  unsigned int sse0;
-  int sum0;
+typedef unsigned int (*variance_fn_t) (const unsigned char *src, int src_stride,
+                                       const unsigned char *ref, int ref_stride,
+                                       unsigned int *sse, int *sum);
+
+unsigned int vp9_get4x4var_mmx(const unsigned char *src, int src_stride,
+                               const unsigned char *ref, int ref_stride,
+                               unsigned int *sse, int *sum);
+
+
+unsigned int vp9_get8x8var_sse2(const unsigned char *src, int src_stride,
+                                const unsigned char *ref, int ref_stride,
+                                unsigned int *sse, int *sum);
+
+unsigned int vp9_get16x16var_sse2(const unsigned char *src, int src_stride,
+                                  const unsigned char *ref, int ref_stride,
+                                  unsigned int *sse, int *sum);
+
+static void variance_sse2(const unsigned char *src, int src_stride,
+                          const unsigned char *ref, int ref_stride,
+                          int w, int h, unsigned int *sse, int *sum,
+                          variance_fn_t var_fn, int block_size) {
   int i, j;
 
   *sse = 0;
@@ -65,217 +41,139 @@ static void variance_sse2(const unsigned char *src_ptr, int  source_stride,
 
   for (i = 0; i < h; i += block_size) {
     for (j = 0; j < w; j += block_size) {
-      var_fn(src_ptr + source_stride * i + j, source_stride,
-             ref_ptr + recon_stride * i + j, recon_stride, &sse0, &sum0);
+      unsigned int sse0;
+      int sum0;
+      var_fn(src + src_stride * i + j, src_stride,
+             ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
       *sse += sse0;
       *sum += sum0;
     }
   }
 }
 
-unsigned int vp9_variance4x4_sse2(
-  const unsigned char *src_ptr,
-  int  source_stride,
-  const unsigned char *ref_ptr,
-  int  recon_stride,
-  unsigned int *sse) {
-  unsigned int var;
-  int avg;
-
-  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 4, 4,
-                  &var, &avg, vp9_get4x4var_mmx, 4);
-  *sse = var;
-  return (var - (((unsigned int)avg * avg) >> 4));
+unsigned int vp9_variance4x4_sse2(const unsigned char *src, int src_stride,
+                                  const unsigned char *ref, int ref_stride,
+                                  unsigned int *sse) {
+  int sum;
+  variance_sse2(src, src_stride, ref, ref_stride, 4, 4,
+                sse, &sum, vp9_get4x4var_mmx, 4);
+  return *sse - (((unsigned int)sum * sum) >> 4);
 }
 
-unsigned int vp9_variance8x4_sse2(const uint8_t *src_ptr,
-                                  int  source_stride,
-                                  const uint8_t *ref_ptr,
-                                  int  recon_stride,
+unsigned int vp9_variance8x4_sse2(const uint8_t *src, int src_stride,
+                                  const uint8_t *ref, int ref_stride,
                                   unsigned int *sse) {
-  unsigned int var;
-  int avg;
-
-  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 8, 4,
-                  &var, &avg, vp9_get4x4var_mmx, 4);
-  *sse = var;
-  return (var - (((unsigned int)avg * avg) >> 5));
+  int sum;
+  variance_sse2(src, src_stride, ref, ref_stride, 8, 4,
+                sse, &sum, vp9_get4x4var_mmx, 4);
+  return *sse - (((unsigned int)sum * sum) >> 5);
 }
 
-unsigned int vp9_variance4x8_sse2(const uint8_t *src_ptr,
-                                  int  source_stride,
-                                  const uint8_t *ref_ptr,
-                                  int  recon_stride,
+unsigned int vp9_variance4x8_sse2(const uint8_t *src, int src_stride,
+                                  const uint8_t *ref, int ref_stride,
                                   unsigned int *sse) {
-  unsigned int var;
-  int avg;
-
-  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 4, 8,
-                  &var, &avg, vp9_get4x4var_mmx, 4);
-  *sse = var;
-  return (var - (((unsigned int)avg * avg) >> 5));
+  int sum;
+  variance_sse2(src, src_stride, ref, ref_stride, 4, 8,
+                sse, &sum, vp9_get4x4var_mmx, 4);
+  return *sse - (((unsigned int)sum * sum) >> 5);
 }
 
-unsigned int vp9_variance8x8_sse2
-(
-  const unsigned char *src_ptr,
-  int  source_stride,
-  const unsigned char *ref_ptr,
-  int  recon_stride,
-  unsigned int *sse) {
-  unsigned int var;
-  int avg;
-
-  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 8, 8,
-                  &var, &avg, vp9_get8x8var_sse2, 8);
-  *sse = var;
-  return (var - (((unsigned int)avg * avg) >> 6));
+unsigned int vp9_variance8x8_sse2(const unsigned char *src, int src_stride,
+                                  const unsigned char *ref, int ref_stride,
+                                  unsigned int *sse) {
+  int sum;
+  variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
+                sse, &sum, vp9_get8x8var_sse2, 8);
+  return *sse - (((unsigned int)sum * sum) >> 6);
 }
 
-unsigned int vp9_variance16x8_sse2
-(
-  const unsigned char *src_ptr,
-  int  source_stride,
-  const unsigned char *ref_ptr,
-  int  recon_stride,
-  unsigned int *sse) {
-  unsigned int var;
-  int avg;
-
-  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 16, 8,
-                  &var, &avg, vp9_get8x8var_sse2, 8);
-  *sse = var;
-  return (var - (((unsigned int)avg * avg) >> 7));
+unsigned int vp9_variance16x8_sse2(const unsigned char *src, int src_stride,
+                                   const unsigned char *ref, int ref_stride,
+                                   unsigned int *sse) {
+  int sum;
+  variance_sse2(src, src_stride, ref, ref_stride, 16, 8,
+                sse, &sum, vp9_get8x8var_sse2, 8);
+  return *sse - (((unsigned int)sum * sum) >> 7);
 }
 
-unsigned int vp9_variance8x16_sse2
-(
-  const unsigned char *src_ptr,
-  int  source_stride,
-  const unsigned char *ref_ptr,
-  int  recon_stride,
-  unsigned int *sse) {
-  unsigned int var;
-  int avg;
-
-  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 8, 16,
-                &var, &avg, vp9_get8x8var_sse2, 8);
-  *sse = var;
-  return (var - (((unsigned int)avg * avg) >> 7));
+unsigned int vp9_variance8x16_sse2(const unsigned char *src, int src_stride,
+                                   const unsigned char *ref, int ref_stride,
+                                   unsigned int *sse) {
+  int sum;
+  variance_sse2(src, src_stride, ref, ref_stride, 8, 16,
+                sse, &sum, vp9_get8x8var_sse2, 8);
+  return *sse - (((unsigned int)sum * sum) >> 7);
 }
 
-unsigned int vp9_variance16x16_sse2
-(
-  const unsigned char *src_ptr,
-  int  source_stride,
-  const unsigned char *ref_ptr,
-  int  recon_stride,
-  unsigned int *sse) {
-  unsigned int var;
-  int avg;
-
-  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16,
-                &var, &avg, vp9_get16x16var_sse2, 16);
-  *sse = var;
-  return (var - (((unsigned int)avg * avg) >> 8));
+unsigned int vp9_variance16x16_sse2(const unsigned char *src, int src_stride,
+                                    const unsigned char *ref, int ref_stride,
+                                    unsigned int *sse) {
+  int sum;
+  variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
+                sse, &sum, vp9_get16x16var_sse2, 16);
+  return *sse - (((unsigned int)sum * sum) >> 8);
 }
 
-unsigned int vp9_mse16x16_sse2(
-  const unsigned char *src_ptr,
-  int  source_stride,
-  const unsigned char *ref_ptr,
-  int  recon_stride,
-  unsigned int *sse) {
-  unsigned int sse0;
-  int sum0;
-  vp9_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0,
-                       &sum0);
-  *sse = sse0;
-  return sse0;
+unsigned int vp9_mse16x16_sse2(const unsigned char *src, int src_stride,
+                               const unsigned char *ref, int ref_stride,
+                               unsigned int *sse) {
+  int sum;
+  vp9_get16x16var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
+  return *sse;
 }
 
-unsigned int vp9_variance32x32_sse2(const uint8_t *src_ptr,
-                                    int  source_stride,
-                                    const uint8_t *ref_ptr,
-                                    int  recon_stride,
+unsigned int vp9_variance32x32_sse2(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
                                     unsigned int *sse) {
-  unsigned int var;
-  int avg;
-
-  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 32,
-                &var, &avg, vp9_get16x16var_sse2, 16);
-  *sse = var;
-  return (var - (((int64_t)avg * avg) >> 10));
+  int sum;
+  variance_sse2(src, src_stride, ref, ref_stride, 32, 32,
+                sse, &sum, vp9_get16x16var_sse2, 16);
+  return *sse - (((int64_t)sum * sum) >> 10);
 }
 
-unsigned int vp9_variance32x16_sse2(const uint8_t *src_ptr,
-                                    int  source_stride,
-                                    const uint8_t *ref_ptr,
-                                    int  recon_stride,
+unsigned int vp9_variance32x16_sse2(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
                                     unsigned int *sse) {
-  unsigned int var;
-  int avg;
-
-  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 16,
-                &var, &avg, vp9_get16x16var_sse2, 16);
-  *sse = var;
-  return (var - (((int64_t)avg * avg) >> 9));
+  int sum;
+  variance_sse2(src, src_stride, ref, ref_stride, 32, 16,
+                sse, &sum, vp9_get16x16var_sse2, 16);
+  return *sse - (((int64_t)sum * sum) >> 9);
 }
 
-unsigned int vp9_variance16x32_sse2(const uint8_t *src_ptr,
-                                    int  source_stride,
-                                    const uint8_t *ref_ptr,
-                                    int  recon_stride,
+unsigned int vp9_variance16x32_sse2(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
                                     unsigned int *sse) {
-  unsigned int var;
-  int avg;
-
-  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 16, 32,
-                &var, &avg, vp9_get16x16var_sse2, 16);
-  *sse = var;
-  return (var - (((int64_t)avg * avg) >> 9));
+  int sum;
+  variance_sse2(src, src_stride, ref, ref_stride, 16, 32,
+                sse, &sum, vp9_get16x16var_sse2, 16);
+  return *sse - (((int64_t)sum * sum) >> 9);
 }
 
-unsigned int vp9_variance64x64_sse2(const uint8_t *src_ptr,
-                                    int  source_stride,
-                                    const uint8_t *ref_ptr,
-                                    int  recon_stride,
+unsigned int vp9_variance64x64_sse2(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
                                     unsigned int *sse) {
-  unsigned int var;
-  int avg;
-
-  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 64, 64,
-                &var, &avg, vp9_get16x16var_sse2, 16);
-  *sse = var;
-  return (var - (((int64_t)avg * avg) >> 12));
+  int sum;
+  variance_sse2(src, src_stride, ref, ref_stride, 64, 64,
+                sse, &sum, vp9_get16x16var_sse2, 16);
+  return *sse - (((int64_t)sum * sum) >> 12);
 }
 
-unsigned int vp9_variance64x32_sse2(const uint8_t *src_ptr,
-                                    int  source_stride,
-                                    const uint8_t *ref_ptr,
-                                    int  recon_stride,
+unsigned int vp9_variance64x32_sse2(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
                                     unsigned int *sse) {
-  unsigned int var;
-  int avg;
-
-  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 64, 32,
-                &var, &avg, vp9_get16x16var_sse2, 16);
-  *sse = var;
-  return (var - (((int64_t)avg * avg) >> 11));
+  int sum;
+  variance_sse2(src, src_stride, ref, ref_stride, 64, 32,
+                sse, &sum, vp9_get16x16var_sse2, 16);
+  return *sse - (((int64_t)sum * sum) >> 11);
 }
 
-unsigned int vp9_variance32x64_sse2(const uint8_t *src_ptr,
-                                    int  source_stride,
-                                    const uint8_t *ref_ptr,
-                                    int  recon_stride,
+unsigned int vp9_variance32x64_sse2(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
                                     unsigned int *sse) {
-  unsigned int var;
-  int avg;
-
-  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 64,
-                &var, &avg, vp9_get16x16var_sse2, 16);
-  *sse = var;
-  return (var - (((int64_t)avg * avg) >> 11));
+  int sum;
+  variance_sse2(src, src_stride, ref, ref_stride, 32, 64,
+                sse, &sum, vp9_get16x16var_sse2, 16);
+  return *sse - (((int64_t)sum * sum) >> 11);
 }
 
 #define DECL(w, opt) \
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index 3b4d6b901..8c1f34567 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -10,7 +10,6 @@
 
 VP9_COMMON_SRCS-yes += vp9_common.mk
 VP9_COMMON_SRCS-yes += vp9_iface_common.h
-VP9_COMMON_SRCS-yes += common/vp9_pragmas.h
 VP9_COMMON_SRCS-yes += common/vp9_ppflags.h
 VP9_COMMON_SRCS-yes += common/vp9_alloccommon.c
 VP9_COMMON_SRCS-yes += common/vp9_blockd.c
@@ -119,9 +118,10 @@ VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_mblpf_horiz_loopfilter_d
 VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_mblpf_vert_loopfilter_dspr2.c
 
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.c
-
+VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.h
+VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_idct_intrin_ssse3.c
 ifeq ($(ARCH_X86_64), yes)
-VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_idct_ssse3.asm
+VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_idct_ssse3_x86_64.asm
 endif
 
 VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_convolve_neon.c
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 5a8a4f4fe..d52424ad6 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -327,7 +327,8 @@ static vpx_codec_err_t set_encoder_config(
   else if (cfg->rc_end_usage == VPX_CBR)
     oxcf->rc_mode = RC_MODE_CBR;
 
-  oxcf->target_bandwidth         = cfg->rc_target_bitrate;
+  // Convert target bandwidth from Kbit/s to Bit/s
+  oxcf->target_bandwidth = 1000 * cfg->rc_target_bitrate;
   oxcf->rc_max_intra_bitrate_pct = extra_cfg->rc_max_intra_bitrate_pct;
 
   oxcf->best_allowed_q  = vp9_quantizer_to_qindex(cfg->rc_min_quantizer);
@@ -387,7 +388,9 @@ static vpx_codec_err_t set_encoder_config(
   oxcf->ss_number_layers = cfg->ss_number_layers;
 
   if (oxcf->ss_number_layers > 1) {
-    vp9_copy(oxcf->ss_target_bitrate, cfg->ss_target_bitrate);
+    int i;
+    for (i = 0; i < VPX_SS_MAX_LAYERS; ++i)
+      oxcf->ss_target_bitrate[i] =  1000 * cfg->ss_target_bitrate[i];
   } else if (oxcf->ss_number_layers == 1) {
     oxcf->ss_target_bitrate[0] = (int)oxcf->target_bandwidth;
   }
@@ -395,8 +398,11 @@ static vpx_codec_err_t set_encoder_config(
   oxcf->ts_number_layers = cfg->ts_number_layers;
 
   if (oxcf->ts_number_layers > 1) {
-    vp9_copy(oxcf->ts_target_bitrate, cfg->ts_target_bitrate);
-    vp9_copy(oxcf->ts_rate_decimator, cfg->ts_rate_decimator);
+    int i;
+    for (i = 0; i < VPX_TS_MAX_LAYERS; ++i) {
+      oxcf->ts_target_bitrate[i] = 1000 * cfg->ts_target_bitrate[i];
+      oxcf->ts_rate_decimator[i] = cfg->ts_rate_decimator[i];
+    }
   } else if (oxcf->ts_number_layers == 1) {
     oxcf->ts_target_bitrate[0] = (int)oxcf->target_bandwidth;
     oxcf->ts_rate_decimator[0] = 1;
@@ -462,64 +468,154 @@ static vpx_codec_err_t encoder_set_config(vpx_codec_alg_priv_t *ctx,
   return res;
 }
 
-static vpx_codec_err_t ctrl_get_param(vpx_codec_alg_priv_t *ctx, int ctrl_id,
-                                 va_list args) {
-  void *const arg = va_arg(args, void *);
-
-#define MAP(id, var) case id: *(RECAST(id, arg)) = var; break
+static vpx_codec_err_t ctrl_get_quantizer(vpx_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  int *const arg = va_arg(args, int *);
+  if (arg == NULL)
+    return VPX_CODEC_INVALID_PARAM;
+  *arg = vp9_get_quantizer(ctx->cpi);
+  return VPX_CODEC_OK;
+}
 
+static vpx_codec_err_t ctrl_get_quantizer64(vpx_codec_alg_priv_t *ctx,
+                                            va_list args) {
+  int *const arg = va_arg(args, int *);
   if (arg == NULL)
     return VPX_CODEC_INVALID_PARAM;
+  *arg = vp9_qindex_to_quantizer(vp9_get_quantizer(ctx->cpi));
+  return VPX_CODEC_OK;
+}
 
-  switch (ctrl_id) {
-    MAP(VP8E_GET_LAST_QUANTIZER, vp9_get_quantizer(ctx->cpi));
-    MAP(VP8E_GET_LAST_QUANTIZER_64,
-        vp9_qindex_to_quantizer(vp9_get_quantizer(ctx->cpi)));
+static vpx_codec_err_t update_extra_cfg(vpx_codec_alg_priv_t *ctx,
+                                        const struct vp9_extracfg *extra_cfg) {
+  const vpx_codec_err_t res = validate_config(ctx, &ctx->cfg, extra_cfg);
+  if (res == VPX_CODEC_OK) {
+    ctx->extra_cfg = *extra_cfg;
+    set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg);
+    vp9_change_config(ctx->cpi, &ctx->oxcf);
   }
+  return res;
+}
 
-  return VPX_CODEC_OK;
-#undef MAP
+static vpx_codec_err_t ctrl_set_cpuused(vpx_codec_alg_priv_t *ctx,
+                                        va_list args) {
+  struct vp9_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.cpu_used = CAST(VP8E_SET_CPUUSED, args);
+  return update_extra_cfg(ctx, &extra_cfg);
 }
 
+static vpx_codec_err_t ctrl_set_enable_auto_alt_ref(vpx_codec_alg_priv_t *ctx,
+                                                    va_list args) {
+  struct vp9_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_auto_alt_ref = CAST(VP8E_SET_ENABLEAUTOALTREF, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
 
-static vpx_codec_err_t ctrl_set_param(vpx_codec_alg_priv_t *ctx, int ctrl_id,
-                                      va_list args) {
-  vpx_codec_err_t res = VPX_CODEC_OK;
+static vpx_codec_err_t ctrl_set_noise_sensitivity(vpx_codec_alg_priv_t *ctx,
+                                                  va_list args) {
   struct vp9_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.noise_sensitivity = CAST(VP8E_SET_NOISE_SENSITIVITY, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
 
-#define MAP(id, var) case id: var = CAST(id, args); break;
-
-  switch (ctrl_id) {
-    MAP(VP8E_SET_CPUUSED,                 extra_cfg.cpu_used);
-    MAP(VP8E_SET_ENABLEAUTOALTREF,        extra_cfg.enable_auto_alt_ref);
-    MAP(VP8E_SET_NOISE_SENSITIVITY,       extra_cfg.noise_sensitivity);
-    MAP(VP8E_SET_SHARPNESS,               extra_cfg.sharpness);
-    MAP(VP8E_SET_STATIC_THRESHOLD,        extra_cfg.static_thresh);
-    MAP(VP9E_SET_TILE_COLUMNS,            extra_cfg.tile_columns);
-    MAP(VP9E_SET_TILE_ROWS,               extra_cfg.tile_rows);
-    MAP(VP8E_SET_ARNR_MAXFRAMES,          extra_cfg.arnr_max_frames);
-    MAP(VP8E_SET_ARNR_STRENGTH,           extra_cfg.arnr_strength);
-    MAP(VP8E_SET_ARNR_TYPE,               extra_cfg.arnr_type);
-    MAP(VP8E_SET_TUNING,                  extra_cfg.tuning);
-    MAP(VP8E_SET_CQ_LEVEL,                extra_cfg.cq_level);
-    MAP(VP8E_SET_MAX_INTRA_BITRATE_PCT,   extra_cfg.rc_max_intra_bitrate_pct);
-    MAP(VP9E_SET_LOSSLESS,                extra_cfg.lossless);
-    MAP(VP9E_SET_FRAME_PARALLEL_DECODING,
-        extra_cfg.frame_parallel_decoding_mode);
-    MAP(VP9E_SET_AQ_MODE,                 extra_cfg.aq_mode);
-    MAP(VP9E_SET_FRAME_PERIODIC_BOOST,   extra_cfg.frame_periodic_boost);
-  }
+static vpx_codec_err_t ctrl_set_sharpness(vpx_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  struct vp9_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.sharpness = CAST(VP8E_SET_SHARPNESS, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
 
-  res = validate_config(ctx, &ctx->cfg, &extra_cfg);
+static vpx_codec_err_t ctrl_set_static_thresh(vpx_codec_alg_priv_t *ctx,
+                                              va_list args) {
+  struct vp9_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.static_thresh = CAST(VP8E_SET_STATIC_THRESHOLD, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
 
-  if (res == VPX_CODEC_OK) {
-    ctx->extra_cfg = extra_cfg;
-    set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg);
-    vp9_change_config(ctx->cpi, &ctx->oxcf);
-  }
+static vpx_codec_err_t ctrl_set_tile_columns(vpx_codec_alg_priv_t *ctx,
+                                             va_list args) {
+  struct vp9_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.tile_columns = CAST(VP9E_SET_TILE_COLUMNS, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
 
-  return res;
-#undef MAP
+static vpx_codec_err_t ctrl_set_tile_rows(vpx_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  struct vp9_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.tile_rows = CAST(VP9E_SET_TILE_ROWS, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_err_t ctrl_set_arnr_max_frames(vpx_codec_alg_priv_t *ctx,
+                                                va_list args) {
+  struct vp9_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.arnr_max_frames = CAST(VP8E_SET_ARNR_MAXFRAMES, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_err_t ctrl_set_arnr_strength(vpx_codec_alg_priv_t *ctx,
+                                              va_list args) {
+  struct vp9_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.arnr_strength = CAST(VP8E_SET_ARNR_STRENGTH, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_err_t ctrl_set_arnr_type(vpx_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  struct vp9_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.arnr_type = CAST(VP8E_SET_ARNR_TYPE, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_err_t ctrl_set_tuning(vpx_codec_alg_priv_t *ctx,
+                                       va_list args) {
+  struct vp9_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.tuning = CAST(VP8E_SET_TUNING, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_err_t ctrl_set_cq_level(vpx_codec_alg_priv_t *ctx,
+                                         va_list args) {
+  struct vp9_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.cq_level = CAST(VP8E_SET_CQ_LEVEL, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_err_t ctrl_set_rc_max_intra_bitrate_pct(
+    vpx_codec_alg_priv_t *ctx, va_list args) {
+  struct vp9_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.rc_max_intra_bitrate_pct =
+      CAST(VP8E_SET_MAX_INTRA_BITRATE_PCT, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_err_t ctrl_set_lossless(vpx_codec_alg_priv_t *ctx,
+                                         va_list args) {
+  struct vp9_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.lossless = CAST(VP9E_SET_LOSSLESS, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_err_t ctrl_set_frame_parallel_decoding_mode(
+    vpx_codec_alg_priv_t *ctx, va_list args) {
+  struct vp9_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.frame_parallel_decoding_mode =
+      CAST(VP9E_SET_FRAME_PARALLEL_DECODING, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_err_t ctrl_set_aq_mode(vpx_codec_alg_priv_t *ctx,
+                                        va_list args) {
+  struct vp9_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.aq_mode = CAST(VP9E_SET_AQ_MODE, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_err_t ctrl_set_frame_periodic_boost(vpx_codec_alg_priv_t *ctx,
+                                                     va_list args) {
+  struct vp9_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.frame_periodic_boost = CAST(VP9E_SET_FRAME_PERIODIC_BOOST, args);
+  return update_extra_cfg(ctx, &extra_cfg);
 }
 
 static vpx_codec_err_t encoder_init(vpx_codec_ctx_t *ctx,
@@ -887,9 +983,8 @@ static const vpx_codec_cx_pkt_t *encoder_get_cxdata(vpx_codec_alg_priv_t *ctx,
 }
 
 static vpx_codec_err_t ctrl_set_reference(vpx_codec_alg_priv_t *ctx,
-                                          int ctrl_id, va_list args) {
+                                          va_list args) {
   vpx_ref_frame_t *const frame = va_arg(args, vpx_ref_frame_t *);
-  (void)ctrl_id;
 
   if (frame != NULL) {
     YV12_BUFFER_CONFIG sd;
@@ -904,9 +999,8 @@ static vpx_codec_err_t ctrl_set_reference(vpx_codec_alg_priv_t *ctx,
 }
 
 static vpx_codec_err_t ctrl_copy_reference(vpx_codec_alg_priv_t *ctx,
-                                           int ctrl_id, va_list args) {
+                                           va_list args) {
   vpx_ref_frame_t *const frame = va_arg(args, vpx_ref_frame_t *);
-  (void)ctrl_id;
 
   if (frame != NULL) {
     YV12_BUFFER_CONFIG sd;
@@ -921,9 +1015,8 @@ static vpx_codec_err_t ctrl_copy_reference(vpx_codec_alg_priv_t *ctx,
 }
 
 static vpx_codec_err_t ctrl_get_reference(vpx_codec_alg_priv_t *ctx,
-                                          int ctrl_id, va_list args) {
+                                          va_list args) {
   vp9_ref_frame_t *const frame = va_arg(args, vp9_ref_frame_t *);
-  (void)ctrl_id;
 
   if (frame != NULL) {
     YV12_BUFFER_CONFIG *fb;
@@ -937,11 +1030,9 @@ static vpx_codec_err_t ctrl_get_reference(vpx_codec_alg_priv_t *ctx,
 }
 
 static vpx_codec_err_t ctrl_set_previewpp(vpx_codec_alg_priv_t *ctx,
-                                          int ctr_id, va_list args) {
+                                          va_list args) {
 #if CONFIG_VP9_POSTPROC
   vp8_postproc_cfg_t *config = va_arg(args, vp8_postproc_cfg_t *);
-  (void)ctr_id;
-
   if (config != NULL) {
     ctx->preview_ppcfg = *config;
     return VPX_CODEC_OK;
@@ -950,7 +1041,6 @@ static vpx_codec_err_t ctrl_set_previewpp(vpx_codec_alg_priv_t *ctx,
   }
 #else
   (void)ctx;
-  (void)ctr_id;
   (void)args;
   return VPX_CODEC_INCAPABLE;
 #endif
@@ -977,36 +1067,32 @@ static vpx_image_t *encoder_get_preview(vpx_codec_alg_priv_t *ctx) {
 }
 
 static vpx_codec_err_t ctrl_update_entropy(vpx_codec_alg_priv_t *ctx,
-                                           int ctrl_id, va_list args) {
+                                           va_list args) {
   const int update = va_arg(args, int);
-  (void)ctrl_id;
 
   vp9_update_entropy(ctx->cpi, update);
   return VPX_CODEC_OK;
 }
 
 static vpx_codec_err_t ctrl_update_reference(vpx_codec_alg_priv_t *ctx,
-                                             int ctrl_id, va_list args) {
+                                             va_list args) {
   const int ref_frame_flags = va_arg(args, int);
-  (void)ctrl_id;
 
   vp9_update_reference(ctx->cpi, ref_frame_flags);
   return VPX_CODEC_OK;
 }
 
 static vpx_codec_err_t ctrl_use_reference(vpx_codec_alg_priv_t *ctx,
-                                          int ctrl_id, va_list args) {
+                                          va_list args) {
   const int reference_flag = va_arg(args, int);
-  (void)ctrl_id;
 
   vp9_use_as_reference(ctx->cpi, reference_flag);
   return VPX_CODEC_OK;
 }
 
 static vpx_codec_err_t ctrl_set_roi_map(vpx_codec_alg_priv_t *ctx,
-                                        int ctrl_id, va_list args) {
+                                        va_list args) {
   (void)ctx;
-  (void)ctrl_id;
   (void)args;
 
   // TODO(yaowu): Need to re-implement and test for VP9.
@@ -1015,9 +1101,8 @@ static vpx_codec_err_t ctrl_set_roi_map(vpx_codec_alg_priv_t *ctx,
 
 
 static vpx_codec_err_t ctrl_set_active_map(vpx_codec_alg_priv_t *ctx,
-                                           int ctrl_id, va_list args) {
+                                           va_list args) {
   vpx_active_map_t *const map = va_arg(args, vpx_active_map_t *);
-  (void)ctrl_id;
 
   if (map) {
     if (!vp9_set_active_map(ctx->cpi, map->active_map,
@@ -1031,9 +1116,8 @@ static vpx_codec_err_t ctrl_set_active_map(vpx_codec_alg_priv_t *ctx,
 }
 
 static vpx_codec_err_t ctrl_set_scale_mode(vpx_codec_alg_priv_t *ctx,
-                                           int ctrl_id, va_list args) {
+                                           va_list args) {
   vpx_scaling_mode_t *const mode = va_arg(args, vpx_scaling_mode_t *);
-  (void)ctrl_id;
 
   if (mode) {
     const int res = vp9_set_internal_size(ctx->cpi,
@@ -1045,11 +1129,9 @@ static vpx_codec_err_t ctrl_set_scale_mode(vpx_codec_alg_priv_t *ctx,
   }
 }
 
-static vpx_codec_err_t ctrl_set_svc(vpx_codec_alg_priv_t *ctx, int ctrl_id,
-                                    va_list args) {
+static vpx_codec_err_t ctrl_set_svc(vpx_codec_alg_priv_t *ctx, va_list args) {
   int data = va_arg(args, int);
   const vpx_codec_enc_cfg_t *cfg = &ctx->cfg;
-  (void)ctrl_id;
 
   vp9_set_svc(ctx->cpi, data);
   // CBR or two pass mode for SVC with both temporal and spatial layers
@@ -1066,11 +1148,10 @@ static vpx_codec_err_t ctrl_set_svc(vpx_codec_alg_priv_t *ctx, int ctrl_id,
 }
 
 static vpx_codec_err_t ctrl_set_svc_layer_id(vpx_codec_alg_priv_t *ctx,
-                                             int ctrl_id, va_list args) {
+                                             va_list args) {
   vpx_svc_layer_id_t *const data = va_arg(args, vpx_svc_layer_id_t *);
   VP9_COMP *const cpi = (VP9_COMP *)ctx->cpi;
   SVC *const svc = &cpi->svc;
-  (void)ctrl_id;
 
   svc->spatial_layer_id = data->spatial_layer_id;
   svc->temporal_layer_id = data->temporal_layer_id;
@@ -1087,10 +1168,9 @@ static vpx_codec_err_t ctrl_set_svc_layer_id(vpx_codec_alg_priv_t *ctx,
 }
 
 static vpx_codec_err_t ctrl_set_svc_parameters(vpx_codec_alg_priv_t *ctx,
-                                               int ctrl_id, va_list args) {
+                                               va_list args) {
   VP9_COMP *const cpi = ctx->cpi;
   vpx_svc_parameters_t *const params = va_arg(args, vpx_svc_parameters_t *);
-  (void)ctrl_id;
 
   if (params == NULL)
     return VPX_CODEC_INVALID_PARAM;
@@ -1126,30 +1206,30 @@ static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
   {VP8E_SET_ROI_MAP,                  ctrl_set_roi_map},
   {VP8E_SET_ACTIVEMAP,                ctrl_set_active_map},
   {VP8E_SET_SCALEMODE,                ctrl_set_scale_mode},
-  {VP8E_SET_CPUUSED,                  ctrl_set_param},
-  {VP8E_SET_NOISE_SENSITIVITY,        ctrl_set_param},
-  {VP8E_SET_ENABLEAUTOALTREF,         ctrl_set_param},
-  {VP8E_SET_SHARPNESS,                ctrl_set_param},
-  {VP8E_SET_STATIC_THRESHOLD,         ctrl_set_param},
-  {VP9E_SET_TILE_COLUMNS,             ctrl_set_param},
-  {VP9E_SET_TILE_ROWS,                ctrl_set_param},
-  {VP8E_SET_ARNR_MAXFRAMES,           ctrl_set_param},
-  {VP8E_SET_ARNR_STRENGTH,            ctrl_set_param},
-  {VP8E_SET_ARNR_TYPE,                ctrl_set_param},
-  {VP8E_SET_TUNING,                   ctrl_set_param},
-  {VP8E_SET_CQ_LEVEL,                 ctrl_set_param},
-  {VP8E_SET_MAX_INTRA_BITRATE_PCT,    ctrl_set_param},
-  {VP9E_SET_LOSSLESS,                 ctrl_set_param},
-  {VP9E_SET_FRAME_PARALLEL_DECODING,  ctrl_set_param},
-  {VP9E_SET_AQ_MODE,                  ctrl_set_param},
-  {VP9E_SET_FRAME_PERIODIC_BOOST,     ctrl_set_param},
+  {VP8E_SET_CPUUSED,                  ctrl_set_cpuused},
+  {VP8E_SET_NOISE_SENSITIVITY,        ctrl_set_noise_sensitivity},
+  {VP8E_SET_ENABLEAUTOALTREF,         ctrl_set_enable_auto_alt_ref},
+  {VP8E_SET_SHARPNESS,                ctrl_set_sharpness},
+  {VP8E_SET_STATIC_THRESHOLD,         ctrl_set_static_thresh},
+  {VP9E_SET_TILE_COLUMNS,             ctrl_set_tile_columns},
+  {VP9E_SET_TILE_ROWS,                ctrl_set_tile_rows},
+  {VP8E_SET_ARNR_MAXFRAMES,           ctrl_set_arnr_max_frames},
+  {VP8E_SET_ARNR_STRENGTH,            ctrl_set_arnr_strength},
+  {VP8E_SET_ARNR_TYPE,                ctrl_set_arnr_type},
+  {VP8E_SET_TUNING,                   ctrl_set_tuning},
+  {VP8E_SET_CQ_LEVEL,                 ctrl_set_cq_level},
+  {VP8E_SET_MAX_INTRA_BITRATE_PCT,    ctrl_set_rc_max_intra_bitrate_pct},
+  {VP9E_SET_LOSSLESS,                 ctrl_set_lossless},
+  {VP9E_SET_FRAME_PARALLEL_DECODING,  ctrl_set_frame_parallel_decoding_mode},
+  {VP9E_SET_AQ_MODE,                  ctrl_set_aq_mode},
+  {VP9E_SET_FRAME_PERIODIC_BOOST,     ctrl_set_frame_periodic_boost},
   {VP9E_SET_SVC,                      ctrl_set_svc},
   {VP9E_SET_SVC_PARAMETERS,           ctrl_set_svc_parameters},
   {VP9E_SET_SVC_LAYER_ID,             ctrl_set_svc_layer_id},
 
   // Getters
-  {VP8E_GET_LAST_QUANTIZER,           ctrl_get_param},
-  {VP8E_GET_LAST_QUANTIZER_64,        ctrl_get_param},
+  {VP8E_GET_LAST_QUANTIZER,           ctrl_get_quantizer},
+  {VP8E_GET_LAST_QUANTIZER_64,        ctrl_get_quantizer64},
   {VP9_GET_REFERENCE,                 ctrl_get_reference},
 
   { -1, NULL},
@@ -1182,7 +1262,7 @@ static vpx_codec_enc_cfg_map_t encoder_usage_cfg_map[] = {
 
       VPX_VBR,            // rc_end_usage
 #if VPX_ENCODER_ABI_VERSION > (1 + VPX_CODEC_ABI_VERSION)
-      {0},                // rc_twopass_stats_in
+      {NULL, 0},          // rc_twopass_stats_in
 #endif
       256,                // rc_target_bandwidth
       0,                  // rc_min_quantizer
diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c
index 328b98fee..48110b414 100644
--- a/vp9/vp9_dx_iface.c
+++ b/vp9/vp9_dx_iface.c
@@ -38,7 +38,6 @@ struct vpx_codec_alg_priv {
   vpx_decrypt_cb          decrypt_cb;
   void                   *decrypt_state;
   vpx_image_t             img;
-  int                     img_avail;
   int                     invert_tile_order;
 
   // External frame buffer info to save for VP9 common.
@@ -48,10 +47,12 @@ struct vpx_codec_alg_priv {
 };
 
 static vpx_codec_err_t decoder_init(vpx_codec_ctx_t *ctx,
-                            vpx_codec_priv_enc_mr_cfg_t *data) {
+                                    vpx_codec_priv_enc_mr_cfg_t *data) {
   // This function only allocates space for the vpx_codec_alg_priv_t
   // structure. More memory may be required at the time the stream
   // information becomes known.
+  (void)data;
+
   if (!ctx->priv) {
     vpx_codec_alg_priv_t *alg_priv = vpx_memalign(32, sizeof(*alg_priv));
     if (alg_priv == NULL)
@@ -243,14 +244,11 @@ static void init_decoder(vpx_codec_alg_priv_t *ctx) {
 static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx,
                                   const uint8_t **data, unsigned int data_sz,
                                   void *user_priv, int64_t deadline) {
-  YV12_BUFFER_CONFIG sd = { 0 };
   vp9_ppflags_t flags = {0};
   VP9_COMMON *cm = NULL;
 
   (void)deadline;
 
-  ctx->img_avail = 0;
-
   // Determine the stream parameters. Note that we rely on peek_si to
   // validate that we have a buffer that does not wrap around the top
   // of the heap.
@@ -260,6 +258,9 @@ static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx,
                                  ctx->decrypt_state);
     if (res != VPX_CODEC_OK)
       return res;
+
+    if (!ctx->si.is_kf)
+      return VPX_CODEC_ERROR;
   }
 
   // Initialize the decoder instance on the first frame
@@ -282,13 +283,6 @@ static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx,
   if (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC)
     set_ppflags(ctx, &flags);
 
-  if (vp9_get_raw_frame(ctx->pbi, &sd, &flags))
-    return update_error_state(ctx, &cm->error);
-
-  yuvconfig2image(&ctx->img, &sd, user_priv);
-  ctx->img.fb_priv = cm->frame_bufs[cm->new_fb_idx].raw_frame_buffer.priv;
-  ctx->img_avail = 1;
-
   return VPX_CODEC_OK;
 }
 
@@ -417,15 +411,20 @@ static vpx_image_t *decoder_get_frame(vpx_codec_alg_priv_t *ctx,
                                       vpx_codec_iter_t *iter) {
   vpx_image_t *img = NULL;
 
-  if (ctx->img_avail) {
-    // iter acts as a flip flop, so an image is only returned on the first
-    // call to get_frame.
-    if (!(*iter)) {
+  // iter acts as a flip flop, so an image is only returned on the first
+  // call to get_frame.
+  if (*iter == NULL && ctx->pbi != NULL) {
+    YV12_BUFFER_CONFIG sd;
+    vp9_ppflags_t flags = {0, 0, 0};
+
+    if (vp9_get_raw_frame(ctx->pbi, &sd, &flags) == 0) {
+      VP9_COMMON *cm = &ctx->pbi->common;
+      yuvconfig2image(&ctx->img, &sd, NULL);
+      ctx->img.fb_priv = cm->frame_bufs[cm->new_fb_idx].raw_frame_buffer.priv;
       img = &ctx->img;
       *iter = img;
     }
   }
-  ctx->img_avail = 0;
 
   return img;
 }
@@ -449,7 +448,7 @@ static vpx_codec_err_t decoder_set_fb_fn(
 }
 
 static vpx_codec_err_t ctrl_set_reference(vpx_codec_alg_priv_t *ctx,
-                                          int ctr_id, va_list args) {
+                                          va_list args) {
   vpx_ref_frame_t *const data = va_arg(args, vpx_ref_frame_t *);
 
   if (data) {
@@ -465,7 +464,7 @@ static vpx_codec_err_t ctrl_set_reference(vpx_codec_alg_priv_t *ctx,
 }
 
 static vpx_codec_err_t ctrl_copy_reference(vpx_codec_alg_priv_t *ctx,
-                                           int ctr_id, va_list args) {
+                                           va_list args) {
   vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *);
 
   if (data) {
@@ -482,7 +481,7 @@ static vpx_codec_err_t ctrl_copy_reference(vpx_codec_alg_priv_t *ctx,
 }
 
 static vpx_codec_err_t ctrl_get_reference(vpx_codec_alg_priv_t *ctx,
-                                          int ctr_id, va_list args) {
+                                          va_list args) {
   vp9_ref_frame_t *data = va_arg(args, vp9_ref_frame_t *);
 
   if (data) {
@@ -497,7 +496,7 @@ static vpx_codec_err_t ctrl_get_reference(vpx_codec_alg_priv_t *ctx,
 }
 
 static vpx_codec_err_t ctrl_set_postproc(vpx_codec_alg_priv_t *ctx,
-                                         int ctr_id, va_list args) {
+                                         va_list args) {
 #if CONFIG_VP9_POSTPROC
   vp8_postproc_cfg_t *data = va_arg(args, vp8_postproc_cfg_t *);
 
@@ -509,17 +508,21 @@ static vpx_codec_err_t ctrl_set_postproc(vpx_codec_alg_priv_t *ctx,
     return VPX_CODEC_INVALID_PARAM;
   }
 #else
+  (void)ctx;
+  (void)args;
   return VPX_CODEC_INCAPABLE;
 #endif
 }
 
 static vpx_codec_err_t ctrl_set_dbg_options(vpx_codec_alg_priv_t *ctx,
-                                            int ctrl_id, va_list args) {
+                                            va_list args) {
+  (void)ctx;
+  (void)args;
   return VPX_CODEC_INCAPABLE;
 }
 
 static vpx_codec_err_t ctrl_get_last_ref_updates(vpx_codec_alg_priv_t *ctx,
-                                                 int ctrl_id, va_list args) {
+                                                 va_list args) {
   int *const update_info = va_arg(args, int *);
 
   if (update_info) {
@@ -535,7 +538,7 @@ static vpx_codec_err_t ctrl_get_last_ref_updates(vpx_codec_alg_priv_t *ctx,
 
 
 static vpx_codec_err_t ctrl_get_frame_corrupted(vpx_codec_alg_priv_t *ctx,
-                                                int ctrl_id, va_list args) {
+                                                va_list args) {
   int *corrupted = va_arg(args, int *);
 
   if (corrupted) {
@@ -550,7 +553,7 @@ static vpx_codec_err_t ctrl_get_frame_corrupted(vpx_codec_alg_priv_t *ctx,
 }
 
 static vpx_codec_err_t ctrl_get_display_size(vpx_codec_alg_priv_t *ctx,
-                                             int ctrl_id, va_list args) {
+                                             va_list args) {
   int *const display_size = va_arg(args, int *);
 
   if (display_size) {
@@ -568,13 +571,12 @@ static vpx_codec_err_t ctrl_get_display_size(vpx_codec_alg_priv_t *ctx,
 }
 
 static vpx_codec_err_t ctrl_set_invert_tile_order(vpx_codec_alg_priv_t *ctx,
-                                                  int ctr_id, va_list args) {
+                                                  va_list args) {
   ctx->invert_tile_order = va_arg(args, int);
   return VPX_CODEC_OK;
 }
 
 static vpx_codec_err_t ctrl_set_decryptor(vpx_codec_alg_priv_t *ctx,
-                                          int ctrl_id,
                                           va_list args) {
   vpx_decrypt_init *init = va_arg(args, vpx_decrypt_init *);
   ctx->decrypt_cb = init ? init->decrypt_cb : NULL;
@@ -625,11 +627,12 @@ CODEC_INTERFACE(vpx_codec_vp9_dx) = {
     decoder_set_fb_fn,  // vpx_codec_set_fb_fn_t
   },
   { // NOLINT
-    NOT_IMPLEMENTED,
-    NOT_IMPLEMENTED,
-    NOT_IMPLEMENTED,
-    NOT_IMPLEMENTED,
-    NOT_IMPLEMENTED,
-    NOT_IMPLEMENTED
+    NOT_IMPLEMENTED,  // vpx_codec_enc_cfg_map_t
+    NOT_IMPLEMENTED,  // vpx_codec_encode_fn_t
+    NOT_IMPLEMENTED,  // vpx_codec_get_cx_data_fn_t
+    NOT_IMPLEMENTED,  // vpx_codec_enc_config_set_fn_t
+    NOT_IMPLEMENTED,  // vpx_codec_get_global_headers_fn_t
+    NOT_IMPLEMENTED,  // vpx_codec_get_preview_frame_fn_t
+    NOT_IMPLEMENTED   // vpx_codec_enc_mr_get_mem_loc_fn_t
   }
 };
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index 6e5c521ab..a44ffc1ca 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -112,12 +112,12 @@ VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance.asm
 endif
 
 ifeq ($(ARCH_X86_64),yes)
-VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_quantize_ssse3.asm
-VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_dct_ssse3.asm
+VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_quantize_ssse3_x86_64.asm
+VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_dct_ssse3_x86_64.asm
 endif
 VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_sad_ssse3.asm
 VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/vp9_sad_sse4.asm
-VP9_CX_SRCS-$(ARCH_X86_64) += encoder/x86/vp9_ssim_opt.asm
+VP9_CX_SRCS-$(ARCH_X86_64) += encoder/x86/vp9_ssim_opt_x86_64.asm
 
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.c
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct32x32_sse2.c