29 files changed, 810 insertions, 443 deletions
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index 8ca356dd6..9088b0bde 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -228,8 +228,6 @@ typedef struct macroblockd {
   DECLARE_ALIGNED(16, uint8_t, mc_buf[80 * 2 * 80 * 2]);
 
   int lossless;
-  /* Inverse transform function pointers. */
-  void (*itxm_add)(const int16_t *input, uint8_t *dest, int stride, int eob);
 
   int corrupted;
 
diff --git a/vp9/common/vp9_idct.h b/vp9/common/vp9_idct.h
index d86877622..3253bcbf4 100644
--- a/vp9/common/vp9_idct.h
+++ b/vp9/common/vp9_idct.h
@@ -33,6 +33,9 @@ extern "C" {
 #define pair_set_epi16(a, b) \
   _mm_set_epi16(b, a, b, a, b, a, b, a)
 
+#define dual_set_epi16(a, b) \
+  _mm_set_epi16(b, b, b, b, a, a, a, a)
+
 // Constants:
 //  for (int i = 1; i< 32; ++i)
 //    printf("static const int cospi_%d_64 = %.0f;\n", i,
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index c300cde62..09ce72ef2 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -360,7 +360,7 @@ specialize qw/vp9_idct16x16_256_add sse2 ssse3 neon_asm dspr2/;
 $vp9_idct16x16_256_add_neon_asm=vp9_idct16x16_256_add_neon;
 
 add_proto qw/void vp9_idct16x16_10_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
-specialize qw/vp9_idct16x16_10_add sse2 neon_asm dspr2/;
+specialize qw/vp9_idct16x16_10_add sse2 ssse3 neon_asm dspr2/;
 $vp9_idct16x16_10_add_neon_asm=vp9_idct16x16_10_add_neon;
 
 add_proto qw/void vp9_idct32x32_1024_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
@@ -422,10 +422,6 @@ specialize qw/vp9_variance64x64/, "$sse2_x86inc", "$avx2_x86inc";
 add_proto qw/unsigned int vp9_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
 specialize qw/vp9_variance16x16 mmx/, "$sse2_x86inc", "$avx2_x86inc";
 
-add_proto qw/void vp9_get_sse_sum_16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-specialize qw/vp9_get_sse_sum_16x16 sse2/;
-$vp9_get_sse_sum_16x16_sse2=vp9_get16x16var_sse2;
-
 add_proto qw/unsigned int vp9_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
 specialize qw/vp9_variance16x8 mmx/, "$sse2_x86inc";
 
@@ -435,9 +431,11 @@ specialize qw/vp9_variance8x16 mmx/, "$sse2_x86inc";
 add_proto qw/unsigned int vp9_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
 specialize qw/vp9_variance8x8 mmx/, "$sse2_x86inc";
 
-add_proto qw/void vp9_get_sse_sum_8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-specialize qw/vp9_get_sse_sum_8x8 sse2/;
-$vp9_get_sse_sum_8x8_sse2=vp9_get8x8var_sse2;
+add_proto qw/void vp9_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+specialize qw/vp9_get8x8var mmx/, "$sse2_x86inc";
+
+add_proto qw/void vp9_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+specialize qw/vp9_get16x16var avx2/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vp9_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
 specialize qw/vp9_variance8x4/, "$sse2_x86inc";
diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c
index ff9c43221..b60f8a06d 100644
--- a/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -375,15 +375,6 @@ void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride,
     out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
   }
 
-#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1) \
-  {                                                     \
-    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
-    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
-                                                        \
-    in0 = _mm_unpacklo_epi32(tr0_0, tr0_1);  /* i1 i0 */  \
-    in1 = _mm_unpackhi_epi32(tr0_0, tr0_1);  /* i3 i2 */  \
-  }
-
 #define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \
   {                                            \
     const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
@@ -612,23 +603,6 @@ void vp9_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
   RECON_AND_STORE(dest, dc_value);
 }
 
-static INLINE void array_transpose_4X8(__m128i *in, __m128i * out) {
-  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
-  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
-  const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
-  const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
-
-  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
-  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
-  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
-
-  out[0] = _mm_unpacklo_epi64(tr1_0, tr1_4);
-  out[1] = _mm_unpackhi_epi64(tr1_0, tr1_4);
-  out[2] = _mm_unpacklo_epi64(tr1_2, tr1_6);
-  out[3] = _mm_unpackhi_epi64(tr1_2, tr1_6);
-}
-
 static void idct8_sse2(__m128i *in) {
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
   const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.h b/vp9/common/x86/vp9_idct_intrin_sse2.h
index 1c62e3272..0f179b49a 100644
--- a/vp9/common/x86/vp9_idct_intrin_sse2.h
+++ b/vp9/common/x86/vp9_idct_intrin_sse2.h
@@ -45,6 +45,32 @@ static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
   res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
 }
 
+#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1) \
+  {                                                     \
+    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
+    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
+                                                        \
+    in0 = _mm_unpacklo_epi32(tr0_0, tr0_1);  /* i1 i0 */  \
+    in1 = _mm_unpackhi_epi32(tr0_0, tr0_1);  /* i3 i2 */  \
+  }
+
+static INLINE void array_transpose_4X8(__m128i *in, __m128i * out) {
+  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
+  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
+  const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
+  const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
+
+  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+
+  out[0] = _mm_unpacklo_epi64(tr1_0, tr1_4);
+  out[1] = _mm_unpackhi_epi64(tr1_0, tr1_4);
+  out[2] = _mm_unpacklo_epi64(tr1_2, tr1_6);
+  out[3] = _mm_unpackhi_epi64(tr1_2, tr1_6);
+}
+
 static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
   __m128i tbuf[8];
   array_transpose_8x8(res0, res0);
diff --git a/vp9/common/x86/vp9_idct_intrin_ssse3.c b/vp9/common/x86/vp9_idct_intrin_ssse3.c
index e5d3cb5f4..73bf5d1d7 100644
--- a/vp9/common/x86/vp9_idct_intrin_ssse3.c
+++ b/vp9/common/x86/vp9_idct_intrin_ssse3.c
@@ -16,7 +16,7 @@
 #include <tmmintrin.h>  // SSSE3
 #include "vp9/common/x86/vp9_idct_intrin_sse2.h"
 
-static void idct16_8col(__m128i *in) {
+static void idct16_8col(__m128i *in, int round) {
   const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
   const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
   const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
@@ -36,6 +36,8 @@ static void idct16_8col(__m128i *in) {
   const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
   const __m128i k__cospi_p16_p16_x2 = pair_set_epi16(23170, 23170);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
 
   __m128i v[16], u[16], s[16], t[16];
 
@@ -266,28 +268,80 @@ static void idct16_8col(__m128i *in) {
   t[15] = _mm_add_epi16(s[12], s[15]);
 
   // stage 6
-  s[0] = _mm_add_epi16(t[0], t[7]);
-  s[1] = _mm_add_epi16(t[1], t[6]);
-  s[2] = _mm_add_epi16(t[2], t[5]);
-  s[3] = _mm_add_epi16(t[3], t[4]);
-  s[4] = _mm_sub_epi16(t[3], t[4]);
-  s[5] = _mm_sub_epi16(t[2], t[5]);
-  s[6] = _mm_sub_epi16(t[1], t[6]);
-  s[7] = _mm_sub_epi16(t[0], t[7]);
-  s[8] = t[8];
-  s[9] = t[9];
-
-  u[0] = _mm_sub_epi16(t[13], t[10]);
-  u[1] = _mm_add_epi16(t[13], t[10]);
-  u[2] = _mm_sub_epi16(t[12], t[11]);
-  u[3] = _mm_add_epi16(t[12], t[11]);
-
-  s[10] = _mm_mulhrs_epi16(u[0], k__cospi_p16_p16_x2);
-  s[13] = _mm_mulhrs_epi16(u[1], k__cospi_p16_p16_x2);
-  s[11] = _mm_mulhrs_epi16(u[2], k__cospi_p16_p16_x2);
-  s[12] = _mm_mulhrs_epi16(u[3], k__cospi_p16_p16_x2);
-  s[14] = t[14];
-  s[15] = t[15];
+  if (round == 1) {
+    s[0] = _mm_add_epi16(t[0], t[7]);
+    s[1] = _mm_add_epi16(t[1], t[6]);
+    s[2] = _mm_add_epi16(t[2], t[5]);
+    s[3] = _mm_add_epi16(t[3], t[4]);
+    s[4] = _mm_sub_epi16(t[3], t[4]);
+    s[5] = _mm_sub_epi16(t[2], t[5]);
+    s[6] = _mm_sub_epi16(t[1], t[6]);
+    s[7] = _mm_sub_epi16(t[0], t[7]);
+    s[8] = t[8];
+    s[9] = t[9];
+
+    u[0] = _mm_unpacklo_epi16(t[10], t[13]);
+    u[1] = _mm_unpackhi_epi16(t[10], t[13]);
+    u[2] = _mm_unpacklo_epi16(t[11], t[12]);
+    u[3] = _mm_unpackhi_epi16(t[11], t[12]);
+
+    v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
+    v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
+    v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
+    v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
+    v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
+    v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
+    v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
+    v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
+
+    u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+    u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+    u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+    u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+    u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+    u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+    u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+    u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+
+    u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+    u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+    u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+    u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+    u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+    u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+    u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+    u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+
+    s[10] = _mm_packs_epi32(u[0], u[1]);
+    s[13] = _mm_packs_epi32(u[2], u[3]);
+    s[11] = _mm_packs_epi32(u[4], u[5]);
+    s[12] = _mm_packs_epi32(u[6], u[7]);
+    s[14] = t[14];
+    s[15] = t[15];
+  } else {
+    s[0] = _mm_add_epi16(t[0], t[7]);
+    s[1] = _mm_add_epi16(t[1], t[6]);
+    s[2] = _mm_add_epi16(t[2], t[5]);
+    s[3] = _mm_add_epi16(t[3], t[4]);
+    s[4] = _mm_sub_epi16(t[3], t[4]);
+    s[5] = _mm_sub_epi16(t[2], t[5]);
+    s[6] = _mm_sub_epi16(t[1], t[6]);
+    s[7] = _mm_sub_epi16(t[0], t[7]);
+    s[8] = t[8];
+    s[9] = t[9];
+
+    u[0] = _mm_sub_epi16(t[13], t[10]);
+    u[1] = _mm_add_epi16(t[13], t[10]);
+    u[2] = _mm_sub_epi16(t[12], t[11]);
+    u[3] = _mm_add_epi16(t[12], t[11]);
+
+    s[10] = _mm_mulhrs_epi16(u[0], k__cospi_p16_p16_x2);
+    s[13] = _mm_mulhrs_epi16(u[1], k__cospi_p16_p16_x2);
+    s[11] = _mm_mulhrs_epi16(u[2], k__cospi_p16_p16_x2);
+    s[12] = _mm_mulhrs_epi16(u[3], k__cospi_p16_p16_x2);
+    s[14] = t[14];
+    s[15] = t[15];
+  }
 
   // stage 7
   in[0] = _mm_add_epi16(s[0], s[15]);
@@ -308,10 +362,10 @@ static void idct16_8col(__m128i *in) {
   in[15] = _mm_sub_epi16(s[0], s[15]);
 }
 
-static void idct16_sse2(__m128i *in0, __m128i *in1) {
+static void idct16_sse2(__m128i *in0, __m128i *in1, int round) {
   array_transpose_16x16(in0, in1);
-  idct16_8col(in0);
-  idct16_8col(in1);
+  idct16_8col(in0, round);
+  idct16_8col(in1, round);
 }
 
 void vp9_idct16x16_256_add_ssse3(const int16_t *input, uint8_t *dest,
@@ -322,10 +376,387 @@ void vp9_idct16x16_256_add_ssse3(const int16_t *input, uint8_t *dest,
   input += 8;
   load_buffer_8x16(input, in1);
 
-  idct16_sse2(in0, in1);
-  idct16_sse2(in0, in1);
+  idct16_sse2(in0, in1, 0);
+  idct16_sse2(in0, in1, 1);
 
   write_buffer_8x16(dest, in0, stride);
   dest += 8;
   write_buffer_8x16(dest, in1, stride);
 }
+
+static void idct16_10_r1(__m128i *in, __m128i *l) {
+  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  const __m128i zero = _mm_setzero_si128();
+
+  const __m128i stg2_01 = dual_set_epi16(3212, 32610);
+  const __m128i stg2_67 = dual_set_epi16(-9512, 31358);
+  const __m128i stg3_01 = dual_set_epi16(6392, 32138);
+  const __m128i stg4_01 = dual_set_epi16(23170, 23170);
+
+
+
+  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+  const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+
+  __m128i stp1_0, stp1_1, stp1_4, stp1_6,
+          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15;
+  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
+          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4;
+
+  // Stage2
+  {
+    const __m128i lo_1_15 = _mm_unpackhi_epi64(in[0], in[0]);
+    const __m128i lo_13_3 = _mm_unpackhi_epi64(in[1], in[1]);
+
+    stp2_8  = _mm_mulhrs_epi16(lo_1_15, stg2_01);
+    stp2_11 = _mm_mulhrs_epi16(lo_13_3, stg2_67);
+  }
+
+  // Stage3
+  {
+    const __m128i lo_2_14 = _mm_unpacklo_epi64(in[1], in[1]);
+    stp1_4 = _mm_mulhrs_epi16(lo_2_14, stg3_01);
+
+    stp1_13 = _mm_unpackhi_epi64(stp2_11, zero);
+    stp1_14 = _mm_unpackhi_epi64(stp2_8, zero);
+  }
+
+  // Stage4
+  {
+    const __m128i lo_0_8 = _mm_unpacklo_epi64(in[0], in[0]);
+    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp1_14);
+    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp1_13);
+
+    tmp0 = _mm_mulhrs_epi16(lo_0_8, stg4_01);
+    tmp1 = _mm_madd_epi16(lo_9_14, stg4_4);
+    tmp3 = _mm_madd_epi16(lo_9_14, stg4_5);
+    tmp2 = _mm_madd_epi16(lo_10_13, stg4_6);
+    tmp4 = _mm_madd_epi16(lo_10_13, stg4_7);
+
+    tmp1 = _mm_add_epi32(tmp1, rounding);
+    tmp3 = _mm_add_epi32(tmp3, rounding);
+    tmp2 = _mm_add_epi32(tmp2, rounding);
+    tmp4 = _mm_add_epi32(tmp4, rounding);
+
+    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
+    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
+    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
+
+    stp1_0 = _mm_unpacklo_epi64(tmp0, tmp0);
+    stp1_1 = _mm_unpackhi_epi64(tmp0, tmp0);
+    stp2_9 = _mm_packs_epi32(tmp1, tmp3);
+    stp2_10 = _mm_packs_epi32(tmp2, tmp4);
+
+    stp2_6 = _mm_unpackhi_epi64(stp1_4, zero);
+  }
+
+  // Stage5 and Stage6
+  {
+    tmp0 = _mm_add_epi16(stp2_8, stp2_11);
+    tmp1 = _mm_sub_epi16(stp2_8, stp2_11);
+    tmp2 = _mm_add_epi16(stp2_9, stp2_10);
+    tmp3 = _mm_sub_epi16(stp2_9, stp2_10);
+
+    stp1_9  = _mm_unpacklo_epi64(tmp2, zero);
+    stp1_10 = _mm_unpacklo_epi64(tmp3, zero);
+    stp1_8  = _mm_unpacklo_epi64(tmp0, zero);
+    stp1_11 = _mm_unpacklo_epi64(tmp1, zero);
+
+    stp1_13 = _mm_unpackhi_epi64(tmp3, zero);
+    stp1_14 = _mm_unpackhi_epi64(tmp2, zero);
+    stp1_12 = _mm_unpackhi_epi64(tmp1, zero);
+    stp1_15 = _mm_unpackhi_epi64(tmp0, zero);
+  }
+
+  // Stage6
+  {
+    const __m128i lo_6_5 = _mm_add_epi16(stp2_6, stp1_4);
+    const __m128i lo_6_6 = _mm_sub_epi16(stp2_6, stp1_4);
+    const __m128i lo_10_13 = _mm_sub_epi16(stp1_13, stp1_10);
+    const __m128i lo_10_14 = _mm_add_epi16(stp1_13, stp1_10);
+    const __m128i lo_11_12 = _mm_sub_epi16(stp1_12, stp1_11);
+    const __m128i lo_11_13 = _mm_add_epi16(stp1_12, stp1_11);
+
+    tmp1 = _mm_unpacklo_epi64(lo_6_5, lo_6_6);
+    tmp0 = _mm_unpacklo_epi64(lo_10_13, lo_10_14);
+    tmp4 = _mm_unpacklo_epi64(lo_11_12, lo_11_13);
+
+    stp1_6 = _mm_mulhrs_epi16(tmp1, stg4_01);
+    tmp0   = _mm_mulhrs_epi16(tmp0, stg4_01);
+    tmp4   = _mm_mulhrs_epi16(tmp4, stg4_01);
+
+    stp2_10 = _mm_unpacklo_epi64(tmp0, zero);
+    stp2_13 = _mm_unpackhi_epi64(tmp0, zero);
+    stp2_11 = _mm_unpacklo_epi64(tmp4, zero);
+    stp2_12 = _mm_unpackhi_epi64(tmp4, zero);
+
+    tmp0 = _mm_add_epi16(stp1_0, stp1_4);
+    tmp1 = _mm_sub_epi16(stp1_0, stp1_4);
+    tmp2 = _mm_add_epi16(stp1_1, stp1_6);
+    tmp3 = _mm_sub_epi16(stp1_1, stp1_6);
+
+    stp2_0 = _mm_unpackhi_epi64(tmp0, zero);
+    stp2_1 = _mm_unpacklo_epi64(tmp2, zero);
+    stp2_2 = _mm_unpackhi_epi64(tmp2, zero);
+    stp2_3 = _mm_unpacklo_epi64(tmp0, zero);
+    stp2_4 = _mm_unpacklo_epi64(tmp1, zero);
+    stp2_5 = _mm_unpackhi_epi64(tmp3, zero);
+    stp2_6 = _mm_unpacklo_epi64(tmp3, zero);
+    stp2_7 = _mm_unpackhi_epi64(tmp1, zero);
+  }
+
+  // Stage7. Left 8x16 only.
+  l[0] = _mm_add_epi16(stp2_0, stp1_15);
+  l[1] = _mm_add_epi16(stp2_1, stp1_14);
+  l[2] = _mm_add_epi16(stp2_2, stp2_13);
+  l[3] = _mm_add_epi16(stp2_3, stp2_12);
+  l[4] = _mm_add_epi16(stp2_4, stp2_11);
+  l[5] = _mm_add_epi16(stp2_5, stp2_10);
+  l[6] = _mm_add_epi16(stp2_6, stp1_9);
+  l[7] = _mm_add_epi16(stp2_7, stp1_8);
+  l[8] = _mm_sub_epi16(stp2_7, stp1_8);
+  l[9] = _mm_sub_epi16(stp2_6, stp1_9);
+  l[10] = _mm_sub_epi16(stp2_5, stp2_10);
+  l[11] = _mm_sub_epi16(stp2_4, stp2_11);
+  l[12] = _mm_sub_epi16(stp2_3, stp2_12);
+  l[13] = _mm_sub_epi16(stp2_2, stp2_13);
+  l[14] = _mm_sub_epi16(stp2_1, stp1_14);
+  l[15] = _mm_sub_epi16(stp2_0, stp1_15);
+}
+
+static void idct16_10_r2(__m128i *in) {
+  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+
+  const __m128i stg2_0 = dual_set_epi16(3212, 3212);
+  const __m128i stg2_1 = dual_set_epi16(32610, 32610);
+  const __m128i stg2_6 = dual_set_epi16(-9512, -9512);
+  const __m128i stg2_7 = dual_set_epi16(31358, 31358);
+  const __m128i stg3_0 = dual_set_epi16(6392, 6392);
+  const __m128i stg3_1 = dual_set_epi16(32138, 32138);
+  const __m128i stg4_01 = dual_set_epi16(23170, 23170);
+
+  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+  const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+
+  __m128i stp1_0, stp1_2, stp1_3, stp1_5, stp1_6,
+          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
+          stp1_8_0, stp1_12_0;
+  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
+          stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+
+  /* Stage2 */
+  {
+    stp1_8_0  = _mm_mulhrs_epi16(in[1], stg2_0);
+    stp1_15   = _mm_mulhrs_epi16(in[1], stg2_1);
+    stp1_11   = _mm_mulhrs_epi16(in[3], stg2_6);
+    stp1_12_0 = _mm_mulhrs_epi16(in[3], stg2_7);
+  }
+
+  /* Stage3 */
+  {
+    stp2_4 = _mm_mulhrs_epi16(in[2], stg3_0);
+    stp2_7 = _mm_mulhrs_epi16(in[2], stg3_1);
+
+    stp1_9  =  stp1_8_0;
+    stp1_10 =  stp1_11;
+
+    stp1_13 = stp1_12_0;
+    stp1_14 = stp1_15;
+  }
+
+  /* Stage4 */
+  {
+    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);
+    const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14);
+    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
+    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);
+
+    stp1_0 = _mm_mulhrs_epi16(in[0], stg4_01);
+
+    stp2_5 = stp2_4;
+    stp2_6 = stp2_7;
+
+
+    tmp0 = _mm_madd_epi16(lo_9_14, stg4_4);
+    tmp1 = _mm_madd_epi16(hi_9_14, stg4_4);
+    tmp2 = _mm_madd_epi16(lo_9_14, stg4_5);
+    tmp3 = _mm_madd_epi16(hi_9_14, stg4_5);
+    tmp4 = _mm_madd_epi16(lo_10_13, stg4_6);
+    tmp5 = _mm_madd_epi16(hi_10_13, stg4_6);
+    tmp6 = _mm_madd_epi16(lo_10_13, stg4_7);
+    tmp7 = _mm_madd_epi16(hi_10_13, stg4_7);
+
+    tmp0 = _mm_add_epi32(tmp0, rounding);
+    tmp1 = _mm_add_epi32(tmp1, rounding);
+    tmp2 = _mm_add_epi32(tmp2, rounding);
+    tmp3 = _mm_add_epi32(tmp3, rounding);
+    tmp4 = _mm_add_epi32(tmp4, rounding);
+    tmp5 = _mm_add_epi32(tmp5, rounding);
+    tmp6 = _mm_add_epi32(tmp6, rounding);
+    tmp7 = _mm_add_epi32(tmp7, rounding);
+
+    tmp0 = _mm_srai_epi32(tmp0, 14);
+    tmp1 = _mm_srai_epi32(tmp1, 14);
+    tmp2 = _mm_srai_epi32(tmp2, 14);
+    tmp3 = _mm_srai_epi32(tmp3, 14);
+    tmp4 = _mm_srai_epi32(tmp4, 14);
+    tmp5 = _mm_srai_epi32(tmp5, 14);
+    tmp6 = _mm_srai_epi32(tmp6, 14);
+    tmp7 = _mm_srai_epi32(tmp7, 14);
+
+    stp2_9 = _mm_packs_epi32(tmp0, tmp1);
+    stp2_14 = _mm_packs_epi32(tmp2, tmp3);
+    stp2_10 = _mm_packs_epi32(tmp4, tmp5);
+    stp2_13 = _mm_packs_epi32(tmp6, tmp7);
+  }
+
+  /* Stage5 */
+  {
+    stp1_2 = stp1_0;
+    stp1_3 = stp1_0;
+
+    tmp0 = _mm_sub_epi16(stp2_6, stp2_5);
+    tmp1 = _mm_add_epi16(stp2_6, stp2_5);
+
+    stp1_5 = _mm_mulhrs_epi16(tmp0, stg4_01);
+    stp1_6 = _mm_mulhrs_epi16(tmp1, stg4_01);
+
+    stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);
+    stp1_9 = _mm_add_epi16(stp2_9, stp2_10);
+    stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);
+    stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11);
+
+    stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0);
+    stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);
+    stp1_14 = _mm_add_epi16(stp2_14, stp2_13);
+    stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0);
+  }
+
+  /* Stage6 */
+  {
+    stp2_0 = _mm_add_epi16(stp1_0, stp2_7);
+    stp2_1 = _mm_add_epi16(stp1_0, stp1_6);
+    stp2_2 = _mm_add_epi16(stp1_2, stp1_5);
+    stp2_3 = _mm_add_epi16(stp1_3, stp2_4);
+
+    tmp0 = _mm_sub_epi16(stp1_13, stp1_10);
+    tmp1 = _mm_add_epi16(stp1_13, stp1_10);
+    tmp2 = _mm_sub_epi16(stp1_12, stp1_11);
+    tmp3 = _mm_add_epi16(stp1_12, stp1_11);
+
+    stp2_4 = _mm_sub_epi16(stp1_3, stp2_4);
+    stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);
+    stp2_6 = _mm_sub_epi16(stp1_0, stp1_6);
+    stp2_7 = _mm_sub_epi16(stp1_0, stp2_7);
+
+    stp2_10 = _mm_mulhrs_epi16(tmp0, stg4_01);
+    stp2_13 = _mm_mulhrs_epi16(tmp1, stg4_01);
+    stp2_11 = _mm_mulhrs_epi16(tmp2, stg4_01);
+    stp2_12 = _mm_mulhrs_epi16(tmp3, stg4_01);
+  }
+
+  // Stage7
+  in[0] = _mm_add_epi16(stp2_0, stp1_15);
+  in[1] = _mm_add_epi16(stp2_1, stp1_14);
+  in[2] = _mm_add_epi16(stp2_2, stp2_13);
+  in[3] = _mm_add_epi16(stp2_3, stp2_12);
+  in[4] = _mm_add_epi16(stp2_4, stp2_11);
+  in[5] = _mm_add_epi16(stp2_5, stp2_10);
+  in[6] = _mm_add_epi16(stp2_6, stp1_9);
+  in[7] = _mm_add_epi16(stp2_7, stp1_8);
+  in[8] = _mm_sub_epi16(stp2_7, stp1_8);
+  in[9] = _mm_sub_epi16(stp2_6, stp1_9);
+  in[10] = _mm_sub_epi16(stp2_5, stp2_10);
+  in[11] = _mm_sub_epi16(stp2_4, stp2_11);
+  in[12] = _mm_sub_epi16(stp2_3, stp2_12);
+  in[13] = _mm_sub_epi16(stp2_2, stp2_13);
+  in[14] = _mm_sub_epi16(stp2_1, stp1_14);
+  in[15] = _mm_sub_epi16(stp2_0, stp1_15);
+}
+
+void vp9_idct16x16_10_add_ssse3(const int16_t *input, uint8_t *dest,
+                               int stride) {
+  const __m128i final_rounding = _mm_set1_epi16(1<<5);
+  const __m128i zero = _mm_setzero_si128();
+  __m128i in[16], l[16];
+
+  int i;
+  // First 1-D inverse DCT
+  // Load input data.
+  in[0] = _mm_load_si128((const __m128i *)input);
+  in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2));
+  in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4));
+  in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6));
+
+  TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]);
+
+  idct16_10_r1(in, l);
+
+  // Second 1-D inverse transform, performed per 8x16 block
+  for (i = 0; i < 2; i++) {
+    array_transpose_4X8(l + 8*i, in);
+
+    idct16_10_r2(in);
+
+    // Final rounding and shift
+    in[0] = _mm_adds_epi16(in[0], final_rounding);
+    in[1] = _mm_adds_epi16(in[1], final_rounding);
+    in[2] = _mm_adds_epi16(in[2], final_rounding);
+    in[3] = _mm_adds_epi16(in[3], final_rounding);
+    in[4] = _mm_adds_epi16(in[4], final_rounding);
+    in[5] = _mm_adds_epi16(in[5], final_rounding);
+    in[6] = _mm_adds_epi16(in[6], final_rounding);
+    in[7] = _mm_adds_epi16(in[7], final_rounding);
+    in[8] = _mm_adds_epi16(in[8], final_rounding);
+    in[9] = _mm_adds_epi16(in[9], final_rounding);
+    in[10] = _mm_adds_epi16(in[10], final_rounding);
+    in[11] = _mm_adds_epi16(in[11], final_rounding);
+    in[12] = _mm_adds_epi16(in[12], final_rounding);
+    in[13] = _mm_adds_epi16(in[13], final_rounding);
+    in[14] = _mm_adds_epi16(in[14], final_rounding);
+    in[15] = _mm_adds_epi16(in[15], final_rounding);
+
+    in[0] = _mm_srai_epi16(in[0], 6);
+    in[1] = _mm_srai_epi16(in[1], 6);
+    in[2] = _mm_srai_epi16(in[2], 6);
+    in[3] = _mm_srai_epi16(in[3], 6);
+    in[4] = _mm_srai_epi16(in[4], 6);
+    in[5] = _mm_srai_epi16(in[5], 6);
+    in[6] = _mm_srai_epi16(in[6], 6);
+    in[7] = _mm_srai_epi16(in[7], 6);
+    in[8] = _mm_srai_epi16(in[8], 6);
+    in[9] = _mm_srai_epi16(in[9], 6);
+    in[10] = _mm_srai_epi16(in[10], 6);
+    in[11] = _mm_srai_epi16(in[11], 6);
+    in[12] = _mm_srai_epi16(in[12], 6);
+    in[13] = _mm_srai_epi16(in[13], 6);
+    in[14] = _mm_srai_epi16(in[14], 6);
+    in[15] = _mm_srai_epi16(in[15], 6);
+
+    RECON_AND_STORE(dest, in[0]);
+    RECON_AND_STORE(dest, in[1]);
+    RECON_AND_STORE(dest, in[2]);
+    RECON_AND_STORE(dest, in[3]);
+    RECON_AND_STORE(dest, in[4]);
+    RECON_AND_STORE(dest, in[5]);
+    RECON_AND_STORE(dest, in[6]);
+    RECON_AND_STORE(dest, in[7]);
+    RECON_AND_STORE(dest, in[8]);
+    RECON_AND_STORE(dest, in[9]);
+    RECON_AND_STORE(dest, in[10]);
+    RECON_AND_STORE(dest, in[11]);
+    RECON_AND_STORE(dest, in[12]);
+    RECON_AND_STORE(dest, in[13]);
+    RECON_AND_STORE(dest, in[14]);
+    RECON_AND_STORE(dest, in[15]);
+
+    dest += 8 - (stride * 16);
+  }
+}
diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index 121b1f2cd..fc70035f2 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -195,30 +195,32 @@ static void inverse_transform_block(MACROBLOCKD* xd, int plane, int block,
   struct macroblockd_plane *const pd = &xd->plane[plane];
   if (eob > 0) {
     TX_TYPE tx_type;
-    const PLANE_TYPE plane_type = pd->plane_type;
     int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
-    switch (tx_size) {
-      case TX_4X4:
-        tx_type = get_tx_type_4x4(plane_type, xd, block);
-        if (tx_type == DCT_DCT)
-          xd->itxm_add(dqcoeff, dst, stride, eob);
-        else
-          vp9_iht4x4_16_add(dqcoeff, dst, stride, tx_type);
-        break;
-      case TX_8X8:
-        tx_type = get_tx_type(plane_type, xd);
-        vp9_iht8x8_add(tx_type, dqcoeff, dst, stride, eob);
-        break;
-      case TX_16X16:
-        tx_type = get_tx_type(plane_type, xd);
-        vp9_iht16x16_add(tx_type, dqcoeff, dst, stride, eob);
-        break;
-      case TX_32X32:
-        tx_type = DCT_DCT;
-        vp9_idct32x32_add(dqcoeff, dst, stride, eob);
-        break;
-      default:
-        assert(0 && "Invalid transform size");
+    if (xd->lossless) {
+      tx_type = DCT_DCT;
+      vp9_iwht4x4_add(dqcoeff, dst, stride, eob);
+    } else {
+      const PLANE_TYPE plane_type = pd->plane_type;
+      switch (tx_size) {
+        case TX_4X4:
+          tx_type = get_tx_type_4x4(plane_type, xd, block);
+          vp9_iht4x4_add(tx_type, dqcoeff, dst, stride, eob);
+          break;
+        case TX_8X8:
+          tx_type = get_tx_type(plane_type, xd);
+          vp9_iht8x8_add(tx_type, dqcoeff, dst, stride, eob);
+          break;
+        case TX_16X16:
+          tx_type = get_tx_type(plane_type, xd);
+          vp9_iht16x16_add(tx_type, dqcoeff, dst, stride, eob);
+          break;
+        case TX_32X32:
+          tx_type = DCT_DCT;
+          vp9_idct32x32_add(dqcoeff, dst, stride, eob);
+          break;
+        default:
+          assert(0 && "Invalid transform size");
+      }
     }
 
     if (eob == 1) {
@@ -588,8 +590,6 @@ static void setup_quantization(VP9_COMMON *const cm, MACROBLOCKD *const xd,
                  cm->y_dc_delta_q == 0 &&
                  cm->uv_dc_delta_q == 0 &&
                  cm->uv_ac_delta_q == 0;
-
-  xd->itxm_add = xd->lossless ? vp9_iwht4x4_add : vp9_idct4x4_add;
 }
 
 static INTERP_FILTER read_interp_filter(struct vp9_read_bit_buffer *rb) {
diff --git a/vp9/encoder/vp9_aq_complexity.c b/vp9/encoder/vp9_aq_complexity.c
index 47ad8d8cc..0d6b41d15 100644
--- a/vp9/encoder/vp9_aq_complexity.c
+++ b/vp9/encoder/vp9_aq_complexity.c
@@ -47,11 +47,21 @@ void vp9_setup_in_frame_q_adj(VP9_COMP *cpi) {
 
     // Use some of the segments for in frame Q adjustment.
     for (segment = 1; segment < 2; segment++) {
-      const int qindex_delta =
+      int qindex_delta =
           vp9_compute_qdelta_by_rate(&cpi->rc, cm->frame_type, cm->base_qindex,
                                      in_frame_q_adj_ratio[segment]);
-      vp9_enable_segfeature(seg, segment, SEG_LVL_ALT_Q);
-      vp9_set_segdata(seg, segment, SEG_LVL_ALT_Q, qindex_delta);
+
+      // For AQ mode 2, we dont allow Q0 in a segment if the base Q is not 0.
+      // Q0 (lossless) implies 4x4 only and in AQ mode 2 a segment Q delta
+      // is sometimes applied without going back around the rd loop.
+      // This could lead to an illegal combination of partition size and q.
+      if ((cm->base_qindex != 0) && ((cm->base_qindex + qindex_delta) == 0)) {
+        qindex_delta = -cm->base_qindex + 1;
+      }
+      if ((cm->base_qindex + qindex_delta) > 0) {
+        vp9_enable_segfeature(seg, segment, SEG_LVL_ALT_Q);
+        vp9_set_segdata(seg, segment, SEG_LVL_ALT_Q, qindex_delta);
+      }
     }
   }
 }
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index c406860a0..c3cd93b78 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -109,6 +109,7 @@ struct macroblock {
   MV pred_mv[MAX_REF_FRAMES];
 
   void (*fwd_txm4x4)(const int16_t *input, int16_t *output, int stride);
+  void (*itxm_add)(const int16_t *input, uint8_t *dest, int stride, int eob);
 };
 
 #ifdef __cplusplus
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 6cbc38d79..c1db8263e 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -478,8 +478,8 @@ static void choose_partitioning(VP9_COMP *cpi,
         unsigned int sse = 0;
         int sum = 0;
         if (x_idx < pixels_wide && y_idx < pixels_high)
-          vp9_get_sse_sum_8x8(s + y_idx * sp + x_idx, sp,
-                              d + y_idx * dp + x_idx, dp, &sse, &sum);
+          vp9_get8x8var(s + y_idx * sp + x_idx, sp,
+                        d + y_idx * dp + x_idx, dp, &sse, &sum);
         fill_variance(sse, sum, 64, &vst->split[k].part_variances.none);
       }
     }
@@ -1214,9 +1214,9 @@ static void set_source_var_based_partition(VP9_COMP *cpi,
         int b_offset = b_mi_row * MI_SIZE * src_stride +
                        b_mi_col * MI_SIZE;
 
-        vp9_get_sse_sum_16x16(src + b_offset, src_stride,
-                              pre_src + b_offset, pre_stride,
-                              &d16[j].sse, &d16[j].sum);
+        vp9_get16x16var(src + b_offset, src_stride,
+                        pre_src + b_offset, pre_stride,
+                        &d16[j].sse, &d16[j].sum);
 
         d16[j].var = d16[j].sse -
             (((uint32_t)d16[j].sum * d16[j].sum) >> 8);
@@ -2369,22 +2369,6 @@ static void init_encode_frame_mb_context(VP9_COMP *cpi) {
              sizeof(*xd->above_seg_context) * aligned_mi_cols);
 }
 
-static void switch_lossless_mode(VP9_COMP *cpi, int lossless) {
-  if (lossless) {
-    // printf("Switching to lossless\n");
-    cpi->mb.fwd_txm4x4 = vp9_fwht4x4;
-    cpi->mb.e_mbd.itxm_add = vp9_iwht4x4_add;
-    cpi->mb.optimize = 0;
-    cpi->common.lf.filter_level = 0;
-    cpi->zbin_mode_boost_enabled = 0;
-    cpi->common.tx_mode = ONLY_4X4;
-  } else {
-    // printf("Not lossless\n");
-    cpi->mb.fwd_txm4x4 = vp9_fdct4x4;
-    cpi->mb.e_mbd.itxm_add = vp9_idct4x4_add;
-  }
-}
-
 static int check_dual_ref_flags(VP9_COMP *cpi) {
   const int ref_flags = cpi->ref_frame_flags;
 
@@ -2421,7 +2405,7 @@ static MV_REFERENCE_FRAME get_frame_type(const VP9_COMP *cpi) {
 }
 
 static TX_MODE select_tx_mode(const VP9_COMP *cpi) {
-  if (cpi->oxcf.lossless) {
+  if (cpi->mb.e_mbd.lossless) {
     return ONLY_4X4;
   } else if (cpi->common.current_video_frame == 0) {
     return TX_MODE_SELECT;
@@ -3011,13 +2995,21 @@ static void encode_frame_internal(VP9_COMP *cpi) {
   vp9_zero(rd_opt->tx_select_diff);
   vp9_zero(rd_opt->tx_select_threshes);
 
-  cm->tx_mode = select_tx_mode(cpi);
-
   cpi->mb.e_mbd.lossless = cm->base_qindex == 0 &&
                            cm->y_dc_delta_q == 0 &&
                            cm->uv_dc_delta_q == 0 &&
                            cm->uv_ac_delta_q == 0;
-  switch_lossless_mode(cpi, cpi->mb.e_mbd.lossless);
+
+  cm->tx_mode = select_tx_mode(cpi);
+
+  cpi->mb.fwd_txm4x4 = cpi->mb.e_mbd.lossless ? vp9_fwht4x4 : vp9_fdct4x4;
+  cpi->mb.itxm_add = cpi->mb.e_mbd.lossless ? vp9_iwht4x4_add : vp9_idct4x4_add;
+
+  if (cpi->mb.e_mbd.lossless) {
+    cpi->mb.optimize = 0;
+    cpi->common.lf.filter_level = 0;
+    cpi->zbin_mode_boost_enabled = 0;
+  }
 
   vp9_frame_init_quantizer(cpi);
 
@@ -3357,7 +3349,8 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
       vp9_tokenize_sb(cpi, t, !output_enabled, MAX(bsize, BLOCK_8X8));
     } else {
       mbmi->skip = 1;
-      if (output_enabled)
+      if (output_enabled &&
+          !vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP))
         cm->counts.skip[vp9_get_skip_context(xd)][1]++;
       reset_skip_context(xd, MAX(bsize, BLOCK_8X8));
     }
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 3b231b7f2..8581e6117 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -406,7 +406,7 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize,
       // this is like vp9_short_idct4x4 but has a special case around eob<=1
       // which is significant (not just an optimization) for the lossless
       // case.
-      xd->itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
+      x->itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
       break;
     default:
       assert(0 && "Invalid transform size");
@@ -428,7 +428,7 @@ static void encode_block_pass1(int plane, int block, BLOCK_SIZE plane_bsize,
   vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
 
   if (p->eobs[block] > 0)
-    xd->itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
+    x->itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
 }
 
 void vp9_encode_sby_pass1(MACROBLOCK *x, BLOCK_SIZE bsize) {
@@ -574,7 +574,7 @@ static void encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
           // this is like vp9_short_idct4x4 but has a special case around eob<=1
           // which is significant (not just an optimization) for the lossless
           // case.
-          xd->itxm_add(dqcoeff, dst, dst_stride, *eob);
+          x->itxm_add(dqcoeff, dst, dst_stride, *eob);
         else
           vp9_iht4x4_16_add(dqcoeff, dst, dst_stride, tx_type);
       }
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 2ce5483d6..0ebc93638 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -393,11 +393,6 @@ static void set_speed_features(VP9_COMP *cpi) {
   // Set rd thresholds based on mode and speed setting
   vp9_set_rd_speed_thresholds(cpi);
   vp9_set_rd_speed_thresholds_sub8x8(cpi);
-
-  cpi->mb.fwd_txm4x4 = vp9_fdct4x4;
-  if (cpi->oxcf.lossless || cpi->mb.e_mbd.lossless) {
-    cpi->mb.fwd_txm4x4 = vp9_fwht4x4;
-  }
 }
 
 static void alloc_raw_frame_buffers(VP9_COMP *cpi) {
@@ -596,16 +591,6 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
   if (cpi->oxcf.mode == REALTIME)
     cpi->oxcf.play_alternate = 0;
 
-  cpi->oxcf.lossless = oxcf->lossless;
-  if (cpi->oxcf.lossless) {
-    // In lossless mode, make sure right quantizer range and correct transform
-    // is set.
-    cpi->oxcf.worst_allowed_q = 0;
-    cpi->oxcf.best_allowed_q = 0;
-    cpi->mb.e_mbd.itxm_add = vp9_iwht4x4_add;
-  } else {
-    cpi->mb.e_mbd.itxm_add = vp9_idct4x4_add;
-  }
   rc->baseline_gf_interval = DEFAULT_GF_INTERVAL;
   cpi->ref_frame_flags = VP9_ALT_FLAG | VP9_GOLD_FLAG | VP9_LAST_FLAG;
 
@@ -627,33 +612,30 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
 
   // local file playback mode == really big buffer
   if (cpi->oxcf.rc_mode == RC_MODE_VBR) {
-    cpi->oxcf.starting_buffer_level   = 60000;
-    cpi->oxcf.optimal_buffer_level    = 60000;
-    cpi->oxcf.maximum_buffer_size     = 240000;
+    cpi->oxcf.starting_buffer_level_ms = 60000;
+    cpi->oxcf.optimal_buffer_level_ms = 60000;
+    cpi->oxcf.maximum_buffer_size_ms = 240000;
   }
 
-  cpi->oxcf.starting_buffer_level =
-      vp9_rescale(cpi->oxcf.starting_buffer_level,
-                  cpi->oxcf.target_bandwidth, 1000);
+  rc->starting_buffer_level = vp9_rescale(cpi->oxcf.starting_buffer_level_ms,
+                                          cpi->oxcf.target_bandwidth, 1000);
 
   // Set or reset optimal and maximum buffer levels.
-  if (cpi->oxcf.optimal_buffer_level == 0)
-    cpi->oxcf.optimal_buffer_level = cpi->oxcf.target_bandwidth / 8;
+  if (cpi->oxcf.optimal_buffer_level_ms == 0)
+    rc->optimal_buffer_level = cpi->oxcf.target_bandwidth / 8;
   else
-    cpi->oxcf.optimal_buffer_level =
-        vp9_rescale(cpi->oxcf.optimal_buffer_level,
-                    cpi->oxcf.target_bandwidth, 1000);
+    rc->optimal_buffer_level = vp9_rescale(cpi->oxcf.optimal_buffer_level_ms,
+                                           cpi->oxcf.target_bandwidth, 1000);
 
-  if (cpi->oxcf.maximum_buffer_size == 0)
-    cpi->oxcf.maximum_buffer_size = cpi->oxcf.target_bandwidth / 8;
+  if (cpi->oxcf.maximum_buffer_size_ms == 0)
+    rc->maximum_buffer_size = cpi->oxcf.target_bandwidth / 8;
   else
-    cpi->oxcf.maximum_buffer_size =
-        vp9_rescale(cpi->oxcf.maximum_buffer_size,
-                    cpi->oxcf.target_bandwidth, 1000);
+    rc->maximum_buffer_size = vp9_rescale(cpi->oxcf.maximum_buffer_size_ms,
+                                          cpi->oxcf.target_bandwidth, 1000);
   // Under a configuration change, where maximum_buffer_size may change,
   // keep buffer level clipped to the maximum allowed buffer size.
-  rc->bits_off_target = MIN(rc->bits_off_target, cpi->oxcf.maximum_buffer_size);
-  rc->buffer_level = MIN(rc->buffer_level, cpi->oxcf.maximum_buffer_size);
+  rc->bits_off_target = MIN(rc->bits_off_target, rc->maximum_buffer_size);
+  rc->buffer_level = MIN(rc->buffer_level, rc->maximum_buffer_size);
 
   // Set up frame rate and related parameters rate control values.
   vp9_new_framerate(cpi, cpi->oxcf.framerate);
@@ -1439,21 +1421,6 @@ static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src,
   vp8_yv12_extend_frame_borders_c(dst);
 }
 
-static int find_fp_qindex() {
-  int i;
-
-  for (i = 0; i < QINDEX_RANGE; i++) {
-    if (vp9_convert_qindex_to_q(i) >= 30.0) {
-      break;
-    }
-  }
-
-  if (i == QINDEX_RANGE)
-    i--;
-
-  return i;
-}
-
 #define WRITE_RECON_BUFFER 0
 #if WRITE_RECON_BUFFER
 void write_cx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame) {
@@ -2308,17 +2275,6 @@ static void Pass0Encode(VP9_COMP *cpi, size_t *size, uint8_t *dest,
   encode_frame_to_data_rate(cpi, size, dest, frame_flags);
 }
 
-static void Pass1Encode(VP9_COMP *cpi, size_t *size, uint8_t *dest,
-                        unsigned int *frame_flags) {
-  (void) size;
-  (void) dest;
-  (void) frame_flags;
-
-  vp9_rc_get_first_pass_params(cpi);
-  vp9_set_quantizer(&cpi->common, find_fp_qindex());
-  vp9_first_pass(cpi);
-}
-
 static void Pass2Encode(VP9_COMP *cpi, size_t *size,
                         uint8_t *dest, unsigned int *frame_flags) {
   cpi->allow_encode_breakout = ENCODE_BREAKOUT_ENABLED;
@@ -2658,7 +2614,10 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
 
   if (cpi->pass == 1 &&
       (!cpi->use_svc || cpi->svc.number_temporal_layers == 1)) {
-    Pass1Encode(cpi, size, dest, frame_flags);
+    const int lossless = is_lossless_requested(&cpi->oxcf);
+    cpi->mb.fwd_txm4x4 = lossless ? vp9_fwht4x4 : vp9_fdct4x4;
+    cpi->mb.itxm_add = lossless ? vp9_iwht4x4_add : vp9_idct4x4_add;
+    vp9_first_pass(cpi);
   } else if (cpi->pass == 2 &&
       (!cpi->use_svc || cpi->svc.number_temporal_layers == 1)) {
     Pass2Encode(cpi, size, dest, frame_flags);
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 47c901975..c69a345d0 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -216,9 +216,9 @@ typedef struct VP9EncoderConfig {
   int over_shoot_pct;
 
   // buffering parameters
-  int64_t starting_buffer_level;  // in seconds
-  int64_t optimal_buffer_level;
-  int64_t maximum_buffer_size;
+  int64_t starting_buffer_level_ms;
+  int64_t optimal_buffer_level_ms;
+  int64_t maximum_buffer_size_ms;
 
   // Frame drop threshold.
   int drop_frames_water_mark;
@@ -228,7 +228,6 @@ typedef struct VP9EncoderConfig {
   int worst_allowed_q;
   int best_allowed_q;
   int cq_level;
-  int lossless;
   AQ_MODE aq_mode;  // Adaptive Quantization mode
 
   // Internal frame size scaling.
@@ -257,7 +256,6 @@ typedef struct VP9EncoderConfig {
 
   // these parameters aren't to be used in final build don't use!!!
   int play_alternate;
-  int alt_freq;
 
   int encode_breakout;  // early breakout : for video conf recommend 800
 
@@ -286,6 +284,10 @@ typedef struct VP9EncoderConfig {
   vp8e_tuning tuning;
 } VP9EncoderConfig;
 
+static INLINE int is_lossless_requested(const VP9EncoderConfig *cfg) {
+  return cfg->best_allowed_q == 0 && cfg->worst_allowed_q == 0;
+}
+
 static INLINE int is_best_mode(MODE mode) {
   return mode == ONE_PASS_BEST || mode == TWO_PASS_SECOND_BEST;
 }
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 0d4f2c72c..dc3832b16 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -398,6 +398,32 @@ static BLOCK_SIZE get_bsize(const VP9_COMMON *cm, int mb_row, int mb_col) {
   }
 }
 
+static int find_fp_qindex() {
+  int i;
+
+  for (i = 0; i < QINDEX_RANGE; ++i)
+    if (vp9_convert_qindex_to_q(i) >= 30.0)
+      break;
+
+  if (i == QINDEX_RANGE)
+    i--;
+
+  return i;
+}
+
+static void set_first_pass_params(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  if (!cpi->refresh_alt_ref_frame &&
+      (cm->current_video_frame == 0 ||
+       (cpi->frame_flags & FRAMEFLAGS_KEY))) {
+    cm->frame_type = KEY_FRAME;
+  } else {
+    cm->frame_type = INTER_FRAME;
+  }
+  // Do not use periodic key frames.
+  cpi->rc.frames_to_key = INT_MAX;
+}
+
 void vp9_first_pass(VP9_COMP *cpi) {
   int mb_row, mb_col;
   MACROBLOCK *const x = &cpi->mb;
@@ -438,6 +464,9 @@ void vp9_first_pass(VP9_COMP *cpi) {
 
   vp9_clear_system_state();
 
+  set_first_pass_params(cpi);
+  vp9_set_quantizer(cm, find_fp_qindex());
+
   if (cpi->use_svc && cpi->svc.number_temporal_layers == 1) {
     MV_REFERENCE_FRAME ref_frame = LAST_FRAME;
     const YV12_BUFFER_CONFIG *scaled_ref_buf = NULL;
@@ -1576,7 +1605,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
 
     // Break out conditions.
     if (
-      // Break at cpi->max_gf_interval unless almost totally static.
+      // Break at active_max_gf_interval unless almost totally static.
       (i >= active_max_gf_interval && (zero_motion_accumulator < 0.995)) ||
       (
         // Don't break out with a very short interval.
@@ -2051,19 +2080,6 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   twopass->modified_error_left -= kf_group_err;
 }
 
-void vp9_rc_get_first_pass_params(VP9_COMP *cpi) {
-  VP9_COMMON *const cm = &cpi->common;
-  if (!cpi->refresh_alt_ref_frame &&
-      (cm->current_video_frame == 0 ||
-       (cpi->frame_flags & FRAMEFLAGS_KEY))) {
-    cm->frame_type = KEY_FRAME;
-  } else {
-    cm->frame_type = INTER_FRAME;
-  }
-  // Do not use periodic key frames.
-  cpi->rc.frames_to_key = INT_MAX;
-}
-
 // For VBR...adjustment to the frame target based on error from previous frames
 void vbr_rate_correction(int * this_frame_target,
                          const int64_t vbr_bits_off_target) {
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 9d2b2a497..dbd19a2d6 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -524,7 +524,8 @@ static int vp9_pattern_search(const MACROBLOCK *x,
 
   // Work out the start point for the search
   bestsad = vfp->sdf(what->buf, what->stride,
-                     get_buf_from_mv(in_what, ref_mv), in_what->stride);
+                     get_buf_from_mv(in_what, ref_mv), in_what->stride) +
+      mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
 
   // Search all possible scales upto the search param around the center point
   // pick the scale of the point that is best as the starting scale of
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 11633a73d..913b8ead4 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -280,8 +280,7 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
       int rate_mv = 0;
 
-      if (cpi->sf.disable_inter_mode_mask[bsize] &
-          (1 << INTER_OFFSET(this_mode)))
+      if (!(cpi->sf.inter_mode_mask[bsize] & (1 << this_mode)))
         continue;
 
       if (rd_less_than_thresh(best_rd, rd_threshes[mode_idx[this_mode]],
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index 6f646ea0e..0163fd1e8 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -159,7 +159,7 @@ static void update_layer_buffer_level(SVC *svc, int encoded_frame_size) {
     lrc->bits_off_target += bits_off_for_this_layer;
 
     // Clip buffer level to maximum buffer size for the layer.
-    lrc->bits_off_target = MIN(lrc->bits_off_target, lc->maximum_buffer_size);
+    lrc->bits_off_target = MIN(lrc->bits_off_target, lrc->maximum_buffer_size);
     lrc->buffer_level = lrc->bits_off_target;
   }
 }
@@ -167,7 +167,6 @@ static void update_layer_buffer_level(SVC *svc, int encoded_frame_size) {
 // Update the buffer level: leaky bucket model.
 static void update_buffer_level(VP9_COMP *cpi, int encoded_frame_size) {
   const VP9_COMMON *const cm = &cpi->common;
-  const VP9EncoderConfig *oxcf = &cpi->oxcf;
   RATE_CONTROL *const rc = &cpi->rc;
 
   // Non-viewable frames are a special case and are treated as pure overhead.
@@ -178,7 +177,7 @@ static void update_buffer_level(VP9_COMP *cpi, int encoded_frame_size) {
   }
 
   // Clip the buffer level to the maximum specified buffer size.
-  rc->bits_off_target = MIN(rc->bits_off_target, oxcf->maximum_buffer_size);
+  rc->bits_off_target = MIN(rc->bits_off_target, rc->maximum_buffer_size);
   rc->buffer_level = rc->bits_off_target;
 
   if (cpi->use_svc && cpi->oxcf.rc_mode == RC_MODE_CBR) {
@@ -188,23 +187,20 @@ static void update_buffer_level(VP9_COMP *cpi, int encoded_frame_size) {
 
 void vp9_rc_init(const VP9EncoderConfig *oxcf, int pass, RATE_CONTROL *rc) {
   if (pass == 0 && oxcf->rc_mode == RC_MODE_CBR) {
-    rc->avg_frame_qindex[0] = oxcf->worst_allowed_q;
-    rc->avg_frame_qindex[1] = oxcf->worst_allowed_q;
-    rc->avg_frame_qindex[2] = oxcf->worst_allowed_q;
+    rc->avg_frame_qindex[KEY_FRAME] = oxcf->worst_allowed_q;
+    rc->avg_frame_qindex[INTER_FRAME] = oxcf->worst_allowed_q;
   } else {
-    rc->avg_frame_qindex[0] = (oxcf->worst_allowed_q +
-                                   oxcf->best_allowed_q) / 2;
-    rc->avg_frame_qindex[1] = (oxcf->worst_allowed_q +
-                                   oxcf->best_allowed_q) / 2;
-    rc->avg_frame_qindex[2] = (oxcf->worst_allowed_q +
-                                   oxcf->best_allowed_q) / 2;
+    rc->avg_frame_qindex[KEY_FRAME] = (oxcf->worst_allowed_q +
+                                           oxcf->best_allowed_q) / 2;
+    rc->avg_frame_qindex[INTER_FRAME] = (oxcf->worst_allowed_q +
+                                           oxcf->best_allowed_q) / 2;
   }
 
   rc->last_q[KEY_FRAME] = oxcf->best_allowed_q;
   rc->last_q[INTER_FRAME] = oxcf->best_allowed_q;
 
-  rc->buffer_level =    oxcf->starting_buffer_level;
-  rc->bits_off_target = oxcf->starting_buffer_level;
+  rc->buffer_level =    rc->starting_buffer_level;
+  rc->bits_off_target = rc->starting_buffer_level;
 
   rc->rolling_target_bits      = rc->avg_frame_bandwidth;
   rc->rolling_actual_bits      = rc->avg_frame_bandwidth;
@@ -250,7 +246,7 @@ int vp9_rc_drop_frame(VP9_COMP *cpi) {
       // If buffer is below drop_mark, for now just drop every other frame
       // (starting with the next frame) until it increases back over drop_mark.
       int drop_mark = (int)(oxcf->drop_frames_water_mark *
-          oxcf->optimal_buffer_level / 100);
+          rc->optimal_buffer_level / 100);
       if ((rc->buffer_level > drop_mark) &&
           (rc->decimation_factor > 0)) {
         --rc->decimation_factor;
@@ -444,10 +440,9 @@ static int calc_active_worst_quality_one_pass_cbr(const VP9_COMP *cpi) {
   // ambient Q (at buffer = optimal level) to worst_quality level
   // (at buffer = critical level).
   const VP9_COMMON *const cm = &cpi->common;
-  const VP9EncoderConfig *oxcf = &cpi->oxcf;
   const RATE_CONTROL *rc = &cpi->rc;
   // Buffer level below which we push active_worst to worst_quality.
-  int64_t critical_level = oxcf->optimal_buffer_level >> 2;
+  int64_t critical_level = rc->optimal_buffer_level >> 2;
   int64_t buff_lvl_step = 0;
   int adjustment = 0;
   int active_worst_quality;
@@ -459,26 +454,26 @@ static int calc_active_worst_quality_one_pass_cbr(const VP9_COMP *cpi) {
   else
     active_worst_quality = MIN(rc->worst_quality,
                                rc->avg_frame_qindex[KEY_FRAME] * 3 / 2);
-  if (rc->buffer_level > oxcf->optimal_buffer_level) {
+  if (rc->buffer_level > rc->optimal_buffer_level) {
     // Adjust down.
     // Maximum limit for down adjustment, ~30%.
     int max_adjustment_down = active_worst_quality / 3;
     if (max_adjustment_down) {
-      buff_lvl_step = ((oxcf->maximum_buffer_size -
-                        oxcf->optimal_buffer_level) / max_adjustment_down);
+      buff_lvl_step = ((rc->maximum_buffer_size -
+                        rc->optimal_buffer_level) / max_adjustment_down);
       if (buff_lvl_step)
-        adjustment = (int)((rc->buffer_level - oxcf->optimal_buffer_level) /
+        adjustment = (int)((rc->buffer_level - rc->optimal_buffer_level) /
                             buff_lvl_step);
       active_worst_quality -= adjustment;
     }
   } else if (rc->buffer_level > critical_level) {
     // Adjust up from ambient Q.
     if (critical_level) {
-      buff_lvl_step = (oxcf->optimal_buffer_level - critical_level);
+      buff_lvl_step = (rc->optimal_buffer_level - critical_level);
       if (buff_lvl_step) {
         adjustment =
             (int)((rc->worst_quality - rc->avg_frame_qindex[INTER_FRAME]) *
-                  (oxcf->optimal_buffer_level - rc->buffer_level) /
+                  (rc->optimal_buffer_level - rc->buffer_level) /
                   buff_lvl_step);
       }
       active_worst_quality = rc->avg_frame_qindex[INTER_FRAME] + adjustment;
@@ -1086,21 +1081,21 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
     rc->last_q[KEY_FRAME] = qindex;
     rc->avg_frame_qindex[KEY_FRAME] =
         ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[KEY_FRAME] + qindex, 2);
-  } else if (!rc->is_src_frame_alt_ref &&
-             (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame) &&
-             !(cpi->use_svc && oxcf->rc_mode == RC_MODE_CBR)) {
-    rc->avg_frame_qindex[2] =
-        ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[2] + qindex, 2);
   } else {
-    rc->last_q[INTER_FRAME] = qindex;
-    rc->avg_frame_qindex[INTER_FRAME] =
+    if (rc->is_src_frame_alt_ref ||
+        !(cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame) ||
+        (cpi->use_svc && oxcf->rc_mode == RC_MODE_CBR)) {
+      rc->last_q[INTER_FRAME] = qindex;
+      rc->avg_frame_qindex[INTER_FRAME] =
         ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[INTER_FRAME] + qindex, 2);
-    rc->ni_frames++;
-    rc->tot_q += vp9_convert_qindex_to_q(qindex);
-    rc->avg_q = rc->tot_q / rc->ni_frames;
-    // Calculate the average Q for normal inter frames (not key or GFU frames).
-    rc->ni_tot_qi += qindex;
-    rc->ni_av_qi = rc->ni_tot_qi / rc->ni_frames;
+      rc->ni_frames++;
+      rc->tot_q += vp9_convert_qindex_to_q(qindex);
+      rc->avg_q = rc->tot_q / rc->ni_frames;
+      // Calculate the average Q for normal inter frames (not key or GFU
+      // frames).
+      rc->ni_tot_qi += qindex;
+      rc->ni_av_qi = rc->ni_tot_qi / rc->ni_frames;
+    }
   }
 
   // Keep record of last boosted (KF/KF/ARF) Q value.
@@ -1227,8 +1222,8 @@ static int calc_pframe_target_size_one_pass_cbr(const VP9_COMP *cpi) {
   const VP9EncoderConfig *oxcf = &cpi->oxcf;
   const RATE_CONTROL *rc = &cpi->rc;
   const SVC *const svc = &cpi->svc;
-  const int64_t diff = oxcf->optimal_buffer_level - rc->buffer_level;
-  const int64_t one_pct_bits = 1 + oxcf->optimal_buffer_level / 100;
+  const int64_t diff = rc->optimal_buffer_level - rc->buffer_level;
+  const int64_t one_pct_bits = 1 + rc->optimal_buffer_level / 100;
   int min_frame_target = MAX(rc->avg_frame_bandwidth >> 4, FRAME_OVERHEAD_BITS);
   int target = rc->avg_frame_bandwidth;
   if (svc->number_temporal_layers > 1 &&
@@ -1259,8 +1254,8 @@ static int calc_iframe_target_size_one_pass_cbr(const VP9_COMP *cpi) {
   const SVC *const svc = &cpi->svc;
   int target;
   if (cpi->common.current_video_frame == 0) {
-    target = ((cpi->oxcf.starting_buffer_level / 2) > INT_MAX)
-      ? INT_MAX : (int)(cpi->oxcf.starting_buffer_level / 2);
+    target = ((rc->starting_buffer_level / 2) > INT_MAX)
+      ? INT_MAX : (int)(rc->starting_buffer_level / 2);
   } else {
     int kf_boost = 32;
     double framerate = oxcf->framerate;
@@ -1388,6 +1383,24 @@ int vp9_compute_qdelta_by_rate(const RATE_CONTROL *rc, FRAME_TYPE frame_type,
   return target_index - qindex;
 }
 
+void vp9_rc_set_gf_max_interval(const VP9EncoderConfig *const oxcf,
+                                RATE_CONTROL *const rc) {
+  // Set Maximum gf/arf interval
+  rc->max_gf_interval = 16;
+
+  // Extended interval for genuinely static scenes
+  rc->static_scene_max_gf_interval = oxcf->key_freq >> 1;
+
+  // Special conditions when alt ref frame enabled
+  if (oxcf->play_alternate && oxcf->lag_in_frames) {
+    if (rc->static_scene_max_gf_interval > oxcf->lag_in_frames - 1)
+      rc->static_scene_max_gf_interval = oxcf->lag_in_frames - 1;
+  }
+
+  if (rc->max_gf_interval > rc->static_scene_max_gf_interval)
+    rc->max_gf_interval = rc->static_scene_max_gf_interval;
+}
+
 void vp9_rc_update_framerate(VP9_COMP *cpi) {
   const VP9_COMMON *const cm = &cpi->common;
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
@@ -1412,21 +1425,5 @@ void vp9_rc_update_framerate(VP9_COMP *cpi) {
   rc->max_frame_bandwidth = MAX(MAX((cm->MBs * MAX_MB_RATE), MAXRATE_1080P),
                                     vbr_max_bits);
 
-  // Set Maximum gf/arf interval
-  rc->max_gf_interval = 16;
-
-  // Extended interval for genuinely static scenes
-  rc->static_scene_max_gf_interval = cpi->oxcf.key_freq >> 1;
-
-  // Special conditions when alt ref frame enabled in lagged compress mode
-  if (oxcf->play_alternate && oxcf->lag_in_frames) {
-    if (rc->max_gf_interval > oxcf->lag_in_frames - 1)
-      rc->max_gf_interval = oxcf->lag_in_frames - 1;
-
-    if (rc->static_scene_max_gf_interval > oxcf->lag_in_frames - 1)
-      rc->static_scene_max_gf_interval = oxcf->lag_in_frames - 1;
-  }
-
-  if (rc->max_gf_interval > rc->static_scene_max_gf_interval)
-    rc->max_gf_interval = rc->static_scene_max_gf_interval;
+  vp9_rc_set_gf_max_interval(oxcf, rc);
 }
diff --git a/vp9/encoder/vp9_ratectrl.h b/vp9/encoder/vp9_ratectrl.h
index 614078eef..f1a4a3f6d 100644
--- a/vp9/encoder/vp9_ratectrl.h
+++ b/vp9/encoder/vp9_ratectrl.h
@@ -61,7 +61,7 @@ typedef struct {
   int ni_av_qi;
   int ni_tot_qi;
   int ni_frames;
-  int avg_frame_qindex[3];        // 0 - KEY, 1 - INTER, 2 - ARF/GF
+  int avg_frame_qindex[FRAME_TYPES];
   double tot_q;
   double avg_q;
 
@@ -84,6 +84,10 @@ typedef struct {
 
   int worst_quality;
   int best_quality;
+
+  int64_t starting_buffer_level;
+  int64_t optimal_buffer_level;
+  int64_t maximum_buffer_size;
   // int active_best_quality;
 } RATE_CONTROL;
 
@@ -178,6 +182,9 @@ int vp9_compute_qdelta_by_rate(const RATE_CONTROL *rc, FRAME_TYPE frame_type,
 
 void vp9_rc_update_framerate(struct VP9_COMP *cpi);
 
+void vp9_rc_set_gf_max_interval(const struct VP9EncoderConfig *const oxcf,
+                                RATE_CONTROL *const rc);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index d402d7b40..f68aa2738 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -1675,9 +1675,9 @@ static INLINE int mv_has_subpel(const MV *mv) {
 static int check_best_zero_mv(
     const VP9_COMP *cpi, const uint8_t mode_context[MAX_REF_FRAMES],
     int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES],
-    int disable_inter_mode_mask, int this_mode,
+    int inter_mode_mask, int this_mode,
     const MV_REFERENCE_FRAME ref_frames[2]) {
-  if (!(disable_inter_mode_mask & (1 << INTER_OFFSET(ZEROMV))) &&
+  if ((inter_mode_mask & (1 << ZEROMV)) &&
       (this_mode == NEARMV || this_mode == NEARESTMV || this_mode == ZEROMV) &&
       frame_mv[this_mode][ref_frames[0]].as_int == 0 &&
       (ref_frames[1] == NONE ||
@@ -1743,7 +1743,7 @@ static int64_t rd_pick_best_sub8x8_mode(VP9_COMP *cpi, MACROBLOCK *x,
   ENTROPY_CONTEXT t_above[2], t_left[2];
   int subpelmv = 1, have_ref = 0;
   const int has_second_rf = has_second_ref(mbmi);
-  const int disable_inter_mode_mask = cpi->sf.disable_inter_mode_mask[bsize];
+  const int inter_mode_mask = cpi->sf.inter_mode_mask[bsize];
 
   vp9_zero(*bsi);
 
@@ -1792,11 +1792,11 @@ static int64_t rd_pick_best_sub8x8_mode(VP9_COMP *cpi, MACROBLOCK *x,
 
         mode_idx = INTER_OFFSET(this_mode);
         bsi->rdstat[i][mode_idx].brdcost = INT64_MAX;
-        if (disable_inter_mode_mask & (1 << mode_idx))
+        if (!(inter_mode_mask & (1 << this_mode)))
           continue;
 
         if (!check_best_zero_mv(cpi, mbmi->mode_context, frame_mv,
-                                disable_inter_mode_mask,
+                                inter_mode_mask,
                                 this_mode, mbmi->ref_frame))
           continue;
 
@@ -3063,7 +3063,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   const int mode_search_skip_flags = cpi->sf.mode_search_skip_flags;
   const int intra_y_mode_mask =
       cpi->sf.intra_y_mode_mask[max_txsize_lookup[bsize]];
-  int disable_inter_mode_mask = cpi->sf.disable_inter_mode_mask[bsize];
+  int inter_mode_mask = cpi->sf.inter_mode_mask[bsize];
   vp9_zero(best_mbmode);
   x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
 
@@ -3130,7 +3130,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     const int inter_non_zero_mode_mask = 0x1F7F7;
     mode_skip_mask |= inter_non_zero_mode_mask;
     mode_skip_mask &= ~(1 << THR_ZEROMV);
-    disable_inter_mode_mask = ~(1 << INTER_OFFSET(ZEROMV));
+    inter_mode_mask = (1 << ZEROMV);
   }
 
   // Disable this drop out case if the ref frame
@@ -3182,7 +3182,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       mode_index = THR_ZEROMV;
     mode_skip_mask = ~(1 << mode_index);
     mode_skip_start = MAX_MODES;
-    disable_inter_mode_mask = 0;
+    inter_mode_mask = (1 << NEARESTMV) | (1 << NEARMV) | (1 << ZEROMV) |
+                      (1 << NEWMV);
   }
 
   for (mode_index = 0; mode_index < MAX_MODES; ++mode_index) {
@@ -3229,8 +3230,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
 
     this_mode = vp9_mode_order[mode_index].mode;
     ref_frame = vp9_mode_order[mode_index].ref_frame[0];
-    if (ref_frame != INTRA_FRAME &&
-        disable_inter_mode_mask & (1 << INTER_OFFSET(this_mode)))
+    if (ref_frame != INTRA_FRAME && !(inter_mode_mask & (1 << this_mode)))
       continue;
     second_ref_frame = vp9_mode_order[mode_index].ref_frame[1];
 
@@ -3279,7 +3279,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
           !vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
         const MV_REFERENCE_FRAME ref_frames[2] = {ref_frame, second_ref_frame};
         if (!check_best_zero_mv(cpi, mbmi->mode_context, frame_mv,
-                                disable_inter_mode_mask, this_mode, ref_frames))
+                                inter_mode_mask, this_mode, ref_frames))
           continue;
       }
     }
@@ -3665,7 +3665,6 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
   int_mv seg_mvs[4][MAX_REF_FRAMES];
   b_mode_info best_bmodes[4];
   int best_skip2 = 0;
-  int ref_frame_mask = 0;
   int mode_skip_mask = 0;
 
   x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
@@ -3700,17 +3699,6 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
     frame_mv[ZEROMV][ref_frame].as_int = 0;
   }
 
-  for (ref_frame = LAST_FRAME;
-       ref_frame <= ALTREF_FRAME && cpi->sf.reference_masking; ++ref_frame) {
-    int i;
-    for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
-      if ((x->pred_mv_sad[ref_frame] >> 1) > x->pred_mv_sad[i]) {
-        ref_frame_mask |= (1 << ref_frame);
-        break;
-      }
-    }
-  }
-
   for (ref_index = 0; ref_index < MAX_REFS; ++ref_index) {
     int mode_excluded = 0;
     int64_t this_rd = INT64_MAX;
@@ -3805,11 +3793,6 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
         vp9_get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) !=
             (int)ref_frame) {
       continue;
-    // If the segment skip feature is enabled....
-    // then do nothing if the current mode is not allowed..
-    } else if (vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP) &&
-               ref_frame != INTRA_FRAME) {
-      continue;
     // Disable this drop out case if the ref frame
     // segment level feature is enabled for this segment. This is to
     // prevent the possibility that we end up unable to pick any mode.
@@ -4034,15 +4017,10 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
     }
 
     if (!disable_skip) {
-      // Test for the condition where skip block will be activated
-      // because there are no non zero coefficients and make any
-      // necessary adjustment for rate. Ignore if skip is coded at
-      // segment level as the cost wont have been added in.
-      // Is Mb level skip allowed (i.e. not coded at segment level).
-      const int mb_skip_allowed = !vp9_segfeature_active(seg, segment_id,
-                                                         SEG_LVL_SKIP);
+      // Skip is never coded at the segment level for sub8x8 blocks and instead
+      // always coded in the bitstream at the mode info level.
 
-      if (mb_skip_allowed && ref_frame != INTRA_FRAME && !xd->lossless) {
+      if (ref_frame != INTRA_FRAME && !xd->lossless) {
         if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv, distortion2) <
             RDCOST(x->rdmult, x->rddiv, 0, total_sse)) {
           // Add in the cost of the no skip flag.
@@ -4057,7 +4035,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
           rate_uv = 0;
           this_skip2 = 1;
         }
-      } else if (mb_skip_allowed) {
+      } else {
         // Add in the cost of the no skip flag.
         rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0);
       }
diff --git a/vp9/encoder/vp9_rdopt.h b/vp9/encoder/vp9_rdopt.h
index 5ea09a8a7..e85d08a6d 100644
--- a/vp9/encoder/vp9_rdopt.h
+++ b/vp9/encoder/vp9_rdopt.h
@@ -99,41 +99,44 @@ static INLINE int full_pixel_search(VP9_COMP *cpi, MACROBLOCK *x,
                                     int step_param, int error_per_bit,
                                     const MV *ref_mv, MV *tmp_mv,
                                     int var_max, int rd) {
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  const SEARCH_METHODS method = sf->search_method;
+  vp9_variance_fn_ptr_t *fn_ptr = &cpi->fn_ptr[bsize];
   int var = 0;
 
-  if (cpi->sf.search_method == FAST_DIAMOND) {
-    var = vp9_fast_dia_search(x, mvp_full, step_param, error_per_bit, 0,
-                              &cpi->fn_ptr[bsize], 1, ref_mv, tmp_mv);
-    if (rd && var < var_max)
-      var = vp9_get_mvpred_var(x, tmp_mv, ref_mv, &cpi->fn_ptr[bsize], 1);
-  } else if (cpi->sf.search_method == FAST_HEX) {
-    var = vp9_fast_hex_search(x, mvp_full, step_param, error_per_bit, 0,
-                              &cpi->fn_ptr[bsize], 1, ref_mv, tmp_mv);
-    if (rd && var < var_max)
-      var = vp9_get_mvpred_var(x, tmp_mv, ref_mv, &cpi->fn_ptr[bsize], 1);
-  } else if (cpi->sf.search_method == HEX) {
-    var = vp9_hex_search(x, mvp_full, step_param, error_per_bit, 1,
-                         &cpi->fn_ptr[bsize], 1, ref_mv, tmp_mv);
-    if (rd && var < var_max)
-      var = vp9_get_mvpred_var(x, tmp_mv, ref_mv, &cpi->fn_ptr[bsize], 1);
-  } else if (cpi->sf.search_method == SQUARE) {
-    var = vp9_square_search(x, mvp_full, step_param, error_per_bit, 1,
-                            &cpi->fn_ptr[bsize], 1, ref_mv, tmp_mv);
-    if (rd && var < var_max)
-      var = vp9_get_mvpred_var(x, tmp_mv, ref_mv, &cpi->fn_ptr[bsize], 1);
-  } else if (cpi->sf.search_method == BIGDIA) {
-    var = vp9_bigdia_search(x, mvp_full, step_param, error_per_bit, 1,
-                            &cpi->fn_ptr[bsize], 1, ref_mv, tmp_mv);
-    if (rd && var < var_max)
-      var = vp9_get_mvpred_var(x, tmp_mv, ref_mv, &cpi->fn_ptr[bsize], 1);
-  } else {
-    int further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;
-
-    var = vp9_full_pixel_diamond(cpi, x, mvp_full, step_param, error_per_bit,
-                                 further_steps, 1, &cpi->fn_ptr[bsize],
-                                 ref_mv, tmp_mv);
+  switch (method) {
+    case FAST_DIAMOND:
+      var = vp9_fast_dia_search(x, mvp_full, step_param, error_per_bit, 0,
+                                fn_ptr, 1, ref_mv, tmp_mv);
+      break;
+    case FAST_HEX:
+      var = vp9_fast_hex_search(x, mvp_full, step_param, error_per_bit, 0,
+                                fn_ptr, 1, ref_mv, tmp_mv);
+      break;
+    case HEX:
+      var = vp9_hex_search(x, mvp_full, step_param, error_per_bit, 1,
+                           fn_ptr, 1, ref_mv, tmp_mv);
+      break;
+    case SQUARE:
+      var = vp9_square_search(x, mvp_full, step_param, error_per_bit, 1,
+                              fn_ptr, 1, ref_mv, tmp_mv);
+      break;
+    case BIGDIA:
+      var = vp9_bigdia_search(x, mvp_full, step_param, error_per_bit, 1,
+                              fn_ptr, 1, ref_mv, tmp_mv);
+      break;
+    case NSTEP:
+      var = vp9_full_pixel_diamond(cpi, x, mvp_full, step_param, error_per_bit,
+                                   (sf->max_step_search_steps - 1) - step_param,
+                                   1, fn_ptr, ref_mv, tmp_mv);
+      break;
+    default:
+      assert(!"Invalid search method.");
   }
 
+  if (method != NSTEP && rd && var < var_max)
+    var = vp9_get_mvpred_var(x, tmp_mv, ref_mv, fn_ptr, 1);
+
   return var;
 }
 
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index 7c3abd5d7..b7f839747 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -14,20 +14,23 @@
 #include "vp9/encoder/vp9_speed_features.h"
 
 enum {
-  ALL_INTRA_MODES = (1 << DC_PRED) |
+  INTRA_ALL       = (1 << DC_PRED) |
                     (1 << V_PRED) | (1 << H_PRED) |
                     (1 << D45_PRED) | (1 << D135_PRED) |
                     (1 << D117_PRED) | (1 << D153_PRED) |
                     (1 << D207_PRED) | (1 << D63_PRED) |
                     (1 << TM_PRED),
-
-  INTRA_DC_ONLY   = (1 << DC_PRED),
-
-  INTRA_DC_TM     = (1 << TM_PRED) | (1 << DC_PRED),
-
+  INTRA_DC        = (1 << DC_PRED),
+  INTRA_DC_TM     = (1 << DC_PRED) | (1 << TM_PRED),
   INTRA_DC_H_V    = (1 << DC_PRED) | (1 << V_PRED) | (1 << H_PRED),
+  INTRA_DC_TM_H_V = (1 << DC_PRED) | (1 << TM_PRED) | (1 << V_PRED) |
+                    (1 << H_PRED)
+};
 
-  INTRA_DC_TM_H_V = INTRA_DC_TM | (1 << V_PRED) | (1 << H_PRED)
+enum {
+  INTER_ALL = (1 << NEARESTMV) | (1 << NEARMV) | (1 << ZEROMV) | (1 << NEWMV),
+  INTER_NEAREST = (1 << NEARESTMV),
+  INTER_NEAREST_NEAR_NEW = (1 << NEARESTMV) | (1 << NEARMV) | (1 << NEWMV)
 };
 
 enum {
@@ -140,8 +143,8 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm,
     sf->search_method = HEX;
     sf->disable_filter_search_var_thresh = 500;
     for (i = 0; i < TX_SIZES; ++i) {
-      sf->intra_y_mode_mask[i] = INTRA_DC_ONLY;
-      sf->intra_uv_mode_mask[i] = INTRA_DC_ONLY;
+      sf->intra_y_mode_mask[i] = INTRA_DC;
+      sf->intra_uv_mode_mask[i] = INTRA_DC;
     }
     cpi->allow_encode_breakout = ENCODE_BREAKOUT_ENABLED;
   }
@@ -156,7 +159,7 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf,
   sf->adaptive_rd_thresh = 1;
   sf->use_fast_coef_costing = 1;
 
-  if (speed == 1) {
+  if (speed >= 1) {
     sf->use_square_partition_only = !frame_is_intra_only(cm);
     sf->less_rectangular_check = 1;
     sf->tx_size_search_method = frame_is_intra_only(cm) ? USE_FULL_RD
@@ -179,13 +182,9 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf,
   }
 
   if (speed >= 2) {
-    sf->use_square_partition_only = !frame_is_intra_only(cm);
-    sf->less_rectangular_check = 1;
-    sf->tx_size_search_method = frame_is_intra_only(cm) ? USE_FULL_RD
-                                                        : USE_LARGESTALL;
     if (MIN(cm->width, cm->height) >= 720)
-      sf->disable_split_mask = cm->show_frame ?
-        DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT;
+      sf->disable_split_mask = cm->show_frame ? DISABLE_ALL_SPLIT
+                                              : DISABLE_ALL_INTER_SPLIT;
     else
       sf->disable_split_mask = LAST_AND_INTRA_SPLIT_ONLY;
 
@@ -193,28 +192,18 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf,
                                  FLAG_SKIP_INTRA_BESTINTER |
                                  FLAG_SKIP_COMP_BESTINTRA |
                                  FLAG_SKIP_INTRA_LOWVAR;
-    sf->use_rd_breakout = 1;
-    sf->adaptive_motion_search = 1;
     sf->adaptive_pred_interp_filter = 2;
-    sf->auto_mv_step_size = 1;
     sf->reference_masking = 1;
-
     sf->disable_filter_search_var_thresh = 50;
     sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
-
     sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX;
     sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_LOW_MOTION;
     sf->lf_motion_threshold = LOW_MOITION_THRESHOLD;
     sf->adjust_partitioning_from_last_frame = 1;
     sf->last_partitioning_redo_frequency = 3;
-
-    sf->adaptive_rd_thresh = 2;
     sf->use_lp32x32fdct = 1;
     sf->mode_skip_start = 11;
-    sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
     sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V;
-    sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V;
-    sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V;
   }
 
   if (speed >= 3) {
@@ -246,15 +235,15 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf,
     sf->subpel_force_stop = 1;
     for (i = 0; i < TX_SIZES; i++) {
       sf->intra_y_mode_mask[i] = INTRA_DC_H_V;
-      sf->intra_uv_mode_mask[i] = INTRA_DC_ONLY;
+      sf->intra_uv_mode_mask[i] = INTRA_DC;
     }
-    sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_ONLY;
+    sf->intra_y_mode_mask[TX_32X32] = INTRA_DC;
     sf->frame_parameter_update = 0;
     sf->search_method = FAST_HEX;
-    sf->disable_inter_mode_mask[BLOCK_32X32] = 1 << INTER_OFFSET(ZEROMV);
-    sf->disable_inter_mode_mask[BLOCK_32X64] = ~(1 << INTER_OFFSET(NEARESTMV));
-    sf->disable_inter_mode_mask[BLOCK_64X32] = ~(1 << INTER_OFFSET(NEARESTMV));
-    sf->disable_inter_mode_mask[BLOCK_64X64] = ~(1 << INTER_OFFSET(NEARESTMV));
+    sf->inter_mode_mask[BLOCK_32X32] = INTER_NEAREST_NEAR_NEW;
+    sf->inter_mode_mask[BLOCK_32X64] = INTER_NEAREST;
+    sf->inter_mode_mask[BLOCK_64X32] = INTER_NEAREST;
+    sf->inter_mode_mask[BLOCK_64X64] = INTER_NEAREST;
     sf->max_intra_bsize = BLOCK_32X32;
     sf->allow_skip_recode = 1;
   }
@@ -285,7 +274,7 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf,
   if (speed >= 7) {
     int i;
     for (i = 0; i < BLOCK_SIZES; ++i)
-      sf->disable_inter_mode_mask[i] = ~(1 << INTER_OFFSET(NEARESTMV));
+      sf->inter_mode_mask[i] = INTER_NEAREST;
   }
 }
 
@@ -302,7 +291,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
   sf->subpel_search_method = SUBPEL_TREE;
   sf->subpel_iters_per_step = 2;
   sf->subpel_force_stop = 0;
-  sf->optimize_coefficients = !oxcf->lossless;
+  sf->optimize_coefficients = !is_lossless_requested(&cpi->oxcf);
   sf->reduce_first_step_size = 0;
   sf->auto_mv_step_size = 0;
   sf->max_step_search_steps = MAX_MVSEARCH_STEPS;
@@ -330,8 +319,8 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
   sf->disable_split_var_thresh = 0;
   sf->disable_filter_search_var_thresh = 0;
   for (i = 0; i < TX_SIZES; i++) {
-    sf->intra_y_mode_mask[i] = ALL_INTRA_MODES;
-    sf->intra_uv_mode_mask[i] = ALL_INTRA_MODES;
+    sf->intra_y_mode_mask[i] = INTRA_ALL;
+    sf->intra_uv_mode_mask[i] = INTRA_ALL;
   }
   sf->use_rd_breakout = 0;
   sf->skip_encode_sb = 0;
@@ -343,7 +332,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
   sf->mode_skip_start = MAX_MODES;  // Mode index at which mode skip mask set
   sf->use_nonrd_pick_mode = 0;
   for (i = 0; i < BLOCK_SIZES; ++i)
-    sf->disable_inter_mode_mask[i] = 0;
+    sf->inter_mode_mask[i] = INTER_ALL;
   sf->max_intra_bsize = BLOCK_64X64;
   // This setting only takes effect when partition_search_type is set
   // to FIXED_PARTITION.
diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h
index a54599e6a..3e7cd27d8 100644
--- a/vp9/encoder/vp9_speed_features.h
+++ b/vp9/encoder/vp9_speed_features.h
@@ -331,8 +331,8 @@ typedef struct SPEED_FEATURES {
   int use_nonrd_pick_mode;
 
   // A binary mask indicating if NEARESTMV, NEARMV, ZEROMV, NEWMV
-  // modes are disabled in order from LSB to MSB for each BLOCK_SIZE.
-  int disable_inter_mode_mask[BLOCK_SIZES];
+  // modes are used in order from LSB to MSB for each BLOCK_SIZE.
+  int inter_mode_mask[BLOCK_SIZES];
 
   // This feature controls whether we do the expensive context update and
   // calculation in the rd coefficient costing loop.
diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c
index 95ea1072d..1b995757a 100644
--- a/vp9/encoder/vp9_svc_layercontext.c
+++ b/vp9/encoder/vp9_svc_layercontext.c
@@ -54,7 +54,7 @@ void vp9_init_layer_context(VP9_COMP *const cpi) {
       lrc->last_q[INTER_FRAME] = oxcf->best_allowed_q;
     }
 
-    lrc->buffer_level = vp9_rescale((int)(oxcf->starting_buffer_level),
+    lrc->buffer_level = vp9_rescale((int)(oxcf->starting_buffer_level_ms),
                                     lc->target_bandwidth, 1000);
     lrc->bits_off_target = lrc->buffer_level;
   }
@@ -87,14 +87,14 @@ void vp9_update_layer_context_change_config(VP9_COMP *const cpi,
     }
     bitrate_alloc = (float)lc->target_bandwidth / target_bandwidth;
     // Update buffer-related quantities.
-    lc->starting_buffer_level =
-        (int64_t)(oxcf->starting_buffer_level * bitrate_alloc);
-    lc->optimal_buffer_level =
-        (int64_t)(oxcf->optimal_buffer_level * bitrate_alloc);
-    lc->maximum_buffer_size =
-        (int64_t)(oxcf->maximum_buffer_size * bitrate_alloc);
-    lrc->bits_off_target = MIN(lrc->bits_off_target, lc->maximum_buffer_size);
-    lrc->buffer_level = MIN(lrc->buffer_level, lc->maximum_buffer_size);
+    lrc->starting_buffer_level =
+        (int64_t)(rc->starting_buffer_level * bitrate_alloc);
+    lrc->optimal_buffer_level =
+        (int64_t)(rc->optimal_buffer_level * bitrate_alloc);
+    lrc->maximum_buffer_size =
+        (int64_t)(rc->maximum_buffer_size * bitrate_alloc);
+    lrc->bits_off_target = MIN(lrc->bits_off_target, lrc->maximum_buffer_size);
+    lrc->buffer_level = MIN(lrc->buffer_level, lrc->maximum_buffer_size);
     // Update framerate-related quantities.
     if (svc->number_temporal_layers > 1) {
       lc->framerate = oxcf->framerate / oxcf->ts_rate_decimator[layer];
@@ -149,20 +149,7 @@ void vp9_update_spatial_layer_framerate(VP9_COMP *const cpi, double framerate) {
                                    oxcf->two_pass_vbrmin_section / 100);
   lrc->max_frame_bandwidth = (int)(((int64_t)lrc->avg_frame_bandwidth *
                                    oxcf->two_pass_vbrmax_section) / 100);
-  lrc->max_gf_interval = 16;
-
-  lrc->static_scene_max_gf_interval = cpi->oxcf.key_freq >> 1;
-
-  if (oxcf->play_alternate && oxcf->lag_in_frames) {
-    if (lrc->max_gf_interval > oxcf->lag_in_frames - 1)
-      lrc->max_gf_interval = oxcf->lag_in_frames - 1;
-
-    if (lrc->static_scene_max_gf_interval > oxcf->lag_in_frames - 1)
-      lrc->static_scene_max_gf_interval = oxcf->lag_in_frames - 1;
-  }
-
-  if (lrc->max_gf_interval > lrc->static_scene_max_gf_interval)
-    lrc->max_gf_interval = lrc->static_scene_max_gf_interval;
+  vp9_rc_set_gf_max_interval(oxcf, lrc);
 }
 
 void vp9_restore_layer_context(VP9_COMP *const cpi) {
@@ -173,9 +160,6 @@ void vp9_restore_layer_context(VP9_COMP *const cpi) {
   cpi->rc = lc->rc;
   cpi->twopass = lc->twopass;
   cpi->oxcf.target_bandwidth = lc->target_bandwidth;
-  cpi->oxcf.starting_buffer_level = lc->starting_buffer_level;
-  cpi->oxcf.optimal_buffer_level = lc->optimal_buffer_level;
-  cpi->oxcf.maximum_buffer_size = lc->maximum_buffer_size;
   // Reset the frames_since_key and frames_to_key counters to their values
   // before the layer restore. Keep these defined for the stream (not layer).
   if (cpi->svc.number_temporal_layers > 1) {
@@ -191,9 +175,6 @@ void vp9_save_layer_context(VP9_COMP *const cpi) {
   lc->rc = cpi->rc;
   lc->twopass = cpi->twopass;
   lc->target_bandwidth = (int)oxcf->target_bandwidth;
-  lc->starting_buffer_level = oxcf->starting_buffer_level;
-  lc->optimal_buffer_level = oxcf->optimal_buffer_level;
-  lc->maximum_buffer_size = oxcf->maximum_buffer_size;
 }
 
 void vp9_init_second_pass_spatial_svc(VP9_COMP *cpi) {
diff --git a/vp9/encoder/vp9_svc_layercontext.h b/vp9/encoder/vp9_svc_layercontext.h
index 6881ce1e7..36e2027fd 100644
--- a/vp9/encoder/vp9_svc_layercontext.h
+++ b/vp9/encoder/vp9_svc_layercontext.h
@@ -22,9 +22,6 @@ extern "C" {
 typedef struct {
   RATE_CONTROL rc;
   int target_bandwidth;
-  int64_t starting_buffer_level;
-  int64_t optimal_buffer_level;
-  int64_t maximum_buffer_size;
   double framerate;
   int avg_frame_size;
   TWO_PASS twopass;
diff --git a/vp9/encoder/vp9_variance.c b/vp9/encoder/vp9_variance.c
index 02bed8988..eb5ae2e41 100644
--- a/vp9/encoder/vp9_variance.c
+++ b/vp9/encoder/vp9_variance.c
@@ -156,16 +156,15 @@ unsigned int vp9_sub_pixel_avg_variance##W##x##H##_c( \
   return vp9_variance##W##x##H##_c(temp3, W, dst, dst_stride, sse); \
 }
 
-
-void vp9_get_sse_sum_16x16_c(const uint8_t *src_ptr, int source_stride,
-                             const uint8_t *ref_ptr, int ref_stride,
-                             unsigned int *sse, int *sum) {
+void vp9_get16x16var_c(const uint8_t *src_ptr, int source_stride,
+                       const uint8_t *ref_ptr, int ref_stride,
+                       unsigned int *sse, int *sum) {
   variance(src_ptr, source_stride, ref_ptr, ref_stride, 16, 16, sse, sum);
 }
 
-void vp9_get_sse_sum_8x8_c(const uint8_t *src_ptr, int source_stride,
-                           const uint8_t *ref_ptr, int ref_stride,
-                           unsigned int *sse, int *sum) {
+void vp9_get8x8var_c(const uint8_t *src_ptr, int source_stride,
+                     const uint8_t *ref_ptr, int ref_stride,
+                     unsigned int *sse, int *sum) {
   variance(src_ptr, source_stride, ref_ptr, ref_stride, 8, 8, sse, sum);
 }
 
diff --git a/vp9/encoder/x86/vp9_dct_ssse3_x86_64.asm b/vp9/encoder/x86/vp9_dct_ssse3_x86_64.asm
index 8723a7114..28458dcdd 100644
--- a/vp9/encoder/x86/vp9_dct_ssse3_x86_64.asm
+++ b/vp9/encoder/x86/vp9_dct_ssse3_x86_64.asm
@@ -23,6 +23,7 @@ pw_%1_%2:   dw  %1,  %2,  %1,  %2,  %1,  %2,  %1,  %2
 pw_%2_m%1:  dw  %2, -%1,  %2, -%1,  %2, -%1,  %2, -%1
 %endmacro
 
+TRANSFORM_COEFFS 11585,  11585
 TRANSFORM_COEFFS 15137,   6270
 TRANSFORM_COEFFS 16069,   3196
 TRANSFORM_COEFFS  9102,  13623
@@ -83,7 +84,7 @@ SECTION .text
 %endmacro
 
 ; 1D forward 8x8 DCT transform
-%macro FDCT8_1D 0
+%macro FDCT8_1D 1
   SUM_SUB            0,  7,  9
   SUM_SUB            1,  6,  9
   SUM_SUB            2,  5,  9
@@ -92,14 +93,21 @@ SECTION .text
   SUM_SUB            0,  3,  9
   SUM_SUB            1,  2,  9
   SUM_SUB            6,  5,  9
+%if %1 == 0
   SUM_SUB            0,  1,  9
+%endif
 
   BUTTERFLY_4X       2,  3,  6270,  15137,  m8,  9,  10
 
   pmulhrsw           m6, m12
   pmulhrsw           m5, m12
+%if %1 == 0
   pmulhrsw           m0, m12
   pmulhrsw           m1, m12
+%else
+  BUTTERFLY_4X       1,  0,  11585, 11585,  m8,  9,  10
+  SWAP               0,  1
+%endif
 
   SUM_SUB            4,  5,  9
   SUM_SUB            7,  6,  9
@@ -150,10 +158,10 @@ cglobal fdct8x8, 3, 5, 13, input, output, stride
   psllw              m7, 2
 
   ; column transform
-  FDCT8_1D
+  FDCT8_1D  0
   TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
 
-  FDCT8_1D
+  FDCT8_1D  1
   TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
 
   DIVIDE_ROUND_2X   0, 1, 9, 10
diff --git a/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm b/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm
index 673e0b3a6..21aaa9383 100644
--- a/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm
+++ b/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm
@@ -43,9 +43,9 @@ sym(vp9_temporal_filter_apply_sse2):
     mov         [rsp + rbp_backup], rbp
     ; end prolog
 
-        mov         rdx,            arg(3)
+        mov         edx,            arg(3)
         mov         [rsp + block_width], rdx
-        mov         rdx,            arg(4)
+        mov         edx,            arg(4)
         mov         [rsp + block_height], rdx
         movd        xmm6,           arg(5)
         movdqa      [rsp + strength], xmm6 ; where strength is used, all 16 bytes are read
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index fb0fe58d3..72768e11e 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -331,8 +331,10 @@ static vpx_codec_err_t set_encoder_config(
   oxcf->target_bandwidth = 1000 * cfg->rc_target_bitrate;
   oxcf->rc_max_intra_bitrate_pct = extra_cfg->rc_max_intra_bitrate_pct;
 
-  oxcf->best_allowed_q  = vp9_quantizer_to_qindex(cfg->rc_min_quantizer);
-  oxcf->worst_allowed_q = vp9_quantizer_to_qindex(cfg->rc_max_quantizer);
+  oxcf->best_allowed_q =
+      extra_cfg->lossless ? 0 : vp9_quantizer_to_qindex(cfg->rc_min_quantizer);
+  oxcf->worst_allowed_q =
+      extra_cfg->lossless ? 0 : vp9_quantizer_to_qindex(cfg->rc_max_quantizer);
   oxcf->cq_level        = vp9_quantizer_to_qindex(extra_cfg->cq_level);
   oxcf->fixed_q = -1;
 
@@ -343,9 +345,9 @@ static vpx_codec_err_t set_encoder_config(
   oxcf->scaled_frame_width       = cfg->rc_scaled_width;
   oxcf->scaled_frame_height      = cfg->rc_scaled_height;
 
-  oxcf->maximum_buffer_size     = cfg->rc_buf_sz;
-  oxcf->starting_buffer_level   = cfg->rc_buf_initial_sz;
-  oxcf->optimal_buffer_level    = cfg->rc_buf_optimal_sz;
+  oxcf->maximum_buffer_size_ms   = cfg->rc_buf_sz;
+  oxcf->starting_buffer_level_ms = cfg->rc_buf_initial_sz;
+  oxcf->optimal_buffer_level_ms  = cfg->rc_buf_optimal_sz;
 
   oxcf->drop_frames_water_mark   = cfg->rc_dropframe_thresh;
 
@@ -376,8 +378,6 @@ static vpx_codec_err_t set_encoder_config(
   oxcf->tile_columns = extra_cfg->tile_columns;
   oxcf->tile_rows    = extra_cfg->tile_rows;
 
-  oxcf->lossless = extra_cfg->lossless;
-
   oxcf->error_resilient_mode         = cfg->g_error_resilient;
   oxcf->frame_parallel_decoding_mode = extra_cfg->frame_parallel_decoding_mode;
 
@@ -1262,7 +1262,7 @@ static vpx_codec_enc_cfg_map_t encoder_usage_cfg_map[] = {
 
       VPX_VBR,            // rc_end_usage
 #if VPX_ENCODER_ABI_VERSION > (1 + VPX_CODEC_ABI_VERSION)
-      {0},                // rc_twopass_stats_in
+      {NULL, 0},          // rc_twopass_stats_in
 #endif
       256,                // rc_target_bandwidth
       0,                  // rc_min_quantizer
diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c
index 734ec4658..48110b414 100644
--- a/vp9/vp9_dx_iface.c
+++ b/vp9/vp9_dx_iface.c
@@ -38,7 +38,6 @@ struct vpx_codec_alg_priv {
   vpx_decrypt_cb          decrypt_cb;
   void                   *decrypt_state;
   vpx_image_t             img;
-  int                     img_avail;
   int                     invert_tile_order;
 
   // External frame buffer info to save for VP9 common.
@@ -48,10 +47,12 @@ struct vpx_codec_alg_priv {
 };
 
 static vpx_codec_err_t decoder_init(vpx_codec_ctx_t *ctx,
-                            vpx_codec_priv_enc_mr_cfg_t *data) {
+                                    vpx_codec_priv_enc_mr_cfg_t *data) {
   // This function only allocates space for the vpx_codec_alg_priv_t
   // structure. More memory may be required at the time the stream
   // information becomes known.
+  (void)data;
+
   if (!ctx->priv) {
     vpx_codec_alg_priv_t *alg_priv = vpx_memalign(32, sizeof(*alg_priv));
     if (alg_priv == NULL)
@@ -243,14 +244,11 @@ static void init_decoder(vpx_codec_alg_priv_t *ctx) {
 static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx,
                                   const uint8_t **data, unsigned int data_sz,
                                   void *user_priv, int64_t deadline) {
-  YV12_BUFFER_CONFIG sd = { 0 };
   vp9_ppflags_t flags = {0};
   VP9_COMMON *cm = NULL;
 
   (void)deadline;
 
-  ctx->img_avail = 0;
-
   // Determine the stream parameters. Note that we rely on peek_si to
   // validate that we have a buffer that does not wrap around the top
   // of the heap.
@@ -285,13 +283,6 @@ static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx,
   if (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC)
     set_ppflags(ctx, &flags);
 
-  if (vp9_get_raw_frame(ctx->pbi, &sd, &flags))
-    return update_error_state(ctx, &cm->error);
-
-  yuvconfig2image(&ctx->img, &sd, user_priv);
-  ctx->img.fb_priv = cm->frame_bufs[cm->new_fb_idx].raw_frame_buffer.priv;
-  ctx->img_avail = 1;
-
   return VPX_CODEC_OK;
 }
 
@@ -420,15 +411,20 @@ static vpx_image_t *decoder_get_frame(vpx_codec_alg_priv_t *ctx,
                                       vpx_codec_iter_t *iter) {
   vpx_image_t *img = NULL;
 
-  if (ctx->img_avail) {
-    // iter acts as a flip flop, so an image is only returned on the first
-    // call to get_frame.
-    if (!(*iter)) {
+  // iter acts as a flip flop, so an image is only returned on the first
+  // call to get_frame.
+  if (*iter == NULL && ctx->pbi != NULL) {
+    YV12_BUFFER_CONFIG sd;
+    vp9_ppflags_t flags = {0, 0, 0};
+
+    if (vp9_get_raw_frame(ctx->pbi, &sd, &flags) == 0) {
+      VP9_COMMON *cm = &ctx->pbi->common;
+      yuvconfig2image(&ctx->img, &sd, NULL);
+      ctx->img.fb_priv = cm->frame_bufs[cm->new_fb_idx].raw_frame_buffer.priv;
       img = &ctx->img;
       *iter = img;
     }
   }
-  ctx->img_avail = 0;
 
   return img;
 }
@@ -631,11 +627,12 @@ CODEC_INTERFACE(vpx_codec_vp9_dx) = {
     decoder_set_fb_fn,  // vpx_codec_set_fb_fn_t
   },
   { // NOLINT
-    NOT_IMPLEMENTED,
-    NOT_IMPLEMENTED,
-    NOT_IMPLEMENTED,
-    NOT_IMPLEMENTED,
-    NOT_IMPLEMENTED,
-    NOT_IMPLEMENTED
+    NOT_IMPLEMENTED,  // vpx_codec_enc_cfg_map_t
+    NOT_IMPLEMENTED,  // vpx_codec_encode_fn_t
+    NOT_IMPLEMENTED,  // vpx_codec_get_cx_data_fn_t
+    NOT_IMPLEMENTED,  // vpx_codec_enc_config_set_fn_t
+    NOT_IMPLEMENTED,  // vpx_codec_get_global_headers_fn_t
+    NOT_IMPLEMENTED,  // vpx_codec_get_preview_frame_fn_t
+    NOT_IMPLEMENTED   // vpx_codec_enc_mr_get_mem_loc_fn_t
   }
 };