96 files changed, 6473 insertions, 3676 deletions
diff --git a/vp9/common/arm/neon/vp9_idct32x32_add_neon.asm b/vp9/common/arm/neon/vp9_idct32x32_add_neon.asm
index 388a7d719..72e933eee 100644
--- a/vp9/common/arm/neon/vp9_idct32x32_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_idct32x32_add_neon.asm
@@ -72,7 +72,7 @@ cospi_31_64 EQU   804
     ;   reg1 = output[first_offset]
     ;   reg2 = output[second_offset]
     ;   for proper address calculation, the last offset used when manipulating
-    ;   output, wethere reading or storing) must be passed in. use 0 for first
+    ;   output, whether reading or storing) must be passed in. use 0 for first
     ;   use.
     MACRO
     LOAD_FROM_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2
@@ -88,7 +88,7 @@ cospi_31_64 EQU   804
     ;   output[first_offset] = reg1
     ;   output[second_offset] = reg2
     ;   for proper address calculation, the last offset used when manipulating
-    ;   output, wethere reading or storing) must be passed in. use 0 for first
+    ;   output, whether reading or storing) must be passed in. use 0 for first
     ;   use.
     MACRO
     STORE_IN_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2
@@ -242,7 +242,7 @@ cospi_31_64 EQU   804
     ; TODO(cd): have special case to re-use constants when they are similar for
     ;           consecutive butterflies
     ; TODO(cd): have special case when both constants are the same, do the
-    ;           additions/substractions before the multiplies.
+    ;           additions/subtractions before the multiplies.
     ; generate the constants
     ;   generate scalar constants
     mov             r8,  #$first_constant  & 0xFF00
@@ -260,7 +260,7 @@ cospi_31_64 EQU   804
     vmull.s16 q11, $regB, d31
     vmull.s16 q12, $regC, d31
     ; (used) five for intermediate (q8-q12), one for constants (q15)
-    ; do some addition/substractions (to get back two register)
+    ; do some addition/subtractions (to get back two register)
     vsub.s32  q8, q8, q10
     vsub.s32  q9, q9, q11
     ; do more multiplications (ordered for maximum latency hiding)
@@ -268,7 +268,7 @@ cospi_31_64 EQU   804
     vmull.s16 q11, $regA, d30
     vmull.s16 q15, $regB, d30
     ; (used) six for intermediate (q8-q12, q15)
-    ; do more addition/substractions
+    ; do more addition/subtractions
     vadd.s32  q11, q12, q11
     vadd.s32  q10, q10, q15
     ; (used) four for intermediate (q8-q11)
diff --git a/vp9/common/arm/neon/vp9_mb_lpf_neon.asm b/vp9/common/arm/neon/vp9_mb_lpf_neon.asm
index 8cb913cb8..5fe2bba46 100644
--- a/vp9/common/arm/neon/vp9_mb_lpf_neon.asm
+++ b/vp9/common/arm/neon/vp9_mb_lpf_neon.asm
@@ -439,6 +439,9 @@ v_end
     tst         r7, #1
     bxne        lr
 
+    orrs        r5, r5, r6                 ; Check for 0
+    orreq       r7, r7, #2                 ; Only do mbfilter branch
+
     ; mbfilter flat && mask branch
     ; TODO(fgalligan): Can I decrease the cycles shifting to consective d's
     ; and using vibt on the q's?
diff --git a/vp9/common/arm/neon/vp9_reconintra_neon.asm b/vp9/common/arm/neon/vp9_reconintra_neon.asm
index 279f678b1..dc9856fa8 100644
--- a/vp9/common/arm/neon/vp9_reconintra_neon.asm
+++ b/vp9/common/arm/neon/vp9_reconintra_neon.asm
@@ -315,8 +315,8 @@ loop_h
     vdup.u16            q2, r2
     vadd.s16            q1, q1, q3
     vadd.s16            q2, q2, q3
-    vqshrun.s16         d0, q1, #0
-    vqshrun.s16         d1, q2, #0
+    vqmovun.s16         d0, q1
+    vqmovun.s16         d1, q2
     vst1.32             {d0[0]}, [r0], r1
     vst1.32             {d1[0]}, [r0], r1
 
@@ -327,8 +327,8 @@ loop_h
     vdup.u16            q2, r2
     vadd.s16            q1, q1, q3
     vadd.s16            q2, q2, q3
-    vqshrun.s16         d0, q1, #0
-    vqshrun.s16         d1, q2, #0
+    vqmovun.s16         d0, q1
+    vqmovun.s16         d1, q2
     vst1.32             {d0[0]}, [r0], r1
     vst1.32             {d1[0]}, [r0], r1
     bx                  lr
@@ -349,7 +349,7 @@ loop_h
     vdup.u8             d0, r12
 
     ; preload 8 left
-    vld1.8              d30, [r3]
+    vld1.8              {d30}, [r3]
 
     ; Load above 8 pixels
     vld1.64             {d2}, [r2]
@@ -372,10 +372,10 @@ loop_h
     vadd.s16            q8, q3, q8
     vadd.s16            q9, q3, q9
 
-    vqshrun.s16         d0, q0, #0
-    vqshrun.s16         d1, q1, #0
-    vqshrun.s16         d2, q8, #0
-    vqshrun.s16         d3, q9, #0
+    vqmovun.s16         d0, q0
+    vqmovun.s16         d1, q1
+    vqmovun.s16         d2, q8
+    vqmovun.s16         d3, q9
 
     vst1.64             {d0}, [r0], r1
     vst1.64             {d1}, [r0], r1
@@ -394,10 +394,10 @@ loop_h
     vadd.s16            q8, q3, q8
     vadd.s16            q9, q3, q9
 
-    vqshrun.s16         d0, q0, #0
-    vqshrun.s16         d1, q1, #0
-    vqshrun.s16         d2, q8, #0
-    vqshrun.s16         d3, q9, #0
+    vqmovun.s16         d0, q0
+    vqmovun.s16         d1, q1
+    vqmovun.s16         d2, q8
+    vqmovun.s16         d3, q9
 
     vst1.64             {d0}, [r0], r1
     vst1.64             {d1}, [r0], r1
@@ -422,10 +422,10 @@ loop_h
     vdup.u8             q0, r12
 
     ; Load above 8 pixels
-    vld1.8              q1, [r2]
+    vld1.8              {q1}, [r2]
 
     ; preload 8 left into r12
-    vld1.8              d18, [r3]!
+    vld1.8              {d18}, [r3]!
 
     ; Compute above - ytop_left
     vsubl.u8            q2, d2, d0
@@ -445,10 +445,10 @@ loop_16x16_neon
     vadd.s16            q0, q0, q3
     vadd.s16            q11, q8, q2
     vadd.s16            q8, q8, q3
-    vqshrun.s16         d2, q1, #0
-    vqshrun.s16         d3, q0, #0
-    vqshrun.s16         d22, q11, #0
-    vqshrun.s16         d23, q8, #0
+    vqmovun.s16         d2, q1
+    vqmovun.s16         d3, q0
+    vqmovun.s16         d22, q11
+    vqmovun.s16         d23, q8
     vdup.16             q0, d20[2]                  ; proload next 2 rows data
     vdup.16             q8, d20[3]
     vst1.64             {d2,d3}, [r0], r1
@@ -459,10 +459,10 @@ loop_16x16_neon
     vadd.s16            q0, q0, q3
     vadd.s16            q11, q8, q2
     vadd.s16            q8, q8, q3
-    vqshrun.s16         d2, q1, #0
-    vqshrun.s16         d3, q0, #0
-    vqshrun.s16         d22, q11, #0
-    vqshrun.s16         d23, q8, #0
+    vqmovun.s16         d2, q1
+    vqmovun.s16         d3, q0
+    vqmovun.s16         d22, q11
+    vqmovun.s16         d23, q8
     vdup.16             q0, d21[0]                  ; proload next 2 rows data
     vdup.16             q8, d21[1]
     vst1.64             {d2,d3}, [r0], r1
@@ -472,10 +472,10 @@ loop_16x16_neon
     vadd.s16            q0, q0, q3
     vadd.s16            q11, q8, q2
     vadd.s16            q8, q8, q3
-    vqshrun.s16         d2, q1, #0
-    vqshrun.s16         d3, q0, #0
-    vqshrun.s16         d22, q11, #0
-    vqshrun.s16         d23, q8, #0
+    vqmovun.s16         d2, q1
+    vqmovun.s16         d3, q0
+    vqmovun.s16         d22, q11
+    vqmovun.s16         d23, q8
     vdup.16             q0, d21[2]                  ; proload next 2 rows data
     vdup.16             q8, d21[3]
     vst1.64             {d2,d3}, [r0], r1
@@ -486,13 +486,11 @@ loop_16x16_neon
     vadd.s16            q0, q0, q3
     vadd.s16            q11, q8, q2
     vadd.s16            q8, q8, q3
-    vqshrun.s16         d2, q1, #0
-    vqshrun.s16         d3, q0, #0
-    vqshrun.s16         d22, q11, #0
-    vqshrun.s16         d23, q8, #0
-    vdup.16             q0, d20[2]
-    vdup.16             q8, d20[3]
-    vld1.8              d18, [r3]!                  ; preload 8 left into r12
+    vqmovun.s16         d2, q1
+    vqmovun.s16         d3, q0
+    vqmovun.s16         d22, q11
+    vqmovun.s16         d23, q8
+    vld1.8              {d18}, [r3]!                  ; preload 8 left into r12
     vmovl.u8            q10, d18
     vst1.64             {d2,d3}, [r0], r1
     vst1.64             {d22,d23}, [r0], r1
@@ -518,11 +516,11 @@ loop_16x16_neon
     vdup.u8             q0, r12
 
     ; Load above 32 pixels
-    vld1.8              q1, [r2]!
-    vld1.8              q2, [r2]
+    vld1.8              {q1}, [r2]!
+    vld1.8              {q2}, [r2]
 
     ; preload 8 left pixels
-    vld1.8              d26, [r3]!
+    vld1.8              {d26}, [r3]!
 
     ; Compute above - ytop_left
     vsubl.u8            q8, d2, d0
@@ -544,19 +542,19 @@ loop_32x32_neon
     vadd.s16            q13, q0, q9
     vadd.s16            q14, q0, q10
     vadd.s16            q15, q0, q11
-    vqshrun.s16         d0, q12, #0
-    vqshrun.s16         d1, q13, #0
+    vqmovun.s16         d0, q12
+    vqmovun.s16         d1, q13
     vadd.s16            q12, q2, q8
     vadd.s16            q13, q2, q9
-    vqshrun.s16         d2, q14, #0
-    vqshrun.s16         d3, q15, #0
+    vqmovun.s16         d2, q14
+    vqmovun.s16         d3, q15
     vadd.s16            q14, q2, q10
     vadd.s16            q15, q2, q11
     vst1.64             {d0-d3}, [r0], r1
-    vqshrun.s16         d24, q12, #0
-    vqshrun.s16         d25, q13, #0
-    vqshrun.s16         d26, q14, #0
-    vqshrun.s16         d27, q15, #0
+    vqmovun.s16         d24, q12
+    vqmovun.s16         d25, q13
+    vqmovun.s16         d26, q14
+    vqmovun.s16         d27, q15
     vdup.16             q1, d6[2]
     vdup.16             q2, d6[3]
     vst1.64             {d24-d27}, [r0], r1
@@ -566,19 +564,19 @@ loop_32x32_neon
     vadd.s16            q13, q1, q9
     vadd.s16            q14, q1, q10
     vadd.s16            q15, q1, q11
-    vqshrun.s16         d0, q12, #0
-    vqshrun.s16         d1, q13, #0
+    vqmovun.s16         d0, q12
+    vqmovun.s16         d1, q13
     vadd.s16            q12, q2, q8
     vadd.s16            q13, q2, q9
-    vqshrun.s16         d2, q14, #0
-    vqshrun.s16         d3, q15, #0
+    vqmovun.s16         d2, q14
+    vqmovun.s16         d3, q15
     vadd.s16            q14, q2, q10
     vadd.s16            q15, q2, q11
     vst1.64             {d0-d3}, [r0], r1
-    vqshrun.s16         d24, q12, #0
-    vqshrun.s16         d25, q13, #0
-    vqshrun.s16         d26, q14, #0
-    vqshrun.s16         d27, q15, #0
+    vqmovun.s16         d24, q12
+    vqmovun.s16         d25, q13
+    vqmovun.s16         d26, q14
+    vqmovun.s16         d27, q15
     vdup.16             q0, d7[0]
     vdup.16             q2, d7[1]
     vst1.64             {d24-d27}, [r0], r1
@@ -588,19 +586,19 @@ loop_32x32_neon
     vadd.s16            q13, q0, q9
     vadd.s16            q14, q0, q10
     vadd.s16            q15, q0, q11
-    vqshrun.s16         d0, q12, #0
-    vqshrun.s16         d1, q13, #0
+    vqmovun.s16         d0, q12
+    vqmovun.s16         d1, q13
     vadd.s16            q12, q2, q8
     vadd.s16            q13, q2, q9
-    vqshrun.s16         d2, q14, #0
-    vqshrun.s16         d3, q15, #0
+    vqmovun.s16         d2, q14
+    vqmovun.s16         d3, q15
     vadd.s16            q14, q2, q10
     vadd.s16            q15, q2, q11
     vst1.64             {d0-d3}, [r0], r1
-    vqshrun.s16         d24, q12, #0
-    vqshrun.s16         d25, q13, #0
-    vqshrun.s16         d26, q14, #0
-    vqshrun.s16         d27, q15, #0
+    vqmovun.s16         d24, q12
+    vqmovun.s16         d25, q13
+    vqmovun.s16         d26, q14
+    vqmovun.s16         d27, q15
     vdup.16             q0, d7[2]
     vdup.16             q2, d7[3]
     vst1.64             {d24-d27}, [r0], r1
@@ -610,20 +608,20 @@ loop_32x32_neon
     vadd.s16            q13, q0, q9
     vadd.s16            q14, q0, q10
     vadd.s16            q15, q0, q11
-    vqshrun.s16         d0, q12, #0
-    vqshrun.s16         d1, q13, #0
+    vqmovun.s16         d0, q12
+    vqmovun.s16         d1, q13
     vadd.s16            q12, q2, q8
     vadd.s16            q13, q2, q9
-    vqshrun.s16         d2, q14, #0
-    vqshrun.s16         d3, q15, #0
+    vqmovun.s16         d2, q14
+    vqmovun.s16         d3, q15
     vadd.s16            q14, q2, q10
     vadd.s16            q15, q2, q11
     vst1.64             {d0-d3}, [r0], r1
-    vqshrun.s16         d24, q12, #0
-    vqshrun.s16         d25, q13, #0
-    vld1.8              d0, [r3]!                   ; preload 8 left pixels
-    vqshrun.s16         d26, q14, #0
-    vqshrun.s16         d27, q15, #0
+    vqmovun.s16         d24, q12
+    vqmovun.s16         d25, q13
+    vld1.8              {d0}, [r3]!                   ; preload 8 left pixels
+    vqmovun.s16         d26, q14
+    vqmovun.s16         d27, q15
     vmovl.u8            q3, d0
     vst1.64             {d24-d27}, [r0], r1
 
diff --git a/vp9/common/generic/vp9_systemdependent.c b/vp9/common/generic/vp9_systemdependent.c
deleted file mode 100644
index 536febb65..000000000
--- a/vp9/common/generic/vp9_systemdependent.c
+++ /dev/null
@@ -1,19 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "./vpx_config.h"
-#include "./vp9_rtcd.h"
-#include "vp9/common/vp9_onyxc_int.h"
-
-void vp9_machine_specific_config(VP9_COMMON *cm) {
-  (void)cm;
-  vp9_rtcd();
-}
diff --git a/vp9/common/mips/dspr2/vp9_common_dspr2.h b/vp9/common/mips/dspr2/vp9_common_dspr2.h
index 991d3c2b3..6ebea9f2f 100644
--- a/vp9/common/mips/dspr2/vp9_common_dspr2.h
+++ b/vp9/common/mips/dspr2/vp9_common_dspr2.h
@@ -85,8 +85,8 @@ static INLINE void vp9_prefetch_store_streamed(unsigned char *dst) {
   );
 }
 
-void vp9_idct32_1d_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
-                                      int dest_stride);
+void vp9_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
+                                   int dest_stride);
 
 void vp9_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                uint8_t *dst, ptrdiff_t dst_stride,
diff --git a/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c b/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c
index 1b2f5506a..19c582fd1 100644
--- a/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c
@@ -19,8 +19,8 @@
 #include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
 
 #if HAVE_DSPR2
-static void idct16_1d_rows_dspr2(const int16_t *input, int16_t *output,
-                                 uint32_t no_rows) {
+static void idct16_rows_dspr2(const int16_t *input, int16_t *output,
+                              uint32_t no_rows) {
   int i;
   int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
   int step1_10, step1_11, step1_12, step1_13;
@@ -404,8 +404,8 @@ static void idct16_1d_rows_dspr2(const int16_t *input, int16_t *output,
   }
 }
 
-static void idct16_1d_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
-                                         int dest_stride) {
+static void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
+                                      int dest_stride) {
   int i;
   int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
   int step1_8, step1_9, step1_10, step1_11;
@@ -905,13 +905,13 @@ void vp9_idct16x16_256_add_dspr2(const int16_t *input, uint8_t *dest,
   );
 
   // First transform rows
-  idct16_1d_rows_dspr2(input, out, 16);
+  idct16_rows_dspr2(input, out, 16);
 
   // Then transform columns and add to dest
-  idct16_1d_cols_add_blk_dspr2(out, dest, dest_stride);
+  idct16_cols_add_blk_dspr2(out, dest, dest_stride);
 }
 
-static void iadst16_1d(const int16_t *input, int16_t *output) {
+static void iadst16(const int16_t *input, int16_t *output) {
   int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
 
   int x0 = input[15];
@@ -1099,16 +1099,16 @@ void vp9_iht16x16_256_add_dspr2(const int16_t *input, uint8_t *dest,
 
   switch (tx_type) {
     case DCT_DCT:     // DCT in both horizontal and vertical
-      idct16_1d_rows_dspr2(input, outptr, 16);
-      idct16_1d_cols_add_blk_dspr2(out, dest, pitch);
+      idct16_rows_dspr2(input, outptr, 16);
+      idct16_cols_add_blk_dspr2(out, dest, pitch);
       break;
     case ADST_DCT:    // ADST in vertical, DCT in horizontal
-      idct16_1d_rows_dspr2(input, outptr, 16);
+      idct16_rows_dspr2(input, outptr, 16);
 
       outptr = out;
 
       for (i = 0; i < 16; ++i) {
-        iadst16_1d(outptr, temp_out);
+        iadst16(outptr, temp_out);
 
         for (j = 0; j < 16; ++j)
           dest[j * pitch + i] =
@@ -1125,7 +1125,7 @@ void vp9_iht16x16_256_add_dspr2(const int16_t *input, uint8_t *dest,
         /* prefetch row */
         vp9_prefetch_load((const uint8_t *)(input + 16));
 
-        iadst16_1d(input, outptr);
+        iadst16(input, outptr);
         input += 16;
         outptr += 16;
       }
@@ -1134,7 +1134,7 @@ void vp9_iht16x16_256_add_dspr2(const int16_t *input, uint8_t *dest,
         for (j = 0; j < 16; ++j)
             temp_in[j * 16 + i] = out[i * 16 + j];
 
-      idct16_1d_cols_add_blk_dspr2(temp_in, dest, pitch);
+      idct16_cols_add_blk_dspr2(temp_in, dest, pitch);
     }
     break;
     case ADST_ADST:   // ADST in both directions
@@ -1145,7 +1145,7 @@ void vp9_iht16x16_256_add_dspr2(const int16_t *input, uint8_t *dest,
         /* prefetch row */
         vp9_prefetch_load((const uint8_t *)(input + 16));
 
-        iadst16_1d(input, outptr);
+        iadst16(input, outptr);
         input += 16;
         outptr += 16;
       }
@@ -1153,7 +1153,7 @@ void vp9_iht16x16_256_add_dspr2(const int16_t *input, uint8_t *dest,
       for (i = 0; i < 16; ++i) {
         for (j = 0; j < 16; ++j)
           temp_in[j] = out[j * 16 + i];
-        iadst16_1d(temp_in, temp_out);
+        iadst16(temp_in, temp_out);
         for (j = 0; j < 16; ++j)
           dest[j * pitch + i] =
                     clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
@@ -1183,7 +1183,7 @@ void vp9_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest,
 
   // First transform rows. Since all non-zero dct coefficients are in
   // upper-left 4x4 area, we only need to calculate first 4 rows here.
-  idct16_1d_rows_dspr2(input, outptr, 4);
+  idct16_rows_dspr2(input, outptr, 4);
 
   outptr += 4;
   for (i = 0; i < 6; ++i) {
@@ -1213,7 +1213,7 @@ void vp9_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest,
   }
 
   // Then transform columns
-  idct16_1d_cols_add_blk_dspr2(out, dest, dest_stride);
+  idct16_cols_add_blk_dspr2(out, dest, dest_stride);
 }
 
 void vp9_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest,
diff --git a/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c b/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c
index 5e92db3d2..132d88ce5 100644
--- a/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c
@@ -18,8 +18,8 @@
 #include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
 
 #if HAVE_DSPR2
-void vp9_idct32_1d_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
-                                      int dest_stride) {
+void vp9_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
+                                   int dest_stride) {
   int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6;
   int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13;
   int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19;
diff --git a/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c b/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c
index bc6759400..74a90b02c 100644
--- a/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c
@@ -19,8 +19,8 @@
 #include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
 
 #if HAVE_DSPR2
-static void idct32_1d_rows_dspr2(const int16_t *input, int16_t *output,
-                                 uint32_t no_rows) {
+static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
+                              uint32_t no_rows) {
   int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6;
   int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13;
   int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19, step1_20;
@@ -882,10 +882,10 @@ void vp9_idct32x32_1024_add_dspr2(const int16_t *input, uint8_t *dest,
   );
 
   // Rows
-  idct32_1d_rows_dspr2(input, outptr, 32);
+  idct32_rows_dspr2(input, outptr, 32);
 
   // Columns
-  vp9_idct32_1d_cols_add_blk_dspr2(out, dest, dest_stride);
+  vp9_idct32_cols_add_blk_dspr2(out, dest, dest_stride);
 }
 
 void vp9_idct32x32_34_add_dspr2(const int16_t *input, uint8_t *dest,
@@ -903,7 +903,7 @@ void vp9_idct32x32_34_add_dspr2(const int16_t *input, uint8_t *dest,
   );
 
   // Rows
-  idct32_1d_rows_dspr2(input, outptr, 8);
+  idct32_rows_dspr2(input, outptr, 8);
 
   outptr += 8;
   __asm__ __volatile__ (
@@ -947,7 +947,7 @@ void vp9_idct32x32_34_add_dspr2(const int16_t *input, uint8_t *dest,
   }
 
   // Columns
-  vp9_idct32_1d_cols_add_blk_dspr2(out, dest, stride);
+  vp9_idct32_cols_add_blk_dspr2(out, dest, stride);
 }
 
 void vp9_idct32x32_1_add_dspr2(const int16_t *input, uint8_t *dest,
diff --git a/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c b/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c
index 5b7aa5e71..1990348b8 100644
--- a/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c
@@ -19,7 +19,7 @@
 #include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
 
 #if HAVE_DSPR2
-static void vp9_idct4_1d_rows_dspr2(const int16_t *input, int16_t *output) {
+static void vp9_idct4_rows_dspr2(const int16_t *input, int16_t *output) {
   int16_t   step_0, step_1, step_2, step_3;
   int       Temp0, Temp1, Temp2, Temp3;
   const int const_2_power_13 = 8192;
@@ -104,7 +104,7 @@ static void vp9_idct4_1d_rows_dspr2(const int16_t *input, int16_t *output) {
   }
 }
 
-static void vp9_idct4_1d_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
+static void vp9_idct4_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
                                                int dest_stride) {
   int16_t   step_0, step_1, step_2, step_3;
   int       Temp0, Temp1, Temp2, Temp3;
@@ -240,10 +240,10 @@ void vp9_idct4x4_16_add_dspr2(const int16_t *input, uint8_t *dest,
   );
 
   // Rows
-  vp9_idct4_1d_rows_dspr2(input, outptr);
+  vp9_idct4_rows_dspr2(input, outptr);
 
   // Columns
-  vp9_idct4_1d_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+  vp9_idct4_columns_add_blk_dspr2(&out[0], dest, dest_stride);
 }
 
 void vp9_idct4x4_1_add_dspr2(const int16_t *input, uint8_t *dest,
@@ -319,7 +319,7 @@ void vp9_idct4x4_1_add_dspr2(const int16_t *input, uint8_t *dest,
   }
 }
 
-static void iadst4_1d_dspr2(const int16_t *input, int16_t *output) {
+static void iadst4_dspr2(const int16_t *input, int16_t *output) {
   int s0, s1, s2, s3, s4, s5, s6, s7;
   int x0, x1, x2, x3;
 
@@ -379,16 +379,16 @@ void vp9_iht4x4_16_add_dspr2(const int16_t *input, uint8_t *dest,
 
   switch (tx_type) {
     case DCT_DCT:   // DCT in both horizontal and vertical
-      vp9_idct4_1d_rows_dspr2(input, outptr);
-      vp9_idct4_1d_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+      vp9_idct4_rows_dspr2(input, outptr);
+      vp9_idct4_columns_add_blk_dspr2(&out[0], dest, dest_stride);
       break;
     case ADST_DCT:  // ADST in vertical, DCT in horizontal
-      vp9_idct4_1d_rows_dspr2(input, outptr);
+      vp9_idct4_rows_dspr2(input, outptr);
 
       outptr = out;
 
       for (i = 0; i < 4; ++i) {
-        iadst4_1d_dspr2(outptr, temp_out);
+        iadst4_dspr2(outptr, temp_out);
 
         for (j = 0; j < 4; ++j)
           dest[j * dest_stride + i] =
@@ -400,7 +400,7 @@ void vp9_iht4x4_16_add_dspr2(const int16_t *input, uint8_t *dest,
       break;
     case DCT_ADST:  // DCT in vertical, ADST in horizontal
       for (i = 0; i < 4; ++i) {
-        iadst4_1d_dspr2(input, outptr);
+        iadst4_dspr2(input, outptr);
         input  += 4;
         outptr += 4;
       }
@@ -410,11 +410,11 @@ void vp9_iht4x4_16_add_dspr2(const int16_t *input, uint8_t *dest,
           temp_in[i * 4 + j] = out[j * 4 + i];
         }
       }
-      vp9_idct4_1d_columns_add_blk_dspr2(&temp_in[0], dest, dest_stride);
+      vp9_idct4_columns_add_blk_dspr2(&temp_in[0], dest, dest_stride);
       break;
     case ADST_ADST:  // ADST in both directions
       for (i = 0; i < 4; ++i) {
-        iadst4_1d_dspr2(input, outptr);
+        iadst4_dspr2(input, outptr);
         input  += 4;
         outptr += 4;
       }
@@ -422,7 +422,7 @@ void vp9_iht4x4_16_add_dspr2(const int16_t *input, uint8_t *dest,
       for (i = 0; i < 4; ++i) {
         for (j = 0; j < 4; ++j)
           temp_in[j] = out[j * 4 + i];
-        iadst4_1d_dspr2(temp_in, temp_out);
+        iadst4_dspr2(temp_in, temp_out);
 
         for (j = 0; j < 4; ++j)
           dest[j * dest_stride + i] =
diff --git a/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c b/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c
index 93a08401d..acccaea6d 100644
--- a/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c
@@ -19,8 +19,8 @@
 #include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
 
 #if HAVE_DSPR2
-static void idct8_1d_rows_dspr2(const int16_t *input, int16_t *output,
-                                uint32_t no_rows) {
+static void idct8_rows_dspr2(const int16_t *input, int16_t *output,
+                             uint32_t no_rows) {
   int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
   const int const_2_power_13 = 8192;
   int Temp0, Temp1, Temp2, Temp3, Temp4;
@@ -200,8 +200,8 @@ static void idct8_1d_rows_dspr2(const int16_t *input, int16_t *output,
   }
 }
 
-static void idct8_1d_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
-                                           int dest_stride) {
+static void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
+                                        int dest_stride) {
   int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
   int Temp0, Temp1, Temp2, Temp3;
   int i;
@@ -462,13 +462,13 @@ void vp9_idct8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
   );
 
   // First transform rows
-  idct8_1d_rows_dspr2(input, outptr, 8);
+  idct8_rows_dspr2(input, outptr, 8);
 
   // Then transform columns and add to dest
-  idct8_1d_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+  idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride);
 }
 
-static void iadst8_1d_dspr2(const int16_t *input, int16_t *output) {
+static void iadst8_dspr2(const int16_t *input, int16_t *output) {
   int s0, s1, s2, s3, s4, s5, s6, s7;
   int x0, x1, x2, x3, x4, x5, x6, x7;
 
@@ -563,14 +563,14 @@ void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
 
   switch (tx_type) {
     case DCT_DCT:     // DCT in both horizontal and vertical
-      idct8_1d_rows_dspr2(input, outptr, 8);
-      idct8_1d_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+      idct8_rows_dspr2(input, outptr, 8);
+      idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride);
       break;
     case ADST_DCT:    // ADST in vertical, DCT in horizontal
-      idct8_1d_rows_dspr2(input, outptr, 8);
+      idct8_rows_dspr2(input, outptr, 8);
 
       for (i = 0; i < 8; ++i) {
-        iadst8_1d_dspr2(&out[i * 8], temp_out);
+        iadst8_dspr2(&out[i * 8], temp_out);
 
         for (j = 0; j < 8; ++j)
           dest[j * dest_stride + i] =
@@ -580,7 +580,7 @@ void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
       break;
     case DCT_ADST:    // DCT in vertical, ADST in horizontal
       for (i = 0; i < 8; ++i) {
-        iadst8_1d_dspr2(input, outptr);
+        iadst8_dspr2(input, outptr);
         input += 8;
         outptr += 8;
       }
@@ -590,11 +590,11 @@ void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
           temp_in[i * 8 + j] = out[j * 8 + i];
         }
       }
-      idct8_1d_columns_add_blk_dspr2(&temp_in[0], dest, dest_stride);
+      idct8_columns_add_blk_dspr2(&temp_in[0], dest, dest_stride);
       break;
     case ADST_ADST:   // ADST in both directions
       for (i = 0; i < 8; ++i) {
-        iadst8_1d_dspr2(input, outptr);
+        iadst8_dspr2(input, outptr);
         input += 8;
         outptr += 8;
       }
@@ -603,7 +603,7 @@ void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
         for (j = 0; j < 8; ++j)
           temp_in[j] = out[j * 8 + i];
 
-        iadst8_1d_dspr2(temp_in, temp_out);
+        iadst8_dspr2(temp_in, temp_out);
 
         for (j = 0; j < 8; ++j)
           dest[j * dest_stride + i] =
@@ -631,7 +631,7 @@ void vp9_idct8x8_10_add_dspr2(const int16_t *input, uint8_t *dest,
   );
 
   // First transform rows
-  idct8_1d_rows_dspr2(input, outptr, 4);
+  idct8_rows_dspr2(input, outptr, 4);
 
   outptr += 4;
 
@@ -659,7 +659,7 @@ void vp9_idct8x8_10_add_dspr2(const int16_t *input, uint8_t *dest,
 
 
   // Then transform columns and add to dest
-  idct8_1d_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+  idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride);
 }
 
 void vp9_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest,
diff --git a/vp9/common/vp9_alloccommon.c b/vp9/common/vp9_alloccommon.c
index e033fbb99..ff4b7c1f9 100644
--- a/vp9/common/vp9_alloccommon.c
+++ b/vp9/common/vp9_alloccommon.c
@@ -33,9 +33,16 @@ void vp9_update_mode_info_border(VP9_COMMON *cm, MODE_INFO *mi) {
 void vp9_free_frame_buffers(VP9_COMMON *cm) {
   int i;
 
-  for (i = 0; i < FRAME_BUFFERS; i++)
+  for (i = 0; i < FRAME_BUFFERS; i++) {
     vp9_free_frame_buffer(&cm->frame_bufs[i].buf);
 
+    if (cm->frame_bufs[i].ref_count > 0 &&
+        cm->frame_bufs[i].raw_frame_buffer.data != NULL) {
+      cm->release_fb_cb(cm->cb_priv, &cm->frame_bufs[i].raw_frame_buffer);
+      cm->frame_bufs[i].ref_count = 0;
+    }
+  }
+
   vp9_free_frame_buffer(&cm->post_proc_buffer);
 
   vpx_free(cm->mip);
@@ -85,7 +92,7 @@ int vp9_resize_frame_buffers(VP9_COMMON *cm, int width, int height) {
   int mi_size;
 
   if (vp9_realloc_frame_buffer(&cm->post_proc_buffer, width, height, ss_x, ss_y,
-                               VP9_DEC_BORDER_IN_PIXELS) < 0)
+                               VP9_DEC_BORDER_IN_PIXELS, NULL, NULL, NULL) < 0)
     goto fail;
 
   set_mb_mi(cm, aligned_width, aligned_height);
@@ -194,11 +201,12 @@ int vp9_alloc_frame_buffers(VP9_COMMON *cm, int width, int height) {
 }
 
 void vp9_create_common(VP9_COMMON *cm) {
-  vp9_machine_specific_config(cm);
+  vp9_rtcd();
 }
 
 void vp9_remove_common(VP9_COMMON *cm) {
   vp9_free_frame_buffers(cm);
+  vp9_free_internal_frame_buffers(&cm->int_frame_buffers);
 }
 
 void vp9_initialize_common() {
diff --git a/vp9/common/vp9_blockd.c b/vp9/common/vp9_blockd.c
index 8cc657114..d918bedc6 100644
--- a/vp9/common/vp9_blockd.c
+++ b/vp9/common/vp9_blockd.c
@@ -98,16 +98,6 @@ void vp9_foreach_transformed_block(const MACROBLOCKD* const xd,
     vp9_foreach_transformed_block_in_plane(xd, bsize, plane, visit, arg);
 }
 
-void vp9_foreach_transformed_block_uv(const MACROBLOCKD* const xd,
-                                      BLOCK_SIZE bsize,
-                                      foreach_transformed_block_visitor visit,
-                                      void *arg) {
-  int plane;
-
-  for (plane = 1; plane < MAX_MB_PLANE; plane++)
-    vp9_foreach_transformed_block_in_plane(xd, bsize, plane, visit, arg);
-}
-
 void vp9_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd,
                       BLOCK_SIZE plane_bsize, TX_SIZE tx_size, int has_eob,
                       int aoff, int loff) {
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index 70b8ffa4e..6086323f6 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -89,7 +89,6 @@ static INLINE int is_inter_mode(MB_PREDICTION_MODE mode) {
 
 #define INTER_OFFSET(mode) ((mode) - NEARESTMV)
 
-
 /* For keyframes, intra block modes are predicted by the (already decoded)
    modes for the Y blocks to the left and above us; for interframes, there
    is a single probability table. */
@@ -129,7 +128,7 @@ typedef struct {
 
   uint8_t mode_context[MAX_REF_FRAMES];
 
-  unsigned char skip_coeff;    // 0=need to decode coeffs, 1=no coefficients
+  unsigned char skip;    // 0=need to decode coeffs, 1=no coefficients
   unsigned char segment_id;    // Segment id for this block.
 
   // Flags used for prediction status of various bit-stream signals
@@ -182,7 +181,7 @@ struct macroblockd_plane {
   int subsampling_y;
   struct buf_2d dst;
   struct buf_2d pre[2];
-  int16_t *dequant;
+  const int16_t *dequant;
   ENTROPY_CONTEXT *above_context;
   ENTROPY_CONTEXT *left_context;
 };
@@ -314,11 +313,6 @@ void vp9_foreach_transformed_block(
     const MACROBLOCKD* const xd, BLOCK_SIZE bsize,
     foreach_transformed_block_visitor visit, void *arg);
 
-
-void vp9_foreach_transformed_block_uv(
-    const MACROBLOCKD* const xd, BLOCK_SIZE bsize,
-    foreach_transformed_block_visitor visit, void *arg);
-
 static INLINE void txfrm_block_to_raster_xy(BLOCK_SIZE plane_bsize,
                                             TX_SIZE tx_size, int block,
                                             int *x, int *y) {
diff --git a/vp9/common/vp9_convolve.c b/vp9/common/vp9_convolve.c
index 3807ccc87..d30e0b488 100644
--- a/vp9/common/vp9_convolve.c
+++ b/vp9/common/vp9_convolve.c
@@ -145,7 +145,7 @@ static const InterpKernel *get_filter_base(const int16_t *filter) {
 }
 
 static int get_filter_offset(const int16_t *f, const InterpKernel *base) {
-  return (const InterpKernel *)(intptr_t)f - base;
+  return (int)((const InterpKernel *)(intptr_t)f - base);
 }
 
 void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
diff --git a/vp9/common/vp9_debugmodes.c b/vp9/common/vp9_debugmodes.c
index 355ac1a49..24c785f2a 100644
--- a/vp9/common/vp9_debugmodes.c
+++ b/vp9/common/vp9_debugmodes.c
@@ -58,7 +58,7 @@ void vp9_print_modes_and_motion_vectors(VP9_COMMON *cm, char *file) {
 
   print_mi_data(cm, mvs, "Partitions:", offsetof(MB_MODE_INFO, sb_type));
   print_mi_data(cm, mvs, "Modes:", offsetof(MB_MODE_INFO, mode));
-  print_mi_data(cm, mvs, "Skips:", offsetof(MB_MODE_INFO, skip_coeff));
+  print_mi_data(cm, mvs, "Skips:", offsetof(MB_MODE_INFO, skip));
   print_mi_data(cm, mvs, "Ref frame:", offsetof(MB_MODE_INFO, ref_frame[0]));
   print_mi_data(cm, mvs, "Transform:", offsetof(MB_MODE_INFO, tx_size));
   print_mi_data(cm, mvs, "UV Modes:", offsetof(MB_MODE_INFO, uv_mode));
diff --git a/vp9/common/vp9_entropy.c b/vp9/common/vp9_entropy.c
index 13e954efe..bc12f9aa2 100644
--- a/vp9/common/vp9_entropy.c
+++ b/vp9/common/vp9_entropy.c
@@ -16,7 +16,7 @@
 #include "vpx/vpx_integer.h"
 
 
-DECLARE_ALIGNED(16, const uint8_t, vp9_coefband_trans_8x8plus[1024]) = {
+const uint8_t vp9_coefband_trans_8x8plus[1024] = {
   0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4,
   4, 4, 4, 4, 4, 5,
   // beyond MAXBAND_INDEX+1 all values are filled as 5
@@ -85,11 +85,11 @@ DECLARE_ALIGNED(16, const uint8_t, vp9_coefband_trans_8x8plus[1024]) = {
   5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
 };
 
-DECLARE_ALIGNED(16, const uint8_t, vp9_coefband_trans_4x4[16]) = {
+const uint8_t vp9_coefband_trans_4x4[16] = {
   0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5,
 };
 
-DECLARE_ALIGNED(16, const uint8_t, vp9_pt_energy_class[ENTROPY_TOKENS]) = {
+const uint8_t vp9_pt_energy_class[ENTROPY_TOKENS] = {
   0, 1, 2, 3, 3, 4, 4, 5, 5, 5, 5, 5
 };
 
diff --git a/vp9/common/vp9_entropy.h b/vp9/common/vp9_entropy.h
index e030d92ec..aab8b5388 100644
--- a/vp9/common/vp9_entropy.h
+++ b/vp9/common/vp9_entropy.h
@@ -42,7 +42,7 @@ extern "C" {
 
 #define ENTROPY_NODES 11
 
-extern DECLARE_ALIGNED(16, const uint8_t, vp9_pt_energy_class[ENTROPY_TOKENS]);
+DECLARE_ALIGNED(16, extern const uint8_t, vp9_pt_energy_class[ENTROPY_TOKENS]);
 
 #define EOB_MODEL_TOKEN 3
 extern const vp9_tree_index vp9_coefmodel_tree[];
@@ -116,10 +116,10 @@ static INLINE void reset_skip_context(MACROBLOCKD *xd, BLOCK_SIZE bsize) {
 // This macro is currently unused but may be used by certain implementations
 #define MAXBAND_INDEX 21
 
-extern DECLARE_ALIGNED(16, const uint8_t, vp9_coefband_trans_8x8plus[1024]);
-extern DECLARE_ALIGNED(16, const uint8_t, vp9_coefband_trans_4x4[16]);
+DECLARE_ALIGNED(16, extern const uint8_t, vp9_coefband_trans_8x8plus[1024]);
+DECLARE_ALIGNED(16, extern const uint8_t, vp9_coefband_trans_4x4[16]);
 
-static const uint8_t *get_band_translate(TX_SIZE tx_size) {
+static INLINE const uint8_t *get_band_translate(TX_SIZE tx_size) {
   return tx_size == TX_4X4 ? vp9_coefband_trans_4x4
                            : vp9_coefband_trans_8x8plus;
 }
@@ -146,8 +146,8 @@ typedef unsigned int vp9_coeff_count_model[REF_TYPES][COEF_BANDS]
 
 void vp9_model_to_full_probs(const vp9_prob *model, vp9_prob *full);
 
-static int get_entropy_context(TX_SIZE tx_size, const ENTROPY_CONTEXT *a,
-                                                const ENTROPY_CONTEXT *l) {
+static INLINE int get_entropy_context(TX_SIZE tx_size, const ENTROPY_CONTEXT *a,
+                                      const ENTROPY_CONTEXT *l) {
   ENTROPY_CONTEXT above_ec = 0, left_ec = 0;
 
   switch (tx_size) {
@@ -174,8 +174,8 @@ static int get_entropy_context(TX_SIZE tx_size, const ENTROPY_CONTEXT *a,
   return combine_entropy_contexts(above_ec, left_ec);
 }
 
-static const scan_order *get_scan(const MACROBLOCKD *xd, TX_SIZE tx_size,
-                                  PLANE_TYPE type, int block_idx) {
+static const INLINE scan_order *get_scan(const MACROBLOCKD *xd, TX_SIZE tx_size,
+                                         PLANE_TYPE type, int block_idx) {
   const MODE_INFO *const mi = xd->mi_8x8[0];
   const MB_MODE_INFO *const mbmi = &mi->mbmi;
 
diff --git a/vp9/common/vp9_entropymode.c b/vp9/common/vp9_entropymode.c
index 6def3c869..892153936 100644
--- a/vp9/common/vp9_entropymode.c
+++ b/vp9/common/vp9_entropymode.c
@@ -345,7 +345,7 @@ static int adapt_prob(vp9_prob pre_prob, const unsigned int ct[2]) {
 static void adapt_probs(const vp9_tree_index *tree,
                         const vp9_prob *pre_probs, const unsigned int *counts,
                         vp9_prob *probs) {
-  tree_merge_probs(tree, pre_probs, counts, COUNT_SAT, MAX_UPDATE_FACTOR,
+  vp9_tree_merge_probs(tree, pre_probs, counts, COUNT_SAT, MAX_UPDATE_FACTOR,
                    probs);
 }
 
@@ -465,8 +465,10 @@ void vp9_setup_past_independence(VP9_COMMON *cm) {
     cm->frame_contexts[cm->frame_context_idx] = cm->fc;
   }
 
-  vpx_memset(cm->prev_mip, 0,
-             cm->mode_info_stride * (cm->mi_rows + 1) * sizeof(MODE_INFO));
+  if (frame_is_intra_only(cm))
+    vpx_memset(cm->prev_mip, 0,
+               cm->mode_info_stride * (cm->mi_rows + 1) * sizeof(MODE_INFO));
+
   vpx_memset(cm->mip, 0,
              cm->mode_info_stride * (cm->mi_rows + 1) * sizeof(MODE_INFO));
 
diff --git a/vp9/common/vp9_entropymv.c b/vp9/common/vp9_entropymv.c
index 60ae79fdc..e1f5ef7b4 100644
--- a/vp9/common/vp9_entropymv.c
+++ b/vp9/common/vp9_entropymv.c
@@ -192,8 +192,8 @@ static vp9_prob adapt_prob(vp9_prob prep, const unsigned int ct[2]) {
 
 static void adapt_probs(const vp9_tree_index *tree, const vp9_prob *pre_probs,
                         const unsigned int *counts, vp9_prob *probs) {
-  tree_merge_probs(tree, pre_probs, counts, MV_COUNT_SAT, MV_MAX_UPDATE_FACTOR,
-                   probs);
+  vp9_tree_merge_probs(tree, pre_probs, counts, MV_COUNT_SAT,
+                       MV_MAX_UPDATE_FACTOR, probs);
 }
 
 void vp9_adapt_mv_probs(VP9_COMMON *cm, int allow_hp) {
diff --git a/vp9/common/vp9_filter.c b/vp9/common/vp9_filter.c
index 546f603b6..7474a88bc 100644
--- a/vp9/common/vp9_filter.c
+++ b/vp9/common/vp9_filter.c
@@ -10,12 +10,9 @@
 
 #include <assert.h>
 
-#include "vpx_ports/mem.h"
-
 #include "vp9/common/vp9_filter.h"
 
-DECLARE_ALIGNED(256, const InterpKernel,
-                vp9_bilinear_filters[SUBPEL_SHIFTS]) = {
+const InterpKernel vp9_bilinear_filters[SUBPEL_SHIFTS] = {
   { 0, 0, 0, 128,   0, 0, 0, 0 },
   { 0, 0, 0, 120,   8, 0, 0, 0 },
   { 0, 0, 0, 112,  16, 0, 0, 0 },
@@ -35,8 +32,7 @@ DECLARE_ALIGNED(256, const InterpKernel,
 };
 
 // Lagrangian interpolation filter
-DECLARE_ALIGNED(256, const InterpKernel,
-                vp9_sub_pel_filters_8[SUBPEL_SHIFTS]) = {
+const InterpKernel vp9_sub_pel_filters_8[SUBPEL_SHIFTS] = {
   { 0,   0,   0, 128,   0,   0,   0,  0},
   { 0,   1,  -5, 126,   8,  -3,   1,  0},
   { -1,   3, -10, 122,  18,  -6,   2,  0},
@@ -56,8 +52,7 @@ DECLARE_ALIGNED(256, const InterpKernel,
 };
 
 // DCT based filter
-DECLARE_ALIGNED(256, const InterpKernel,
-                vp9_sub_pel_filters_8s[SUBPEL_SHIFTS]) = {
+const InterpKernel vp9_sub_pel_filters_8s[SUBPEL_SHIFTS] = {
   {0,   0,   0, 128,   0,   0,   0, 0},
   {-1,   3,  -7, 127,   8,  -3,   1, 0},
   {-2,   5, -13, 125,  17,  -6,   3, -1},
@@ -77,8 +72,7 @@ DECLARE_ALIGNED(256, const InterpKernel,
 };
 
 // freqmultiplier = 0.5
-DECLARE_ALIGNED(256, const InterpKernel,
-                vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS]) = {
+const InterpKernel vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS] = {
   { 0,  0,  0, 128,  0,  0,  0,  0},
   {-3, -1, 32,  64, 38,  1, -3,  0},
   {-2, -2, 29,  63, 41,  2, -3,  0},
diff --git a/vp9/common/vp9_filter.h b/vp9/common/vp9_filter.h
index 15610d781..29d3867c9 100644
--- a/vp9/common/vp9_filter.h
+++ b/vp9/common/vp9_filter.h
@@ -13,6 +13,8 @@
 
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+
 
 #ifdef __cplusplus
 extern "C" {
@@ -37,10 +39,14 @@ typedef int16_t InterpKernel[SUBPEL_TAPS];
 
 const InterpKernel *vp9_get_interp_kernel(INTERP_FILTER filter);
 
-extern const InterpKernel vp9_bilinear_filters[SUBPEL_SHIFTS];
-extern const InterpKernel vp9_sub_pel_filters_8[SUBPEL_SHIFTS];
-extern const InterpKernel vp9_sub_pel_filters_8s[SUBPEL_SHIFTS];
-extern const InterpKernel vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS];
+DECLARE_ALIGNED(256, extern const InterpKernel,
+                vp9_bilinear_filters[SUBPEL_SHIFTS]);
+DECLARE_ALIGNED(256, extern const InterpKernel,
+                vp9_sub_pel_filters_8[SUBPEL_SHIFTS]);
+DECLARE_ALIGNED(256, extern const InterpKernel,
+                vp9_sub_pel_filters_8s[SUBPEL_SHIFTS]);
+DECLARE_ALIGNED(256, extern const InterpKernel,
+                vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS]);
 
 // The VP9_BILINEAR_FILTERS_2TAP macro returns a pointer to the bilinear
 // filter kernel as a 2 tap filter.
diff --git a/vp9/common/vp9_frame_buffers.c b/vp9/common/vp9_frame_buffers.c
new file mode 100644
index 000000000..dffeb8a22
--- /dev/null
+++ b/vp9/common/vp9_frame_buffers.c
@@ -0,0 +1,80 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "vp9/common/vp9_frame_buffers.h"
+#include "vpx_mem/vpx_mem.h"
+
+int vp9_alloc_internal_frame_buffers(InternalFrameBufferList *list) {
+  assert(list != NULL);
+  vp9_free_internal_frame_buffers(list);
+
+  list->num_internal_frame_buffers =
+      VP9_MAXIMUM_REF_BUFFERS + VPX_MAXIMUM_WORK_BUFFERS;
+  list->int_fb = vpx_calloc(list->num_internal_frame_buffers,
+                            sizeof(*list->int_fb));
+  return (list->int_fb == NULL);
+}
+
+void vp9_free_internal_frame_buffers(InternalFrameBufferList *list) {
+  int i;
+
+  assert(list != NULL);
+
+  for (i = 0; i < list->num_internal_frame_buffers; ++i) {
+    vpx_free(list->int_fb[i].data);
+    list->int_fb[i].data = NULL;
+  }
+  vpx_free(list->int_fb);
+  list->int_fb = NULL;
+}
+
+int vp9_get_frame_buffer(void *cb_priv, size_t min_size,
+                         vpx_codec_frame_buffer_t *fb) {
+  int i;
+  InternalFrameBufferList *const int_fb_list =
+      (InternalFrameBufferList *)cb_priv;
+  if (int_fb_list == NULL)
+    return -1;
+
+  // Find a free frame buffer.
+  for (i = 0; i < int_fb_list->num_internal_frame_buffers; ++i) {
+    if (!int_fb_list->int_fb[i].in_use)
+      break;
+  }
+
+  if (i == int_fb_list->num_internal_frame_buffers)
+    return -1;
+
+  if (int_fb_list->int_fb[i].size < min_size) {
+    int_fb_list->int_fb[i].data =
+        (uint8_t *)vpx_realloc(int_fb_list->int_fb[i].data, min_size);
+    if (!int_fb_list->int_fb[i].data)
+      return -1;
+
+    int_fb_list->int_fb[i].size = min_size;
+  }
+
+  fb->data = int_fb_list->int_fb[i].data;
+  fb->size = int_fb_list->int_fb[i].size;
+  int_fb_list->int_fb[i].in_use = 1;
+
+  // Set the frame buffer's private data to point at the internal frame buffer.
+  fb->priv = &int_fb_list->int_fb[i];
+  return 0;
+}
+
+int vp9_release_frame_buffer(void *cb_priv, vpx_codec_frame_buffer_t *fb) {
+  InternalFrameBuffer *const int_fb = (InternalFrameBuffer *)fb->priv;
+  (void)cb_priv;
+  int_fb->in_use = 0;
+  return 0;
+}
diff --git a/vp9/common/vp9_frame_buffers.h b/vp9/common/vp9_frame_buffers.h
new file mode 100644
index 000000000..e2cfe61b6
--- /dev/null
+++ b/vp9/common/vp9_frame_buffers.h
@@ -0,0 +1,53 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_COMMON_VP9_FRAME_BUFFERS_H_
+#define VP9_COMMON_VP9_FRAME_BUFFERS_H_
+
+#include "vpx/vpx_frame_buffer.h"
+#include "vpx/vpx_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct InternalFrameBuffer {
+  uint8_t *data;
+  size_t size;
+  int in_use;
+} InternalFrameBuffer;
+
+typedef struct InternalFrameBufferList {
+  int num_internal_frame_buffers;
+  InternalFrameBuffer *int_fb;
+} InternalFrameBufferList;
+
+// Initializes |list|. Returns 0 on success.
+int vp9_alloc_internal_frame_buffers(InternalFrameBufferList *list);
+
+// Free any data allocated to the frame buffers.
+void vp9_free_internal_frame_buffers(InternalFrameBufferList *list);
+
+// Callback used by libvpx to request an external frame buffer. |cb_priv|
+// Callback private data, which points to an InternalFrameBufferList.
+// |min_size| is the minimum size in bytes needed to decode the next frame.
+// |fb| pointer to the frame buffer.
+int vp9_get_frame_buffer(void *cb_priv, size_t min_size,
+                         vpx_codec_frame_buffer_t *fb);
+
+// Callback used by libvpx when there are no references to the frame buffer.
+// |cb_priv| is not used. |fb| pointer to the frame buffer.
+int vp9_release_frame_buffer(void *cb_priv, vpx_codec_frame_buffer_t *fb);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_COMMON_VP9_FRAME_BUFFERS_H_
diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c
index 07d7a92f6..868a66ae4 100644
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@@ -262,9 +262,9 @@ void vp9_loop_filter_frame_init(VP9_COMMON *cm, int default_filt_lvl) {
     int lvl_seg = default_filt_lvl;
     if (vp9_segfeature_active(seg, seg_id, SEG_LVL_ALT_LF)) {
       const int data = vp9_get_segdata(seg, seg_id, SEG_LVL_ALT_LF);
-      lvl_seg = seg->abs_delta == SEGMENT_ABSDATA
-                  ? data
-                  : clamp(default_filt_lvl + data, 0, MAX_LOOP_FILTER);
+      lvl_seg = clamp(seg->abs_delta == SEGMENT_ABSDATA ?
+                      data : default_filt_lvl + data,
+                      0, MAX_LOOP_FILTER);
     }
 
     if (!lf->mode_ref_delta_enabled) {
@@ -496,7 +496,7 @@ static void build_masks(const loop_filter_info_n *const lfi_n,
   const BLOCK_SIZE block_size = mi->mbmi.sb_type;
   const TX_SIZE tx_size_y = mi->mbmi.tx_size;
   const TX_SIZE tx_size_uv = get_uv_tx_size(&mi->mbmi);
-  const int skip = mi->mbmi.skip_coeff;
+  const int skip = mi->mbmi.skip;
   const int seg = mi->mbmi.segment_id;
   const int ref = mi->mbmi.ref_frame[0];
   const int filter_level = lfi_n->lvl[seg][ref][mode_lf_lut[mi->mbmi.mode]];
@@ -577,7 +577,7 @@ static void build_y_mask(const loop_filter_info_n *const lfi_n,
                          LOOP_FILTER_MASK *lfm) {
   const BLOCK_SIZE block_size = mi->mbmi.sb_type;
   const TX_SIZE tx_size_y = mi->mbmi.tx_size;
-  const int skip = mi->mbmi.skip_coeff;
+  const int skip = mi->mbmi.skip;
   const int seg = mi->mbmi.segment_id;
   const int ref = mi->mbmi.ref_frame[0];
   const int filter_level = lfi_n->lvl[seg][ref][mode_lf_lut[mi->mbmi.mode]];
@@ -868,7 +868,6 @@ void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
   assert(!(lfm->int_4x4_uv & lfm->above_uv[TX_16X16]));
 }
 
-#if CONFIG_NON420
 static uint8_t build_lfi(const loop_filter_info_n *lfi_n,
                      const MB_MODE_INFO *mbmi) {
   const int seg = mbmi->segment_id;
@@ -937,8 +936,7 @@ static void filter_block_plane_non420(VP9_COMMON *cm,
     for (c = 0; c < MI_BLOCK_SIZE && mi_col + c < cm->mi_cols; c += col_step) {
       const MODE_INFO *mi = mi_8x8[c];
       const BLOCK_SIZE sb_type = mi[0].mbmi.sb_type;
-      const int skip_this = mi[0].mbmi.skip_coeff
-                            && is_inter_block(&mi[0].mbmi);
+      const int skip_this = mi[0].mbmi.skip && is_inter_block(&mi[0].mbmi);
       // left edge of current unit is block/partition edge -> no skip
       const int block_edge_left = (num_4x4_blocks_wide_lookup[sb_type] > 1) ?
           !(c & (num_8x8_blocks_wide_lookup[sb_type] - 1)) : 1;
@@ -1047,7 +1045,6 @@ static void filter_block_plane_non420(VP9_COMMON *cm,
     dst->buf += 8 * dst->stride;
   }
 }
-#endif
 
 void vp9_filter_block_plane(VP9_COMMON *const cm,
                             struct macroblockd_plane *const plane,
@@ -1207,10 +1204,8 @@ void vp9_loop_filter_rows(const YV12_BUFFER_CONFIG *frame_buffer,
   const int num_planes = y_only ? 1 : MAX_MB_PLANE;
   int mi_row, mi_col;
   LOOP_FILTER_MASK lfm;
-#if CONFIG_NON420
   int use_420 = y_only || (xd->plane[1].subsampling_y == 1 &&
       xd->plane[1].subsampling_x == 1);
-#endif
 
   for (mi_row = start; mi_row < stop; mi_row += MI_BLOCK_SIZE) {
     MODE_INFO **mi_8x8 = cm->mi_grid_visible + mi_row * cm->mode_info_stride;
@@ -1221,22 +1216,16 @@ void vp9_loop_filter_rows(const YV12_BUFFER_CONFIG *frame_buffer,
       setup_dst_planes(xd, frame_buffer, mi_row, mi_col);
 
       // TODO(JBB): Make setup_mask work for non 420.
-#if CONFIG_NON420
       if (use_420)
-#endif
         vp9_setup_mask(cm, mi_row, mi_col, mi_8x8 + mi_col,
                        cm->mode_info_stride, &lfm);
 
       for (plane = 0; plane < num_planes; ++plane) {
-#if CONFIG_NON420
         if (use_420)
-#endif
           vp9_filter_block_plane(cm, &xd->plane[plane], mi_row, &lfm);
-#if CONFIG_NON420
         else
           filter_block_plane_non420(cm, &xd->plane[plane], mi_8x8 + mi_col,
                                     mi_row, mi_col);
-#endif
       }
     }
   }
diff --git a/vp9/common/vp9_mv.h b/vp9/common/vp9_mv.h
index 98fd1d82f..3eb7f9d61 100644
--- a/vp9/common/vp9_mv.h
+++ b/vp9/common/vp9_mv.h
@@ -34,8 +34,8 @@ typedef struct mv32 {
   int32_t col;
 } MV32;
 
-static void clamp_mv(MV *mv, int min_col, int max_col,
-                             int min_row, int max_row) {
+static INLINE void clamp_mv(MV *mv, int min_col, int max_col,
+                            int min_row, int max_row) {
   mv->col = clamp(mv->col, min_col, max_col);
   mv->row = clamp(mv->row, min_row, max_row);
 }
diff --git a/vp9/common/vp9_mvref_common.c b/vp9/common/vp9_mvref_common.c
index ff0262210..e5f3fed45 100644
--- a/vp9/common/vp9_mvref_common.c
+++ b/vp9/common/vp9_mvref_common.c
@@ -186,17 +186,17 @@ static INLINE int is_inside(const TileInfo *const tile,
 
 // This function searches the neighbourhood of a given MB/SB
 // to try and find candidate reference vectors.
-void vp9_find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
-                          const TileInfo *const tile,
-                          MODE_INFO *mi, const MODE_INFO *prev_mi,
-                          MV_REFERENCE_FRAME ref_frame,
-                          int_mv *mv_ref_list,
-                          int block_idx,
-                          int mi_row, int mi_col) {
+static void find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
+                             const TileInfo *const tile,
+                             MODE_INFO *mi, const MODE_INFO *prev_mi,
+                             MV_REFERENCE_FRAME ref_frame,
+                             int_mv *mv_ref_list,
+                             int block_idx, int mi_row, int mi_col) {
   const int *ref_sign_bias = cm->ref_frame_sign_bias;
   int i, refmv_count = 0;
   const POSITION *const mv_ref_search = mv_ref_blocks[mi->mbmi.sb_type];
-  const MB_MODE_INFO *const prev_mbmi = prev_mi ? &prev_mi->mbmi : NULL;
+  const MB_MODE_INFO *const prev_mbmi = cm->coding_use_prev_mi && prev_mi ?
+      &prev_mi->mbmi : NULL;
   int different_ref_found = 0;
   int context_counter = 0;
 
@@ -290,6 +290,16 @@ void vp9_find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
     clamp_mv_ref(&mv_ref_list[i].as_mv, xd);
 }
 
+void vp9_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd,
+                                    const TileInfo *const tile,
+                                    MODE_INFO *mi, const MODE_INFO *prev_mi,
+                                    MV_REFERENCE_FRAME ref_frame,
+                                    int_mv *mv_ref_list,
+                                    int mi_row, int mi_col) {
+  find_mv_refs_idx(cm, xd, tile, mi, prev_mi, ref_frame, mv_ref_list, -1,
+                   mi_row, mi_col);
+}
+
 static void lower_mv_precision(MV *mv, int allow_hp) {
   const int use_hp = allow_hp && vp9_use_mv_hp(mv);
   if (!use_hp) {
@@ -324,8 +334,8 @@ void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd,
 
   assert(MAX_MV_REF_CANDIDATES == 2);
 
-  vp9_find_mv_refs_idx(cm, xd, tile, mi, xd->last_mi, mi->mbmi.ref_frame[ref],
-                       mv_list, block, mi_row, mi_col);
+  find_mv_refs_idx(cm, xd, tile, mi, xd->last_mi, mi->mbmi.ref_frame[ref],
+                   mv_list, block, mi_row, mi_col);
 
   near->as_int = 0;
   switch (block) {
diff --git a/vp9/common/vp9_mvref_common.h b/vp9/common/vp9_mvref_common.h
index 0936abfcd..04cb000ef 100644
--- a/vp9/common/vp9_mvref_common.h
+++ b/vp9/common/vp9_mvref_common.h
@@ -17,29 +17,24 @@
 extern "C" {
 #endif
 
+#define LEFT_TOP_MARGIN ((VP9_ENC_BORDER_IN_PIXELS - VP9_INTERP_EXTEND) << 3)
+#define RIGHT_BOTTOM_MARGIN ((VP9_ENC_BORDER_IN_PIXELS -\
+                                VP9_INTERP_EXTEND) << 3)
 
-void vp9_find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
-                          const TileInfo *const tile,
-                          MODE_INFO *mi, const MODE_INFO *prev_mi,
-                          MV_REFERENCE_FRAME ref_frame,
-                          int_mv *mv_ref_list,
-                          int block_idx,
-                          int mi_row, int mi_col);
-
-static INLINE void vp9_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd,
-                                    const TileInfo *const tile,
-                                    MODE_INFO *mi, const MODE_INFO *prev_mi,
-                                    MV_REFERENCE_FRAME ref_frame,
-                                    int_mv *mv_ref_list,
-                                    int mi_row, int mi_col) {
-  vp9_find_mv_refs_idx(cm, xd, tile, mi, prev_mi, ref_frame,
-                       mv_ref_list, -1, mi_row, mi_col);
+// TODO(jingning): this mv clamping function should be block size dependent.
+static INLINE void clamp_mv2(MV *mv, const MACROBLOCKD *xd) {
+  clamp_mv(mv, xd->mb_to_left_edge - LEFT_TOP_MARGIN,
+               xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN,
+               xd->mb_to_top_edge - LEFT_TOP_MARGIN,
+               xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);
 }
 
-#define LEFT_TOP_MARGIN     ((VP9_ENC_BORDER_IN_PIXELS  \
-                            - VP9_INTERP_EXTEND) << 3)
-#define RIGHT_BOTTOM_MARGIN ((VP9_ENC_BORDER_IN_PIXELS  \
-                            - VP9_INTERP_EXTEND) << 3)
+void vp9_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd,
+                      const TileInfo *const tile,
+                      MODE_INFO *mi, const MODE_INFO *prev_mi,
+                      MV_REFERENCE_FRAME ref_frame,
+                      int_mv *mv_ref_list,
+                      int mi_row, int mi_col);
 
 // check a list of motion vectors by sad score using a number rows of pixels
 // above and a number cols of pixels in the left to select the one with best
@@ -47,14 +42,6 @@ static INLINE void vp9_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd,
 void vp9_find_best_ref_mvs(MACROBLOCKD *xd, int allow_hp,
                            int_mv *mvlist, int_mv *nearest, int_mv *near);
 
-// TODO(jingning): this mv clamping function should be block size dependent.
-static void clamp_mv2(MV *mv, const MACROBLOCKD *xd) {
-  clamp_mv(mv, xd->mb_to_left_edge - LEFT_TOP_MARGIN,
-               xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN,
-               xd->mb_to_top_edge - LEFT_TOP_MARGIN,
-               xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);
-}
-
 void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd,
                                    const TileInfo *const tile,
                                    int block, int ref, int mi_row, int mi_col,
diff --git a/vp9/common/vp9_onyx.h b/vp9/common/vp9_onyx.h
index 564e4195f..222086886 100644
--- a/vp9/common/vp9_onyx.h
+++ b/vp9/common/vp9_onyx.h
@@ -112,7 +112,6 @@ extern "C" {
     int auto_key;  // autodetect cut scenes and set the keyframes
     int key_freq;  // maximum distance to key frame.
 
-    int allow_lag;  // allow lagged compression (if 0 lagin frames is ignored)
     int lag_in_frames;  // how many frames lag before we start encoding
 
     // ----------------------------------------------------------------
@@ -147,8 +146,14 @@ extern "C" {
     // END DATARATE CONTROL OPTIONS
     // ----------------------------------------------------------------
 
-    // Spatial scalability
-    int ss_number_layers;
+    // Spatial and temporal scalability.
+    int ss_number_layers;  // Number of spatial layers.
+    int ts_number_layers;  // Number of temporal layers.
+    // Bitrate allocation for spatial layers.
+    int ss_target_bitrate[VPX_SS_MAX_LAYERS];
+    // Bitrate allocation (CBR mode) and framerate factor, for temporal layers.
+    int ts_target_bitrate[VPX_TS_MAX_LAYERS];
+    int ts_rate_decimator[VPX_TS_MAX_LAYERS];
 
     // these parameters aren't to be used in final build don't use!!!
     int play_alternate;
diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h
index d92a25b12..e6d6ea7f0 100644
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h
@@ -18,6 +18,7 @@
 #include "vp9/common/vp9_entropymv.h"
 #include "vp9/common/vp9_entropy.h"
 #include "vp9/common/vp9_entropymode.h"
+#include "vp9/common/vp9_frame_buffers.h"
 #include "vp9/common/vp9_quant_common.h"
 #include "vp9/common/vp9_tile_common.h"
 
@@ -94,6 +95,7 @@ typedef enum {
 
 typedef struct {
   int ref_count;
+  vpx_codec_frame_buffer_t raw_frame_buffer;
   YV12_BUFFER_CONFIG buf;
 } RefCntBuffer;
 
@@ -222,14 +224,27 @@ typedef struct VP9Common {
   int error_resilient_mode;
   int frame_parallel_decoding_mode;
 
+  // Flag indicates if prev_mi can be used in coding:
+  //   0: encoder assumes decoder does not have prev_mi
+  //   1: encoder assumes decoder has and uses prev_mi
+  unsigned int coding_use_prev_mi;
+
   int log2_tile_cols, log2_tile_rows;
+
+  // Private data associated with the frame buffer callbacks.
+  void *cb_priv;
+  vpx_get_frame_buffer_cb_fn_t get_fb_cb;
+  vpx_release_frame_buffer_cb_fn_t release_fb_cb;
+
+  // Handles memory for the codec.
+  InternalFrameBufferList int_frame_buffers;
 } VP9_COMMON;
 
-static YV12_BUFFER_CONFIG *get_frame_new_buffer(VP9_COMMON *cm) {
+static INLINE YV12_BUFFER_CONFIG *get_frame_new_buffer(VP9_COMMON *cm) {
   return &cm->frame_bufs[cm->new_fb_idx].buf;
 }
 
-static int get_free_fb(VP9_COMMON *cm) {
+static INLINE int get_free_fb(VP9_COMMON *cm) {
   int i;
   for (i = 0; i < FRAME_BUFFERS; i++)
     if (cm->frame_bufs[i].ref_count == 0)
@@ -240,7 +255,7 @@ static int get_free_fb(VP9_COMMON *cm) {
   return i;
 }
 
-static void ref_cnt_fb(RefCntBuffer *bufs, int *idx, int new_idx) {
+static INLINE void ref_cnt_fb(RefCntBuffer *bufs, int *idx, int new_idx) {
   const int ref_index = *idx;
 
   if (ref_index >= 0 && bufs[ref_index].ref_count > 0)
@@ -251,7 +266,7 @@ static void ref_cnt_fb(RefCntBuffer *bufs, int *idx, int new_idx) {
   bufs[new_idx].ref_count++;
 }
 
-static int mi_cols_aligned_to_sb(int n_mis) {
+static INLINE int mi_cols_aligned_to_sb(int n_mis) {
   return ALIGN_POWER_OF_TWO(n_mis, MI_BLOCK_SIZE_LOG2);
 }
 
@@ -275,10 +290,10 @@ static INLINE void set_skip_context(
   }
 }
 
-static void set_mi_row_col(MACROBLOCKD *xd, const TileInfo *const tile,
-                           int mi_row, int bh,
-                           int mi_col, int bw,
-                           int mi_rows, int mi_cols) {
+static INLINE void set_mi_row_col(MACROBLOCKD *xd, const TileInfo *const tile,
+                                  int mi_row, int bh,
+                                  int mi_col, int bw,
+                                  int mi_rows, int mi_cols) {
   xd->mb_to_top_edge    = -((mi_row * MI_SIZE) * 8);
   xd->mb_to_bottom_edge = ((mi_rows - bh - mi_row) * MI_SIZE) * 8;
   xd->mb_to_left_edge   = -((mi_col * MI_SIZE) * 8);
@@ -292,7 +307,6 @@ static void set_mi_row_col(MACROBLOCKD *xd, const TileInfo *const tile,
 static void set_prev_mi(VP9_COMMON *cm) {
   const int use_prev_in_find_mv_refs = cm->width == cm->last_width &&
                                        cm->height == cm->last_height &&
-                                       !cm->error_resilient_mode &&
                                        !cm->intra_only &&
                                        cm->last_show_frame;
   // Special case: set prev_mi to NULL when the previous mode info
diff --git a/vp9/common/vp9_postproc.c b/vp9/common/vp9_postproc.c
index a172ba6a2..7baa9ee33 100644
--- a/vp9/common/vp9_postproc.c
+++ b/vp9/common/vp9_postproc.c
@@ -700,7 +700,7 @@ int vp9_post_proc_frame(struct VP9Common *cm,
         char zz[4];
         int dc_diff = !(mi[mb_index].mbmi.mode != I4X4_PRED &&
                         mi[mb_index].mbmi.mode != SPLITMV &&
-                        mi[mb_index].mbmi.skip_coeff);
+                        mi[mb_index].mbmi.skip);
 
         if (cm->frame_type == KEY_FRAME)
           snprintf(zz, sizeof(zz) - 1, "a");
diff --git a/vp9/common/vp9_pred_common.c b/vp9/common/vp9_pred_common.c
index 11b6d93c1..197bcb643 100644
--- a/vp9/common/vp9_pred_common.c
+++ b/vp9/common/vp9_pred_common.c
@@ -218,27 +218,25 @@ int vp9_get_pred_context_single_ref_p1(const MACROBLOCKD *xd) {
     } else {  // inter/inter
       const int above_has_second = has_second_ref(above_mbmi);
       const int left_has_second = has_second_ref(left_mbmi);
+      const MV_REFERENCE_FRAME above0 = above_mbmi->ref_frame[0];
+      const MV_REFERENCE_FRAME above1 = above_mbmi->ref_frame[1];
+      const MV_REFERENCE_FRAME left0 = left_mbmi->ref_frame[0];
+      const MV_REFERENCE_FRAME left1 = left_mbmi->ref_frame[1];
 
       if (above_has_second && left_has_second) {
-        pred_context = 1 + (above_mbmi->ref_frame[0] == LAST_FRAME ||
-                            above_mbmi->ref_frame[1] == LAST_FRAME ||
-                            left_mbmi->ref_frame[0] == LAST_FRAME ||
-                            left_mbmi->ref_frame[1] == LAST_FRAME);
+        pred_context = 1 + (above0 == LAST_FRAME || above1 == LAST_FRAME ||
+                            left0 == LAST_FRAME || left1 == LAST_FRAME);
       } else if (above_has_second || left_has_second) {
-        const MV_REFERENCE_FRAME rfs = !above_has_second ?
-            above_mbmi->ref_frame[0] : left_mbmi->ref_frame[0];
-        const MV_REFERENCE_FRAME crf1 = above_has_second ?
-            above_mbmi->ref_frame[0] : left_mbmi->ref_frame[0];
-        const MV_REFERENCE_FRAME crf2 = above_has_second ?
-            above_mbmi->ref_frame[1] : left_mbmi->ref_frame[1];
+        const MV_REFERENCE_FRAME rfs = !above_has_second ? above0 : left0;
+        const MV_REFERENCE_FRAME crf1 = above_has_second ? above0 : left0;
+        const MV_REFERENCE_FRAME crf2 = above_has_second ? above1 : left1;
 
         if (rfs == LAST_FRAME)
           pred_context = 3 + (crf1 == LAST_FRAME || crf2 == LAST_FRAME);
         else
           pred_context = (crf1 == LAST_FRAME || crf2 == LAST_FRAME);
       } else {
-        pred_context = 2 * (above_mbmi->ref_frame[0] == LAST_FRAME) +
-                       2 * (left_mbmi->ref_frame[0] == LAST_FRAME);
+        pred_context = 2 * (above0 == LAST_FRAME) + 2 * (left0 == LAST_FRAME);
       }
     }
   } else if (has_above || has_left) {  // one edge available
@@ -291,23 +289,23 @@ int vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) {
     } else {  // inter/inter
       const int above_has_second = has_second_ref(above_mbmi);
       const int left_has_second = has_second_ref(left_mbmi);
+      const MV_REFERENCE_FRAME above0 = above_mbmi->ref_frame[0];
+      const MV_REFERENCE_FRAME above1 = above_mbmi->ref_frame[1];
+      const MV_REFERENCE_FRAME left0 = left_mbmi->ref_frame[0];
+      const MV_REFERENCE_FRAME left1 = left_mbmi->ref_frame[1];
 
       if (above_has_second && left_has_second) {
-        if (above_mbmi->ref_frame[0] == left_mbmi->ref_frame[0] &&
-            above_mbmi->ref_frame[1] == left_mbmi->ref_frame[1])
-          pred_context = 3 * (above_mbmi->ref_frame[0] == GOLDEN_FRAME ||
-                              above_mbmi->ref_frame[1] == GOLDEN_FRAME ||
-                              left_mbmi->ref_frame[0] == GOLDEN_FRAME ||
-                              left_mbmi->ref_frame[1] == GOLDEN_FRAME);
+        if (above0 == left0 && above1 == left1)
+          pred_context = 3 * (above0 == GOLDEN_FRAME ||
+                              above1 == GOLDEN_FRAME ||
+                              left0 == GOLDEN_FRAME ||
+                              left1 == GOLDEN_FRAME);
         else
           pred_context = 2;
       } else if (above_has_second || left_has_second) {
-        const MV_REFERENCE_FRAME rfs = !above_has_second ?
-                     above_mbmi->ref_frame[0] : left_mbmi->ref_frame[0];
-        const MV_REFERENCE_FRAME crf1 = above_has_second ?
-                   above_mbmi->ref_frame[0] : left_mbmi->ref_frame[0];
-        const MV_REFERENCE_FRAME crf2 = above_has_second ?
-                     above_mbmi->ref_frame[1] : left_mbmi->ref_frame[1];
+        const MV_REFERENCE_FRAME rfs = !above_has_second ? above0 : left0;
+        const MV_REFERENCE_FRAME crf1 = above_has_second ? above0 : left0;
+        const MV_REFERENCE_FRAME crf2 = above_has_second ? above1 : left1;
 
         if (rfs == GOLDEN_FRAME)
           pred_context = 3 + (crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME);
@@ -316,17 +314,15 @@ int vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) {
         else
           pred_context = 1 + 2 * (crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME);
       } else {
-        if (above_mbmi->ref_frame[0] == LAST_FRAME &&
-            left_mbmi->ref_frame[0] == LAST_FRAME) {
+        if (above0 == LAST_FRAME && left0 == LAST_FRAME) {
           pred_context = 3;
-        } else if (above_mbmi->ref_frame[0] == LAST_FRAME ||
-                   left_mbmi->ref_frame[0] == LAST_FRAME) {
-          const MB_MODE_INFO *edge_mbmi =
-              above_mbmi->ref_frame[0] == LAST_FRAME ? left_mbmi : above_mbmi;
-            pred_context = 4 * (edge_mbmi->ref_frame[0] == GOLDEN_FRAME);
+        } else if (above0 == LAST_FRAME || left0 == LAST_FRAME) {
+          const MV_REFERENCE_FRAME edge0 = (above0 == LAST_FRAME) ? left0
+                                                                  : above0;
+          pred_context = 4 * (edge0 == GOLDEN_FRAME);
         } else {
-          pred_context = 2 * (above_mbmi->ref_frame[0] == GOLDEN_FRAME) +
-                         2 * (left_mbmi->ref_frame[0] == GOLDEN_FRAME);
+          pred_context = 2 * (above0 == GOLDEN_FRAME) +
+                             2 * (left0 == GOLDEN_FRAME);
         }
       }
     }
@@ -357,10 +353,10 @@ int vp9_get_tx_size_context(const MACROBLOCKD *xd) {
   const MB_MODE_INFO *const left_mbmi = get_mbmi(get_left_mi(xd));
   const int has_above = above_mbmi != NULL;
   const int has_left = left_mbmi != NULL;
-  int above_ctx = (has_above && !above_mbmi->skip_coeff) ? above_mbmi->tx_size
-                                                         : max_tx_size;
-  int left_ctx = (has_left && !left_mbmi->skip_coeff) ? left_mbmi->tx_size
-                                                      : max_tx_size;
+  int above_ctx = (has_above && !above_mbmi->skip) ? above_mbmi->tx_size
+                                                   : max_tx_size;
+  int left_ctx = (has_left && !left_mbmi->skip) ? left_mbmi->tx_size
+                                                : max_tx_size;
   if (!has_left)
     left_ctx = above_ctx;
 
diff --git a/vp9/common/vp9_pred_common.h b/vp9/common/vp9_pred_common.h
index 0acee32f8..6c7a0d383 100644
--- a/vp9/common/vp9_pred_common.h
+++ b/vp9/common/vp9_pred_common.h
@@ -39,7 +39,7 @@ static INLINE int vp9_get_pred_context_seg_id(const MACROBLOCKD *xd) {
   return above_sip + left_sip;
 }
 
-static INLINE vp9_prob vp9_get_pred_prob_seg_id(struct segmentation *seg,
+static INLINE vp9_prob vp9_get_pred_prob_seg_id(const struct segmentation *seg,
                                                 const MACROBLOCKD *xd) {
   return seg->pred_probs[vp9_get_pred_context_seg_id(xd)];
 }
@@ -47,8 +47,8 @@ static INLINE vp9_prob vp9_get_pred_prob_seg_id(struct segmentation *seg,
 static INLINE int vp9_get_skip_context(const MACROBLOCKD *xd) {
   const MODE_INFO *const above_mi = get_above_mi(xd);
   const MODE_INFO *const left_mi = get_left_mi(xd);
-  const int above_skip = (above_mi != NULL) ? above_mi->mbmi.skip_coeff : 0;
-  const int left_skip = (left_mi != NULL) ? left_mi->mbmi.skip_coeff : 0;
+  const int above_skip = (above_mi != NULL) ? above_mi->mbmi.skip : 0;
+  const int left_skip = (left_mi != NULL) ? left_mi->mbmi.skip : 0;
   return above_skip + left_skip;
 }
 
@@ -98,8 +98,8 @@ static INLINE vp9_prob vp9_get_pred_prob_single_ref_p2(const VP9_COMMON *cm,
 
 int vp9_get_tx_size_context(const MACROBLOCKD *xd);
 
-static const vp9_prob *get_tx_probs(TX_SIZE max_tx_size, int ctx,
-                                    const struct tx_probs *tx_probs) {
+static INLINE const vp9_prob *get_tx_probs(TX_SIZE max_tx_size, int ctx,
+                                           const struct tx_probs *tx_probs) {
   switch (max_tx_size) {
     case TX_8X8:
       return tx_probs->p8x8[ctx];
@@ -113,13 +113,14 @@ static const vp9_prob *get_tx_probs(TX_SIZE max_tx_size, int ctx,
   }
 }
 
-static const vp9_prob *get_tx_probs2(TX_SIZE max_tx_size, const MACROBLOCKD *xd,
-                                     const struct tx_probs *tx_probs) {
+static INLINE const vp9_prob *get_tx_probs2(TX_SIZE max_tx_size,
+                                            const MACROBLOCKD *xd,
+                                            const struct tx_probs *tx_probs) {
   return get_tx_probs(max_tx_size, vp9_get_tx_size_context(xd), tx_probs);
 }
 
-static unsigned int *get_tx_counts(TX_SIZE max_tx_size, int ctx,
-                                   struct tx_counts *tx_counts) {
+static INLINE unsigned int *get_tx_counts(TX_SIZE max_tx_size, int ctx,
+                                          struct tx_counts *tx_counts) {
   switch (max_tx_size) {
     case TX_8X8:
       return tx_counts->p8x8[ctx];
diff --git a/vp9/common/vp9_prob.c b/vp9/common/vp9_prob.c
index 884884e0b..a1befc63e 100644
--- a/vp9/common/vp9_prob.c
+++ b/vp9/common/vp9_prob.c
@@ -10,7 +10,7 @@
 
 #include "vp9/common/vp9_prob.h"
 
-DECLARE_ALIGNED(16, const uint8_t, vp9_norm[256]) = {
+const uint8_t vp9_norm[256] = {
   0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
   3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
@@ -28,3 +28,34 @@ DECLARE_ALIGNED(16, const uint8_t, vp9_norm[256]) = {
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 };
+
+
+static unsigned int tree_merge_probs_impl(unsigned int i,
+                                          const vp9_tree_index *tree,
+                                          const vp9_prob *pre_probs,
+                                          const unsigned int *counts,
+                                          unsigned int count_sat,
+                                          unsigned int max_update,
+                                          vp9_prob *probs) {
+  const int l = tree[i];
+  const unsigned int left_count = (l <= 0)
+                 ? counts[-l]
+                 : tree_merge_probs_impl(l, tree, pre_probs, counts,
+                                         count_sat, max_update, probs);
+  const int r = tree[i + 1];
+  const unsigned int right_count = (r <= 0)
+                 ? counts[-r]
+                 : tree_merge_probs_impl(r, tree, pre_probs, counts,
+                                         count_sat, max_update, probs);
+  const unsigned int ct[2] = { left_count, right_count };
+  probs[i >> 1] = merge_probs(pre_probs[i >> 1], ct,
+                              count_sat, max_update);
+  return left_count + right_count;
+}
+
+void vp9_tree_merge_probs(const vp9_tree_index *tree, const vp9_prob *pre_probs,
+                          const unsigned int *counts, unsigned int count_sat,
+                          unsigned int max_update_factor, vp9_prob *probs) {
+  tree_merge_probs_impl(0, tree, pre_probs, counts, count_sat,
+                        max_update_factor, probs);
+}
diff --git a/vp9/common/vp9_prob.h b/vp9/common/vp9_prob.h
index cc8d8ab38..f36148035 100644
--- a/vp9/common/vp9_prob.h
+++ b/vp9/common/vp9_prob.h
@@ -79,37 +79,10 @@ static INLINE vp9_prob merge_probs(vp9_prob pre_prob,
   return weighted_prob(pre_prob, prob, factor);
 }
 
-static unsigned int tree_merge_probs_impl(unsigned int i,
-                                          const vp9_tree_index *tree,
-                                          const vp9_prob *pre_probs,
-                                          const unsigned int *counts,
-                                          unsigned int count_sat,
-                                          unsigned int max_update_factor,
-                                          vp9_prob *probs) {
-  const int l = tree[i];
-  const unsigned int left_count = (l <= 0)
-                 ? counts[-l]
-                 : tree_merge_probs_impl(l, tree, pre_probs, counts,
-                                         count_sat, max_update_factor, probs);
-  const int r = tree[i + 1];
-  const unsigned int right_count = (r <= 0)
-                 ? counts[-r]
-                 : tree_merge_probs_impl(r, tree, pre_probs, counts,
-                                         count_sat, max_update_factor, probs);
-  const unsigned int ct[2] = { left_count, right_count };
-  probs[i >> 1] = merge_probs(pre_probs[i >> 1], ct,
-                              count_sat, max_update_factor);
-  return left_count + right_count;
-}
+void vp9_tree_merge_probs(const vp9_tree_index *tree, const vp9_prob *pre_probs,
+                          const unsigned int *counts, unsigned int count_sat,
+                          unsigned int max_update_factor, vp9_prob *probs);
 
-static void tree_merge_probs(const vp9_tree_index *tree,
-                             const vp9_prob *pre_probs,
-                             const unsigned int *counts,
-                             unsigned int count_sat,
-                             unsigned int max_update_factor, vp9_prob *probs) {
-  tree_merge_probs_impl(0, tree, pre_probs, counts,
-                        count_sat, max_update_factor, probs);
-}
 
 DECLARE_ALIGNED(16, extern const uint8_t, vp9_norm[256]);
 
diff --git a/vp9/common/vp9_quant_common.c b/vp9/common/vp9_quant_common.c
index 6dbdb4216..def12554d 100644
--- a/vp9/common/vp9_quant_common.c
+++ b/vp9/common/vp9_quant_common.c
@@ -130,12 +130,13 @@ int16_t vp9_ac_quant(int qindex, int delta) {
 }
 
 
-int vp9_get_qindex(struct segmentation *seg, int segment_id, int base_qindex) {
+int vp9_get_qindex(const struct segmentation *seg, int segment_id,
+                   int base_qindex) {
   if (vp9_segfeature_active(seg, segment_id, SEG_LVL_ALT_Q)) {
     const int data = vp9_get_segdata(seg, segment_id, SEG_LVL_ALT_Q);
-    return seg->abs_delta == SEGMENT_ABSDATA ?
-                             data :  // Abs value
-                             clamp(base_qindex + data, 0, MAXQ);  // Delta value
+    const int seg_qindex = seg->abs_delta == SEGMENT_ABSDATA ?
+        data : base_qindex + data;
+    return clamp(seg_qindex, 0, MAXQ);
   } else {
     return base_qindex;
   }
diff --git a/vp9/common/vp9_quant_common.h b/vp9/common/vp9_quant_common.h
index af50e23cd..581104006 100644
--- a/vp9/common/vp9_quant_common.h
+++ b/vp9/common/vp9_quant_common.h
@@ -27,7 +27,8 @@ void vp9_init_quant_tables();
 int16_t vp9_dc_quant(int qindex, int delta);
 int16_t vp9_ac_quant(int qindex, int delta);
 
-int vp9_get_qindex(struct segmentation *seg, int segment_id, int base_qindex);
+int vp9_get_qindex(const struct segmentation *seg, int segment_id,
+                   int base_qindex);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c
index db20f19d9..df603ad70 100644
--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c
@@ -139,9 +139,6 @@ MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd, const MV *src_mv,
   return clamped_mv;
 }
 
-// TODO(jkoleszar): In principle, pred_w, pred_h are unnecessary, as we could
-// calculate the subsampled BLOCK_SIZE, but that type isn't defined for
-// sizes smaller than 16x16 yet.
 static void build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
                                    int bw, int bh,
                                    int x, int y, int w, int h,
@@ -270,8 +267,8 @@ static void dec_build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
                              : mi_mv_pred_q4(mi, ref))
                : mi->mbmi.mv[ref].as_mv;
     MV32 scaled_mv;
-    int xs, ys, x0, y0, x0_16, y0_16, x1, y1, frame_width,
-        frame_height, subpel_x, subpel_y, buf_stride;
+    int xs, ys, x0, y0, x0_16, y0_16, frame_width, frame_height, buf_stride,
+        subpel_x, subpel_y;
     uint8_t *ref_frame, *buf_ptr;
     const YV12_BUFFER_CONFIG *ref_buf = xd->block_refs[ref]->buf;
     const MV mv_q4 = {
@@ -321,10 +318,6 @@ static void dec_build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
     x0_16 += scaled_mv.col;
     y0_16 += scaled_mv.row;
 
-    // Get reference block bottom right coordinate.
-    x1 = ((x0_16 + (w - 1) * xs) >> SUBPEL_BITS) + 1;
-    y1 = ((y0_16 + (h - 1) * ys) >> SUBPEL_BITS) + 1;
-
     // Get reference block pointer.
     buf_ptr = ref_frame + y0 * pre_buf->stride + x0;
     buf_stride = pre_buf->stride;
@@ -333,6 +326,9 @@ static void dec_build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
     // width/height is not a multiple of 8 pixels.
     if (scaled_mv.col || scaled_mv.row ||
         (frame_width & 0x7) || (frame_height & 0x7)) {
+      // Get reference block bottom right coordinate.
+      int x1 = ((x0_16 + (w - 1) * xs) >> SUBPEL_BITS) + 1;
+      int y1 = ((y0_16 + (h - 1) * ys) >> SUBPEL_BITS) + 1;
       int x_pad = 0, y_pad = 0;
 
       if (subpel_x || (sf->x_step_q4 & SUBPEL_MASK)) {
diff --git a/vp9/common/vp9_reconinter.h b/vp9/common/vp9_reconinter.h
index bf738c28b..dccd60938 100644
--- a/vp9/common/vp9_reconinter.h
+++ b/vp9/common/vp9_reconinter.h
@@ -39,18 +39,18 @@ void vp9_build_inter_predictor(const uint8_t *src, int src_stride,
                                enum mv_precision precision,
                                int x, int y);
 
-static int scaled_buffer_offset(int x_offset, int y_offset, int stride,
-                                const struct scale_factors *sf) {
+static INLINE int scaled_buffer_offset(int x_offset, int y_offset, int stride,
+                                       const struct scale_factors *sf) {
   const int x = sf ? sf->scale_value_x(x_offset, sf) : x_offset;
   const int y = sf ? sf->scale_value_y(y_offset, sf) : y_offset;
   return y * stride + x;
 }
 
-static void setup_pred_plane(struct buf_2d *dst,
-                             uint8_t *src, int stride,
-                             int mi_row, int mi_col,
-                             const struct scale_factors *scale,
-                             int subsampling_x, int subsampling_y) {
+static INLINE void setup_pred_plane(struct buf_2d *dst,
+                                    uint8_t *src, int stride,
+                                    int mi_row, int mi_col,
+                                    const struct scale_factors *scale,
+                                    int subsampling_x, int subsampling_y) {
   const int x = (MI_SIZE * mi_col) >> subsampling_x;
   const int y = (MI_SIZE * mi_row) >> subsampling_y;
   dst->buf = src + scaled_buffer_offset(x, y, stride, scale);
diff --git a/vp9/common/vp9_reconintra.c b/vp9/common/vp9_reconintra.c
index 96ba3e464..71a41a9de 100644
--- a/vp9/common/vp9_reconintra.c
+++ b/vp9/common/vp9_reconintra.c
@@ -382,34 +382,34 @@ static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref,
       /* slower path if the block needs border extension */
       if (x0 + 2 * bs <= frame_width) {
         if (right_available && bs == 4) {
-          vpx_memcpy(above_row - 1, above_ref - 1, 2 * bs + 1);
+          vpx_memcpy(above_row, above_ref, 2 * bs);
         } else {
-          vpx_memcpy(above_row - 1, above_ref - 1, bs + 1);
+          vpx_memcpy(above_row, above_ref, bs);
           vpx_memset(above_row + bs, above_row[bs - 1], bs);
         }
       } else if (x0 + bs <= frame_width) {
         const int r = frame_width - x0;
         if (right_available && bs == 4) {
-          vpx_memcpy(above_row - 1, above_ref - 1, r + 1);
+          vpx_memcpy(above_row, above_ref, r);
           vpx_memset(above_row + r, above_row[r - 1],
                      x0 + 2 * bs - frame_width);
         } else {
-          vpx_memcpy(above_row - 1, above_ref - 1, bs + 1);
+          vpx_memcpy(above_row, above_ref, bs);
           vpx_memset(above_row + bs, above_row[bs - 1], bs);
         }
       } else if (x0 <= frame_width) {
         const int r = frame_width - x0;
         if (right_available && bs == 4) {
-          vpx_memcpy(above_row - 1, above_ref - 1, r + 1);
+          vpx_memcpy(above_row, above_ref, r);
           vpx_memset(above_row + r, above_row[r - 1],
                      x0 + 2 * bs - frame_width);
         } else {
-          vpx_memcpy(above_row - 1, above_ref - 1, r + 1);
+          vpx_memcpy(above_row, above_ref, r);
           vpx_memset(above_row + r, above_row[r - 1],
                      x0 + 2 * bs - frame_width);
         }
-        above_row[-1] = left_available ? above_ref[-1] : 129;
       }
+      above_row[-1] = left_available ? above_ref[-1] : 129;
     } else {
       /* faster path if the block does not need extension */
       if (bs == 4 && right_available && left_available) {
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index 04a40bd58..4031bda55 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -264,13 +264,13 @@ prototype void vp9_convolve_avg "const uint8_t *src, ptrdiff_t src_stride, uint8
 specialize vp9_convolve_avg $sse2_x86inc neon dspr2
 
 prototype void vp9_convolve8 "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8 sse2 ssse3 neon dspr2
+specialize vp9_convolve8 sse2 ssse3 avx2 neon dspr2
 
 prototype void vp9_convolve8_horiz "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_horiz sse2 ssse3 neon dspr2
+specialize vp9_convolve8_horiz sse2 ssse3 avx2 neon dspr2
 
 prototype void vp9_convolve8_vert "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_vert sse2 ssse3 neon dspr2
+specialize vp9_convolve8_vert sse2 ssse3 avx2 neon dspr2
 
 prototype void vp9_convolve8_avg "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
 specialize vp9_convolve8_avg sse2 ssse3 neon dspr2
@@ -386,7 +386,7 @@ prototype unsigned int vp9_variance4x4 "const uint8_t *src_ptr, int source_strid
 specialize vp9_variance4x4 mmx $sse2_x86inc
 
 prototype unsigned int vp9_sub_pixel_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance64x64 $sse2_x86inc $ssse3_x86inc
+specialize vp9_sub_pixel_variance64x64 $sse2_x86inc $ssse3_x86inc avx2
 
 prototype unsigned int vp9_sub_pixel_avg_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
 specialize vp9_sub_pixel_avg_variance64x64 $sse2_x86inc $ssse3_x86inc
@@ -416,7 +416,7 @@ prototype unsigned int vp9_sub_pixel_avg_variance16x32 "const uint8_t *src_ptr,
 specialize vp9_sub_pixel_avg_variance16x32 $sse2_x86inc $ssse3_x86inc
 
 prototype unsigned int vp9_sub_pixel_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance32x32 $sse2_x86inc $ssse3_x86inc
+specialize vp9_sub_pixel_variance32x32 $sse2_x86inc $ssse3_x86inc avx2
 
 prototype unsigned int vp9_sub_pixel_avg_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
 specialize vp9_sub_pixel_avg_variance32x32 $sse2_x86inc $ssse3_x86inc
@@ -707,14 +707,14 @@ if [ "$CONFIG_INTERNAL_STATS" = "yes" ]; then
 fi
 
 # fdct functions
-prototype void vp9_short_fht4x4 "const int16_t *input, int16_t *output, int stride, int tx_type"
-specialize vp9_short_fht4x4 sse2 avx2
+prototype void vp9_fht4x4 "const int16_t *input, int16_t *output, int stride, int tx_type"
+specialize vp9_fht4x4 sse2 avx2
 
-prototype void vp9_short_fht8x8 "const int16_t *input, int16_t *output, int stride, int tx_type"
-specialize vp9_short_fht8x8 sse2 avx2
+prototype void vp9_fht8x8 "const int16_t *input, int16_t *output, int stride, int tx_type"
+specialize vp9_fht8x8 sse2 avx2
 
-prototype void vp9_short_fht16x16 "const int16_t *input, int16_t *output, int stride, int tx_type"
-specialize vp9_short_fht16x16 sse2 avx2
+prototype void vp9_fht16x16 "const int16_t *input, int16_t *output, int stride, int tx_type"
+specialize vp9_fht16x16 sse2 avx2
 
 prototype void vp9_fwht4x4 "const int16_t *input, int16_t *output, int stride"
 specialize vp9_fwht4x4
@@ -737,20 +737,20 @@ specialize vp9_fdct32x32_rd sse2 avx2
 #
 # Motion search
 #
-prototype int vp9_full_search_sad "const struct macroblock *x, struct mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, const struct mv *center_mv, int n"
+prototype int vp9_full_search_sad "const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, const struct mv *center_mv, struct mv *best_mv"
 specialize vp9_full_search_sad sse3 sse4_1
 vp9_full_search_sad_sse3=vp9_full_search_sadx3
 vp9_full_search_sad_sse4_1=vp9_full_search_sadx8
 
-prototype int vp9_refining_search_sad "const struct macroblock *x, struct mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, const struct mv *center_mv"
+prototype int vp9_refining_search_sad "const struct macroblock *x, struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, const struct mv *center_mv"
 specialize vp9_refining_search_sad sse3
 vp9_refining_search_sad_sse3=vp9_refining_search_sadx4
 
-prototype int vp9_diamond_search_sad "const struct macroblock *x, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, const struct mv *center_mv"
+prototype int vp9_diamond_search_sad "const struct macroblock *x, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, const struct mv *center_mv"
 specialize vp9_diamond_search_sad sse3
 vp9_diamond_search_sad_sse3=vp9_diamond_search_sadx4
 
-prototype int vp9_full_range_search "const struct macroblock *x, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, const struct mv *center_mv"
+prototype int vp9_full_range_search "const struct macroblock *x, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, const struct mv *center_mv"
 specialize vp9_full_range_search
 
 prototype void vp9_temporal_filter_apply "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count"
diff --git a/vp9/common/vp9_scale.h b/vp9/common/vp9_scale.h
index 90b0d0bf9..a9dda1889 100644
--- a/vp9/common/vp9_scale.h
+++ b/vp9/common/vp9_scale.h
@@ -40,12 +40,12 @@ void vp9_setup_scale_factors_for_frame(struct scale_factors *sf,
                                        int other_w, int other_h,
                                        int this_w, int this_h);
 
-static int vp9_is_valid_scale(const struct scale_factors *sf) {
+static INLINE int vp9_is_valid_scale(const struct scale_factors *sf) {
   return sf->x_scale_fp != REF_INVALID_SCALE &&
          sf->y_scale_fp != REF_INVALID_SCALE;
 }
 
-static int vp9_is_scaled(const struct scale_factors *sf) {
+static INLINE int vp9_is_scaled(const struct scale_factors *sf) {
   return sf->x_scale_fp != REF_NO_SCALE ||
          sf->y_scale_fp != REF_NO_SCALE;
 }
diff --git a/vp9/common/vp9_systemdependent.h b/vp9/common/vp9_systemdependent.h
index ee9a4823b..72edbca55 100644
--- a/vp9/common/vp9_systemdependent.h
+++ b/vp9/common/vp9_systemdependent.h
@@ -11,13 +11,17 @@
 #ifndef VP9_COMMON_VP9_SYSTEMDEPENDENT_H_
 #define VP9_COMMON_VP9_SYSTEMDEPENDENT_H_
 
-#ifdef __cplusplus
-extern "C" {
+#ifdef _MSC_VER
+# if _MSC_VER > 1310 && (defined(_M_X64) || defined(_M_IX86))
+#  include <intrin.h>
+#  define USE_MSC_INTRIN
+# endif
+# include <math.h>
+# define snprintf _snprintf
 #endif
 
-#ifdef _MSC_VER
-#include <math.h>
-#define snprintf _snprintf
+#ifdef __cplusplus
+extern "C" {
 #endif
 
 #include "./vpx_config.h"
@@ -30,7 +34,7 @@ void vpx_reset_mmx_state(void);
 
 #if defined(_MSC_VER) && _MSC_VER < 1800
 // round is not defined in MSVC before VS2013.
-static int round(double x) {
+static INLINE int round(double x) {
   if (x < 0)
     return (int)ceil(x - 0.5);
   else
@@ -44,9 +48,7 @@ static int round(double x) {
 static INLINE int get_msb(unsigned int n) {
   return 31 ^ __builtin_clz(n);
 }
-#elif defined(_MSC_VER) && _MSC_VER > 1310 && \
-      (defined(_M_X64) || defined(_M_IX86))
-#include <intrin.h>
+#elif defined(USE_MSC_INTRIN)
 #pragma intrinsic(_BitScanReverse)
 
 static INLINE int get_msb(unsigned int n) {
@@ -54,6 +56,7 @@ static INLINE int get_msb(unsigned int n) {
   _BitScanReverse(&first_set_bit, n);
   return first_set_bit;
 }
+#undef USE_MSC_INTRIN
 #else
 // Returns (int)floor(log2(n)). n must be > 0.
 static INLINE int get_msb(unsigned int n) {
@@ -73,9 +76,6 @@ static INLINE int get_msb(unsigned int n) {
 }
 #endif
 
-struct VP9Common;
-void vp9_machine_specific_config(struct VP9Common *cm);
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp9/common/x86/vp9_asm_stubs.c b/vp9/common/x86/vp9_asm_stubs.c
index 8ab5fb1bc..1b4904c39 100644
--- a/vp9/common/x86/vp9_asm_stubs.c
+++ b/vp9/common/x86/vp9_asm_stubs.c
@@ -16,15 +16,15 @@
 
 typedef void filter8_1dfunction (
   const unsigned char *src_ptr,
-  const unsigned int src_pitch,
+  const ptrdiff_t src_pitch,
   unsigned char *output_ptr,
-  unsigned int out_pitch,
+  ptrdiff_t out_pitch,
   unsigned int output_height,
   const short *filter
 );
 
-#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt1, opt2) \
-void vp9_convolve8_##name##_##opt1(const uint8_t *src, ptrdiff_t src_stride, \
+#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \
+  void vp9_convolve8_##name##_##opt(const uint8_t *src, ptrdiff_t src_stride, \
                                    uint8_t *dst, ptrdiff_t dst_stride, \
                                    const int16_t *filter_x, int x_step_q4, \
                                    const int16_t *filter_y, int y_step_q4, \
@@ -32,50 +32,68 @@ void vp9_convolve8_##name##_##opt1(const uint8_t *src, ptrdiff_t src_stride, \
   if (step_q4 == 16 && filter[3] != 128) { \
     if (filter[0] || filter[1] || filter[2]) { \
       while (w >= 16) { \
-        vp9_filter_block1d16_##dir##8_##avg##opt1(src_start, src_stride, \
-                                                  dst, dst_stride, \
-                                                  h, filter); \
+        vp9_filter_block1d16_##dir##8_##avg##opt(src_start, \
+                                                 src_stride, \
+                                                 dst, \
+                                                 dst_stride, \
+                                                 h, \
+                                                 filter); \
         src += 16; \
         dst += 16; \
         w -= 16; \
       } \
       while (w >= 8) { \
-        vp9_filter_block1d8_##dir##8_##avg##opt1(src_start, src_stride, \
-                                                 dst, dst_stride, \
-                                                 h, filter); \
+        vp9_filter_block1d8_##dir##8_##avg##opt(src_start, \
+                                                src_stride, \
+                                                dst, \
+                                                dst_stride, \
+                                                h, \
+                                                filter); \
         src += 8; \
         dst += 8; \
         w -= 8; \
       } \
       while (w >= 4) { \
-        vp9_filter_block1d4_##dir##8_##avg##opt1(src_start, src_stride, \
-                                                 dst, dst_stride, \
-                                                 h, filter); \
+        vp9_filter_block1d4_##dir##8_##avg##opt(src_start, \
+                                                src_stride, \
+                                                dst, \
+                                                dst_stride, \
+                                                h, \
+                                                filter); \
         src += 4; \
         dst += 4; \
         w -= 4; \
       } \
     } else { \
       while (w >= 16) { \
-        vp9_filter_block1d16_##dir##2_##avg##opt2(src, src_stride, \
-                                                  dst, dst_stride, \
-                                                  h, filter); \
+        vp9_filter_block1d16_##dir##2_##avg##opt(src, \
+                                                 src_stride, \
+                                                 dst, \
+                                                 dst_stride, \
+                                                 h, \
+                                                 filter); \
         src += 16; \
         dst += 16; \
         w -= 16; \
       } \
       while (w >= 8) { \
-        vp9_filter_block1d8_##dir##2_##avg##opt2(src, src_stride, \
-                                                 dst, dst_stride, \
-                                                 h, filter); \
+        vp9_filter_block1d8_##dir##2_##avg##opt(src, \
+                                                src_stride, \
+                                                dst, \
+                                                dst_stride, \
+                                                h, \
+                                                filter); \
         src += 8; \
         dst += 8; \
         w -= 8; \
       } \
       while (w >= 4) { \
-        vp9_filter_block1d4_##dir##2_##avg##opt2(src, src_stride, \
-                                                 dst, dst_stride, \
-                                                 h, filter); \
+        vp9_filter_block1d4_##dir##2_##avg##opt(src, \
+                                                src_stride, \
+                                                dst, \
+                                                dst_stride, \
+                                                h, \
+                                                filter); \
         src += 4; \
         dst += 4; \
         w -= 4; \
@@ -121,14 +139,79 @@ void vp9_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \
                            filter_x, x_step_q4, filter_y, y_step_q4, w, h); \
   } \
 }
+#if HAVE_AVX2
+filter8_1dfunction vp9_filter_block1d16_v8_avx2;
+filter8_1dfunction vp9_filter_block1d16_h8_avx2;
+filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
+#if (ARCH_X86_64)
+filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3;
+#define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_intrin_ssse3
+#define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_intrin_ssse3
+#define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_intrin_ssse3
+#else
+filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
+#define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_ssse3
+#define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_ssse3
+#define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_ssse3
+#endif
+filter8_1dfunction vp9_filter_block1d16_v2_ssse3;
+filter8_1dfunction vp9_filter_block1d16_h2_ssse3;
+filter8_1dfunction vp9_filter_block1d8_v2_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h2_ssse3;
+filter8_1dfunction vp9_filter_block1d4_v2_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h2_ssse3;
+#define vp9_filter_block1d4_v8_avx2 vp9_filter_block1d4_v8_ssse3
+#define vp9_filter_block1d16_v2_avx2 vp9_filter_block1d16_v2_ssse3
+#define vp9_filter_block1d16_h2_avx2 vp9_filter_block1d16_h2_ssse3
+#define vp9_filter_block1d8_v2_avx2  vp9_filter_block1d8_v2_ssse3
+#define vp9_filter_block1d8_h2_avx2  vp9_filter_block1d8_h2_ssse3
+#define vp9_filter_block1d4_v2_avx2  vp9_filter_block1d4_v2_ssse3
+#define vp9_filter_block1d4_h2_avx2  vp9_filter_block1d4_h2_ssse3
+// void vp9_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
+//                                uint8_t *dst, ptrdiff_t dst_stride,
+//                                const int16_t *filter_x, int x_step_q4,
+//                                const int16_t *filter_y, int y_step_q4,
+//                                int w, int h);
+// void vp9_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
+//                               uint8_t *dst, ptrdiff_t dst_stride,
+//                               const int16_t *filter_x, int x_step_q4,
+//                               const int16_t *filter_y, int y_step_q4,
+//                               int w, int h);
+FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2);
+FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2);
 
+// void vp9_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride,
+//                          uint8_t *dst, ptrdiff_t dst_stride,
+//                          const int16_t *filter_x, int x_step_q4,
+//                          const int16_t *filter_y, int y_step_q4,
+//                          int w, int h);
+FUN_CONV_2D(, avx2);
+#endif
 #if HAVE_SSSE3
+#if (ARCH_X86_64)
+filter8_1dfunction vp9_filter_block1d16_v8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d16_h8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3;
+#define vp9_filter_block1d16_v8_ssse3 vp9_filter_block1d16_v8_intrin_ssse3
+#define vp9_filter_block1d16_h8_ssse3 vp9_filter_block1d16_h8_intrin_ssse3
+#define vp9_filter_block1d8_v8_ssse3 vp9_filter_block1d8_v8_intrin_ssse3
+#define vp9_filter_block1d8_h8_ssse3 vp9_filter_block1d8_h8_intrin_ssse3
+#define vp9_filter_block1d4_h8_ssse3 vp9_filter_block1d4_h8_intrin_ssse3
+#else
 filter8_1dfunction vp9_filter_block1d16_v8_ssse3;
 filter8_1dfunction vp9_filter_block1d16_h8_ssse3;
 filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
 filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
 filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
 filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
+#endif
 filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3;
 filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3;
 filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3;
@@ -136,18 +219,18 @@ filter8_1dfunction vp9_filter_block1d8_h8_avg_ssse3;
 filter8_1dfunction vp9_filter_block1d4_v8_avg_ssse3;
 filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3;
 
-filter8_1dfunction vp9_filter_block1d16_v2_sse2;
-filter8_1dfunction vp9_filter_block1d16_h2_sse2;
-filter8_1dfunction vp9_filter_block1d8_v2_sse2;
-filter8_1dfunction vp9_filter_block1d8_h2_sse2;
-filter8_1dfunction vp9_filter_block1d4_v2_sse2;
-filter8_1dfunction vp9_filter_block1d4_h2_sse2;
-filter8_1dfunction vp9_filter_block1d16_v2_avg_sse2;
-filter8_1dfunction vp9_filter_block1d16_h2_avg_sse2;
-filter8_1dfunction vp9_filter_block1d8_v2_avg_sse2;
-filter8_1dfunction vp9_filter_block1d8_h2_avg_sse2;
-filter8_1dfunction vp9_filter_block1d4_v2_avg_sse2;
-filter8_1dfunction vp9_filter_block1d4_h2_avg_sse2;
+filter8_1dfunction vp9_filter_block1d16_v2_ssse3;
+filter8_1dfunction vp9_filter_block1d16_h2_ssse3;
+filter8_1dfunction vp9_filter_block1d8_v2_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h2_ssse3;
+filter8_1dfunction vp9_filter_block1d4_v2_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h2_ssse3;
+filter8_1dfunction vp9_filter_block1d16_v2_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d16_h2_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d8_v2_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h2_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d4_v2_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h2_avg_ssse3;
 
 // void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
 //                                uint8_t *dst, ptrdiff_t dst_stride,
@@ -169,11 +252,11 @@ filter8_1dfunction vp9_filter_block1d4_h2_avg_sse2;
 //                                   const int16_t *filter_x, int x_step_q4,
 //                                   const int16_t *filter_y, int y_step_q4,
 //                                   int w, int h);
-FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3, sse2);
-FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3, sse2);
-FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3, sse2);
+FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3);
+FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3);
+FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3);
 FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
-            ssse3, sse2);
+            ssse3);
 
 // void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
 //                          uint8_t *dst, ptrdiff_t dst_stride,
@@ -236,11 +319,10 @@ filter8_1dfunction vp9_filter_block1d4_h2_avg_sse2;
 //                                  const int16_t *filter_x, int x_step_q4,
 //                                  const int16_t *filter_y, int y_step_q4,
 //                                  int w, int h);
-FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2, sse2);
-FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2, sse2);
-FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2, sse2);
-FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, sse2,
-            sse2);
+FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2);
+FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2);
+FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2);
+FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, sse2);
 
 // void vp9_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
 //                         uint8_t *dst, ptrdiff_t dst_stride,
diff --git a/vp9/common/x86/vp9_loopfilter_mmx.asm b/vp9/common/x86/vp9_loopfilter_mmx.asm
index a7f69307d..91055b9f9 100644
--- a/vp9/common/x86/vp9_loopfilter_mmx.asm
+++ b/vp9/common/x86/vp9_loopfilter_mmx.asm
@@ -527,7 +527,7 @@ sym(vp9_lpf_vertical_4_mmx):
         pxor        mm7,        [GLOBAL(t80)]       ; unoffset
         ; mm7 = q1
 
-        ; tranpose and write back
+        ; transpose and write back
         ; mm1 =    72 62 52 42 32 22 12 02
         ; mm6 =    73 63 53 43 33 23 13 03
         ; mm3 =    74 64 54 44 34 24 14 04
diff --git a/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c b/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c
new file mode 100644
index 000000000..efa960c66
--- /dev/null
+++ b/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c
@@ -0,0 +1,545 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h>
+#include "vpx_ports/mem.h"
+
+// filters for 16_h8 and 16_v8
+DECLARE_ALIGNED(32, static const uint8_t, filt1_global_avx2[32]) = {
+  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, filt2_global_avx2[32]) = {
+  2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
+  2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = {
+  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12,
+  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = {
+  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14,
+  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
+};
+
+void vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr,
+                                  unsigned int src_pixels_per_line,
+                                  unsigned char *output_ptr,
+                                  unsigned int  output_pitch,
+                                  unsigned int  output_height,
+                                  int16_t *filter) {
+  __m128i filtersReg;
+  __m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg;
+  __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
+  __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
+  __m256i srcReg32b1, srcReg32b2, filtersReg32;
+  unsigned int i;
+  unsigned int src_stride, dst_stride;
+
+  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+  addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);
+  filtersReg = _mm_loadu_si128((__m128i *)filter);
+  // converting the 16 bit (short) to 8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+  // have the same data in both lanes of a 256 bit register
+#if defined (__GNUC__)
+#if ( __GNUC__ < 4 || (__GNUC__ == 4 && \
+(__GNUC_MINOR__ < 6 || (__GNUC_MINOR__ == 6 && __GNUC_PATCHLEVEL__ > 0))))
+  filtersReg32 = _mm_broadcastsi128_si256((__m128i const *)&filtersReg);
+#elif(__GNUC__ == 4 && (__GNUC_MINOR__ == 7 && __GNUC_PATCHLEVEL__ > 0))
+  filtersReg32 = _mm_broadcastsi128_si256(filtersReg);
+#else
+  filtersReg32 = _mm256_broadcastsi128_si256(filtersReg);
+#endif
+#else
+  filtersReg32 = _mm256_broadcastsi128_si256(filtersReg);
+#endif
+
+  // duplicate only the first 16 bits (first and second byte)
+  // across 256 bit register
+  firstFilters = _mm256_shuffle_epi8(filtersReg32,
+                 _mm256_set1_epi16(0x100u));
+  // duplicate only the second 16 bits (third and forth byte)
+  // across 256 bit register
+  secondFilters = _mm256_shuffle_epi8(filtersReg32,
+                  _mm256_set1_epi16(0x302u));
+  // duplicate only the third 16 bits (fifth and sixth byte)
+  // across 256 bit register
+  thirdFilters = _mm256_shuffle_epi8(filtersReg32,
+                 _mm256_set1_epi16(0x504u));
+  // duplicate only the forth 16 bits (seventh and eighth byte)
+  // across 256 bit register
+  forthFilters = _mm256_shuffle_epi8(filtersReg32,
+                 _mm256_set1_epi16(0x706u));
+
+  filt1Reg = _mm256_load_si256((__m256i const *)filt1_global_avx2);
+  filt2Reg = _mm256_load_si256((__m256i const *)filt2_global_avx2);
+  filt3Reg = _mm256_load_si256((__m256i const *)filt3_global_avx2);
+  filt4Reg = _mm256_load_si256((__m256i const *)filt4_global_avx2);
+
+  // multiple the size of the source and destination stride by two
+  src_stride = src_pixels_per_line << 1;
+  dst_stride = output_pitch << 1;
+  for (i = output_height; i > 1; i-=2) {
+    // load the 2 strides of source
+    srcReg32b1 = _mm256_castsi128_si256(
+                 _mm_loadu_si128((__m128i *)(src_ptr-3)));
+    srcReg32b1 = _mm256_inserti128_si256(srcReg32b1,
+                 _mm_loadu_si128((__m128i *)
+                 (src_ptr+src_pixels_per_line-3)), 1);
+
+    // filter the source buffer
+    srcRegFilt32b1_1= _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
+    srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
+    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters);
+
+    // add and saturate the results together
+    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
+
+    // filter the source buffer
+    srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b1, filt4Reg);
+    srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, forthFilters);
+    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
+
+    // add and saturate the results together
+    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1,
+                       _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2));
+
+    // reading 2 strides of the next 16 bytes
+    // (part of it was being read by earlier read)
+    srcReg32b2 = _mm256_castsi128_si256(
+                 _mm_loadu_si128((__m128i *)(src_ptr+5)));
+    srcReg32b2 = _mm256_inserti128_si256(srcReg32b2,
+                 _mm_loadu_si128((__m128i *)
+                 (src_ptr+src_pixels_per_line+5)), 1);
+
+    // add and saturate the results together
+    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1,
+                       _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2));
+
+    // filter the source buffer
+    srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg);
+    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters);
+    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters);
+
+    // add and saturate the results together
+    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2);
+
+    // filter the source buffer
+    srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b2, filt4Reg);
+    srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b2, filt3Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, forthFilters);
+    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
+
+    // add and saturate the results together
+    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1,
+                       _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2));
+    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1,
+                       _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2));
+
+
+    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg64);
+
+    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg64);
+
+    // shift by 7 bit each 16 bit
+    srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 7);
+    srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 7);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1,
+                                           srcRegFilt32b2_1);
+
+    src_ptr+=src_stride;
+
+    // save 16 bytes
+    _mm_store_si128((__m128i*)output_ptr,
+    _mm256_castsi256_si128(srcRegFilt32b1_1));
+
+    // save the next 16 bits
+    _mm_store_si128((__m128i*)(output_ptr+output_pitch),
+    _mm256_extractf128_si256(srcRegFilt32b1_1, 1));
+    output_ptr+=dst_stride;
+  }
+
+  // if the number of strides is odd.
+  // process only 16 bytes
+  if (i > 0) {
+    __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1;
+    __m128i srcRegFilt2, srcRegFilt3;
+
+    srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr-3));
+
+    // filter the source buffer
+    srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1,
+                    _mm256_castsi256_si128(filt1Reg));
+    srcRegFilt2 = _mm_shuffle_epi8(srcReg1,
+                  _mm256_castsi256_si128(filt2Reg));
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1,
+                    _mm256_castsi256_si128(firstFilters));
+    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
+                  _mm256_castsi256_si128(secondFilters));
+
+    // add and saturate the results together
+    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
+
+    // filter the source buffer
+    srcRegFilt3= _mm_shuffle_epi8(srcReg1,
+                 _mm256_castsi256_si128(filt4Reg));
+    srcRegFilt2= _mm_shuffle_epi8(srcReg1,
+                 _mm256_castsi256_si128(filt3Reg));
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3,
+                  _mm256_castsi256_si128(forthFilters));
+    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
+                  _mm256_castsi256_si128(thirdFilters));
+
+    // add and saturate the results together
+    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
+                    _mm_min_epi16(srcRegFilt3, srcRegFilt2));
+
+    // reading the next 16 bytes
+    // (part of it was being read by earlier read)
+    srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr+5));
+
+    // add and saturate the results together
+    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
+                    _mm_max_epi16(srcRegFilt3, srcRegFilt2));
+
+    // filter the source buffer
+    srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2,
+                    _mm256_castsi256_si128(filt1Reg));
+    srcRegFilt2 = _mm_shuffle_epi8(srcReg2,
+                  _mm256_castsi256_si128(filt2Reg));
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1,
+                    _mm256_castsi256_si128(firstFilters));
+    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
+                  _mm256_castsi256_si128(secondFilters));
+
+    // add and saturate the results together
+    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);
+
+    // filter the source buffer
+    srcRegFilt3 = _mm_shuffle_epi8(srcReg2,
+                  _mm256_castsi256_si128(filt4Reg));
+    srcRegFilt2 = _mm_shuffle_epi8(srcReg2,
+                  _mm256_castsi256_si128(filt3Reg));
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3,
+                  _mm256_castsi256_si128(forthFilters));
+    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
+                  _mm256_castsi256_si128(thirdFilters));
+
+    // add and saturate the results together
+    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
+                    _mm_min_epi16(srcRegFilt3, srcRegFilt2));
+    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
+                    _mm_max_epi16(srcRegFilt3, srcRegFilt2));
+
+
+    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
+                    _mm256_castsi256_si128(addFilterReg64));
+
+    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
+                    _mm256_castsi256_si128(addFilterReg64));
+
+    // shift by 7 bit each 16 bit
+    srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7);
+    srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);
+
+    // save 16 bytes
+    _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1);
+  }
+}
+
+void vp9_filter_block1d16_v8_avx2(unsigned char *src_ptr,
+                                  unsigned int src_pitch,
+                                  unsigned char *output_ptr,
+                                  unsigned int out_pitch,
+                                  unsigned int output_height,
+                                  int16_t *filter) {
+  __m128i filtersReg;
+  __m256i addFilterReg64;
+  __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5;
+  __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10;
+  __m256i srcReg32b11, srcReg32b12, srcReg32b13, filtersReg32;
+  __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
+  unsigned int i;
+  unsigned int src_stride, dst_stride;
+
+  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+  addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);
+  filtersReg = _mm_loadu_si128((__m128i *)filter);
+  // converting the 16 bit (short) to  8 bit (byte) and have the
+  // same data in both lanes of 128 bit register.
+  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+  // have the same data in both lanes of a 256 bit register
+#if defined (__GNUC__)
+#if ( __GNUC__ < 4 || (__GNUC__ == 4 && \
+(__GNUC_MINOR__ < 6 || (__GNUC_MINOR__ == 6 && __GNUC_PATCHLEVEL__ > 0))))
+  filtersReg32 = _mm_broadcastsi128_si256((__m128i const *)&filtersReg);
+#elif(__GNUC__ == 4 && (__GNUC_MINOR__ == 7 && __GNUC_PATCHLEVEL__ > 0))
+  filtersReg32 = _mm_broadcastsi128_si256(filtersReg);
+#else
+  filtersReg32 = _mm256_broadcastsi128_si256(filtersReg);
+#endif
+#else
+  filtersReg32 = _mm256_broadcastsi128_si256(filtersReg);
+#endif
+
+  // duplicate only the first 16 bits (first and second byte)
+  // across 256 bit register
+  firstFilters = _mm256_shuffle_epi8(filtersReg32,
+                 _mm256_set1_epi16(0x100u));
+  // duplicate only the second 16 bits (third and forth byte)
+  // across 256 bit register
+  secondFilters = _mm256_shuffle_epi8(filtersReg32,
+                  _mm256_set1_epi16(0x302u));
+  // duplicate only the third 16 bits (fifth and sixth byte)
+  // across 256 bit register
+  thirdFilters = _mm256_shuffle_epi8(filtersReg32,
+                 _mm256_set1_epi16(0x504u));
+  // duplicate only the forth 16 bits (seventh and eighth byte)
+  // across 256 bit register
+  forthFilters = _mm256_shuffle_epi8(filtersReg32,
+                 _mm256_set1_epi16(0x706u));
+
+  // multiple the size of the source and destination stride by two
+  src_stride = src_pitch << 1;
+  dst_stride = out_pitch << 1;
+
+  // load 16 bytes 7 times in stride of src_pitch
+  srcReg32b1 = _mm256_castsi128_si256(
+               _mm_loadu_si128((__m128i *)(src_ptr)));
+  srcReg32b2 = _mm256_castsi128_si256(
+               _mm_loadu_si128((__m128i *)(src_ptr+src_pitch)));
+  srcReg32b3 = _mm256_castsi128_si256(
+               _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*2)));
+  srcReg32b4 = _mm256_castsi128_si256(
+               _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*3)));
+  srcReg32b5 = _mm256_castsi128_si256(
+               _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*4)));
+  srcReg32b6 = _mm256_castsi128_si256(
+               _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*5)));
+  srcReg32b7 = _mm256_castsi128_si256(
+               _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*6)));
+
+  // have each consecutive loads on the same 256 register
+  srcReg32b1 = _mm256_inserti128_si256(srcReg32b1,
+               _mm256_castsi256_si128(srcReg32b2), 1);
+  srcReg32b2 = _mm256_inserti128_si256(srcReg32b2,
+               _mm256_castsi256_si128(srcReg32b3), 1);
+  srcReg32b3 = _mm256_inserti128_si256(srcReg32b3,
+               _mm256_castsi256_si128(srcReg32b4), 1);
+  srcReg32b4 = _mm256_inserti128_si256(srcReg32b4,
+               _mm256_castsi256_si128(srcReg32b5), 1);
+  srcReg32b5 = _mm256_inserti128_si256(srcReg32b5,
+               _mm256_castsi256_si128(srcReg32b6), 1);
+  srcReg32b6 = _mm256_inserti128_si256(srcReg32b6,
+               _mm256_castsi256_si128(srcReg32b7), 1);
+
+  // merge every two consecutive registers except the last one
+  srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2);
+  srcReg32b1 = _mm256_unpackhi_epi8(srcReg32b1, srcReg32b2);
+
+  // save
+  srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4);
+
+  // save
+  srcReg32b3 = _mm256_unpackhi_epi8(srcReg32b3, srcReg32b4);
+
+  // save
+  srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6);
+
+  // save
+  srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6);
+
+
+  for (i = output_height; i > 1; i-=2) {
+     // load the last 2 loads of 16 bytes and have every two
+     // consecutive loads in the same 256 bit register
+     srcReg32b8 = _mm256_castsi128_si256(
+     _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7)));
+     srcReg32b7 = _mm256_inserti128_si256(srcReg32b7,
+     _mm256_castsi256_si128(srcReg32b8), 1);
+     srcReg32b9 = _mm256_castsi128_si256(
+     _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*8)));
+     srcReg32b8 = _mm256_inserti128_si256(srcReg32b8,
+     _mm256_castsi256_si128(srcReg32b9), 1);
+
+     // merge every two consecutive registers
+     // save
+     srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8);
+     srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8);
+
+     // multiply 2 adjacent elements with the filter and add the result
+     srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters);
+     srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters);
+     srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters);
+     srcReg32b8 = _mm256_maddubs_epi16(srcReg32b7, forthFilters);
+
+     // add and saturate the results together
+     srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6);
+     srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b8);
+
+
+     // multiply 2 adjacent elements with the filter and add the result
+     srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters);
+     srcReg32b6 = _mm256_maddubs_epi16(srcReg32b3, secondFilters);
+
+     // multiply 2 adjacent elements with the filter and add the result
+     srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters);
+     srcReg32b13 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters);
+
+
+     // add and saturate the results together
+     srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
+                   _mm256_min_epi16(srcReg32b8, srcReg32b12));
+     srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
+                  _mm256_min_epi16(srcReg32b6, srcReg32b13));
+
+     // add and saturate the results together
+     srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
+                   _mm256_max_epi16(srcReg32b8, srcReg32b12));
+     srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
+                  _mm256_max_epi16(srcReg32b6, srcReg32b13));
+
+
+     srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg64);
+     srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg64);
+
+     // shift by 7 bit each 16 bit
+     srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 7);
+     srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 7);
+
+     // shrink to 8 bit each 16 bits, the first lane contain the first
+     // convolve result and the second lane contain the second convolve
+     // result
+     srcReg32b1 = _mm256_packus_epi16(srcReg32b10, srcReg32b1);
+
+     src_ptr+=src_stride;
+
+     // save 16 bytes
+     _mm_store_si128((__m128i*)output_ptr,
+     _mm256_castsi256_si128(srcReg32b1));
+
+     // save the next 16 bits
+     _mm_store_si128((__m128i*)(output_ptr+out_pitch),
+     _mm256_extractf128_si256(srcReg32b1, 1));
+
+     output_ptr+=dst_stride;
+
+     // save part of the registers for next strides
+     srcReg32b10 = srcReg32b11;
+     srcReg32b1 = srcReg32b3;
+     srcReg32b11 = srcReg32b2;
+     srcReg32b3 = srcReg32b5;
+     srcReg32b2 = srcReg32b4;
+     srcReg32b5 = srcReg32b7;
+     srcReg32b7 = srcReg32b9;
+  }
+  if (i > 0) {
+    __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5;
+    __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8;
+    // load the last 16 bytes
+    srcRegFilt8 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7));
+
+    // merge the last 2 results together
+    srcRegFilt4 = _mm_unpacklo_epi8(
+                  _mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
+    srcRegFilt7 = _mm_unpackhi_epi8(
+                  _mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10),
+                  _mm256_castsi256_si128(firstFilters));
+    srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4,
+                  _mm256_castsi256_si128(forthFilters));
+    srcRegFilt3 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b1),
+                  _mm256_castsi256_si128(firstFilters));
+    srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7,
+                  _mm256_castsi256_si128(forthFilters));
+
+    // add and saturate the results together
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
+    srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, srcRegFilt7);
+
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11),
+                  _mm256_castsi256_si128(secondFilters));
+    srcRegFilt5 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b3),
+                  _mm256_castsi256_si128(secondFilters));
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2),
+                  _mm256_castsi256_si128(thirdFilters));
+    srcRegFilt7 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b5),
+                  _mm256_castsi256_si128(thirdFilters));
+
+    // add and saturate the results together
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
+                  _mm_min_epi16(srcRegFilt4, srcRegFilt6));
+    srcRegFilt3 = _mm_adds_epi16(srcRegFilt3,
+                  _mm_min_epi16(srcRegFilt5, srcRegFilt7));
+
+    // add and saturate the results together
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
+                  _mm_max_epi16(srcRegFilt4, srcRegFilt6));
+    srcRegFilt3 = _mm_adds_epi16(srcRegFilt3,
+                  _mm_max_epi16(srcRegFilt5, srcRegFilt7));
+
+
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
+                  _mm256_castsi256_si128(addFilterReg64));
+    srcRegFilt3 = _mm_adds_epi16(srcRegFilt3,
+                  _mm256_castsi256_si128(addFilterReg64));
+
+    // shift by 7 bit each 16 bit
+    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
+    srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 7);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3);
+
+    // save 16 bytes
+    _mm_store_si128((__m128i*)output_ptr, srcRegFilt1);
+  }
+}
diff --git a/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c b/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c
new file mode 100644
index 000000000..cf28d8d2b
--- /dev/null
+++ b/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c
@@ -0,0 +1,490 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <tmmintrin.h>
+#include "vpx_ports/mem.h"
+#include "vpx_ports/emmintrin_compat.h"
+
+// filters only for the 4_h8 convolution
+DECLARE_ALIGNED(16, static const uint8_t, filt1_4_h8[16]) = {
+  0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, filt2_4_h8[16]) = {
+  4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10
+};
+
+// filters for 8_h8 and 16_h8
+DECLARE_ALIGNED(16, static const uint8_t, filt1_global[16]) = {
+  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, filt2_global[16]) = {
+  2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, filt3_global[16]) = {
+  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, filt4_global[16]) = {
+  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
+};
+
+void vp9_filter_block1d4_h8_intrin_ssse3(unsigned char *src_ptr,
+                                         unsigned int src_pixels_per_line,
+                                         unsigned char *output_ptr,
+                                         unsigned int output_pitch,
+                                         unsigned int output_height,
+                                         int16_t *filter) {
+  __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
+  __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
+  __m128i addFilterReg64, filtersReg, srcReg, minReg;
+  unsigned int i;
+
+  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+  addFilterReg64 =_mm_set1_epi32((int)0x0400040u);
+  filtersReg = _mm_loadu_si128((__m128i *)filter);
+  // converting the 16 bit (short) to  8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+
+  // duplicate only the first 16 bits in the filter into the first lane
+  firstFilters = _mm_shufflelo_epi16(filtersReg, 0);
+  // duplicate only the third 16 bit in the filter into the first lane
+  secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu);
+  // duplicate only the seconds 16 bits in the filter into the second lane
+  firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u);
+  // duplicate only the forth 16 bits in the filter into the second lane
+  secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu);
+
+  // loading the local filters
+  thirdFilters =_mm_load_si128((__m128i const *)filt1_4_h8);
+  forthFilters = _mm_load_si128((__m128i const *)filt2_4_h8);
+
+  for (i = 0; i < output_height; i++) {
+    srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3));
+
+    // filter the source buffer
+    srcRegFilt1= _mm_shuffle_epi8(srcReg, thirdFilters);
+    srcRegFilt2= _mm_shuffle_epi8(srcReg, forthFilters);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
+    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
+
+    // extract the higher half of the lane
+    srcRegFilt3 =  _mm_srli_si128(srcRegFilt1, 8);
+    srcRegFilt4 =  _mm_srli_si128(srcRegFilt2, 8);
+
+    minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2);
+
+    // add and saturate all the results together
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
+    srcRegFilt3 = _mm_max_epi16(srcRegFilt3, srcRegFilt2);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
+
+    // shift by 7 bit each 16 bits
+    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
+
+    // shrink to 8 bit each 16 bits
+    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
+    src_ptr+=src_pixels_per_line;
+
+    // save only 4 bytes
+    *((int*)&output_ptr[0])= _mm_cvtsi128_si32(srcRegFilt1);
+
+    output_ptr+=output_pitch;
+  }
+}
+
+void vp9_filter_block1d8_h8_intrin_ssse3(unsigned char *src_ptr,
+                                         unsigned int src_pixels_per_line,
+                                         unsigned char *output_ptr,
+                                         unsigned int output_pitch,
+                                         unsigned int output_height,
+                                         int16_t *filter) {
+  __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg;
+  __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
+  __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
+  __m128i addFilterReg64, filtersReg, minReg;
+  unsigned int i;
+
+  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+  addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
+  filtersReg = _mm_loadu_si128((__m128i *)filter);
+  // converting the 16 bit (short) to  8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+
+  // duplicate only the first 16 bits (first and second byte)
+  // across 128 bit register
+  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
+  // duplicate only the second 16 bits (third and forth byte)
+  // across 128 bit register
+  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
+  // duplicate only the third 16 bits (fifth and sixth byte)
+  // across 128 bit register
+  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
+  // duplicate only the forth 16 bits (seventh and eighth byte)
+  // across 128 bit register
+  forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
+
+  filt1Reg = _mm_load_si128((__m128i const *)filt1_global);
+  filt2Reg = _mm_load_si128((__m128i const *)filt2_global);
+  filt3Reg = _mm_load_si128((__m128i const *)filt3_global);
+  filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
+
+  for (i = 0; i < output_height; i++) {
+    srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3));
+
+    // filter the source buffer
+    srcRegFilt1= _mm_shuffle_epi8(srcReg, filt1Reg);
+    srcRegFilt2= _mm_shuffle_epi8(srcReg, filt2Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
+    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
+
+    // filter the source buffer
+    srcRegFilt3= _mm_shuffle_epi8(srcReg, filt3Reg);
+    srcRegFilt4= _mm_shuffle_epi8(srcReg, filt4Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters);
+    srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters);
+
+    // add and saturate all the results together
+    minReg = _mm_min_epi16(srcRegFilt4, srcRegFilt3);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
+
+    srcRegFilt4= _mm_max_epi16(srcRegFilt4, srcRegFilt3);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
+
+    // shift by 7 bit each 16 bits
+    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
+
+    // shrink to 8 bit each 16 bits
+    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
+
+    src_ptr+=src_pixels_per_line;
+
+    // save only 8 bytes
+    _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);
+
+    output_ptr+=output_pitch;
+  }
+}
+
+void vp9_filter_block1d16_h8_intrin_ssse3(unsigned char *src_ptr,
+                                          unsigned int src_pixels_per_line,
+                                          unsigned char *output_ptr,
+                                          unsigned int output_pitch,
+                                          unsigned int output_height,
+                                          int16_t *filter) {
+  __m128i addFilterReg64, filtersReg, srcReg1, srcReg2;
+  __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
+  __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
+  __m128i srcRegFilt1_1, srcRegFilt2_1, srcRegFilt2, srcRegFilt3;
+  unsigned int i;
+
+  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+  addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
+  filtersReg = _mm_loadu_si128((__m128i *)filter);
+  // converting the 16 bit (short) to  8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+
+  // duplicate only the first 16 bits (first and second byte)
+  // across 128 bit register
+  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
+  // duplicate only the second 16 bits (third and forth byte)
+  // across 128 bit register
+  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
+  // duplicate only the third 16 bits (fifth and sixth byte)
+  // across 128 bit register
+  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
+  // duplicate only the forth 16 bits (seventh and eighth byte)
+  // across 128 bit register
+  forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
+
+  filt1Reg = _mm_load_si128((__m128i const *)filt1_global);
+  filt2Reg = _mm_load_si128((__m128i const *)filt2_global);
+  filt3Reg = _mm_load_si128((__m128i const *)filt3_global);
+  filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
+
+  for (i = 0; i < output_height; i++) {
+    srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr-3));
+
+    // filter the source buffer
+    srcRegFilt1_1= _mm_shuffle_epi8(srcReg1, filt1Reg);
+    srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt2Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, firstFilters);
+    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
+
+    // add and saturate the results together
+    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
+
+    // filter the source buffer
+    srcRegFilt3= _mm_shuffle_epi8(srcReg1, filt4Reg);
+    srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt3Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters);
+    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
+
+    // add and saturate the results together
+    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
+                                   _mm_min_epi16(srcRegFilt3, srcRegFilt2));
+
+    // reading the next 16 bytes.
+    // (part of it was being read by earlier read)
+    srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr+5));
+
+    // add and saturate the results together
+    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
+                                   _mm_max_epi16(srcRegFilt3, srcRegFilt2));
+
+    // filter the source buffer
+    srcRegFilt2_1= _mm_shuffle_epi8(srcReg2, filt1Reg);
+    srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt2Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, firstFilters);
+    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
+
+    // add and saturate the results together
+    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);
+
+    // filter the source buffer
+    srcRegFilt3= _mm_shuffle_epi8(srcReg2, filt4Reg);
+    srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt3Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters);
+    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
+
+    // add and saturate the results together
+    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
+    _mm_min_epi16(srcRegFilt3, srcRegFilt2));
+    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
+    _mm_max_epi16(srcRegFilt3, srcRegFilt2));
+
+    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, addFilterReg64);
+    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, addFilterReg64);
+
+    // shift by 7 bit each 16 bit
+    srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7);
+    srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);
+
+    src_ptr+=src_pixels_per_line;
+
+    // save 16 bytes
+    _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1);
+
+    output_ptr+=output_pitch;
+  }
+}
+
+void vp9_filter_block1d8_v8_intrin_ssse3(unsigned char *src_ptr,
+                                         unsigned int src_pitch,
+                                         unsigned char *output_ptr,
+                                         unsigned int out_pitch,
+                                         unsigned int output_height,
+                                         int16_t *filter) {
+  __m128i addFilterReg64, filtersReg, minReg, srcRegFilt6;
+  __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
+  __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4, srcRegFilt5;
+  unsigned int i;
+
+  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+  addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
+  filtersReg = _mm_loadu_si128((__m128i *)filter);
+  // converting the 16 bit (short) to  8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+
+  // duplicate only the first 16 bits in the filter
+  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
+  // duplicate only the second 16 bits in the filter
+  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
+  // duplicate only the third 16 bits in the filter
+  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
+  // duplicate only the forth 16 bits in the filter
+  forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
+
+  for (i = 0; i < output_height; i++) {
+    // load the first 8 bytes
+    srcRegFilt1 = _mm_loadl_epi64((__m128i *)&src_ptr[0]);
+    // load the next 8 bytes in stride of src_pitch
+    srcRegFilt2 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch)[0]);
+    srcRegFilt3 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*2)[0]);
+    srcRegFilt4 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*3)[0]);
+
+    // merge the result together
+    srcRegFilt1 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2);
+    srcRegFilt3 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4);
+
+    // load the next 8 bytes in stride of src_pitch
+    srcRegFilt2 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*4)[0]);
+    srcRegFilt4 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*5)[0]);
+    srcRegFilt5 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*6)[0]);
+    srcRegFilt6 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*7)[0]);
+
+    // merge the result together
+    srcRegFilt2 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt4);
+    srcRegFilt5 = _mm_unpacklo_epi8(srcRegFilt5, srcRegFilt6);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
+    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
+    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
+    srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, forthFilters);
+
+    // add and saturate the results together
+    minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt5);
+    srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
+
+    // shift by 7 bit each 16 bit
+    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
+
+    // shrink to 8 bit each 16 bits
+    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
+
+    src_ptr+=src_pitch;
+
+    // save only 8 bytes convolve result
+    _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);
+
+    output_ptr+=out_pitch;
+  }
+}
+
+void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr,
+                                          unsigned int src_pitch,
+                                          unsigned char *output_ptr,
+                                          unsigned int out_pitch,
+                                          unsigned int output_height,
+                                          int16_t *filter) {
+  __m128i addFilterReg64, filtersReg, srcRegFilt1, srcRegFilt2, srcRegFilt3;
+  __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
+  __m128i srcRegFilt4, srcRegFilt5, srcRegFilt6, srcRegFilt7, srcRegFilt8;
+  unsigned int i;
+
+  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+  addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
+  filtersReg = _mm_loadu_si128((__m128i *)filter);
+  // converting the 16 bit (short) to  8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+
+  // duplicate only the first 16 bits in the filter
+  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
+  // duplicate only the second 16 bits in the filter
+  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
+  // duplicate only the third 16 bits in the filter
+  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
+  // duplicate only the forth 16 bits in the filter
+  forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
+
+  for (i = 0; i < output_height; i++) {
+    // load the first 16 bytes
+    srcRegFilt1 = _mm_loadu_si128((__m128i *)(src_ptr));
+    // load the next 16 bytes in stride of src_pitch
+    srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch));
+    srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*6));
+    srcRegFilt4 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7));
+
+    // merge the result together
+    srcRegFilt5 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2);
+    srcRegFilt6 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4);
+    srcRegFilt1 = _mm_unpackhi_epi8(srcRegFilt1, srcRegFilt2);
+    srcRegFilt3 = _mm_unpackhi_epi8(srcRegFilt3, srcRegFilt4);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, firstFilters);
+    srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, forthFilters);
+    srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
+    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters);
+
+    // add and saturate the results together
+    srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, srcRegFilt6);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
+
+    // load the next 16 bytes in stride of two/three src_pitch
+    srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*2));
+    srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*3));
+
+    // merge the result together
+    srcRegFilt4 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3);
+    srcRegFilt6 = _mm_unpackhi_epi8(srcRegFilt2, srcRegFilt3);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, secondFilters);
+    srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, secondFilters);
+
+    // load the next 16 bytes in stride of four/five src_pitch
+    srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*4));
+    srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*5));
+
+    // merge the result together
+    srcRegFilt7 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3);
+    srcRegFilt8 = _mm_unpackhi_epi8(srcRegFilt2, srcRegFilt3);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7, thirdFilters);
+    srcRegFilt8 = _mm_maddubs_epi16(srcRegFilt8, thirdFilters);
+
+    // add and saturate the results together
+    srcRegFilt5 = _mm_adds_epi16(srcRegFilt5,
+                                 _mm_min_epi16(srcRegFilt4, srcRegFilt7));
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
+                                 _mm_min_epi16(srcRegFilt6, srcRegFilt8));
+
+    // add and saturate the results together
+    srcRegFilt5 = _mm_adds_epi16(srcRegFilt5,
+                                 _mm_max_epi16(srcRegFilt4, srcRegFilt7));
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
+                                 _mm_max_epi16(srcRegFilt6, srcRegFilt8));
+    srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, addFilterReg64);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
+
+    // shift by 7 bit each 16 bit
+    srcRegFilt5 = _mm_srai_epi16(srcRegFilt5, 7);
+    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    srcRegFilt1 = _mm_packus_epi16(srcRegFilt5, srcRegFilt1);
+
+    src_ptr+=src_pitch;
+
+    // save 16 bytes convolve result
+    _mm_store_si128((__m128i*)output_ptr, srcRegFilt1);
+
+    output_ptr+=out_pitch;
+  }
+}
diff --git a/vp9/common/x86/vp9_subpixel_bilinear_ssse3.asm b/vp9/common/x86/vp9_subpixel_bilinear_ssse3.asm
new file mode 100644
index 000000000..b5e18fe6d
--- /dev/null
+++ b/vp9/common/x86/vp9_subpixel_bilinear_ssse3.asm
@@ -0,0 +1,422 @@
+;
+;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%macro GET_PARAM_4 0
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+    mov         rcx, 0x0400040
+
+    movdqa      xmm3, [rdx]                 ;load filters
+    psrldq      xmm3, 6
+    packsswb    xmm3, xmm3
+    pshuflw     xmm3, xmm3, 0b              ;k3_k4
+
+    movq        xmm2, rcx                   ;rounding
+    pshufd      xmm2, xmm2, 0
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+%endm
+
+%macro APPLY_FILTER_4 1
+    punpcklbw   xmm0, xmm1
+    pmaddubsw   xmm0, xmm3
+
+    paddsw      xmm0, xmm2                  ;rounding
+    psraw       xmm0, 7                     ;shift
+    packuswb    xmm0, xmm0                  ;pack to byte
+
+%if %1
+    movd        xmm1, [rdi]
+    pavgb       xmm0, xmm1
+%endif
+    movd        [rdi], xmm0
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+%endm
+
+%macro GET_PARAM 0
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+    mov         rcx, 0x0400040
+
+    movdqa      xmm7, [rdx]                 ;load filters
+    psrldq      xmm7, 6
+    packsswb    xmm7, xmm7
+    pshuflw     xmm7, xmm7, 0b              ;k3_k4
+    punpcklwd   xmm7, xmm7
+
+    movq        xmm6, rcx                   ;rounding
+    pshufd      xmm6, xmm6, 0
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+%endm
+
+%macro APPLY_FILTER_8 1
+    punpcklbw   xmm0, xmm1
+    pmaddubsw   xmm0, xmm7
+
+    paddsw      xmm0, xmm6                  ;rounding
+    psraw       xmm0, 7                     ;shift
+    packuswb    xmm0, xmm0                  ;pack back to byte
+
+%if %1
+    movq        xmm1, [rdi]
+    pavgb       xmm0, xmm1
+%endif
+    movq        [rdi], xmm0                 ;store the result
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+%endm
+
+%macro APPLY_FILTER_16 1
+    punpcklbw   xmm0, xmm1
+    punpckhbw   xmm2, xmm1
+    pmaddubsw   xmm0, xmm7
+    pmaddubsw   xmm2, xmm7
+
+    paddsw      xmm0, xmm6                  ;rounding
+    paddsw      xmm2, xmm6
+    psraw       xmm0, 7                     ;shift
+    psraw       xmm2, 7
+    packuswb    xmm0, xmm2                  ;pack back to byte
+
+%if %1
+    movdqu      xmm1, [rdi]
+    pavgb       xmm0, xmm1
+%endif
+    movdqu      [rdi], xmm0                 ;store the result
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+%endm
+
+global sym(vp9_filter_block1d4_v2_ssse3) PRIVATE
+sym(vp9_filter_block1d4_v2_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM_4
+.loop:
+    movd        xmm0, [rsi]                 ;load src
+    movd        xmm1, [rsi + rax]
+
+    APPLY_FILTER_4 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d8_v2_ssse3) PRIVATE
+sym(vp9_filter_block1d8_v2_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movq        xmm0, [rsi]                 ;0
+    movq        xmm1, [rsi + rax]           ;1
+
+    APPLY_FILTER_8 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d16_v2_ssse3) PRIVATE
+sym(vp9_filter_block1d16_v2_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movdqu        xmm0, [rsi]               ;0
+    movdqu        xmm1, [rsi + rax]         ;1
+    movdqa        xmm2, xmm0
+
+    APPLY_FILTER_16 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d4_v2_avg_ssse3) PRIVATE
+sym(vp9_filter_block1d4_v2_avg_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM_4
+.loop:
+    movd        xmm0, [rsi]                 ;load src
+    movd        xmm1, [rsi + rax]
+
+    APPLY_FILTER_4 1
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d8_v2_avg_ssse3) PRIVATE
+sym(vp9_filter_block1d8_v2_avg_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movq        xmm0, [rsi]                 ;0
+    movq        xmm1, [rsi + rax]           ;1
+
+    APPLY_FILTER_8 1
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d16_v2_avg_ssse3) PRIVATE
+sym(vp9_filter_block1d16_v2_avg_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movdqu        xmm0, [rsi]               ;0
+    movdqu        xmm1, [rsi + rax]         ;1
+    movdqa        xmm2, xmm0
+
+    APPLY_FILTER_16 1
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d4_h2_ssse3) PRIVATE
+sym(vp9_filter_block1d4_h2_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM_4
+.loop:
+    movdqu      xmm0, [rsi]                 ;load src
+    movdqa      xmm1, xmm0
+    psrldq      xmm1, 1
+
+    APPLY_FILTER_4 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d8_h2_ssse3) PRIVATE
+sym(vp9_filter_block1d8_h2_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movdqu      xmm0, [rsi]                 ;load src
+    movdqa      xmm1, xmm0
+    psrldq      xmm1, 1
+
+    APPLY_FILTER_8 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d16_h2_ssse3) PRIVATE
+sym(vp9_filter_block1d16_h2_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movdqu      xmm0,   [rsi]               ;load src
+    movdqu      xmm1,   [rsi + 1]
+    movdqa      xmm2, xmm0
+
+    APPLY_FILTER_16 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d4_h2_avg_ssse3) PRIVATE
+sym(vp9_filter_block1d4_h2_avg_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM_4
+.loop:
+    movdqu      xmm0, [rsi]                 ;load src
+    movdqa      xmm1, xmm0
+    psrldq      xmm1, 1
+
+    APPLY_FILTER_4 1
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d8_h2_avg_ssse3) PRIVATE
+sym(vp9_filter_block1d8_h2_avg_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movdqu      xmm0, [rsi]                 ;load src
+    movdqa      xmm1, xmm0
+    psrldq      xmm1, 1
+
+    APPLY_FILTER_8 1
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d16_h2_avg_ssse3) PRIVATE
+sym(vp9_filter_block1d16_h2_avg_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movdqu      xmm0,   [rsi]               ;load src
+    movdqu      xmm1,   [rsi + 1]
+    movdqa      xmm2, xmm0
+
+    APPLY_FILTER_16 1
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index 93ef7503f..e52b3f759 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -15,6 +15,7 @@
 #include "./vpx_scale_rtcd.h"
 
 #include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem_ops.h"
 #include "vpx_scale/vpx_scale.h"
 
 #include "vp9/common/vp9_alloccommon.h"
@@ -39,20 +40,16 @@
 #include "vp9/decoder/vp9_reader.h"
 #include "vp9/decoder/vp9_thread.h"
 
-static int read_be32(const uint8_t *p) {
-  return (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
-}
-
 static int is_compound_reference_allowed(const VP9_COMMON *cm) {
   int i;
   for (i = 1; i < REFS_PER_FRAME; ++i)
-    if  (cm->ref_frame_sign_bias[i + 1] != cm->ref_frame_sign_bias[1])
+    if (cm->ref_frame_sign_bias[i + 1] != cm->ref_frame_sign_bias[1])
       return 1;
 
   return 0;
 }
 
-static void setup_compound_reference(VP9_COMMON *cm) {
+static void setup_compound_reference_mode(VP9_COMMON *cm) {
   if (cm->ref_frame_sign_bias[LAST_FRAME] ==
           cm->ref_frame_sign_bias[GOLDEN_FRAME]) {
     cm->comp_fixed_ref = ALTREF_FRAME;
@@ -116,33 +113,34 @@ static void read_inter_mode_probs(FRAME_CONTEXT *fc, vp9_reader *r) {
       vp9_diff_update_prob(r, &fc->inter_mode_probs[i][j]);
 }
 
-static REFERENCE_MODE read_reference_mode(VP9_COMMON *cm, vp9_reader *r) {
+static REFERENCE_MODE read_frame_reference_mode(const VP9_COMMON *cm,
+                                                vp9_reader *r) {
   if (is_compound_reference_allowed(cm)) {
-    REFERENCE_MODE mode = vp9_read_bit(r);
-    if (mode)
-      mode += vp9_read_bit(r);
-    setup_compound_reference(cm);
-    return mode;
+    return vp9_read_bit(r) ? (vp9_read_bit(r) ? REFERENCE_MODE_SELECT
+                                              : COMPOUND_REFERENCE)
+                           : SINGLE_REFERENCE;
   } else {
     return SINGLE_REFERENCE;
   }
 }
 
-static void read_reference_mode_probs(VP9_COMMON *cm, vp9_reader *r) {
+static void read_frame_reference_mode_probs(VP9_COMMON *cm, vp9_reader *r) {
+  FRAME_CONTEXT *const fc = &cm->fc;
   int i;
+
   if (cm->reference_mode == REFERENCE_MODE_SELECT)
-    for (i = 0; i < COMP_INTER_CONTEXTS; i++)
-      vp9_diff_update_prob(r, &cm->fc.comp_inter_prob[i]);
+    for (i = 0; i < COMP_INTER_CONTEXTS; ++i)
+      vp9_diff_update_prob(r, &fc->comp_inter_prob[i]);
 
   if (cm->reference_mode != COMPOUND_REFERENCE)
-    for (i = 0; i < REF_CONTEXTS; i++) {
-      vp9_diff_update_prob(r, &cm->fc.single_ref_prob[i][0]);
-      vp9_diff_update_prob(r, &cm->fc.single_ref_prob[i][1]);
+    for (i = 0; i < REF_CONTEXTS; ++i) {
+      vp9_diff_update_prob(r, &fc->single_ref_prob[i][0]);
+      vp9_diff_update_prob(r, &fc->single_ref_prob[i][1]);
     }
 
   if (cm->reference_mode != SINGLE_REFERENCE)
-    for (i = 0; i < REF_CONTEXTS; i++)
-      vp9_diff_update_prob(r, &cm->fc.comp_ref_prob[i]);
+    for (i = 0; i < REF_CONTEXTS; ++i)
+      vp9_diff_update_prob(r, &fc->comp_ref_prob[i]);
 }
 
 static void update_mv_probs(vp9_prob *p, int n, vp9_reader *r) {
@@ -303,7 +301,7 @@ static void predict_and_reconstruct_intra_block(int plane, int block,
                           dst, pd->dst.stride, dst, pd->dst.stride,
                           x, y, plane);
 
-  if (!mi->mbmi.skip_coeff) {
+  if (!mi->mbmi.skip) {
     const int eob = vp9_decode_block_tokens(cm, xd, plane, block,
                                             plane_bsize, x, y, tx_size,
                                             args->r);
@@ -350,9 +348,9 @@ static void set_offsets(VP9_COMMON *const cm, MACROBLOCKD *const xd,
 
   xd->mi_8x8 = cm->mi_grid_visible + offset;
   xd->prev_mi_8x8 = cm->prev_mi_grid_visible + offset;
-  // Special case: if prev_mi is NULL, the previous mode info context
-  // cannot be used.
-  xd->last_mi = cm->prev_mi ? xd->prev_mi_8x8[0] : NULL;
+
+  xd->last_mi = cm->coding_use_prev_mi && cm->prev_mi ?
+      xd->prev_mi_8x8[0] : NULL;
 
   xd->mi_8x8[0] = xd->mi_stream + offset - tile_offset;
   xd->mi_8x8[0]->mbmi.sb_type = bsize;
@@ -397,7 +395,7 @@ static void decode_modes_b(VP9_COMMON *const cm, MACROBLOCKD *const xd,
   // Has to be called after set_offsets
   mbmi = &xd->mi_8x8[0]->mbmi;
 
-  if (mbmi->skip_coeff) {
+  if (mbmi->skip) {
     reset_skip_context(xd, bsize);
   } else {
     if (cm->seg.enabled)
@@ -421,12 +419,12 @@ static void decode_modes_b(VP9_COMMON *const cm, MACROBLOCKD *const xd,
     vp9_dec_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
 
     // Reconstruction
-    if (!mbmi->skip_coeff) {
+    if (!mbmi->skip) {
       int eobtotal = 0;
       struct inter_args arg = { cm, xd, r, &eobtotal };
       vp9_foreach_transformed_block(xd, bsize, reconstruct_inter_block, &arg);
       if (!less8x8 && eobtotal == 0)
-        mbmi->skip_coeff = 1;  // skip loopfilter
+        mbmi->skip = 1;  // skip loopfilter
     }
   }
 
@@ -691,9 +689,14 @@ static void apply_frame_size(VP9D_COMP *pbi, int width, int height) {
     vp9_update_frame_size(cm);
   }
 
-  vp9_realloc_frame_buffer(get_frame_new_buffer(cm), cm->width, cm->height,
-                           cm->subsampling_x, cm->subsampling_y,
-                           VP9_DEC_BORDER_IN_PIXELS);
+  if (vp9_realloc_frame_buffer(
+          get_frame_new_buffer(cm), cm->width, cm->height,
+          cm->subsampling_x, cm->subsampling_y, VP9_DEC_BORDER_IN_PIXELS,
+          &cm->frame_bufs[cm->new_fb_idx].raw_frame_buffer, cm->get_fb_cb,
+          cm->cb_priv)) {
+    vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                       "Failed to allocate frame buffer");
+  }
 }
 
 static void setup_frame_size(VP9D_COMP *pbi,
@@ -831,7 +834,7 @@ static size_t get_tile(const uint8_t *const data_end,
       vpx_internal_error(error_info, VPX_CODEC_CORRUPT_FRAME,
                          "Truncated packet or corrupt tile length");
 
-    size = read_be32(*data);
+    size = mem_get_be32(*data);
     *data += 4;
 
     if (size > (size_t)(data_end - *data))
@@ -1114,7 +1117,13 @@ static size_t read_uncompressed_header(VP9D_COMP *pbi,
   cm->show_existing_frame = vp9_rb_read_bit(rb);
   if (cm->show_existing_frame) {
     // Show an existing frame directly.
-    int frame_to_show = cm->ref_frame_map[vp9_rb_read_literal(rb, 3)];
+    const int frame_to_show = cm->ref_frame_map[vp9_rb_read_literal(rb, 3)];
+
+    if (cm->frame_bufs[frame_to_show].ref_count < 1)
+      vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
+                         "Buffer %d does not contain a decoded frame",
+                         frame_to_show);
+
     ref_cnt_fb(cm->frame_bufs, &cm->new_fb_idx, frame_to_show);
     pbi->refresh_frame_flags = 0;
     cm->lf.filter_level = 0;
@@ -1198,9 +1207,11 @@ static size_t read_uncompressed_header(VP9D_COMP *pbi,
   }
 
   if (!cm->error_resilient_mode) {
+    cm->coding_use_prev_mi = 1;
     cm->refresh_frame_context = vp9_rb_read_bit(rb);
     cm->frame_parallel_decoding_mode = vp9_rb_read_bit(rb);
   } else {
+    cm->coding_use_prev_mi = 0;
     cm->refresh_frame_context = 0;
     cm->frame_parallel_decoding_mode = 1;
   }
@@ -1258,8 +1269,10 @@ static int read_compressed_header(VP9D_COMP *pbi, const uint8_t *data,
     for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
       vp9_diff_update_prob(&r, &fc->intra_inter_prob[i]);
 
-    cm->reference_mode = read_reference_mode(cm, &r);
-    read_reference_mode_probs(cm, &r);
+    cm->reference_mode = read_frame_reference_mode(cm, &r);
+    if (cm->reference_mode != SINGLE_REFERENCE)
+      setup_compound_reference_mode(cm);
+    read_frame_reference_mode_probs(cm, &r);
 
     for (j = 0; j < BLOCK_SIZE_GROUPS; j++)
       for (i = 0; i < INTRA_MODES - 1; ++i)
@@ -1368,7 +1381,10 @@ int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) {
   alloc_tile_storage(pbi, tile_rows, tile_cols);
 
   xd->mode_info_stride = cm->mode_info_stride;
-  set_prev_mi(cm);
+  if (cm->coding_use_prev_mi)
+    set_prev_mi(cm);
+  else
+    cm->prev_mi = NULL;
 
   setup_plane_dequants(cm, xd, cm->base_qindex);
   vp9_setup_block_planes(xd, cm->subsampling_x, cm->subsampling_y);
diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index c7fb71ddf..0fb7a1580 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -146,8 +146,8 @@ static int read_inter_segment_id(VP9_COMMON *const cm, MACROBLOCKD *const xd,
   return segment_id;
 }
 
-static int read_skip_coeff(VP9_COMMON *cm, const MACROBLOCKD *xd,
-                           int segment_id, vp9_reader *r) {
+static int read_skip(VP9_COMMON *cm, const MACROBLOCKD *xd,
+                     int segment_id, vp9_reader *r) {
   if (vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
     return 1;
   } else {
@@ -169,7 +169,7 @@ static void read_intra_frame_mode_info(VP9_COMMON *const cm,
   const BLOCK_SIZE bsize = mbmi->sb_type;
 
   mbmi->segment_id = read_intra_segment_id(cm, xd, mi_row, mi_col, r);
-  mbmi->skip_coeff = read_skip_coeff(cm, xd, mbmi->segment_id, r);
+  mbmi->skip = read_skip(cm, xd, mbmi->segment_id, r);
   mbmi->tx_size = read_tx_size(cm, xd, cm->tx_mode, bsize, 1, r);
   mbmi->ref_frame[0] = INTRA_FRAME;
   mbmi->ref_frame[1] = NONE;
@@ -257,13 +257,18 @@ static INLINE void read_mv(vp9_reader *r, MV *mv, const MV *ref,
   mv->col = ref->col + diff.col;
 }
 
-static REFERENCE_MODE read_reference_mode(VP9_COMMON *cm, const MACROBLOCKD *xd,
-                                          vp9_reader *r) {
-  const int ctx = vp9_get_reference_mode_context(cm, xd);
-  const int mode = vp9_read(r, cm->fc.comp_inter_prob[ctx]);
-  if (!cm->frame_parallel_decoding_mode)
-    ++cm->counts.comp_inter[ctx][mode];
-  return mode;  // SINGLE_REFERENCE or COMPOUND_REFERENCE
+static REFERENCE_MODE read_block_reference_mode(VP9_COMMON *cm,
+                                                const MACROBLOCKD *xd,
+                                                vp9_reader *r) {
+  if (cm->reference_mode == REFERENCE_MODE_SELECT) {
+    const int ctx = vp9_get_reference_mode_context(cm, xd);
+    const int mode = vp9_read(r, cm->fc.comp_inter_prob[ctx]);
+    if (!cm->frame_parallel_decoding_mode)
+      ++cm->counts.comp_inter[ctx][mode];
+    return mode;  // SINGLE_REFERENCE or COMPOUND_REFERENCE
+  } else {
+    return cm->reference_mode;
+  }
 }
 
 // Read the referncence frame
@@ -277,10 +282,7 @@ static void read_ref_frames(VP9_COMMON *const cm, MACROBLOCKD *const xd,
     ref_frame[0] = vp9_get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME);
     ref_frame[1] = NONE;
   } else {
-    const REFERENCE_MODE mode = (cm->reference_mode == REFERENCE_MODE_SELECT)
-                                      ? read_reference_mode(cm, xd, r)
-                                      : cm->reference_mode;
-
+    const REFERENCE_MODE mode = read_block_reference_mode(cm, xd, r);
     // FIXME(rbultje) I'm pretty sure this breaks segmentation ref frame coding
     if (mode == COMPOUND_REFERENCE) {
       const int idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref];
@@ -356,6 +358,11 @@ static void read_intra_block_mode_info(VP9_COMMON *const cm, MODE_INFO *mi,
   mbmi->uv_mode = read_intra_mode_uv(cm, r, mbmi->mode);
 }
 
+static INLINE int is_mv_valid(const MV *mv) {
+  return mv->row > MV_LOW && mv->row < MV_UPP &&
+         mv->col > MV_LOW && mv->col < MV_UPP;
+}
+
 static INLINE int assign_mv(VP9_COMMON *cm, MB_PREDICTION_MODE mode,
                             int_mv mv[2], int_mv ref_mv[2],
                             int_mv nearest_mv[2], int_mv near_mv[2],
@@ -367,14 +374,10 @@ static INLINE int assign_mv(VP9_COMMON *cm, MB_PREDICTION_MODE mode,
     case NEWMV: {
       nmv_context_counts *const mv_counts = cm->frame_parallel_decoding_mode ?
                                             NULL : &cm->counts.mv;
-      read_mv(r, &mv[0].as_mv, &ref_mv[0].as_mv,
-              &cm->fc.nmvc, mv_counts, allow_hp);
-      if (is_compound)
-        read_mv(r, &mv[1].as_mv, &ref_mv[1].as_mv,
-                &cm->fc.nmvc, mv_counts, allow_hp);
       for (i = 0; i < 1 + is_compound; ++i) {
-        ret = ret && mv[i].as_mv.row < MV_UPP && mv[i].as_mv.row > MV_LOW;
-        ret = ret && mv[i].as_mv.col < MV_UPP && mv[i].as_mv.col > MV_LOW;
+        read_mv(r, &mv[i].as_mv, &ref_mv[i].as_mv, &cm->fc.nmvc, mv_counts,
+                allow_hp);
+        ret = ret && is_mv_valid(&mv[i].as_mv);
       }
       break;
     }
@@ -520,10 +523,10 @@ static void read_inter_frame_mode_info(VP9_COMMON *const cm,
   mbmi->mv[0].as_int = 0;
   mbmi->mv[1].as_int = 0;
   mbmi->segment_id = read_inter_segment_id(cm, xd, mi_row, mi_col, r);
-  mbmi->skip_coeff = read_skip_coeff(cm, xd, mbmi->segment_id, r);
+  mbmi->skip = read_skip(cm, xd, mbmi->segment_id, r);
   inter_block = read_is_inter_block(cm, xd, mbmi->segment_id, r);
   mbmi->tx_size = read_tx_size(cm, xd, cm->tx_mode, mbmi->sb_type,
-                               !mbmi->skip_coeff || !inter_block, r);
+                               !mbmi->skip || !inter_block, r);
 
   if (inter_block)
     read_inter_block_mode_info(cm, xd, tile, mi, mi_row, mi_col, r);
diff --git a/vp9/decoder/vp9_dthread.c b/vp9/decoder/vp9_dthread.c
index 128b9f8af..542732aa0 100644
--- a/vp9/decoder/vp9_dthread.c
+++ b/vp9/decoder/vp9_dthread.c
@@ -220,11 +220,13 @@ void vp9_loop_filter_alloc(VP9_COMMON *cm, VP9LfSync *lf_sync, int rows,
 
   CHECK_MEM_ERROR(cm, lf_sync->mutex_,
                   vpx_malloc(sizeof(*lf_sync->mutex_) * rows));
+  for (i = 0; i < rows; ++i) {
+    pthread_mutex_init(&lf_sync->mutex_[i], NULL);
+  }
+
   CHECK_MEM_ERROR(cm, lf_sync->cond_,
                   vpx_malloc(sizeof(*lf_sync->cond_) * rows));
-
   for (i = 0; i < rows; ++i) {
-    pthread_mutex_init(&lf_sync->mutex_[i], NULL);
     pthread_cond_init(&lf_sync->cond_[i], NULL);
   }
 #endif  // CONFIG_MULTITHREAD
@@ -242,18 +244,29 @@ void vp9_loop_filter_dealloc(VP9LfSync *lf_sync, int rows) {
   if (lf_sync != NULL) {
     int i;
 
-    for (i = 0; i < rows; ++i) {
-      pthread_mutex_destroy(&lf_sync->mutex_[i]);
-      pthread_cond_destroy(&lf_sync->cond_[i]);
+    if (lf_sync->mutex_ != NULL) {
+      for (i = 0; i < rows; ++i) {
+        pthread_mutex_destroy(&lf_sync->mutex_[i]);
+      }
+      vpx_free(lf_sync->mutex_);
+    }
+    if (lf_sync->cond_ != NULL) {
+      for (i = 0; i < rows; ++i) {
+        pthread_cond_destroy(&lf_sync->cond_[i]);
+      }
+      vpx_free(lf_sync->cond_);
     }
 
-    vpx_free(lf_sync->mutex_);
-    vpx_free(lf_sync->cond_);
     vpx_free(lf_sync->cur_sb_col);
+    // clear the structure as the source of this call may be a resize in which
+    // case this call will be followed by an _alloc() which may fail.
+    vpx_memset(lf_sync, 0, sizeof(*lf_sync));
   }
 #else
   (void)rows;
-  if (lf_sync != NULL)
+  if (lf_sync != NULL) {
     vpx_free(lf_sync->cur_sb_col);
+    vpx_memset(lf_sync, 0, sizeof(*lf_sync));
+  }
 #endif  // CONFIG_MULTITHREAD
 }
diff --git a/vp9/decoder/vp9_onyxd_if.c b/vp9/decoder/vp9_onyxd_if.c
index 803d536ba..1d3522e13 100644
--- a/vp9/decoder/vp9_onyxd_if.c
+++ b/vp9/decoder/vp9_onyxd_if.c
@@ -290,9 +290,14 @@ static void swap_frame_buffers(VP9D_COMP *pbi) {
   VP9_COMMON *const cm = &pbi->common;
 
   for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) {
-    if (mask & 1)
+    if (mask & 1) {
+      const int old_idx = cm->ref_frame_map[ref_index];
       ref_cnt_fb(cm->frame_bufs, &cm->ref_frame_map[ref_index],
                  cm->new_fb_idx);
+      if (old_idx >= 0 && cm->frame_bufs[old_idx].ref_count == 0)
+        cm->release_fb_cb(cm->cb_priv,
+                          &cm->frame_bufs[old_idx].raw_frame_buffer);
+    }
     ++ref_index;
   }
 
@@ -337,6 +342,10 @@ int vp9_receive_compressed_data(VP9D_PTR ptr,
       cm->frame_refs[0].buf->corrupted = 1;
   }
 
+  // Check if the previous frame was a frame without any references to it.
+  if (cm->new_fb_idx >= 0 && cm->frame_bufs[cm->new_fb_idx].ref_count == 0)
+    cm->release_fb_cb(cm->cb_priv,
+                      &cm->frame_bufs[cm->new_fb_idx].raw_frame_buffer);
   cm->new_fb_idx = get_free_fb(cm);
 
   if (setjmp(cm->error.jmp)) {
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index dc64a107c..34d1da7bd 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -14,6 +14,7 @@
 
 #include "vpx/vpx_encoder.h"
 #include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem_ops.h"
 
 #include "vp9/common/vp9_entropymode.h"
 #include "vp9/common/vp9_entropymv.h"
@@ -33,13 +34,7 @@
 #include "vp9/encoder/vp9_tokenize.h"
 #include "vp9/encoder/vp9_write_bit_buffer.h"
 
-
-#if defined(SECTIONBITS_OUTPUT)
-unsigned __int64 Sectionbits[500];
-#endif
-
 #ifdef ENTROPY_STATS
-vp9_coeff_stats tree_update_hist[TX_SIZES][PLANE_TYPES];
 extern unsigned int active_section;
 #endif
 
@@ -67,13 +62,6 @@ static void write_inter_mode(vp9_writer *w, MB_PREDICTION_MODE mode,
                   &inter_mode_encodings[INTER_OFFSET(mode)]);
 }
 
-static INLINE void write_be32(uint8_t *p, int value) {
-  p[0] = value >> 24;
-  p[1] = value >> 16;
-  p[2] = value >> 8;
-  p[3] = value;
-}
-
 void vp9_encode_unsigned_max(struct vp9_write_bit_buffer *wb,
                              int data, int max) {
   vp9_wb_write_literal(wb, data, get_unsigned_bits(max));
@@ -109,13 +97,13 @@ static void write_selected_tx_size(const VP9_COMP *cpi, MODE_INFO *m,
   }
 }
 
-static int write_skip_coeff(const VP9_COMP *cpi, int segment_id, MODE_INFO *m,
-                            vp9_writer *w) {
+static int write_skip(const VP9_COMP *cpi, int segment_id, MODE_INFO *m,
+                      vp9_writer *w) {
   const MACROBLOCKD *const xd = &cpi->mb.e_mbd;
   if (vp9_segfeature_active(&cpi->common.seg, segment_id, SEG_LVL_SKIP)) {
     return 1;
   } else {
-    const int skip = m->mbmi.skip_coeff;
+    const int skip = m->mbmi.skip;
     vp9_write(w, skip, vp9_get_skip_prob(&cpi->common, xd));
     return skip;
   }
@@ -252,15 +240,15 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) {
   const nmv_context *nmvc = &cm->fc.nmvc;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
-  struct segmentation *seg = &cm->seg;
+  const struct segmentation *const seg = &cm->seg;
   MB_MODE_INFO *const mi = &m->mbmi;
-  const MV_REFERENCE_FRAME rf = mi->ref_frame[0];
-  const MV_REFERENCE_FRAME sec_rf = mi->ref_frame[1];
+  const MV_REFERENCE_FRAME ref0 = mi->ref_frame[0];
+  const MV_REFERENCE_FRAME ref1 = mi->ref_frame[1];
   const MB_PREDICTION_MODE mode = mi->mode;
   const int segment_id = mi->segment_id;
-  int skip_coeff;
   const BLOCK_SIZE bsize = mi->sb_type;
   const int allow_hp = cm->allow_high_precision_mv;
+  int skip;
 
 #ifdef ENTROPY_STATS
   active_section = 9;
@@ -278,18 +266,18 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) {
     }
   }
 
-  skip_coeff = write_skip_coeff(cpi, segment_id, m, bc);
+  skip = write_skip(cpi, segment_id, m, bc);
 
   if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME))
-    vp9_write(bc, rf != INTRA_FRAME, vp9_get_intra_inter_prob(cm, xd));
+    vp9_write(bc, ref0 != INTRA_FRAME, vp9_get_intra_inter_prob(cm, xd));
 
   if (bsize >= BLOCK_8X8 && cm->tx_mode == TX_MODE_SELECT &&
-      !(rf != INTRA_FRAME &&
-        (skip_coeff || vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)))) {
+      !(ref0 != INTRA_FRAME &&
+        (skip || vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)))) {
     write_selected_tx_size(cpi, m, mi->tx_size, bsize, bc);
   }
 
-  if (rf == INTRA_FRAME) {
+  if (ref0 == INTRA_FRAME) {
 #ifdef ENTROPY_STATS
     active_section = 6;
 #endif
@@ -311,7 +299,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) {
   } else {
     vp9_prob *mv_ref_p;
     encode_ref_frame(cpi, bc);
-    mv_ref_p = cpi->common.fc.inter_mode_probs[mi->mode_context[rf]];
+    mv_ref_p = cm->fc.inter_mode_probs[mi->mode_context[ref0]];
 
 #ifdef ENTROPY_STATS
     active_section = 3;
@@ -321,7 +309,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) {
     if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)) {
       if (bsize >= BLOCK_8X8) {
         write_inter_mode(bc, mode, mv_ref_p);
-        ++cm->counts.inter_mode[mi->mode_context[rf]][INTER_OFFSET(mode)];
+        ++cm->counts.inter_mode[mi->mode_context[ref0]][INTER_OFFSET(mode)];
       }
     }
 
@@ -341,21 +329,19 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) {
       for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
         for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
           const int j = idy * 2 + idx;
-          const MB_PREDICTION_MODE blockmode = m->bmi[j].as_mode;
-          write_inter_mode(bc, blockmode, mv_ref_p);
-          ++cm->counts.inter_mode[mi->mode_context[rf]]
-                                 [INTER_OFFSET(blockmode)];
-
-          if (blockmode == NEWMV) {
+          const MB_PREDICTION_MODE b_mode = m->bmi[j].as_mode;
+          write_inter_mode(bc, b_mode, mv_ref_p);
+          ++cm->counts.inter_mode[mi->mode_context[ref0]][INTER_OFFSET(b_mode)];
+          if (b_mode == NEWMV) {
 #ifdef ENTROPY_STATS
             active_section = 11;
 #endif
             vp9_encode_mv(cpi, bc, &m->bmi[j].as_mv[0].as_mv,
-                          &mi->ref_mvs[rf][0].as_mv, nmvc, allow_hp);
+                          &mi->ref_mvs[ref0][0].as_mv, nmvc, allow_hp);
 
             if (has_second_ref(mi))
               vp9_encode_mv(cpi, bc, &m->bmi[j].as_mv[1].as_mv,
-                            &mi->ref_mvs[sec_rf][0].as_mv, nmvc, allow_hp);
+                            &mi->ref_mvs[ref1][0].as_mv, nmvc, allow_hp);
           }
         }
       }
@@ -364,11 +350,11 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) {
       active_section = 5;
 #endif
       vp9_encode_mv(cpi, bc, &mi->mv[0].as_mv,
-                    &mi->ref_mvs[rf][0].as_mv, nmvc, allow_hp);
+                    &mi->ref_mvs[ref0][0].as_mv, nmvc, allow_hp);
 
       if (has_second_ref(mi))
         vp9_encode_mv(cpi, bc, &mi->mv[1].as_mv,
-                      &mi->ref_mvs[sec_rf][0].as_mv, nmvc, allow_hp);
+                      &mi->ref_mvs[ref1][0].as_mv, nmvc, allow_hp);
     }
   }
 }
@@ -387,7 +373,7 @@ static void write_mb_modes_kf(const VP9_COMP *cpi, MODE_INFO **mi_8x8,
   if (seg->update_map)
     write_segment_id(bc, seg, m->mbmi.segment_id);
 
-  write_skip_coeff(cpi, segment_id, m, bc);
+  write_skip(cpi, segment_id, m, bc);
 
   if (m->mbmi.sb_type >= BLOCK_8X8 && cm->tx_mode == TX_MODE_SELECT)
     write_selected_tx_size(cpi, m, m->mbmi.tx_size, m->mbmi.sb_type, bc);
@@ -555,16 +541,6 @@ static void build_tree_distribution(VP9_COMP *cpi, TX_SIZE tx_size) {
             coef_probs[i][j][k][l][m] = get_binary_prob(
                                             coef_branch_ct[i][j][k][l][m][0],
                                             coef_branch_ct[i][j][k][l][m][1]);
-#ifdef ENTROPY_STATS
-          if (!cpi->dummy_packing) {
-            int t;
-            for (t = 0; t < ENTROPY_TOKENS; ++t)
-              context_counters[tx_size][i][j][k][l][t] +=
-                  coef_counts[i][j][k][l][t];
-            context_counters[tx_size][i][j][k][l][ENTROPY_TOKENS] +=
-                eob_branch_ct[i][j][k][l];
-          }
-#endif
         }
       }
     }
@@ -643,10 +619,6 @@ static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi,
                 if (s > 0 && newp != *oldp)
                   u = 1;
                 vp9_write(bc, u, upd);
-#ifdef ENTROPY_STATS
-                if (!cpi->dummy_packing)
-                  ++tree_update_hist[tx_size][i][j][k][l][t][u];
-#endif
                 if (u) {
                   /* send/use new probability */
                   vp9_write_prob_diff_update(bc, newp, *oldp);
@@ -698,10 +670,6 @@ static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi,
                 updates += u;
                 if (u == 0 && updates == 0) {
                   noupdates_before_first++;
-#ifdef ENTROPY_STATS
-                  if (!cpi->dummy_packing)
-                    ++tree_update_hist[tx_size][i][j][k][l][t][u];
-#endif
                   continue;
                 }
                 if (u == 1 && updates == 1) {
@@ -712,10 +680,6 @@ static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi,
                     vp9_write(bc, 0, upd);
                 }
                 vp9_write(bc, u, upd);
-#ifdef ENTROPY_STATS
-                if (!cpi->dummy_packing)
-                  ++tree_update_hist[tx_size][i][j][k][l][t][u];
-#endif
                 if (u) {
                   /* send/use new probability */
                   vp9_write_prob_diff_update(bc, newp, *oldp);
@@ -1037,7 +1001,7 @@ static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) {
       vp9_stop_encode(&residual_bc);
       if (tile_col < tile_cols - 1 || tile_row < tile_rows - 1) {
         // size of this tile
-        write_be32(data_ptr + total_size, residual_bc.pos);
+        mem_put_be32(data_ptr + total_size, residual_bc.pos);
         total_size += 4;
       }
 
@@ -1287,11 +1251,12 @@ void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, size_t *size) {
     active_section = 7;
 #endif
 
-  vp9_clear_system_state();  // __asm emms;
+  vp9_clear_system_state();
 
   first_part_size = write_compressed_header(cpi, data);
   data += first_part_size;
-  vp9_wb_write_literal(&saved_wb, first_part_size, 16);
+  // TODO(jbb): Figure out what to do if first_part_size > 16 bits.
+  vp9_wb_write_literal(&saved_wb, (int)first_part_size, 16);
 
   data += encode_tiles(cpi, data);
 
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index 713cc5132..85f6c97af 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -49,7 +49,6 @@ typedef struct {
   int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES];
   int rate;
   int distortion;
-  int64_t intra_error;
   int best_mode_index;
   int rddiv;
   int rdmult;
@@ -63,9 +62,6 @@ typedef struct {
   // search loop
   int_mv pred_mv[MAX_REF_FRAMES];
   INTERP_FILTER pred_interp_filter;
-
-  // Bit flag for each mode whether it has high error in comparison to others.
-  unsigned int modes_with_high_error;
 } PICK_MODE_CONTEXT;
 
 struct macroblock_plane {
@@ -172,9 +168,7 @@ struct macroblock {
   int skip_encode;
 
   // Used to store sub partition's choices.
-  int fast_ms;
   int_mv pred_mv[MAX_REF_FRAMES];
-  int subblock_ref;
 
   // TODO(jingning): Need to refactor the structure arrays that buffers the
   // coding mode decisions of each partition type.
diff --git a/vp9/encoder/vp9_dct.c b/vp9/encoder/vp9_dct.c
index a840b480a..d5232393f 100644
--- a/vp9/encoder/vp9_dct.c
+++ b/vp9/encoder/vp9_dct.c
@@ -18,8 +18,6 @@
 #include "vp9/common/vp9_idct.h"
 #include "vp9/common/vp9_systemdependent.h"
 
-#include "vp9/encoder/vp9_dct.h"
-
 static INLINE int fdct_round_shift(int input) {
   int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
   assert(INT16_MIN <= rv && rv <= INT16_MAX);
@@ -49,7 +47,7 @@ void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride) {
   // The 2D transform is done with two passes which are actually pretty
   // similar. In the first one, we transform the columns and transpose
   // the results. In the second one, we transform the rows. To achieve that,
-  // as the first pass results are transposed, we tranpose the columns (that
+  // as the first pass results are transposed, we transpose the columns (that
   // is the transposed rows) and transpose the results (so that it goes back
   // in normal/row positions).
   int pass;
@@ -157,32 +155,36 @@ static const transform_2d FHT_4[] = {
   { fadst4, fadst4 }   // ADST_ADST = 3
 };
 
-void vp9_short_fht4x4_c(const int16_t *input, int16_t *output,
-                        int stride, int tx_type) {
-  int16_t out[4 * 4];
-  int16_t *outptr = &out[0];
-  int i, j;
-  int16_t temp_in[4], temp_out[4];
-  const transform_2d ht = FHT_4[tx_type];
+void vp9_fht4x4_c(const int16_t *input, int16_t *output,
+                  int stride, int tx_type) {
+  if (tx_type == DCT_DCT) {
+    vp9_fdct4x4_c(input, output, stride);
+  } else {
+    int16_t out[4 * 4];
+    int16_t *outptr = &out[0];
+    int i, j;
+    int16_t temp_in[4], temp_out[4];
+    const transform_2d ht = FHT_4[tx_type];
 
-  // Columns
-  for (i = 0; i < 4; ++i) {
-    for (j = 0; j < 4; ++j)
-      temp_in[j] = input[j * stride + i] * 16;
-    if (i == 0 && temp_in[0])
-      temp_in[0] += 1;
-    ht.cols(temp_in, temp_out);
-    for (j = 0; j < 4; ++j)
-      outptr[j * 4 + i] = temp_out[j];
-  }
+    // Columns
+    for (i = 0; i < 4; ++i) {
+      for (j = 0; j < 4; ++j)
+        temp_in[j] = input[j * stride + i] * 16;
+      if (i == 0 && temp_in[0])
+        temp_in[0] += 1;
+      ht.cols(temp_in, temp_out);
+      for (j = 0; j < 4; ++j)
+        outptr[j * 4 + i] = temp_out[j];
+    }
 
-  // Rows
-  for (i = 0; i < 4; ++i) {
-    for (j = 0; j < 4; ++j)
-      temp_in[j] = out[j + i * 4];
-    ht.rows(temp_in, temp_out);
-    for (j = 0; j < 4; ++j)
-      output[j + i * 4] = (temp_out[j] + 1) >> 2;
+    // Rows
+    for (i = 0; i < 4; ++i) {
+      for (j = 0; j < 4; ++j)
+        temp_in[j] = out[j + i * 4];
+      ht.rows(temp_in, temp_out);
+      for (j = 0; j < 4; ++j)
+        output[j + i * 4] = (temp_out[j] + 1) >> 2;
+    }
   }
 }
 
@@ -313,7 +315,7 @@ void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride) {
   // The 2D transform is done with two passes which are actually pretty
   // similar. In the first one, we transform the columns and transpose
   // the results. In the second one, we transform the rows. To achieve that,
-  // as the first pass results are transposed, we tranpose the columns (that
+  // as the first pass results are transposed, we transpose the columns (that
   // is the transposed rows) and transpose the results (so that it goes back
   // in normal/row positions).
   int pass;
@@ -565,30 +567,34 @@ static const transform_2d FHT_8[] = {
   { fadst8, fadst8 }   // ADST_ADST = 3
 };
 
-void vp9_short_fht8x8_c(const int16_t *input, int16_t *output,
-                        int stride, int tx_type) {
-  int16_t out[64];
-  int16_t *outptr = &out[0];
-  int i, j;
-  int16_t temp_in[8], temp_out[8];
-  const transform_2d ht = FHT_8[tx_type];
-
-  // Columns
-  for (i = 0; i < 8; ++i) {
-    for (j = 0; j < 8; ++j)
-      temp_in[j] = input[j * stride + i] * 4;
-    ht.cols(temp_in, temp_out);
-    for (j = 0; j < 8; ++j)
-      outptr[j * 8 + i] = temp_out[j];
-  }
+void vp9_fht8x8_c(const int16_t *input, int16_t *output,
+                  int stride, int tx_type) {
+  if (tx_type == DCT_DCT) {
+    vp9_fdct8x8_c(input, output, stride);
+  } else {
+    int16_t out[64];
+    int16_t *outptr = &out[0];
+    int i, j;
+    int16_t temp_in[8], temp_out[8];
+    const transform_2d ht = FHT_8[tx_type];
+
+    // Columns
+    for (i = 0; i < 8; ++i) {
+      for (j = 0; j < 8; ++j)
+        temp_in[j] = input[j * stride + i] * 4;
+      ht.cols(temp_in, temp_out);
+      for (j = 0; j < 8; ++j)
+        outptr[j * 8 + i] = temp_out[j];
+    }
 
-  // Rows
-  for (i = 0; i < 8; ++i) {
-    for (j = 0; j < 8; ++j)
-      temp_in[j] = out[j + i * 8];
-    ht.rows(temp_in, temp_out);
-    for (j = 0; j < 8; ++j)
-      output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
+    // Rows
+    for (i = 0; i < 8; ++i) {
+      for (j = 0; j < 8; ++j)
+        temp_in[j] = out[j + i * 8];
+      ht.rows(temp_in, temp_out);
+      for (j = 0; j < 8; ++j)
+        output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
+    }
   }
 }
 
@@ -958,31 +964,34 @@ static const transform_2d FHT_16[] = {
   { fadst16, fadst16 }   // ADST_ADST = 3
 };
 
-void vp9_short_fht16x16_c(const int16_t *input, int16_t *output,
-                          int stride, int tx_type) {
-  int16_t out[256];
-  int16_t *outptr = &out[0];
-  int i, j;
-  int16_t temp_in[16], temp_out[16];
-  const transform_2d ht = FHT_16[tx_type];
-
-  // Columns
-  for (i = 0; i < 16; ++i) {
-    for (j = 0; j < 16; ++j)
-      temp_in[j] = input[j * stride + i] * 4;
-    ht.cols(temp_in, temp_out);
-    for (j = 0; j < 16; ++j)
-      outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
-//      outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
-  }
+void vp9_fht16x16_c(const int16_t *input, int16_t *output,
+                    int stride, int tx_type) {
+  if (tx_type == DCT_DCT) {
+    vp9_fdct16x16_c(input, output, stride);
+  } else {
+    int16_t out[256];
+    int16_t *outptr = &out[0];
+    int i, j;
+    int16_t temp_in[16], temp_out[16];
+    const transform_2d ht = FHT_16[tx_type];
+
+    // Columns
+    for (i = 0; i < 16; ++i) {
+      for (j = 0; j < 16; ++j)
+        temp_in[j] = input[j * stride + i] * 4;
+      ht.cols(temp_in, temp_out);
+      for (j = 0; j < 16; ++j)
+        outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
+    }
 
-  // Rows
-  for (i = 0; i < 16; ++i) {
-    for (j = 0; j < 16; ++j)
-      temp_in[j] = out[j + i * 16];
-    ht.rows(temp_in, temp_out);
-    for (j = 0; j < 16; ++j)
-      output[j + i * 16] = temp_out[j];
+    // Rows
+    for (i = 0; i < 16; ++i) {
+      for (j = 0; j < 16; ++j)
+        temp_in[j] = out[j + i * 16];
+      ht.rows(temp_in, temp_out);
+      for (j = 0; j < 16; ++j)
+        output[j + i * 16] = temp_out[j];
+    }
   }
 }
 
@@ -1375,27 +1384,3 @@ void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *out, int stride) {
       out[j + i * 32] = temp_out[j];
   }
 }
-
-void vp9_fht4x4(TX_TYPE tx_type, const int16_t *input, int16_t *output,
-                int stride) {
-  if (tx_type == DCT_DCT)
-    vp9_fdct4x4(input, output, stride);
-  else
-    vp9_short_fht4x4(input, output, stride, tx_type);
-}
-
-void vp9_fht8x8(TX_TYPE tx_type, const int16_t *input, int16_t *output,
-                int stride) {
-  if (tx_type == DCT_DCT)
-    vp9_fdct8x8(input, output, stride);
-  else
-    vp9_short_fht8x8(input, output, stride, tx_type);
-}
-
-void vp9_fht16x16(TX_TYPE tx_type, const int16_t *input, int16_t *output,
-                  int stride) {
-  if (tx_type == DCT_DCT)
-    vp9_fdct16x16(input, output, stride);
-  else
-    vp9_short_fht16x16(input, output, stride, tx_type);
-}
diff --git a/vp9/encoder/vp9_dct.h b/vp9/encoder/vp9_dct.h
deleted file mode 100644
index cf5f001a9..000000000
--- a/vp9/encoder/vp9_dct.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef VP9_ENCODER_VP9_DCT_H_
-#define VP9_ENCODER_VP9_DCT_H_
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-void vp9_fht4x4(TX_TYPE tx_type, const int16_t *input, int16_t *output,
-                int stride);
-
-void vp9_fht8x8(TX_TYPE tx_type, const int16_t *input, int16_t *output,
-                int stride);
-
-void vp9_fht16x16(TX_TYPE tx_type, const int16_t *input, int16_t *output,
-                  int stride);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // VP9_ENCODER_VP9_DCT_H_
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 7fb5a03ba..57865138d 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -40,8 +40,6 @@
 #include "vp9/encoder/vp9_tokenize.h"
 #include "vp9/encoder/vp9_vaq.h"
 
-#define DBG_PRNT_SEGMAP 0
-
 static INLINE uint8_t *get_sb_index(MACROBLOCK *x, BLOCK_SIZE subsize) {
   switch (subsize) {
     case BLOCK_64X64:
@@ -96,7 +94,8 @@ static const uint8_t VP9_VAR_OFFS[64] = {
   128, 128, 128, 128, 128, 128, 128, 128
 };
 
-static unsigned int get_sby_perpixel_variance(VP9_COMP *cpi, MACROBLOCK *x,
+static unsigned int get_sby_perpixel_variance(VP9_COMP *cpi,
+                                              MACROBLOCK *x,
                                               BLOCK_SIZE bs) {
   unsigned int var, sse;
   var = cpi->fn_ptr[bs].vf(x->plane[0].src.buf, x->plane[0].src.stride,
@@ -104,6 +103,52 @@ static unsigned int get_sby_perpixel_variance(VP9_COMP *cpi, MACROBLOCK *x,
   return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]);
 }
 
+static unsigned int get_sby_perpixel_diff_variance(VP9_COMP *cpi,
+                                                   MACROBLOCK *x,
+                                                   int mi_row,
+                                                   int mi_col,
+                                                   BLOCK_SIZE bs) {
+  const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, LAST_FRAME);
+  int offset = (mi_row * MI_SIZE) * yv12->y_stride + (mi_col * MI_SIZE);
+  unsigned int var, sse;
+  var = cpi->fn_ptr[bs].vf(x->plane[0].src.buf,
+                           x->plane[0].src.stride,
+                           yv12->y_buffer + offset,
+                           yv12->y_stride,
+                           &sse);
+  return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]);
+}
+
+static BLOCK_SIZE get_rd_var_based_fixed_partition(VP9_COMP *cpi,
+                                                   int mi_row,
+                                                   int mi_col) {
+  unsigned int var = get_sby_perpixel_diff_variance(cpi, &cpi->mb,
+                                                    mi_row, mi_col,
+                                                    BLOCK_64X64);
+  if (var < 8)
+    return BLOCK_64X64;
+  else if (var < 128)
+    return BLOCK_32X32;
+  else if (var < 2048)
+    return BLOCK_16X16;
+  else
+    return BLOCK_8X8;
+}
+
+static BLOCK_SIZE get_nonrd_var_based_fixed_partition(VP9_COMP *cpi,
+                                                      int mi_row,
+                                                      int mi_col) {
+  unsigned int var = get_sby_perpixel_diff_variance(cpi, &cpi->mb,
+                                                    mi_row, mi_col,
+                                                    BLOCK_64X64);
+  if (var < 4)
+    return BLOCK_64X64;
+  else if (var < 10)
+    return BLOCK_32X32;
+  else
+    return BLOCK_16X16;
+}
+
 // Original activity measure from Tim T's code.
 static unsigned int tt_activity_measure(MACROBLOCK *x) {
   unsigned int sse;
@@ -321,7 +366,7 @@ static void build_activity_map(VP9_COMP *cpi) {
 }
 
 // Macroblock activity masking
-void vp9_activity_masking(VP9_COMP *cpi, MACROBLOCK *x) {
+static void activity_masking(VP9_COMP *cpi, MACROBLOCK *x) {
 #if USE_ACT_INDEX
   x->rdmult += *(x->mb_activity_ptr) * (x->rdmult >> 2);
   x->errorperbit = x->rdmult * 100 / (110 * x->rddiv);
@@ -347,7 +392,6 @@ static void select_in_frame_q_segment(VP9_COMP *cpi,
                                       int mi_row, int mi_col,
                                       int output_enabled, int projected_rate) {
   VP9_COMMON *const cm = &cpi->common;
-  int target_rate = cpi->rc.sb64_target_rate << 8;   // convert to bits << 8
 
   const int mi_offset = mi_row * cm->mi_cols + mi_col;
   const int bw = num_8x8_blocks_wide_lookup[BLOCK_64X64];
@@ -364,11 +408,10 @@ static void select_in_frame_q_segment(VP9_COMP *cpi,
   } else {
     // Rate depends on fraction of a SB64 in frame (xmis * ymis / bw * bh).
     // It is converted to bits * 256 units
-    target_rate = (cpi->rc.sb64_target_rate * xmis * ymis * 256) / (bw * bh);
+    const int target_rate = (cpi->rc.sb64_target_rate * xmis * ymis * 256) /
+                            (bw * bh);
 
     if (projected_rate < (target_rate / 4)) {
-      segment = 2;
-    } else if (projected_rate < (target_rate / 2)) {
       segment = 1;
     } else {
       segment = 0;
@@ -402,7 +445,6 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
   MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
   MODE_INFO *mi_addr = xd->mi_8x8[0];
 
-  const int mb_mode_index = ctx->best_mode_index;
   const int mis = cm->mode_info_stride;
   const int mi_width = num_8x8_blocks_wide_lookup[bsize];
   const int mi_height = num_8x8_blocks_high_lookup[bsize];
@@ -474,8 +516,8 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
       cpi->rd_tx_select_diff[i] += ctx->tx_rd_diff[i];
   }
 
-  if (frame_is_intra_only(cm)) {
 #if CONFIG_INTERNAL_STATS
+  if (frame_is_intra_only(cm)) {
     static const int kf_mode_index[] = {
       THR_DC        /*DC_PRED*/,
       THR_V_PRED    /*V_PRED*/,
@@ -488,29 +530,32 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
       THR_D63_PRED  /*D63_PRED*/,
       THR_TM        /*TM_PRED*/,
     };
-    cpi->mode_chosen_counts[kf_mode_index[mbmi->mode]]++;
-#endif
+    ++cpi->mode_chosen_counts[kf_mode_index[mbmi->mode]];
   } else {
     // Note how often each mode chosen as best
-    cpi->mode_chosen_counts[mb_mode_index]++;
-    if (is_inter_block(mbmi) &&
-        (mbmi->sb_type < BLOCK_8X8 || mbmi->mode == NEWMV)) {
-      int_mv best_mv[2];
-      for (i = 0; i < 1 + has_second_ref(mbmi); ++i)
-        best_mv[i].as_int = mbmi->ref_mvs[mbmi->ref_frame[i]][0].as_int;
-      vp9_update_mv_count(cpi, x, best_mv);
-    }
+    ++cpi->mode_chosen_counts[ctx->best_mode_index];
+  }
+#endif
+  if (!frame_is_intra_only(cm)) {
+    if (is_inter_block(mbmi)) {
+      if (mbmi->sb_type < BLOCK_8X8 || mbmi->mode == NEWMV) {
+        int_mv best_mv[2];
+        for (i = 0; i < 1 + has_second_ref(mbmi); ++i)
+          best_mv[i].as_int = mbmi->ref_mvs[mbmi->ref_frame[i]][0].as_int;
+        vp9_update_mv_count(cpi, x, best_mv);
+      }
 
-    if (cm->interp_filter == SWITCHABLE && is_inter_mode(mbmi->mode)) {
-      const int ctx = vp9_get_pred_context_switchable_interp(xd);
-      ++cm->counts.switchable_interp[ctx][mbmi->interp_filter];
+      if (cm->interp_filter == SWITCHABLE) {
+        const int ctx = vp9_get_pred_context_switchable_interp(xd);
+        ++cm->counts.switchable_interp[ctx][mbmi->interp_filter];
+      }
     }
 
     cpi->rd_comp_pred_diff[SINGLE_REFERENCE] += ctx->single_pred_diff;
     cpi->rd_comp_pred_diff[COMPOUND_REFERENCE] += ctx->comp_pred_diff;
     cpi->rd_comp_pred_diff[REFERENCE_MODE_SELECT] += ctx->hybrid_pred_diff;
 
-    for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
+    for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
       cpi->rd_filter_diff[i] += ctx->best_filter_diff[i];
   }
 }
@@ -555,8 +600,6 @@ static void set_offsets(VP9_COMP *cpi, const TileInfo *const tile,
   xd->mi_8x8 = cm->mi_grid_visible + idx_str;
   xd->prev_mi_8x8 = cm->prev_mi_grid_visible + idx_str;
 
-  // Special case: if prev_mi is NULL, the previous mode info context
-  // cannot be used.
   xd->last_mi = cm->prev_mi ? xd->prev_mi_8x8[0] : NULL;
 
   xd->mi_8x8[0] = cm->mi + idx_str;
@@ -613,7 +656,7 @@ static void set_offsets(VP9_COMP *cpi, const TileInfo *const tile,
     x->encode_breakout = cpi->segment_encode_breakout[mbmi->segment_id];
   } else {
     mbmi->segment_id = 0;
-    x->encode_breakout = cpi->oxcf.encode_breakout;
+    x->encode_breakout = cpi->encode_breakout;
   }
 }
 
@@ -631,7 +674,7 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile,
   int orig_rdmult = x->rdmult;
   double rdmult_ratio;
 
-  vp9_clear_system_state();  // __asm emms;
+  vp9_clear_system_state();
   rdmult_ratio = 1.0;  // avoid uninitialized warnings
 
   // Use the lower precision, but faster, 32x32 fdct for mode selection.
@@ -660,32 +703,44 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile,
   x->skip_recode = 0;
 
   // Set to zero to make sure we do not use the previous encoded frame stats
-  xd->mi_8x8[0]->mbmi.skip_coeff = 0;
+  xd->mi_8x8[0]->mbmi.skip = 0;
 
   x->source_variance = get_sby_perpixel_variance(cpi, x, bsize);
 
   if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
     const int energy = bsize <= BLOCK_16X16 ? x->mb_energy
                                             : vp9_block_energy(cpi, x, bsize);
-    xd->mi_8x8[0]->mbmi.segment_id = vp9_vaq_segment_id(energy);
+
+    if (cm->frame_type == KEY_FRAME ||
+        cpi->refresh_alt_ref_frame ||
+        (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) {
+      xd->mi_8x8[0]->mbmi.segment_id = vp9_vaq_segment_id(energy);
+    } else {
+      const uint8_t *const map = cm->seg.update_map ? cpi->segmentation_map
+                                                    : cm->last_frame_seg_map;
+      xd->mi_8x8[0]->mbmi.segment_id =
+        vp9_get_segment_id(cm, map, bsize, mi_row, mi_col);
+    }
+
     rdmult_ratio = vp9_vaq_rdmult_ratio(energy);
     vp9_mb_init_quantizer(cpi, x);
   }
 
   if (cpi->oxcf.tuning == VP8_TUNE_SSIM)
-    vp9_activity_masking(cpi, x);
+    activity_masking(cpi, x);
 
   if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
-    vp9_clear_system_state();  // __asm emms;
-    x->rdmult = round(x->rdmult * rdmult_ratio);
+    vp9_clear_system_state();
+    x->rdmult = (int)round(x->rdmult * rdmult_ratio);
   } else if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) {
     const int mi_offset = mi_row * cm->mi_cols + mi_col;
     unsigned char complexity = cpi->complexity_map[mi_offset];
-    const int is_edge = (mi_row == 0) || (mi_row == (cm->mi_rows - 1)) ||
-                        (mi_col == 0) || (mi_col == (cm->mi_cols - 1));
+    const int is_edge = (mi_row <= 1) || (mi_row >= (cm->mi_rows - 2)) ||
+                        (mi_col <= 1) || (mi_col >= (cm->mi_cols - 2));
 
-    if (!is_edge && (complexity > 128))
+    if (!is_edge && (complexity > 128)) {
       x->rdmult = x->rdmult  + ((x->rdmult * (complexity - 128)) / 256);
+    }
   }
 
   // Find best coding mode & reconstruct the MB so it is available
@@ -705,44 +760,51 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile,
   if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
     x->rdmult = orig_rdmult;
     if (*totalrate != INT_MAX) {
-      vp9_clear_system_state();  // __asm emms;
-      *totalrate = round(*totalrate * rdmult_ratio);
+      vp9_clear_system_state();
+      *totalrate = (int)round(*totalrate * rdmult_ratio);
     }
   }
+  else if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) {
+    x->rdmult = orig_rdmult;
+  }
 }
 
 static void update_stats(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &cpi->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MODE_INFO *mi = xd->mi_8x8[0];
-  MB_MODE_INFO *const mbmi = &mi->mbmi;
+  const MACROBLOCK *const x = &cpi->mb;
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const MODE_INFO *const mi = xd->mi_8x8[0];
+  const MB_MODE_INFO *const mbmi = &mi->mbmi;
 
   if (!frame_is_intra_only(cm)) {
     const int seg_ref_active = vp9_segfeature_active(&cm->seg, mbmi->segment_id,
                                                      SEG_LVL_REF_FRAME);
+    if (!seg_ref_active) {
+      FRAME_COUNTS *const counts = &cm->counts;
+      const int inter_block = is_inter_block(mbmi);
 
-    if (!seg_ref_active)
-      cm->counts.intra_inter[vp9_get_intra_inter_context(xd)]
-                            [is_inter_block(mbmi)]++;
-
-    // If the segment reference feature is enabled we have only a single
-    // reference frame allowed for the segment so exclude it from
-    // the reference frame counts used to work out probabilities.
-    if (is_inter_block(mbmi) && !seg_ref_active) {
-      if (cm->reference_mode == REFERENCE_MODE_SELECT)
-        cm->counts.comp_inter[vp9_get_reference_mode_context(cm, xd)]
-                             [has_second_ref(mbmi)]++;
-
-      if (has_second_ref(mbmi)) {
-        cm->counts.comp_ref[vp9_get_pred_context_comp_ref_p(cm, xd)]
-                           [mbmi->ref_frame[0] == GOLDEN_FRAME]++;
-      } else {
-        cm->counts.single_ref[vp9_get_pred_context_single_ref_p1(xd)][0]
-                             [mbmi->ref_frame[0] != LAST_FRAME]++;
-        if (mbmi->ref_frame[0] != LAST_FRAME)
-          cm->counts.single_ref[vp9_get_pred_context_single_ref_p2(xd)][1]
-                               [mbmi->ref_frame[0] != GOLDEN_FRAME]++;
+      counts->intra_inter[vp9_get_intra_inter_context(xd)][inter_block]++;
+
+      // If the segment reference feature is enabled we have only a single
+      // reference frame allowed for the segment so exclude it from
+      // the reference frame counts used to work out probabilities.
+      if (inter_block) {
+        const MV_REFERENCE_FRAME ref0 = mbmi->ref_frame[0];
+
+        if (cm->reference_mode == REFERENCE_MODE_SELECT)
+          counts->comp_inter[vp9_get_reference_mode_context(cm, xd)]
+                            [has_second_ref(mbmi)]++;
+
+        if (has_second_ref(mbmi)) {
+          counts->comp_ref[vp9_get_pred_context_comp_ref_p(cm, xd)]
+                          [ref0 == GOLDEN_FRAME]++;
+        } else {
+          counts->single_ref[vp9_get_pred_context_single_ref_p1(xd)][0]
+                            [ref0 != LAST_FRAME]++;
+          if (ref0 != LAST_FRAME)
+            counts->single_ref[vp9_get_pred_context_single_ref_p2(xd)][1]
+                              [ref0 != GOLDEN_FRAME]++;
+        }
       }
     }
   }
@@ -950,9 +1012,9 @@ static BLOCK_SIZE find_partition_size(BLOCK_SIZE bsize,
 // may not be allowed in which case this code attempts to choose the largest
 // allowable partition.
 static void set_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
-                             MODE_INFO **mi_8x8, int mi_row, int mi_col) {
+                             MODE_INFO **mi_8x8, int mi_row, int mi_col,
+                             BLOCK_SIZE bsize) {
   VP9_COMMON *const cm = &cpi->common;
-  BLOCK_SIZE bsize = cpi->sf.always_this_block_size;
   const int mis = cm->mode_info_stride;
   int row8x8_remaining = tile->mi_row_end - mi_row;
   int col8x8_remaining = tile->mi_col_end - mi_col;
@@ -979,7 +1041,7 @@ static void set_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
       for (block_col = 0; block_col < MI_BLOCK_SIZE; block_col += bw) {
         int index = block_row * mis + block_col;
         // Find a partition size that fits
-        bsize = find_partition_size(cpi->sf.always_this_block_size,
+        bsize = find_partition_size(bsize,
                                     (row8x8_remaining - block_row),
                                     (col8x8_remaining - block_col), &bh, &bw);
         mi_8x8[index] = mi_upper_left + index;
@@ -1025,38 +1087,19 @@ static int sb_has_motion(const VP9_COMMON *cm, MODE_INFO **prev_mi_8x8) {
   }
   return 0;
 }
+
 static void update_state_rt(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
                          BLOCK_SIZE bsize, int output_enabled) {
   int i;
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
-  struct macroblock_plane *const p = x->plane;
-  struct macroblockd_plane *const pd = xd->plane;
   MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
 
-  const int mb_mode_index = ctx->best_mode_index;
-  int max_plane;
-
-  max_plane = is_inter_block(mbmi) ? MAX_MB_PLANE : 1;
-  for (i = 0; i < max_plane; ++i) {
-    p[i].coeff = ctx->coeff_pbuf[i][1];
-    p[i].qcoeff = ctx->qcoeff_pbuf[i][1];
-    pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][1];
-    p[i].eobs = ctx->eobs_pbuf[i][1];
-  }
-
-  for (i = max_plane; i < MAX_MB_PLANE; ++i) {
-    p[i].coeff = ctx->coeff_pbuf[i][2];
-    p[i].qcoeff = ctx->qcoeff_pbuf[i][2];
-    pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][2];
-    p[i].eobs = ctx->eobs_pbuf[i][2];
-  }
-
   x->skip = ctx->skip;
 
-  if (frame_is_intra_only(cm)) {
 #if CONFIG_INTERNAL_STATS
+  if (frame_is_intra_only(cm)) {
     static const int kf_mode_index[] = {
       THR_DC /*DC_PRED*/,
       THR_V_PRED /*V_PRED*/,
@@ -1070,21 +1113,24 @@ static void update_state_rt(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
       THR_TM /*TM_PRED*/,
     };
     ++cpi->mode_chosen_counts[kf_mode_index[mbmi->mode]];
-#endif
   } else {
     // Note how often each mode chosen as best
-    cpi->mode_chosen_counts[mb_mode_index]++;
-    if (is_inter_block(mbmi) &&
-        (mbmi->sb_type < BLOCK_8X8 || mbmi->mode == NEWMV)) {
-      int_mv best_mv[2];
-      for (i = 0; i < 1 + has_second_ref(mbmi); ++i)
-        best_mv[i].as_int = mbmi->ref_mvs[mbmi->ref_frame[i]][0].as_int;
-      vp9_update_mv_count(cpi, x, best_mv);
-    }
+    ++cpi->mode_chosen_counts[ctx->best_mode_index];
+  }
+#endif
+  if (!frame_is_intra_only(cm)) {
+    if (is_inter_block(mbmi)) {
+      if (mbmi->sb_type < BLOCK_8X8 || mbmi->mode == NEWMV) {
+        int_mv best_mv[2];
+        for (i = 0; i < 1 + has_second_ref(mbmi); ++i)
+          best_mv[i].as_int = mbmi->ref_mvs[mbmi->ref_frame[i]][0].as_int;
+        vp9_update_mv_count(cpi, x, best_mv);
+      }
 
-    if (cm->interp_filter == SWITCHABLE && is_inter_mode(mbmi->mode)) {
-      const int ctx = vp9_get_pred_context_switchable_interp(xd);
-      ++cm->counts.switchable_interp[ctx][mbmi->interp_filter];
+      if (cm->interp_filter == SWITCHABLE) {
+        const int ctx = vp9_get_pred_context_switchable_interp(xd);
+        ++cm->counts.switchable_interp[ctx][mbmi->interp_filter];
+      }
     }
   }
 }
@@ -1111,8 +1157,8 @@ static void encode_b_rt(VP9_COMP *cpi, const TileInfo *const tile,
 }
 
 static void encode_sb_rt(VP9_COMP *cpi, const TileInfo *const tile,
-                      TOKENEXTRA **tp, int mi_row, int mi_col,
-                      int output_enabled, BLOCK_SIZE bsize) {
+                         TOKENEXTRA **tp, int mi_row, int mi_col,
+                         int output_enabled, BLOCK_SIZE bsize) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
   const int bsl = b_width_log2(bsize), hbs = (1 << bsl) / 4;
@@ -1130,7 +1176,6 @@ static void encode_sb_rt(VP9_COMP *cpi, const TileInfo *const tile,
     ctx = partition_plane_context(cpi->above_seg_context, cpi->left_seg_context,
                                  mi_row, mi_col, bsize);
     subsize = mi_8x8[0]->mbmi.sb_type;
-
   } else {
     ctx = 0;
     subsize = BLOCK_4X4;
@@ -1181,7 +1226,7 @@ static void encode_sb_rt(VP9_COMP *cpi, const TileInfo *const tile,
                    subsize);
       *get_sb_index(x, subsize) = 3;
       encode_sb_rt(cpi, tile, tp, mi_row + hbs, mi_col + hbs, output_enabled,
-                subsize);
+                   subsize);
       break;
     default:
       assert("Invalid partition type.");
@@ -1213,13 +1258,14 @@ static void rd_use_partition(VP9_COMP *cpi,
   ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
   PARTITION_CONTEXT sl[8], sa[8];
   int last_part_rate = INT_MAX;
-  int64_t last_part_dist = INT_MAX;
-  int split_rate = INT_MAX;
-  int64_t split_dist = INT_MAX;
+  int64_t last_part_dist = INT64_MAX;
+  int64_t last_part_rd = INT64_MAX;
   int none_rate = INT_MAX;
-  int64_t none_dist = INT_MAX;
+  int64_t none_dist = INT64_MAX;
+  int64_t none_rd = INT64_MAX;
   int chosen_rate = INT_MAX;
-  int64_t chosen_dist = INT_MAX;
+  int64_t chosen_dist = INT64_MAX;
+  int64_t chosen_rd = INT64_MAX;
   BLOCK_SIZE sub_subsize = BLOCK_4X4;
   int splits_below = 0;
   BLOCK_SIZE bs_type = mi_8x8[0]->mbmi.sb_type;
@@ -1248,10 +1294,8 @@ static void rd_use_partition(VP9_COMP *cpi,
     x->mb_energy = vp9_block_energy(cpi, x, bsize);
   }
 
-  x->fast_ms = 0;
-  x->subblock_ref = 0;
-
-  if (cpi->sf.adjust_partitioning_from_last_frame) {
+  if (cpi->sf.partition_search_type == SEARCH_PARTITION &&
+      cpi->sf.adjust_partitioning_from_last_frame) {
     // Check if any of the sub blocks are further split.
     if (partition == PARTITION_SPLIT && subsize > BLOCK_8X8) {
       sub_subsize = get_subsize(subsize, PARTITION_SPLIT);
@@ -1277,7 +1321,11 @@ static void rd_use_partition(VP9_COMP *cpi,
       pl = partition_plane_context(cpi->above_seg_context,
                                    cpi->left_seg_context,
                                    mi_row, mi_col, bsize);
-      none_rate += x->partition_cost[pl][PARTITION_NONE];
+
+      if (none_rate < INT_MAX) {
+        none_rate += x->partition_cost[pl][PARTITION_NONE];
+        none_rd = RDCOST(x->rdmult, x->rddiv, none_rate, none_dist);
+      }
 
       restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
       mi_8x8[0]->mbmi.sb_type = bs_type;
@@ -1305,9 +1353,9 @@ static void rd_use_partition(VP9_COMP *cpi,
         *get_sb_index(x, subsize) = 1;
         rd_pick_sb_modes(cpi, tile, mi_row + (ms >> 1), mi_col, &rt, &dt,
                          subsize, get_block_context(x, subsize), INT64_MAX);
-        if (rt == INT_MAX || dt == INT_MAX) {
+        if (rt == INT_MAX || dt == INT64_MAX) {
           last_part_rate = INT_MAX;
-          last_part_dist = INT_MAX;
+          last_part_dist = INT64_MAX;
           break;
         }
 
@@ -1329,9 +1377,9 @@ static void rd_use_partition(VP9_COMP *cpi,
         *get_sb_index(x, subsize) = 1;
         rd_pick_sb_modes(cpi, tile, mi_row, mi_col + (ms >> 1), &rt, &dt,
                          subsize, get_block_context(x, subsize), INT64_MAX);
-        if (rt == INT_MAX || dt == INT_MAX) {
+        if (rt == INT_MAX || dt == INT64_MAX) {
           last_part_rate = INT_MAX;
-          last_part_dist = INT_MAX;
+          last_part_dist = INT64_MAX;
           break;
         }
         last_part_rate += rt;
@@ -1357,9 +1405,9 @@ static void rd_use_partition(VP9_COMP *cpi,
         rd_use_partition(cpi, tile, mi_8x8 + jj * bss * mis + ii * bss, tp,
                          mi_row + y_idx, mi_col + x_idx, subsize, &rt, &dt,
                          i != 3);
-        if (rt == INT_MAX || dt == INT_MAX) {
+        if (rt == INT_MAX || dt == INT64_MAX) {
           last_part_rate = INT_MAX;
-          last_part_dist = INT_MAX;
+          last_part_dist = INT64_MAX;
           break;
         }
         last_part_rate += rt;
@@ -1372,16 +1420,19 @@ static void rd_use_partition(VP9_COMP *cpi,
 
   pl = partition_plane_context(cpi->above_seg_context, cpi->left_seg_context,
                                mi_row, mi_col, bsize);
-  if (last_part_rate < INT_MAX)
+  if (last_part_rate < INT_MAX) {
     last_part_rate += x->partition_cost[pl][partition];
+    last_part_rd = RDCOST(x->rdmult, x->rddiv, last_part_rate, last_part_dist);
+  }
 
   if (cpi->sf.adjust_partitioning_from_last_frame
+      && cpi->sf.partition_search_type == SEARCH_PARTITION
       && partition != PARTITION_SPLIT && bsize > BLOCK_8X8
       && (mi_row + ms < cm->mi_rows || mi_row + (ms >> 1) == cm->mi_rows)
       && (mi_col + ms < cm->mi_cols || mi_col + (ms >> 1) == cm->mi_cols)) {
     BLOCK_SIZE split_subsize = get_subsize(bsize, PARTITION_SPLIT);
-    split_rate = 0;
-    split_dist = 0;
+    chosen_rate = 0;
+    chosen_dist = 0;
     restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
 
     // Split partition.
@@ -1408,46 +1459,44 @@ static void rd_use_partition(VP9_COMP *cpi,
 
       restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
 
-      if (rt == INT_MAX || dt == INT_MAX) {
-        split_rate = INT_MAX;
-        split_dist = INT_MAX;
+      if (rt == INT_MAX || dt == INT64_MAX) {
+        chosen_rate = INT_MAX;
+        chosen_dist = INT64_MAX;
         break;
       }
 
+      chosen_rate += rt;
+      chosen_dist += dt;
+
       if (i != 3)
         encode_sb(cpi, tile, tp,  mi_row + y_idx, mi_col + x_idx, 0,
                   split_subsize);
 
-      split_rate += rt;
-      split_dist += dt;
       pl = partition_plane_context(cpi->above_seg_context,
                                    cpi->left_seg_context,
                                    mi_row + y_idx, mi_col + x_idx,
                                    split_subsize);
-      split_rate += x->partition_cost[pl][PARTITION_NONE];
+      chosen_rate += x->partition_cost[pl][PARTITION_NONE];
     }
     pl = partition_plane_context(cpi->above_seg_context, cpi->left_seg_context,
                                  mi_row, mi_col, bsize);
-    if (split_rate < INT_MAX) {
-      split_rate += x->partition_cost[pl][PARTITION_SPLIT];
-
-      chosen_rate = split_rate;
-      chosen_dist = split_dist;
+    if (chosen_rate < INT_MAX) {
+      chosen_rate += x->partition_cost[pl][PARTITION_SPLIT];
+      chosen_rd = RDCOST(x->rdmult, x->rddiv, chosen_rate, chosen_dist);
     }
   }
 
   // If last_part is better set the partitioning to that...
-  if (RDCOST(x->rdmult, x->rddiv, last_part_rate, last_part_dist)
-      < RDCOST(x->rdmult, x->rddiv, chosen_rate, chosen_dist)) {
+  if (last_part_rd < chosen_rd) {
     mi_8x8[0]->mbmi.sb_type = bsize;
     if (bsize >= BLOCK_8X8)
       *(get_sb_partitioning(x, bsize)) = subsize;
     chosen_rate = last_part_rate;
     chosen_dist = last_part_dist;
+    chosen_rd = last_part_rd;
   }
   // If none was better set the partitioning to that...
-  if (RDCOST(x->rdmult, x->rddiv, chosen_rate, chosen_dist)
-      > RDCOST(x->rdmult, x->rddiv, none_rate, none_dist)) {
+  if (none_rd < chosen_rd) {
     if (bsize >= BLOCK_8X8)
       *(get_sb_partitioning(x, bsize)) = bsize;
     chosen_rate = none_rate;
@@ -1459,7 +1508,7 @@ static void rd_use_partition(VP9_COMP *cpi,
   // We must have chosen a partitioning and encoding or we'll fail later on.
   // No other opportunities for success.
   if ( bsize == BLOCK_64X64)
-    assert(chosen_rate < INT_MAX && chosen_dist < INT_MAX);
+    assert(chosen_rate < INT_MAX && chosen_dist < INT64_MAX);
 
   if (do_recon) {
     int output_enabled = (bsize == BLOCK_64X64);
@@ -1523,6 +1572,15 @@ static void get_sb_partition_size_range(VP9_COMP *cpi, MODE_INFO ** mi_8x8,
   }
 }
 
+// Next square block size less or equal than current block size.
+static const BLOCK_SIZE next_square_size[BLOCK_SIZES] = {
+  BLOCK_4X4, BLOCK_4X4, BLOCK_4X4,
+  BLOCK_8X8, BLOCK_8X8, BLOCK_8X8,
+  BLOCK_16X16, BLOCK_16X16, BLOCK_16X16,
+  BLOCK_32X32, BLOCK_32X32, BLOCK_32X32,
+  BLOCK_64X64
+};
+
 // Look at neighboring blocks and set a min and max partition size based on
 // what they chose.
 static void rd_auto_partition_range(VP9_COMP *cpi, const TileInfo *const tile,
@@ -1589,95 +1647,13 @@ static void rd_auto_partition_range(VP9_COMP *cpi, const TileInfo *const tile,
                                         row8x8_remaining, col8x8_remaining,
                                         &bh, &bw);
   *min_block_size = MIN(*min_block_size, *max_block_size);
-}
 
-static void compute_fast_motion_search_level(VP9_COMP *cpi, BLOCK_SIZE bsize) {
-  VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &cpi->mb;
-
-  // Only use 8x8 result for non HD videos.
-  // int use_8x8 = (MIN(cpi->common.width, cpi->common.height) < 720) ? 1 : 0;
-  int use_8x8 = 1;
-
-  if (cm->frame_type && !cpi->rc.is_src_frame_alt_ref &&
-      ((use_8x8 && bsize == BLOCK_16X16) ||
-      bsize == BLOCK_32X32 || bsize == BLOCK_64X64)) {
-    int ref0 = 0, ref1 = 0, ref2 = 0, ref3 = 0;
-    PICK_MODE_CONTEXT *block_context = NULL;
-
-    if (bsize == BLOCK_16X16) {
-      block_context = x->sb8x8_context[x->sb_index][x->mb_index];
-    } else if (bsize == BLOCK_32X32) {
-      block_context = x->mb_context[x->sb_index];
-    } else if (bsize == BLOCK_64X64) {
-      block_context = x->sb32_context;
-    }
-
-    if (block_context) {
-      ref0 = block_context[0].mic.mbmi.ref_frame[0];
-      ref1 = block_context[1].mic.mbmi.ref_frame[0];
-      ref2 = block_context[2].mic.mbmi.ref_frame[0];
-      ref3 = block_context[3].mic.mbmi.ref_frame[0];
-    }
-
-    // Currently, only consider 4 inter reference frames.
-    if (ref0 && ref1 && ref2 && ref3) {
-      int d01, d23, d02, d13;
-
-      // Motion vectors for the four subblocks.
-      int16_t mvr0 = block_context[0].mic.mbmi.mv[0].as_mv.row;
-      int16_t mvc0 = block_context[0].mic.mbmi.mv[0].as_mv.col;
-      int16_t mvr1 = block_context[1].mic.mbmi.mv[0].as_mv.row;
-      int16_t mvc1 = block_context[1].mic.mbmi.mv[0].as_mv.col;
-      int16_t mvr2 = block_context[2].mic.mbmi.mv[0].as_mv.row;
-      int16_t mvc2 = block_context[2].mic.mbmi.mv[0].as_mv.col;
-      int16_t mvr3 = block_context[3].mic.mbmi.mv[0].as_mv.row;
-      int16_t mvc3 = block_context[3].mic.mbmi.mv[0].as_mv.col;
-
-      // Adjust sign if ref is alt_ref.
-      if (cm->ref_frame_sign_bias[ref0]) {
-        mvr0 *= -1;
-        mvc0 *= -1;
-      }
-
-      if (cm->ref_frame_sign_bias[ref1]) {
-        mvr1 *= -1;
-        mvc1 *= -1;
-      }
-
-      if (cm->ref_frame_sign_bias[ref2]) {
-        mvr2 *= -1;
-        mvc2 *= -1;
-      }
-
-      if (cm->ref_frame_sign_bias[ref3]) {
-        mvr3 *= -1;
-        mvc3 *= -1;
-      }
-
-      // Calculate mv distances.
-      d01 = MAX(abs(mvr0 - mvr1), abs(mvc0 - mvc1));
-      d23 = MAX(abs(mvr2 - mvr3), abs(mvc2 - mvc3));
-      d02 = MAX(abs(mvr0 - mvr2), abs(mvc0 - mvc2));
-      d13 = MAX(abs(mvr1 - mvr3), abs(mvc1 - mvc3));
-
-      if (d01 < FAST_MOTION_MV_THRESH && d23 < FAST_MOTION_MV_THRESH &&
-          d02 < FAST_MOTION_MV_THRESH && d13 < FAST_MOTION_MV_THRESH) {
-        // Set fast motion search level.
-        x->fast_ms = 1;
-
-        if (ref0 == ref1 && ref1 == ref2 && ref2 == ref3 &&
-            d01 < 2 && d23 < 2 && d02 < 2 && d13 < 2) {
-          // Set fast motion search level.
-          x->fast_ms = 2;
-
-          if (!d01 && !d23 && !d02 && !d13) {
-            x->fast_ms = 3;
-            x->subblock_ref = ref0;
-          }
-        }
-      }
-    }
+  // When use_square_partition_only is true, make sure at least one square
+  // partition is allowed by selecting the next smaller square size as
+  // *min_block_size.
+  if (cpi->sf.use_square_partition_only &&
+      (*max_block_size - *min_block_size) < 2) {
+    *min_block_size = next_square_size[*min_block_size];
   }
 }
 
@@ -1720,8 +1696,6 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
                                bsize >= BLOCK_8X8;
   int partition_vert_allowed = !force_horz_split && xss <= yss &&
                                bsize >= BLOCK_8X8;
-
-  int partition_split_done = 0;
   (void) *tp_orig;
 
   if (bsize < BLOCK_8X8) {
@@ -1863,18 +1837,9 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
       if (cpi->sf.less_rectangular_check)
         do_rect &= !partition_none_allowed;
     }
-    partition_split_done = 1;
     restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
   }
 
-  x->fast_ms = 0;
-  x->subblock_ref = 0;
-
-  if (partition_split_done &&
-      cpi->sf.using_small_partition_info) {
-    compute_fast_motion_search_level(cpi, bsize);
-  }
-
   // PARTITION_HORZ
   if (partition_horz_allowed && do_rect) {
     subsize = get_subsize(bsize, PARTITION_HORZ);
@@ -1979,7 +1944,11 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
     restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
   }
 
-
+  // TODO(jbb): This code added so that we avoid static analysis
+  // warning related to the fact that best_rd isn't used after this
+  // point.  This code should be refactored so that the duplicate
+  // checks occur in some sub function and thus are used...
+  (void) best_rd;
   *rate = best_rate;
   *dist = best_dist;
 
@@ -1997,49 +1966,14 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
   if (bsize == BLOCK_64X64) {
     assert(tp_orig < *tp);
     assert(best_rate < INT_MAX);
-    assert(best_dist < INT_MAX);
+    assert(best_dist < INT64_MAX);
   } else {
     assert(tp_orig == *tp);
   }
 }
 
-// Examines 64x64 block and chooses a best reference frame
-static void rd_pick_reference_frame(VP9_COMP *cpi, const TileInfo *const tile,
-                                    int mi_row, int mi_col) {
-  VP9_COMMON * const cm = &cpi->common;
-  MACROBLOCK * const x = &cpi->mb;
-  int bsl = b_width_log2(BLOCK_64X64), bs = 1 << bsl;
-  int ms = bs / 2;
-  ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
-  PARTITION_CONTEXT sl[8], sa[8];
-  int pl;
-  int r;
-  int64_t d;
-
-  save_context(cpi, mi_row, mi_col, a, l, sa, sl, BLOCK_64X64);
-
-  // Default is non mask (all reference frames allowed.
-  cpi->ref_frame_mask = 0;
-
-  // Do RD search for 64x64.
-  if ((mi_row + (ms >> 1) < cm->mi_rows) &&
-      (mi_col + (ms >> 1) < cm->mi_cols)) {
-    cpi->set_ref_frame_mask = 1;
-    rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &r, &d, BLOCK_64X64,
-                     get_block_context(x, BLOCK_64X64), INT64_MAX);
-    pl = partition_plane_context(cpi->above_seg_context, cpi->left_seg_context,
-                                 mi_row, mi_col, BLOCK_64X64);
-    r += x->partition_cost[pl][PARTITION_NONE];
-
-    *(get_sb_partitioning(x, BLOCK_64X64)) = BLOCK_64X64;
-    cpi->set_ref_frame_mask = 0;
-  }
-
-  restore_context(cpi, mi_row, mi_col, a, l, sa, sl, BLOCK_64X64);
-}
-
-static void encode_sb_row(VP9_COMP *cpi, const TileInfo *const tile,
-                          int mi_row, TOKENEXTRA **tp) {
+static void encode_rd_sb_row(VP9_COMP *cpi, const TileInfo *const tile,
+                             int mi_row, TOKENEXTRA **tp) {
   VP9_COMMON *const cm = &cpi->common;
   int mi_col;
 
@@ -2055,28 +1989,45 @@ static void encode_sb_row(VP9_COMP *cpi, const TileInfo *const tile,
 
     BLOCK_SIZE i;
     MACROBLOCK *x = &cpi->mb;
-    for (i = BLOCK_4X4; i < BLOCK_8X8; ++i) {
-      const int num_4x4_w = num_4x4_blocks_wide_lookup[i];
-      const int num_4x4_h = num_4x4_blocks_high_lookup[i];
-      const int num_4x4_blk = MAX(4, num_4x4_w * num_4x4_h);
-      for (x->sb_index = 0; x->sb_index < 4; ++x->sb_index)
-        for (x->mb_index = 0; x->mb_index < 4; ++x->mb_index)
-          for (x->b_index = 0; x->b_index < 16 / num_4x4_blk; ++x->b_index)
-            get_block_context(x, i)->pred_interp_filter = SWITCHABLE;
+
+    if (cpi->sf.adaptive_pred_interp_filter) {
+      for (i = BLOCK_4X4; i < BLOCK_8X8; ++i) {
+        const int num_4x4_w = num_4x4_blocks_wide_lookup[i];
+        const int num_4x4_h = num_4x4_blocks_high_lookup[i];
+        const int num_4x4_blk = MAX(4, num_4x4_w * num_4x4_h);
+        for (x->sb_index = 0; x->sb_index < 4; ++x->sb_index)
+          for (x->mb_index = 0; x->mb_index < 4; ++x->mb_index)
+            for (x->b_index = 0; x->b_index < 16 / num_4x4_blk; ++x->b_index)
+              get_block_context(x, i)->pred_interp_filter = SWITCHABLE;
+      }
     }
 
     vp9_zero(cpi->mb.pred_mv);
 
-    if (cpi->sf.use_lastframe_partitioning ||
-        cpi->sf.use_one_partition_size_always ) {
+    if ((cpi->sf.partition_search_type == SEARCH_PARTITION &&
+         cpi->sf.use_lastframe_partitioning) ||
+        cpi->sf.partition_search_type == FIXED_PARTITION ||
+        cpi->sf.partition_search_type == VAR_BASED_FIXED_PARTITION) {
       const int idx_str = cm->mode_info_stride * mi_row + mi_col;
       MODE_INFO **mi_8x8 = cm->mi_grid_visible + idx_str;
       MODE_INFO **prev_mi_8x8 = cm->prev_mi_grid_visible + idx_str;
 
       cpi->mb.source_variance = UINT_MAX;
-      if (cpi->sf.use_one_partition_size_always) {
+      if (cpi->sf.partition_search_type == FIXED_PARTITION) {
+        set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64);
+        set_partitioning(cpi, tile, mi_8x8, mi_row, mi_col,
+                         cpi->sf.always_this_block_size);
+        rd_use_partition(cpi, tile, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64,
+                         &dummy_rate, &dummy_dist, 1);
+      } else if (cpi->sf.partition_search_type == VAR_BASED_FIXED_PARTITION ||
+                 cpi->sf.partition_search_type == VAR_BASED_PARTITION) {
+        // TODO(debargha): Implement VAR_BASED_PARTITION as a separate case.
+        // Currently both VAR_BASED_FIXED_PARTITION/VAR_BASED_PARTITION
+        // map to the same thing.
+        BLOCK_SIZE bsize;
         set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64);
-        set_partitioning(cpi, tile, mi_8x8, mi_row, mi_col);
+        bsize = get_rd_var_based_fixed_partition(cpi, mi_row, mi_col);
+        set_partitioning(cpi, tile, mi_8x8, mi_row, mi_col, bsize);
         rd_use_partition(cpi, tile, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64,
                          &dummy_rate, &dummy_dist, 1);
       } else {
@@ -2183,118 +2134,6 @@ static void switch_tx_mode(VP9_COMP *cpi) {
     cpi->common.tx_mode = ALLOW_32X32;
 }
 
-static void encode_frame_internal(VP9_COMP *cpi) {
-  int mi_row;
-  MACROBLOCK *const x = &cpi->mb;
-  VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &x->e_mbd;
-
-//  fprintf(stderr, "encode_frame_internal frame %d (%d) type %d\n",
-//           cpi->common.current_video_frame, cpi->common.show_frame,
-//           cm->frame_type);
-
-// debug output
-#if DBG_PRNT_SEGMAP
-  {
-    FILE *statsfile;
-    statsfile = fopen("segmap2.stt", "a");
-    fprintf(statsfile, "\n");
-    fclose(statsfile);
-  }
-#endif
-
-  vp9_zero(cm->counts.switchable_interp);
-  vp9_zero(cpi->tx_stepdown_count);
-
-  xd->mi_8x8 = cm->mi_grid_visible;
-  // required for vp9_frame_init_quantizer
-  xd->mi_8x8[0] = cm->mi;
-
-  xd->last_mi = cm->prev_mi;
-
-  vp9_zero(cm->counts.mv);
-  vp9_zero(cpi->coef_counts);
-  vp9_zero(cm->counts.eob_branch);
-
-  cpi->mb.e_mbd.lossless = cm->base_qindex == 0 && cm->y_dc_delta_q == 0
-      && cm->uv_dc_delta_q == 0 && cm->uv_ac_delta_q == 0;
-  switch_lossless_mode(cpi, cpi->mb.e_mbd.lossless);
-
-  vp9_frame_init_quantizer(cpi);
-
-  vp9_initialize_rd_consts(cpi);
-  vp9_initialize_me_consts(cpi, cm->base_qindex);
-  switch_tx_mode(cpi);
-
-  if (cpi->oxcf.tuning == VP8_TUNE_SSIM) {
-    // Initialize encode frame context.
-    init_encode_frame_mb_context(cpi);
-
-    // Build a frame level activity map
-    build_activity_map(cpi);
-  }
-
-  // Re-initialize encode frame context.
-  init_encode_frame_mb_context(cpi);
-
-  vp9_zero(cpi->rd_comp_pred_diff);
-  vp9_zero(cpi->rd_filter_diff);
-  vp9_zero(cpi->rd_tx_select_diff);
-  vp9_zero(cpi->rd_tx_select_threshes);
-
-  set_prev_mi(cm);
-
-  {
-    struct vpx_usec_timer emr_timer;
-    vpx_usec_timer_start(&emr_timer);
-
-    {
-      // Take tiles into account and give start/end MB
-      int tile_col, tile_row;
-      TOKENEXTRA *tp = cpi->tok;
-      const int tile_cols = 1 << cm->log2_tile_cols;
-      const int tile_rows = 1 << cm->log2_tile_rows;
-
-      for (tile_row = 0; tile_row < tile_rows; tile_row++) {
-        for (tile_col = 0; tile_col < tile_cols; tile_col++) {
-          TileInfo tile;
-          TOKENEXTRA *tp_old = tp;
-
-          // For each row of SBs in the frame
-          vp9_tile_init(&tile, cm, tile_row, tile_col);
-          for (mi_row = tile.mi_row_start;
-               mi_row < tile.mi_row_end; mi_row += 8)
-            encode_sb_row(cpi, &tile, mi_row, &tp);
-
-          cpi->tok_count[tile_row][tile_col] = (unsigned int)(tp - tp_old);
-          assert(tp - cpi->tok <= get_token_alloc(cm->mb_rows, cm->mb_cols));
-        }
-      }
-    }
-
-    vpx_usec_timer_mark(&emr_timer);
-    cpi->time_encode_sb_row += vpx_usec_timer_elapsed(&emr_timer);
-  }
-
-  if (cpi->sf.skip_encode_sb) {
-    int j;
-    unsigned int intra_count = 0, inter_count = 0;
-    for (j = 0; j < INTRA_INTER_CONTEXTS; ++j) {
-      intra_count += cm->counts.intra_inter[j][0];
-      inter_count += cm->counts.intra_inter[j][1];
-    }
-    cpi->sf.skip_encode_frame = ((intra_count << 2) < inter_count);
-    cpi->sf.skip_encode_frame &= (cm->frame_type != KEY_FRAME);
-    cpi->sf.skip_encode_frame &= cm->show_frame;
-  } else {
-    cpi->sf.skip_encode_frame = 0;
-  }
-
-#if 0
-  // Keep record of the total distortion this time around for future use
-  cpi->last_frame_distortion = cpi->frame_distortion;
-#endif
-}
 
 static int check_dual_ref_flags(VP9_COMP *cpi) {
   const int ref_flags = cpi->ref_frame_flags;
@@ -2312,7 +2151,7 @@ static int get_skip_flag(MODE_INFO **mi_8x8, int mis, int ymbs, int xmbs) {
 
   for (y = 0; y < ymbs; y++) {
     for (x = 0; x < xmbs; x++) {
-      if (!mi_8x8[y * mis + x]->mbmi.skip_coeff)
+      if (!mi_8x8[y * mis + x]->mbmi.skip)
         return 0;
     }
   }
@@ -2443,6 +2282,7 @@ static void select_tx_mode(VP9_COMP *cpi) {
     }
   }
 }
+
 // Start RTC Exploration
 typedef enum {
   BOTH_ZERO = 0,
@@ -2470,98 +2310,75 @@ static void set_mode_info(MB_MODE_INFO *mbmi, BLOCK_SIZE bsize,
   mbmi->ref_frame[1] = INTRA_FRAME;
   mbmi->tx_size = max_txsize_lookup[bsize];
   mbmi->uv_mode = mode;
-  mbmi->skip_coeff = 0;
+  mbmi->skip = 0;
   mbmi->sb_type = bsize;
   mbmi->segment_id = 0;
 }
+
 static INLINE int get_block_row(int b32i, int b16i, int b8i) {
   return ((b32i >> 1) << 2) + ((b16i >> 1) << 1) + (b8i >> 1);
 }
+
 static INLINE int get_block_col(int b32i, int b16i, int b8i) {
   return ((b32i & 1) << 2) + ((b16i & 1) << 1) + (b8i & 1);
 }
-static void rtc_use_partition(VP9_COMP *cpi,
-                             const TileInfo *const tile,
-                             MODE_INFO **mi_8x8,
-                             TOKENEXTRA **tp, int mi_row, int mi_col,
-                             BLOCK_SIZE bsize, int *rate, int64_t *dist,
-                             int do_recon) {
+
+static void nonrd_use_partition(VP9_COMP *cpi, const TileInfo *const tile,
+                                TOKENEXTRA **tp, int mi_row, int mi_col,
+                                BLOCK_SIZE bsize, int *rate, int64_t *dist) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &cpi->mb.e_mbd;
-  const int mis = cm->mode_info_stride;
-  int mi_width = num_8x8_blocks_wide_lookup[cpi->sf.always_this_block_size];
-  int mi_height = num_8x8_blocks_high_lookup[cpi->sf.always_this_block_size];
+  int mis = cm->mode_info_stride;
+  int br, bc;
   int i, j;
   int chosen_rate = INT_MAX;
-  int64_t chosen_dist = INT_MAX;
+  int64_t chosen_dist = INT64_MAX;
   MB_PREDICTION_MODE mode = DC_PRED;
-  int row8x8_remaining = tile->mi_row_end - mi_row;
-  int col8x8_remaining = tile->mi_col_end - mi_col;
-  int b32i;
-  x->fast_ms = 0;
-  x->subblock_ref = 0;
-  for (b32i = 0; b32i < 4; b32i++) {
-    int b16i;
-    for (b16i = 0; b16i < 4; b16i++) {
-      int b8i;
-      int block_row = get_block_row(b32i, b16i, 0);
-      int block_col = get_block_col(b32i, b16i, 0);
-      int index = block_row * mis + block_col;
-      int rate;
-      int64_t dist;
-
-      int_mv frame_nearest_mv[MAX_REF_FRAMES];
-      int_mv frame_near_mv[MAX_REF_FRAMES];
-      struct buf_2d yv12_mb[MAX_REF_FRAMES][MAX_MB_PLANE];
-
-      // Find a partition size that fits
-      bsize = find_partition_size(cpi->sf.always_this_block_size,
-                                  (row8x8_remaining - block_row),
-                                  (col8x8_remaining - block_col),
-                                  &mi_height, &mi_width);
-      mi_8x8[index] = mi_8x8[0] + index;
-
-      set_mi_row_col(xd, tile, mi_row + block_row, mi_height,
-                     mi_col + block_col, mi_width, cm->mi_rows, cm->mi_cols);
-
-      xd->mi_8x8 = mi_8x8 + index;
-
-      if (cm->frame_type != KEY_FRAME) {
-        set_offsets(cpi, tile, mi_row + block_row, mi_col + block_col, bsize);
-
-        vp9_pick_inter_mode(cpi, x, tile,
-                            mi_row + block_row, mi_col + block_col,
-                            &rate, &dist, bsize);
-      } else {
-        set_mode_info(&mi_8x8[index]->mbmi, bsize, mode,
-                      mi_row + block_row, mi_col + block_col);
-        vp9_setup_buffer_inter(cpi, x, tile,
-                               LAST_FRAME, cpi->sf.always_this_block_size,
-                               mi_row + block_row, mi_col + block_col,
-                               frame_nearest_mv, frame_near_mv, yv12_mb);
-      }
+  int rows = MIN(MI_BLOCK_SIZE, tile->mi_row_end - mi_row);
+  int cols = MIN(MI_BLOCK_SIZE, tile->mi_col_end - mi_col);
 
-      for (j = 0; j < mi_height; j++)
-        for (i = 0; i < mi_width; i++)
-          if ((xd->mb_to_right_edge >> (3 + MI_SIZE_LOG2)) + mi_width > i
-            && (xd->mb_to_bottom_edge >> (3 + MI_SIZE_LOG2)) + mi_height > j) {
-            mi_8x8[index+ i + j * mis] = mi_8x8[index];
-          }
+  int bw = num_8x8_blocks_wide_lookup[bsize];
+  int bh = num_8x8_blocks_high_lookup[bsize];
 
-      for (b8i = 0; b8i < 4; b8i++) {
-      }
+  int brate = 0;
+  int64_t bdist = 0;
+  *rate = 0;
+  *dist = 0;
+
+  // find prediction mode for each 8x8 block
+  for (br = 0; br < rows; br += bh) {
+    for (bc = 0; bc < cols; bc += bw) {
+      int row = mi_row + br;
+      int col = mi_col + bc;
+
+      BLOCK_SIZE bs = find_partition_size(bsize, rows - br, cols - bc,
+                                          &bh, &bw);
+      set_offsets(cpi, tile, row, col, bs);
+
+      if (cm->frame_type != KEY_FRAME)
+        vp9_pick_inter_mode(cpi, x, tile, row, col, &brate, &bdist, bs);
+      else
+        set_mode_info(&xd->mi_8x8[0]->mbmi, bs, mode, row, col);
+
+      *rate += brate;
+      *dist += bdist;
+
+      for (j = 0; j < bh; ++j)
+        for (i = 0; i < bw; ++i) {
+          xd->mi_8x8[j * mis + i] = xd->mi_8x8[0];
+        }
     }
   }
-  encode_sb_rt(cpi, tile, tp, mi_row, mi_col, 1, BLOCK_64X64);
 
   *rate = chosen_rate;
   *dist = chosen_dist;
+
+  encode_sb_rt(cpi, tile, tp, mi_row, mi_col, 1, BLOCK_64X64);
 }
 
-static void encode_rtc_sb_row(VP9_COMP *cpi, const TileInfo *const tile,
-                              int mi_row, TOKENEXTRA **tp) {
-  VP9_COMMON * const cm = &cpi->common;
+static void encode_nonrd_sb_row(VP9_COMP *cpi, const TileInfo *const tile,
+                                int mi_row, TOKENEXTRA **tp) {
   int mi_col;
 
   // Initialize the left context for the new SB row
@@ -2574,38 +2391,39 @@ static void encode_rtc_sb_row(VP9_COMP *cpi, const TileInfo *const tile,
     int dummy_rate;
     int64_t dummy_dist;
 
-    const int idx_str = cm->mode_info_stride * mi_row + mi_col;
-    MODE_INFO **mi_8x8 = cm->mi_grid_visible + idx_str;
-
     cpi->mb.source_variance = UINT_MAX;
-    set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64);
-    set_partitioning(cpi, tile, mi_8x8, mi_row, mi_col);
-    rtc_use_partition(cpi, tile, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64,
-                     &dummy_rate, &dummy_dist, 1);
+
+    if (cpi->sf.partition_search_type == FIXED_PARTITION) {
+      nonrd_use_partition(cpi, tile, tp, mi_row, mi_col,
+                          cpi->sf.always_this_block_size,
+                          &dummy_rate, &dummy_dist);
+    } else if (cpi->sf.partition_search_type == VAR_BASED_FIXED_PARTITION ||
+               cpi->sf.partition_search_type == VAR_BASED_PARTITION) {
+      // TODO(debargha): Implement VAR_BASED_PARTITION as a separate case.
+      // Currently both VAR_BASED_FIXED_PARTITION/VAR_BASED_PARTITION
+      // map to the same thing.
+      BLOCK_SIZE bsize = get_nonrd_var_based_fixed_partition(cpi,
+                                                             mi_row,
+                                                             mi_col);
+      nonrd_use_partition(cpi, tile, tp, mi_row, mi_col,
+                          bsize, &dummy_rate, &dummy_dist);
+    } else {
+      assert(0);
+    }
   }
 }
+// end RTC play code
 
-
-static void encode_rtc_frame_internal(VP9_COMP *cpi) {
+static void encode_frame_internal(VP9_COMP *cpi) {
   int mi_row;
-  MACROBLOCK * const x = &cpi->mb;
-  VP9_COMMON * const cm = &cpi->common;
-  MACROBLOCKD * const xd = &x->e_mbd;
+  MACROBLOCK *const x = &cpi->mb;
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
 
 //  fprintf(stderr, "encode_frame_internal frame %d (%d) type %d\n",
 //           cpi->common.current_video_frame, cpi->common.show_frame,
 //           cm->frame_type);
 
-// debug output
-#if DBG_PRNT_SEGMAP
-  {
-    FILE *statsfile;
-    statsfile = fopen("segmap2.stt", "a");
-    fprintf(statsfile, "\n");
-    fclose(statsfile);
-  }
-#endif
-
   vp9_zero(cm->counts.switchable_interp);
   vp9_zero(cpi->tx_stepdown_count);
 
@@ -2615,7 +2433,7 @@ static void encode_rtc_frame_internal(VP9_COMP *cpi) {
 
   xd->last_mi = cm->prev_mi;
 
-  vp9_zero(cpi->common.counts.mv);
+  vp9_zero(cm->counts.mv);
   vp9_zero(cpi->coef_counts);
   vp9_zero(cm->counts.eob_branch);
 
@@ -2628,7 +2446,6 @@ static void encode_rtc_frame_internal(VP9_COMP *cpi) {
   vp9_initialize_rd_consts(cpi);
   vp9_initialize_me_consts(cpi, cm->base_qindex);
   switch_tx_mode(cpi);
-  cpi->sf.always_this_block_size = BLOCK_16X16;
 
   if (cpi->oxcf.tuning == VP8_TUNE_SSIM) {
     // Initialize encode frame context.
@@ -2648,6 +2465,22 @@ static void encode_rtc_frame_internal(VP9_COMP *cpi) {
 
   set_prev_mi(cm);
 
+  if (cpi->sf.use_nonrd_pick_mode) {
+    // Initialize internal buffer pointers for rtc coding, where non-RD
+    // mode decision is used and hence no buffer pointer swap needed.
+    int i;
+    struct macroblock_plane *const p = x->plane;
+    struct macroblockd_plane *const pd = xd->plane;
+    PICK_MODE_CONTEXT *ctx = &cpi->mb.sb64_context;
+
+    for (i = 0; i < MAX_MB_PLANE; ++i) {
+      p[i].coeff = ctx->coeff_pbuf[i][0];
+      p[i].qcoeff = ctx->qcoeff_pbuf[i][0];
+      pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][0];
+      p[i].eobs = ctx->eobs_pbuf[i][0];
+    }
+  }
+
   {
     struct vpx_usec_timer emr_timer;
     vpx_usec_timer_start(&emr_timer);
@@ -2667,9 +2500,12 @@ static void encode_rtc_frame_internal(VP9_COMP *cpi) {
           // For each row of SBs in the frame
           vp9_tile_init(&tile, cm, tile_row, tile_col);
           for (mi_row = tile.mi_row_start;
-               mi_row < tile.mi_row_end; mi_row += 8)
-            encode_rtc_sb_row(cpi, &tile, mi_row, &tp);
-
+               mi_row < tile.mi_row_end; mi_row += MI_BLOCK_SIZE) {
+            if (cpi->sf.use_nonrd_pick_mode)
+              encode_nonrd_sb_row(cpi, &tile, mi_row, &tp);
+            else
+              encode_rd_sb_row(cpi, &tile, mi_row, &tp);
+          }
           cpi->tok_count[tile_row][tile_col] = (unsigned int)(tp - tp_old);
           assert(tp - cpi->tok <= get_token_alloc(cm->mb_rows, cm->mb_cols));
         }
@@ -2699,8 +2535,6 @@ static void encode_rtc_frame_internal(VP9_COMP *cpi) {
   cpi->last_frame_distortion = cpi->frame_distortion;
 #endif
 }
-// end RTC play code
-
 
 void vp9_encode_frame(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
@@ -2725,7 +2559,7 @@ void vp9_encode_frame(VP9_COMP *cpi) {
     }
   }
 
-  if (cpi->sf.RD) {
+  if (cpi->sf.frame_parameter_update) {
     int i;
     REFERENCE_MODE reference_mode;
     /*
@@ -2775,10 +2609,7 @@ void vp9_encode_frame(VP9_COMP *cpi) {
     select_tx_mode(cpi);
     cm->reference_mode = reference_mode;
 
-    if (cpi->sf.super_fast_rtc)
-      encode_rtc_frame_internal(cpi);
-    else
-      encode_frame_internal(cpi);
+    encode_frame_internal(cpi);
 
     for (i = 0; i < REFERENCE_MODES; ++i) {
       const int diff = (int) (cpi->rd_comp_pred_diff[i] / cm->MBs);
@@ -2858,10 +2689,7 @@ void vp9_encode_frame(VP9_COMP *cpi) {
   } else {
     // Force the usage of the BILINEAR interp_filter.
     cm->interp_filter = BILINEAR;
-    if (cpi->sf.super_fast_rtc)
-      encode_rtc_frame_internal(cpi);
-    else
-      encode_frame_internal(cpi);
+    encode_frame_internal(cpi);
   }
 }
 
@@ -2936,9 +2764,10 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
   const int mis = cm->mode_info_stride;
   const int mi_width = num_8x8_blocks_wide_lookup[bsize];
   const int mi_height = num_8x8_blocks_high_lookup[bsize];
+
   x->skip_recode = !x->select_txfm_size && mbmi->sb_type >= BLOCK_8X8 &&
                    (cpi->oxcf.aq_mode != COMPLEXITY_AQ) &&
-                   !cpi->sf.super_fast_rtc;
+                   !cpi->sf.use_nonrd_pick_mode;
   x->skip_optimize = ctx->is_coded;
   ctx->is_coded = 1;
   x->use_lp32x32fdct = cpi->sf.use_lp32x32fdct;
@@ -2969,11 +2798,13 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
   }
 
   if (!is_inter_block(mbmi)) {
-    mbmi->skip_coeff = 1;
-    vp9_encode_intra_block_y(x, MAX(bsize, BLOCK_8X8));
-    vp9_encode_intra_block_uv(x, MAX(bsize, BLOCK_8X8));
+    int plane;
+    mbmi->skip = 1;
+    for (plane = 0; plane < MAX_MB_PLANE; ++plane)
+      vp9_encode_intra_block_plane(x, MAX(bsize, BLOCK_8X8), plane);
     if (output_enabled)
       sum_intra_stats(&cm->counts, mi);
+    vp9_tokenize_sb(cpi, t, !output_enabled, MAX(bsize, BLOCK_8X8));
   } else {
     int ref;
     const int is_compound = has_second_ref(mbmi);
@@ -2983,26 +2814,24 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
       setup_pre_planes(xd, ref, cfg, mi_row, mi_col, &xd->block_refs[ref]->sf);
     }
     vp9_build_inter_predictors_sb(xd, mi_row, mi_col, MAX(bsize, BLOCK_8X8));
-  }
 
-  if (!is_inter_block(mbmi)) {
-    vp9_tokenize_sb(cpi, t, !output_enabled, MAX(bsize, BLOCK_8X8));
-  } else if (!x->skip) {
-    mbmi->skip_coeff = 1;
-    vp9_encode_sb(x, MAX(bsize, BLOCK_8X8));
-    vp9_tokenize_sb(cpi, t, !output_enabled, MAX(bsize, BLOCK_8X8));
-  } else {
-    mbmi->skip_coeff = 1;
-    if (output_enabled)
-      cm->counts.skip[vp9_get_skip_context(xd)][1]++;
-    reset_skip_context(xd, MAX(bsize, BLOCK_8X8));
+    if (!x->skip) {
+      mbmi->skip = 1;
+      vp9_encode_sb(x, MAX(bsize, BLOCK_8X8));
+      vp9_tokenize_sb(cpi, t, !output_enabled, MAX(bsize, BLOCK_8X8));
+    } else {
+      mbmi->skip = 1;
+      if (output_enabled)
+        cm->counts.skip[vp9_get_skip_context(xd)][1]++;
+      reset_skip_context(xd, MAX(bsize, BLOCK_8X8));
+    }
   }
 
   if (output_enabled) {
     if (cm->tx_mode == TX_MODE_SELECT &&
         mbmi->sb_type >= BLOCK_8X8  &&
         !(is_inter_block(mbmi) &&
-            (mbmi->skip_coeff ||
+            (mbmi->skip ||
              vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)))) {
       ++get_tx_counts(max_txsize_lookup[bsize], vp9_get_tx_size_context(xd),
                       &cm->counts.tx)[mbmi->tx_size];
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 376a899e0..13eabe05d 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -19,29 +19,39 @@
 #include "vp9/common/vp9_reconintra.h"
 #include "vp9/common/vp9_systemdependent.h"
 
-#include "vp9/encoder/vp9_dct.h"
 #include "vp9/encoder/vp9_encodemb.h"
 #include "vp9/encoder/vp9_quantize.h"
 #include "vp9/encoder/vp9_rdopt.h"
 #include "vp9/encoder/vp9_tokenize.h"
 
+struct optimize_ctx {
+  ENTROPY_CONTEXT ta[MAX_MB_PLANE][16];
+  ENTROPY_CONTEXT tl[MAX_MB_PLANE][16];
+};
+
+struct encode_b_args {
+  MACROBLOCK *x;
+  struct optimize_ctx *ctx;
+  unsigned char *skip;
+};
+
 void vp9_subtract_block_c(int rows, int cols,
-                          int16_t *diff_ptr, ptrdiff_t diff_stride,
-                          const uint8_t *src_ptr, ptrdiff_t src_stride,
-                          const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
+                          int16_t *diff, ptrdiff_t diff_stride,
+                          const uint8_t *src, ptrdiff_t src_stride,
+                          const uint8_t *pred, ptrdiff_t pred_stride) {
   int r, c;
 
   for (r = 0; r < rows; r++) {
     for (c = 0; c < cols; c++)
-      diff_ptr[c] = src_ptr[c] - pred_ptr[c];
+      diff[c] = src[c] - pred[c];
 
-    diff_ptr += diff_stride;
-    pred_ptr += pred_stride;
-    src_ptr  += src_stride;
+    diff += diff_stride;
+    pred += pred_stride;
+    src  += src_stride;
   }
 }
 
-static void subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
+void vp9_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
   struct macroblock_plane *const p = &x->plane[plane];
   const struct macroblockd_plane *const pd = &x->e_mbd.plane[plane];
   const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
@@ -52,22 +62,6 @@ static void subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
                      pd->dst.buf, pd->dst.stride);
 }
 
-void vp9_subtract_sby(MACROBLOCK *x, BLOCK_SIZE bsize) {
-  subtract_plane(x, bsize, 0);
-}
-
-void vp9_subtract_sbuv(MACROBLOCK *x, BLOCK_SIZE bsize) {
-  int i;
-
-  for (i = 1; i < MAX_MB_PLANE; i++)
-    subtract_plane(x, bsize, i);
-}
-
-void vp9_subtract_sb(MACROBLOCK *x, BLOCK_SIZE bsize) {
-  vp9_subtract_sby(x, bsize);
-  vp9_subtract_sbuv(x, bsize);
-}
-
 #define RDTRUNC(RM, DM, R, D) ((128 + (R) * (RM)) & 0xFF)
 typedef struct vp9_token_state vp9_token_state;
 
@@ -111,19 +105,18 @@ static int trellis_get_coeff_context(const int16_t *scan,
   return pt;
 }
 
-static void optimize_b(MACROBLOCK *mb,
-                       int plane, int block, BLOCK_SIZE plane_bsize,
-                       ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
-                       TX_SIZE tx_size) {
+static void optimize_b(int plane, int block, BLOCK_SIZE plane_bsize,
+                       TX_SIZE tx_size, MACROBLOCK *mb,
+                       struct optimize_ctx *ctx) {
   MACROBLOCKD *const xd = &mb->e_mbd;
   struct macroblock_plane *p = &mb->plane[plane];
   struct macroblockd_plane *pd = &xd->plane[plane];
   const int ref = is_inter_block(&xd->mi_8x8[0]->mbmi);
   vp9_token_state tokens[1025][2];
   unsigned best_index[1025][2];
-  const int16_t *coeff_ptr = BLOCK_OFFSET(mb->plane[plane].coeff, block);
-  int16_t *qcoeff_ptr;
-  int16_t *dqcoeff_ptr;
+  const int16_t *coeff = BLOCK_OFFSET(mb->plane[plane].coeff, block);
+  int16_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  int16_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
   int eob = p->eobs[block], final_eob, sz = 0;
   const int i0 = 0;
   int rc, x, next, i;
@@ -133,7 +126,6 @@ static void optimize_b(MACROBLOCK *mb,
   PLANE_TYPE type = pd->plane_type;
   int err_mult = plane_rd_mult[type];
   const int default_eob = 16 << (tx_size << 1);
-
   const int mul = 1 + (tx_size == TX_32X32);
   uint8_t token_cache[1024];
   const int16_t *dequant_ptr = pd->dequant;
@@ -141,10 +133,13 @@ static void optimize_b(MACROBLOCK *mb,
   const scan_order *so = get_scan(xd, tx_size, type, block);
   const int16_t *scan = so->scan;
   const int16_t *nb = so->neighbors;
+  ENTROPY_CONTEXT *a, *l;
+  int tx_x, tx_y;
+  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &tx_x, &tx_y);
+  a = &ctx->ta[plane][tx_x];
+  l = &ctx->tl[plane][tx_y];
 
   assert((!type && !plane) || (type && plane));
-  dqcoeff_ptr = BLOCK_OFFSET(pd->dqcoeff, block);
-  qcoeff_ptr = BLOCK_OFFSET(p->qcoeff, block);
   assert(eob <= default_eob);
 
   /* Now set up a Viterbi trellis to evaluate alternative roundings. */
@@ -162,13 +157,13 @@ static void optimize_b(MACROBLOCK *mb,
   next = eob;
   for (i = 0; i < eob; i++)
     token_cache[scan[i]] = vp9_pt_energy_class[vp9_dct_value_tokens_ptr[
-        qcoeff_ptr[scan[i]]].token];
+        qcoeff[scan[i]]].token];
 
   for (i = eob; i-- > i0;) {
     int base_bits, d2, dx;
 
     rc = scan[i];
-    x = qcoeff_ptr[rc];
+    x = qcoeff[rc];
     /* Only add a trellis state for non-zero coefficients. */
     if (x) {
       int shortcut = 0;
@@ -193,7 +188,7 @@ static void optimize_b(MACROBLOCK *mb,
       /* And pick the best. */
       best = rd_cost1 < rd_cost0;
       base_bits = *(vp9_dct_value_cost_ptr + x);
-      dx = mul * (dqcoeff_ptr[rc] - coeff_ptr[rc]);
+      dx = mul * (dqcoeff[rc] - coeff[rc]);
       d2 = dx * dx;
       tokens[i][0].rate = base_bits + (best ? rate1 : rate0);
       tokens[i][0].error = d2 + (best ? error1 : error0);
@@ -206,8 +201,8 @@ static void optimize_b(MACROBLOCK *mb,
       rate0 = tokens[next][0].rate;
       rate1 = tokens[next][1].rate;
 
-      if ((abs(x)*dequant_ptr[rc != 0] > abs(coeff_ptr[rc]) * mul) &&
-          (abs(x)*dequant_ptr[rc != 0] < abs(coeff_ptr[rc]) * mul +
+      if ((abs(x)*dequant_ptr[rc != 0] > abs(coeff[rc]) * mul) &&
+          (abs(x)*dequant_ptr[rc != 0] < abs(coeff[rc]) * mul +
                                          dequant_ptr[rc != 0]))
         shortcut = 1;
       else
@@ -296,16 +291,16 @@ static void optimize_b(MACROBLOCK *mb,
   UPDATE_RD_COST();
   best = rd_cost1 < rd_cost0;
   final_eob = i0 - 1;
-  vpx_memset(qcoeff_ptr, 0, sizeof(*qcoeff_ptr) * (16 << (tx_size * 2)));
-  vpx_memset(dqcoeff_ptr, 0, sizeof(*dqcoeff_ptr) * (16 << (tx_size * 2)));
+  vpx_memset(qcoeff, 0, sizeof(*qcoeff) * (16 << (tx_size * 2)));
+  vpx_memset(dqcoeff, 0, sizeof(*dqcoeff) * (16 << (tx_size * 2)));
   for (i = next; i < eob; i = next) {
     x = tokens[i][best].qc;
     if (x) {
       final_eob = i;
     }
     rc = scan[i];
-    qcoeff_ptr[rc] = x;
-    dqcoeff_ptr[rc] = (x * dequant_ptr[rc != 0]) / mul;
+    qcoeff[rc] = x;
+    dqcoeff[rc] = (x * dequant_ptr[rc != 0]) / mul;
 
     next = tokens[i][best].next;
     best = best_index[i][best];
@@ -316,60 +311,39 @@ static void optimize_b(MACROBLOCK *mb,
   *a = *l = (final_eob > 0);
 }
 
-void vp9_optimize_b(int plane, int block, BLOCK_SIZE plane_bsize,
-                    TX_SIZE tx_size, MACROBLOCK *mb, struct optimize_ctx *ctx) {
-  int x, y;
-  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &x, &y);
-  optimize_b(mb, plane, block, plane_bsize,
-             &ctx->ta[plane][x], &ctx->tl[plane][y], tx_size);
+static INLINE void fdct32x32(int rd_transform,
+                             const int16_t *src, int16_t *dst, int src_stride) {
+  if (rd_transform)
+    vp9_fdct32x32_rd(src, dst, src_stride);
+  else
+    vp9_fdct32x32(src, dst, src_stride);
 }
 
-static void optimize_init_b(int plane, BLOCK_SIZE bsize,
-                            struct encode_b_args *args) {
-  const MACROBLOCKD *xd = &args->x->e_mbd;
-  const struct macroblockd_plane* const pd = &xd->plane[plane];
-  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
-  const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
-  const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
-  const MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
-  const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi) : mbmi->tx_size;
-
-  vp9_get_entropy_contexts(tx_size, args->ctx->ta[plane], args->ctx->tl[plane],
-                           pd->above_context, pd->left_context,
-                           num_4x4_w, num_4x4_h);
-}
-void vp9_xform_quant(int plane, int block, BLOCK_SIZE plane_bsize,
-                     TX_SIZE tx_size, void *arg) {
-  struct encode_b_args* const args = arg;
-  MACROBLOCK* const x = args->x;
-  MACROBLOCKD* const xd = &x->e_mbd;
-  struct macroblock_plane *const p = &x->plane[plane];
-  struct macroblockd_plane *const pd = &xd->plane[plane];
-  int16_t *coeff = BLOCK_OFFSET(p->coeff, block);
-  int16_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
-  int16_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
-  const scan_order *scan_order;
-  uint16_t *eob = &p->eobs[block];
+void vp9_xform_quant(MACROBLOCK *x, int plane, int block,
+                     BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const struct macroblock_plane *const p = &x->plane[plane];
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const scan_order *const scan_order = &vp9_default_scan_orders[tx_size];
+  int16_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+  int16_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  uint16_t *const eob = &p->eobs[block];
   const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
   int i, j;
-  int16_t *src_diff;
+  const int16_t *src_diff;
   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
   src_diff = &p->src_diff[4 * (j * diff_stride + i)];
 
   switch (tx_size) {
     case TX_32X32:
-      scan_order = &vp9_default_scan_orders[TX_32X32];
-      if (x->use_lp32x32fdct)
-        vp9_fdct32x32_rd(src_diff, coeff, diff_stride);
-      else
-        vp9_fdct32x32(src_diff, coeff, diff_stride);
+      fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
       vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
                            p->quant, p->quant_shift, qcoeff, dqcoeff,
                            pd->dequant, p->zbin_extra, eob, scan_order->scan,
                            scan_order->iscan);
       break;
     case TX_16X16:
-      scan_order = &vp9_default_scan_orders[TX_16X16];
       vp9_fdct16x16(src_diff, coeff, diff_stride);
       vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
                      p->quant, p->quant_shift, qcoeff, dqcoeff,
@@ -377,7 +351,6 @@ void vp9_xform_quant(int plane, int block, BLOCK_SIZE plane_bsize,
                      scan_order->scan, scan_order->iscan);
       break;
     case TX_8X8:
-      scan_order = &vp9_default_scan_orders[TX_8X8];
       vp9_fdct8x8(src_diff, coeff, diff_stride);
       vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,
                      p->quant, p->quant_shift, qcoeff, dqcoeff,
@@ -385,7 +358,6 @@ void vp9_xform_quant(int plane, int block, BLOCK_SIZE plane_bsize,
                      scan_order->scan, scan_order->iscan);
       break;
     case TX_4X4:
-      scan_order = &vp9_default_scan_orders[TX_4X4];
       x->fwd_txm4x4(src_diff, coeff, diff_stride);
       vp9_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
                      p->quant, p->quant_shift, qcoeff, dqcoeff,
@@ -421,17 +393,17 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize,
   }
 
   if (!x->skip_recode)
-    vp9_xform_quant(plane, block, plane_bsize, tx_size, arg);
+    vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
 
   if (x->optimize && (!x->skip_recode || !x->skip_optimize)) {
-    vp9_optimize_b(plane, block, plane_bsize, tx_size, x, ctx);
+    optimize_b(plane, block, plane_bsize, tx_size, x, ctx);
   } else {
     ctx->ta[plane][i] = p->eobs[block] > 0;
     ctx->tl[plane][j] = p->eobs[block] > 0;
   }
 
   if (p->eobs[block])
-    *(args->skip_coeff) = 0;
+    *(args->skip) = 0;
 
   if (x->skip_encode || p->eobs[block] == 0)
     return;
@@ -458,8 +430,7 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize,
 }
 static void encode_block_pass1(int plane, int block, BLOCK_SIZE plane_bsize,
                                TX_SIZE tx_size, void *arg) {
-  struct encode_b_args *const args = arg;
-  MACROBLOCK *const x = args->x;
+  MACROBLOCK *const x = (MACROBLOCK *)arg;
   MACROBLOCKD *const xd = &x->e_mbd;
   struct macroblock_plane *const p = &x->plane[plane];
   struct macroblockd_plane *const pd = &xd->plane[plane];
@@ -469,48 +440,43 @@ static void encode_block_pass1(int plane, int block, BLOCK_SIZE plane_bsize,
   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
   dst = &pd->dst.buf[4 * j * pd->dst.stride + 4 * i];
 
-  vp9_xform_quant(plane, block, plane_bsize, tx_size, arg);
+  vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
 
-  if (p->eobs[block] == 0)
-    return;
-
-  xd->itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
+  if (p->eobs[block] > 0)
+    xd->itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
 }
 
-void vp9_encode_sby(MACROBLOCK *x, BLOCK_SIZE bsize) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  struct optimize_ctx ctx;
-  MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
-  struct encode_b_args arg = {x, &ctx, &mbmi->skip_coeff};
-
-  vp9_subtract_sby(x, bsize);
-  if (x->optimize)
-    optimize_init_b(0, bsize, &arg);
-
-  vp9_foreach_transformed_block_in_plane(xd, bsize, 0, encode_block_pass1,
-                                         &arg);
+void vp9_encode_sby_pass1(MACROBLOCK *x, BLOCK_SIZE bsize) {
+  vp9_subtract_plane(x, bsize, 0);
+  vp9_foreach_transformed_block_in_plane(&x->e_mbd, bsize, 0,
+                                         encode_block_pass1, x);
 }
 
 void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize) {
   MACROBLOCKD *const xd = &x->e_mbd;
   struct optimize_ctx ctx;
   MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
-  struct encode_b_args arg = {x, &ctx, &mbmi->skip_coeff};
-
-  if (!x->skip_recode)
-    vp9_subtract_sb(x, bsize);
+  struct encode_b_args arg = {x, &ctx, &mbmi->skip};
+  int plane;
+
+  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+    if (!x->skip_recode)
+      vp9_subtract_plane(x, bsize, plane);
+
+    if (x->optimize && (!x->skip_recode || !x->skip_optimize)) {
+      const struct macroblockd_plane* const pd = &xd->plane[plane];
+      const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi) : mbmi->tx_size;
+      vp9_get_entropy_contexts(bsize, tx_size, pd,
+                               ctx.ta[plane], ctx.tl[plane]);
+    }
 
-  if (x->optimize && (!x->skip_recode || !x->skip_optimize)) {
-    int i;
-    for (i = 0; i < MAX_MB_PLANE; ++i)
-      optimize_init_b(i, bsize, &arg);
+    vp9_foreach_transformed_block_in_plane(xd, bsize, plane, encode_block,
+                                           &arg);
   }
-
-  vp9_foreach_transformed_block(xd, bsize, encode_block, &arg);
 }
 
-void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
-                            TX_SIZE tx_size, void *arg) {
+static void encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
+                               TX_SIZE tx_size, void *arg) {
   struct encode_b_args* const args = arg;
   MACROBLOCK *const x = args->x;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -528,14 +494,16 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
   uint8_t *src, *dst;
   int16_t *src_diff;
   uint16_t *eob = &p->eobs[block];
+  const int src_stride = p->src.stride;
+  const int dst_stride = pd->dst.stride;
   int i, j;
   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
-  dst = &pd->dst.buf[4 * (j * pd->dst.stride + i)];
-  src = &p->src.buf[4 * (j * p->src.stride + i)];
+  dst = &pd->dst.buf[4 * (j * dst_stride + i)];
+  src = &p->src.buf[4 * (j * src_stride + i)];
   src_diff = &p->src_diff[4 * (j * diff_stride + i)];
 
   // if (x->optimize)
-  // vp9_optimize_b(plane, block, plane_bsize, tx_size, x, args->ctx);
+  //   optimize_b(plane, block, plane_bsize, tx_size, x, args->ctx);
 
   switch (tx_size) {
     case TX_32X32:
@@ -543,22 +511,19 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
       mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
       vp9_predict_intra_block(xd, block >> 6, bwl, TX_32X32, mode,
                               x->skip_encode ? src : dst,
-                              x->skip_encode ? p->src.stride : pd->dst.stride,
-                              dst, pd->dst.stride, i, j, plane);
+                              x->skip_encode ? src_stride : dst_stride,
+                              dst, dst_stride, i, j, plane);
       if (!x->skip_recode) {
         vp9_subtract_block(32, 32, src_diff, diff_stride,
-                           src, p->src.stride, dst, pd->dst.stride);
-        if (x->use_lp32x32fdct)
-          vp9_fdct32x32_rd(src_diff, coeff, diff_stride);
-        else
-          vp9_fdct32x32(src_diff, coeff, diff_stride);
+                           src, src_stride, dst, dst_stride);
+        fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
         vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
                              p->quant, p->quant_shift, qcoeff, dqcoeff,
                              pd->dequant, p->zbin_extra, eob, scan_order->scan,
                              scan_order->iscan);
       }
       if (!x->skip_encode && *eob)
-        vp9_idct32x32_add(dqcoeff, dst, pd->dst.stride, *eob);
+        vp9_idct32x32_add(dqcoeff, dst, dst_stride, *eob);
       break;
     case TX_16X16:
       tx_type = get_tx_type_16x16(pd->plane_type, xd);
@@ -566,19 +531,19 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
       mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
       vp9_predict_intra_block(xd, block >> 4, bwl, TX_16X16, mode,
                               x->skip_encode ? src : dst,
-                              x->skip_encode ? p->src.stride : pd->dst.stride,
-                              dst, pd->dst.stride, i, j, plane);
+                              x->skip_encode ? src_stride : dst_stride,
+                              dst, dst_stride, i, j, plane);
       if (!x->skip_recode) {
         vp9_subtract_block(16, 16, src_diff, diff_stride,
-                           src, p->src.stride, dst, pd->dst.stride);
-        vp9_fht16x16(tx_type, src_diff, coeff, diff_stride);
+                           src, src_stride, dst, dst_stride);
+        vp9_fht16x16(src_diff, coeff, diff_stride, tx_type);
         vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
                        p->quant, p->quant_shift, qcoeff, dqcoeff,
                        pd->dequant, p->zbin_extra, eob, scan_order->scan,
                        scan_order->iscan);
       }
       if (!x->skip_encode && *eob)
-        vp9_iht16x16_add(tx_type, dqcoeff, dst, pd->dst.stride, *eob);
+        vp9_iht16x16_add(tx_type, dqcoeff, dst, dst_stride, *eob);
       break;
     case TX_8X8:
       tx_type = get_tx_type_8x8(pd->plane_type, xd);
@@ -586,19 +551,19 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
       mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
       vp9_predict_intra_block(xd, block >> 2, bwl, TX_8X8, mode,
                               x->skip_encode ? src : dst,
-                              x->skip_encode ? p->src.stride : pd->dst.stride,
-                              dst, pd->dst.stride, i, j, plane);
+                              x->skip_encode ? src_stride : dst_stride,
+                              dst, dst_stride, i, j, plane);
       if (!x->skip_recode) {
         vp9_subtract_block(8, 8, src_diff, diff_stride,
-                           src, p->src.stride, dst, pd->dst.stride);
-        vp9_fht8x8(tx_type, src_diff, coeff, diff_stride);
+                           src, src_stride, dst, dst_stride);
+        vp9_fht8x8(src_diff, coeff, diff_stride, tx_type);
         vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, p->quant,
                        p->quant_shift, qcoeff, dqcoeff,
                        pd->dequant, p->zbin_extra, eob, scan_order->scan,
                        scan_order->iscan);
       }
       if (!x->skip_encode && *eob)
-        vp9_iht8x8_add(tx_type, dqcoeff, dst, pd->dst.stride, *eob);
+        vp9_iht8x8_add(tx_type, dqcoeff, dst, dst_stride, *eob);
       break;
     case TX_4X4:
       tx_type = get_tx_type_4x4(pd->plane_type, xd, block);
@@ -610,14 +575,14 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
 
       vp9_predict_intra_block(xd, block, bwl, TX_4X4, mode,
                               x->skip_encode ? src : dst,
-                              x->skip_encode ? p->src.stride : pd->dst.stride,
-                              dst, pd->dst.stride, i, j, plane);
+                              x->skip_encode ? src_stride : dst_stride,
+                              dst, dst_stride, i, j, plane);
 
       if (!x->skip_recode) {
         vp9_subtract_block(4, 4, src_diff, diff_stride,
-                           src, p->src.stride, dst, pd->dst.stride);
+                           src, src_stride, dst, dst_stride);
         if (tx_type != DCT_DCT)
-          vp9_short_fht4x4(src_diff, coeff, diff_stride, tx_type);
+          vp9_fht4x4(src_diff, coeff, diff_stride, tx_type);
         else
           x->fwd_txm4x4(src_diff, coeff, diff_stride);
         vp9_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant,
@@ -631,33 +596,32 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
           // this is like vp9_short_idct4x4 but has a special case around eob<=1
           // which is significant (not just an optimization) for the lossless
           // case.
-          xd->itxm_add(dqcoeff, dst, pd->dst.stride, *eob);
+          xd->itxm_add(dqcoeff, dst, dst_stride, *eob);
         else
-          vp9_iht4x4_16_add(dqcoeff, dst, pd->dst.stride, tx_type);
+          vp9_iht4x4_16_add(dqcoeff, dst, dst_stride, tx_type);
       }
       break;
     default:
       assert(0);
   }
   if (*eob)
-    *(args->skip_coeff) = 0;
+    *(args->skip) = 0;
 }
 
-void vp9_encode_intra_block_y(MACROBLOCK *x, BLOCK_SIZE bsize) {
-  MACROBLOCKD* const xd = &x->e_mbd;
-  struct optimize_ctx ctx;
-  MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
-  struct encode_b_args arg = {x, &ctx, &mbmi->skip_coeff};
-
-  vp9_foreach_transformed_block_in_plane(xd, bsize, 0, vp9_encode_block_intra,
-                                     &arg);
+void vp9_encode_block_intra(MACROBLOCK *x, int plane, int block,
+                            BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                            unsigned char *skip) {
+  struct encode_b_args arg = {x, NULL, skip};
+  encode_block_intra(plane, block, plane_bsize, tx_size, &arg);
 }
-void vp9_encode_intra_block_uv(MACROBLOCK *x, BLOCK_SIZE bsize) {
-  MACROBLOCKD* const xd = &x->e_mbd;
-  struct optimize_ctx ctx;
-  MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
-  struct encode_b_args arg = {x, &ctx, &mbmi->skip_coeff};
-  vp9_foreach_transformed_block_uv(xd, bsize, vp9_encode_block_intra, &arg);
+
+
+void vp9_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  struct encode_b_args arg = {x, NULL, &xd->mi_8x8[0]->mbmi.skip};
+
+  vp9_foreach_transformed_block_in_plane(xd, bsize, plane, encode_block_intra,
+                                         &arg);
 }
 
 int vp9_encode_intra(MACROBLOCK *x, int use_16x16_pred) {
@@ -668,6 +632,6 @@ int vp9_encode_intra(MACROBLOCK *x, int use_16x16_pred) {
   mbmi->tx_size = use_16x16_pred ? (mbmi->sb_type >= BLOCK_16X16 ? TX_16X16
                                                                  : TX_8X8)
                                    : TX_4X4;
-  vp9_encode_intra_block_y(x, mbmi->sb_type);
+  vp9_encode_intra_block_plane(x, mbmi->sb_type, 0);
   return vp9_get_mb_ss(x->plane[0].src_diff);
 }
diff --git a/vp9/encoder/vp9_encodemb.h b/vp9/encoder/vp9_encodemb.h
index 9f6c9f069..dcf6e8759 100644
--- a/vp9/encoder/vp9_encodemb.h
+++ b/vp9/encoder/vp9_encodemb.h
@@ -20,32 +20,19 @@
 extern "C" {
 #endif
 
-struct optimize_ctx {
-  ENTROPY_CONTEXT ta[MAX_MB_PLANE][16];
-  ENTROPY_CONTEXT tl[MAX_MB_PLANE][16];
-};
-
-struct encode_b_args {
-  MACROBLOCK *x;
-  struct optimize_ctx *ctx;
-  unsigned char *skip_coeff;
-};
-
 void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize);
-void vp9_encode_sby(MACROBLOCK *x, BLOCK_SIZE bsize);
+void vp9_encode_sby_pass1(MACROBLOCK *x, BLOCK_SIZE bsize);
 
-void vp9_xform_quant(int plane, int block, BLOCK_SIZE plane_bsize,
-                     TX_SIZE tx_size, void *arg);
+void vp9_xform_quant(MACROBLOCK *x, int plane, int block,
+                     BLOCK_SIZE plane_bsize, TX_SIZE tx_size);
 
-void vp9_subtract_sby(MACROBLOCK *x, BLOCK_SIZE bsize);
-void vp9_subtract_sbuv(MACROBLOCK *x, BLOCK_SIZE bsize);
-void vp9_subtract_sb(MACROBLOCK *x, BLOCK_SIZE bsize);
+void vp9_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane);
 
-void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
-                            TX_SIZE tx_size, void *arg);
+void vp9_encode_block_intra(MACROBLOCK *x, int plane, int block,
+                            BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                            unsigned char *skip);
 
-void vp9_encode_intra_block_y(MACROBLOCK *x, BLOCK_SIZE bsize);
-void vp9_encode_intra_block_uv(MACROBLOCK *x, BLOCK_SIZE bsize);
+void vp9_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane);
 
 int vp9_encode_intra(MACROBLOCK *x, int use_16x16_pred);
 
diff --git a/vp9/encoder/vp9_encodemv.c b/vp9/encoder/vp9_encodemv.c
index af710a8f4..be6abc2a1 100644
--- a/vp9/encoder/vp9_encodemv.c
+++ b/vp9/encoder/vp9_encodemv.c
@@ -224,18 +224,11 @@ void vp9_encode_mv(VP9_COMP* cpi, vp9_writer* w,
   }
 }
 
-void vp9_build_nmv_cost_table(int *mvjoint,
-                              int *mvcost[2],
-                              const nmv_context* const mvctx,
-                              int usehp,
-                              int mvc_flag_v,
-                              int mvc_flag_h) {
-  vp9_clear_system_state();
-  vp9_cost_tokens(mvjoint, mvctx->joints, vp9_mv_joint_tree);
-  if (mvc_flag_v)
-    build_nmv_component_cost_table(mvcost[0], &mvctx->comps[0], usehp);
-  if (mvc_flag_h)
-    build_nmv_component_cost_table(mvcost[1], &mvctx->comps[1], usehp);
+void vp9_build_nmv_cost_table(int *mvjoint, int *mvcost[2],
+                              const nmv_context* ctx, int usehp) {
+  vp9_cost_tokens(mvjoint, ctx->joints, vp9_mv_joint_tree);
+  build_nmv_component_cost_table(mvcost[0], &ctx->comps[0], usehp);
+  build_nmv_component_cost_table(mvcost[1], &ctx->comps[1], usehp);
 }
 
 static void inc_mvs(int_mv mv[2], int_mv ref[2], int is_compound,
diff --git a/vp9/encoder/vp9_encodemv.h b/vp9/encoder/vp9_encodemv.h
index f0463bbd3..7f997ff37 100644
--- a/vp9/encoder/vp9_encodemv.h
+++ b/vp9/encoder/vp9_encodemv.h
@@ -25,12 +25,8 @@ void vp9_write_nmv_probs(VP9_COMMON *cm, int usehp, vp9_writer *w);
 void vp9_encode_mv(VP9_COMP *cpi, vp9_writer* w, const MV* mv, const MV* ref,
                    const nmv_context* mvctx, int usehp);
 
-void vp9_build_nmv_cost_table(int *mvjoint,
-                              int *mvcost[2],
-                              const nmv_context* const mvctx,
-                              int usehp,
-                              int mvc_flag_v,
-                              int mvc_flag_h);
+void vp9_build_nmv_cost_table(int *mvjoint, int *mvcost[2],
+                              const nmv_context* mvctx, int usehp);
 
 void vp9_update_mv_count(VP9_COMP *cpi, MACROBLOCK *x, int_mv best_ref_mv[2]);
 
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 153046440..32ed96999 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -49,8 +49,9 @@
 
 #define DOUBLE_DIVIDE_CHECK(x) ((x) < 0 ? (x) - 0.000001 : (x) + 0.000001)
 
-#define MIN_BOOST        300
-#define KEY_FRAME_BOOST 2000
+#define MIN_KF_BOOST        300
+
+#define DISABLE_RC_LONG_TERM_MEM 0
 
 static void swap_yv12(YV12_BUFFER_CONFIG *a, YV12_BUFFER_CONFIG *b) {
   YV12_BUFFER_CONFIG temp = *a;
@@ -64,7 +65,7 @@ static int select_cq_level(int qindex) {
 
   double target_q = (vp9_convert_qindex_to_q(qindex) * 0.5847) + 1.0;
 
-  for (i = 0; i < QINDEX_RANGE; i++) {
+  for (i = 0; i < QINDEX_RANGE; ++i) {
     if (target_q <= vp9_convert_qindex_to_q(i)) {
       ret_val = i;
       break;
@@ -105,12 +106,12 @@ static int lookup_next_frame_stats(const struct twopass_rc *p,
 }
 
 
-// Read frame stats at an offset from the current position
+// Read frame stats at an offset from the current position.
 static int read_frame_stats(const struct twopass_rc *p,
                             FIRSTPASS_STATS *frame_stats, int offset) {
   const FIRSTPASS_STATS *fps_ptr = p->stats_in;
 
-  // Check legality of offset
+  // Check legality of offset.
   if (offset >= 0) {
     if (&fps_ptr[offset] >= p->stats_in_end)
       return EOF;
@@ -132,9 +133,9 @@ static int input_stats(struct twopass_rc *p, FIRSTPASS_STATS *fps) {
   return 1;
 }
 
-static void output_stats(const VP9_COMP            *cpi,
+static void output_stats(const VP9_COMP *cpi,
                          struct vpx_codec_pkt_list *pktlist,
-                         FIRSTPASS_STATS            *stats) {
+                         FIRSTPASS_STATS *stats) {
   struct vpx_codec_cx_pkt pkt;
   pkt.kind = VPX_CODEC_STATS_PKT;
   pkt.data.twopass_stats.buf = stats;
@@ -143,7 +144,6 @@ static void output_stats(const VP9_COMP            *cpi,
 
 // TEMP debug code
 #if OUTPUT_FPF
-
   {
     FILE *fpfile;
     fpfile = fopen("firstpass.stt", "a");
@@ -265,9 +265,9 @@ static void avg_stats(FIRSTPASS_STATS *section) {
 
 // Calculate a modified Error used in distributing bits between easier and
 // harder frames.
-static double calculate_modified_err(VP9_COMP *cpi,
-                                     FIRSTPASS_STATS *this_frame) {
-  struct twopass_rc *const twopass = &cpi->twopass;
+static double calculate_modified_err(const VP9_COMP *cpi,
+                                     const FIRSTPASS_STATS *this_frame) {
+  const struct twopass_rc *const twopass = &cpi->twopass;
   const FIRSTPASS_STATS *const stats = &twopass->total_stats;
   const double av_err = stats->ssim_weighted_pred_err / stats->count;
   double modified_error = av_err * pow(this_frame->ssim_weighted_pred_err /
@@ -336,7 +336,7 @@ static double simple_weight(const YV12_BUFFER_CONFIG *buf) {
 }
 
 // This function returns the maximum target rate per frame.
-static int frame_max_bits(VP9_COMP *cpi) {
+static int frame_max_bits(const VP9_COMP *cpi) {
   int64_t max_bits =
     ((int64_t)cpi->rc.av_per_frame_bandwidth *
      (int64_t)cpi->oxcf.two_pass_vbrmax_section) / 100;
@@ -376,7 +376,6 @@ static unsigned int zz_motion_search(const VP9_COMP *cpi, const MACROBLOCK *x) {
   const int src_stride = x->plane[0].src.stride;
   const uint8_t *const ref = xd->plane[0].pre[0].buf;
   const int ref_stride = xd->plane[0].pre[0].stride;
-
   unsigned int sse;
   vp9_variance_fn_t fn = get_block_variance_fn(xd->mi_8x8[0]->mbmi.sb_type);
   fn(src, src_stride, ref, ref_stride, &sse);
@@ -397,18 +396,18 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   int new_mv_mode_penalty = 256;
   const int quart_frm = MIN(cpi->common.width, cpi->common.height);
 
-  // refine the motion search range accroding to the frame dimension
-  // for first pass test
+  // Refine the motion search range according to the frame dimension
+  // for first pass test.
   while ((quart_frm << sr) < MAX_FULL_PEL_VAL)
-    sr++;
+    ++sr;
 
   step_param += sr;
   further_steps -= sr;
 
-  // override the default variance function to use MSE
+  // Override the default variance function to use MSE.
   v_fn_ptr.vf = get_block_variance_fn(bsize);
 
-  // Initial step/diamond search centred on best mv
+  // Center the initial step/diamond search on best mv.
   tmp_err = cpi->diamond_search_sad(x, &ref_mv_full, &tmp_mv,
                                     step_param,
                                     x->sadperbit16, &num00, &v_fn_ptr,
@@ -423,15 +422,15 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
     best_mv->col = tmp_mv.col;
   }
 
-  // Further step/diamond searches as necessary
+  // Carry out further step/diamond searches as necessary.
   n = num00;
   num00 = 0;
 
   while (n < further_steps) {
-    n++;
+    ++n;
 
     if (num00) {
-      num00--;
+      --num00;
     } else {
       tmp_err = cpi->diamond_search_sad(x, &ref_mv_full, &tmp_mv,
                                         step_param + n, x->sadperbit16,
@@ -468,7 +467,7 @@ void vp9_first_pass(VP9_COMP *cpi) {
   TileInfo tile;
   struct macroblock_plane *const p = x->plane;
   struct macroblockd_plane *const pd = xd->plane;
-  PICK_MODE_CONTEXT *ctx = &x->sb64_context;
+  const PICK_MODE_CONTEXT *ctx = &x->sb64_context;
   int i;
 
   int recon_yoffset, recon_uvoffset;
@@ -496,14 +495,14 @@ void vp9_first_pass(VP9_COMP *cpi) {
   struct twopass_rc *const twopass = &cpi->twopass;
   const MV zero_mv = {0, 0};
 
-  vp9_clear_system_state();  // __asm emms;
+  vp9_clear_system_state();
 
   vp9_setup_src_planes(x, cpi->Source, 0, 0);
   setup_pre_planes(xd, 0, lst_yv12, 0, 0, NULL);
   setup_dst_planes(xd, new_yv12, 0, 0);
 
   xd->mi_8x8 = cm->mi_grid_visible;
-  xd->mi_8x8[0] = cm->mi;  // required for vp9_frame_init_quantizer
+  xd->mi_8x8[0] = cm->mi;
 
   vp9_setup_block_planes(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);
 
@@ -520,34 +519,32 @@ void vp9_first_pass(VP9_COMP *cpi) {
   vp9_init_mv_probs(cm);
   vp9_initialize_rd_consts(cpi);
 
-  // tiling is ignored in the first pass
+  // Tiling is ignored in the first pass.
   vp9_tile_init(&tile, cm, 0, 0);
 
-  // for each macroblock row in image
-  for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {
+  for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) {
     int_mv best_ref_mv;
 
     best_ref_mv.as_int = 0;
 
-    // reset above block coeffs
+    // Reset above block coeffs.
     xd->up_available = (mb_row != 0);
     recon_yoffset = (mb_row * recon_y_stride * 16);
     recon_uvoffset = (mb_row * recon_uv_stride * uv_mb_height);
 
     // Set up limit values for motion vectors to prevent them extending
-    // outside the UMV borders
+    // outside the UMV borders.
     x->mv_row_min = -((mb_row * 16) + BORDER_MV_PIXELS_B16);
     x->mv_row_max = ((cm->mb_rows - 1 - mb_row) * 16)
                     + BORDER_MV_PIXELS_B16;
 
-    // for each macroblock col in image
-    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
+    for (mb_col = 0; mb_col < cm->mb_cols; ++mb_col) {
       int this_error;
       const int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
       double error_weight = 1.0;
       const BLOCK_SIZE bsize = get_bsize(cm, mb_row, mb_col);
 
-      vp9_clear_system_state();  // __asm emms;
+      vp9_clear_system_state();
 
       xd->plane[0].dst.buf = new_yv12->y_buffer + recon_yoffset;
       xd->plane[1].dst.buf = new_yv12->u_buffer + recon_uvoffset;
@@ -565,15 +562,15 @@ void vp9_first_pass(VP9_COMP *cpi) {
         error_weight = vp9_vaq_inv_q_ratio(energy);
       }
 
-      // do intra 16x16 prediction
+      // Do intra 16x16 prediction.
       this_error = vp9_encode_intra(x, use_dc_pred);
       if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
-        vp9_clear_system_state();  // __asm emms;
-        this_error *= error_weight;
+        vp9_clear_system_state();
+        this_error = (int)(this_error * error_weight);
       }
 
-      // intrapenalty below deals with situations where the intra and inter
-      // error scores are very low (eg a plain black frame).
+      // Intrapenalty below deals with situations where the intra and inter
+      // error scores are very low (e.g. a plain black frame).
       // We do not have special cases in first pass for 0,0 and nearest etc so
       // all inter modes carry an overhead cost estimate for the mv.
       // When the error score is very low this causes us to pick all or lots of
@@ -581,7 +578,7 @@ void vp9_first_pass(VP9_COMP *cpi) {
       // This penalty adds a cost matching that of a 0,0 mv to the intra case.
       this_error += intrapenalty;
 
-      // Cumulative intra error total
+      // Accumulate the intra error.
       intra_error += (int64_t)this_error;
 
       // Set up limit values for motion vectors to prevent them extending
@@ -589,23 +586,23 @@ void vp9_first_pass(VP9_COMP *cpi) {
       x->mv_col_min = -((mb_col * 16) + BORDER_MV_PIXELS_B16);
       x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16) + BORDER_MV_PIXELS_B16;
 
-      // Other than for the first frame do a motion search
+      // Other than for the first frame do a motion search.
       if (cm->current_video_frame > 0) {
         int tmp_err, motion_error;
         int_mv mv, tmp_mv;
 
         xd->plane[0].pre[0].buf = lst_yv12->y_buffer + recon_yoffset;
         motion_error = zz_motion_search(cpi, x);
-        // Simple 0,0 motion with no mv overhead
+        // Assume 0,0 motion with no mv overhead.
         mv.as_int = tmp_mv.as_int = 0;
 
         // Test last reference frame using the previous best mv as the
-        // starting point (best reference) for the search
+        // starting point (best reference) for the search.
         first_pass_motion_search(cpi, x, &best_ref_mv.as_mv, &mv.as_mv,
                                  &motion_error);
         if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
-          vp9_clear_system_state();  // __asm emms;
-          motion_error *= error_weight;
+          vp9_clear_system_state();
+          motion_error = (int)(motion_error * error_weight);
         }
 
         // If the current best reference mv is not centered on 0,0 then do a 0,0
@@ -615,8 +612,8 @@ void vp9_first_pass(VP9_COMP *cpi) {
           first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv.as_mv,
                                    &tmp_err);
           if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
-            vp9_clear_system_state();  // __asm emms;
-            tmp_err *= error_weight;
+            vp9_clear_system_state();
+            tmp_err = (int)(tmp_err * error_weight);
           }
 
           if (tmp_err < motion_error) {
@@ -625,9 +622,9 @@ void vp9_first_pass(VP9_COMP *cpi) {
           }
         }
 
-        // Experimental search in an older reference frame
+        // Search in an older reference frame.
         if (cm->current_video_frame > 1) {
-          // Simple 0,0 motion with no mv overhead
+          // Assume 0,0 motion with no mv overhead.
           int gf_motion_error;
 
           xd->plane[0].pre[0].buf = gld_yv12->y_buffer + recon_yoffset;
@@ -636,22 +633,22 @@ void vp9_first_pass(VP9_COMP *cpi) {
           first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv.as_mv,
                                    &gf_motion_error);
           if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
-            vp9_clear_system_state();  // __asm emms;
-            gf_motion_error *= error_weight;
+            vp9_clear_system_state();
+            gf_motion_error = (int)(gf_motion_error * error_weight);
           }
 
           if (gf_motion_error < motion_error && gf_motion_error < this_error)
-            second_ref_count++;
+            ++second_ref_count;
 
-          // Reset to last frame as reference buffer
+          // Reset to last frame as reference buffer.
           xd->plane[0].pre[0].buf = lst_yv12->y_buffer + recon_yoffset;
           xd->plane[1].pre[0].buf = lst_yv12->u_buffer + recon_uvoffset;
           xd->plane[2].pre[0].buf = lst_yv12->v_buffer + recon_uvoffset;
 
-          // In accumulating a score for the older reference frame
-          // take the best of the motion predicted score and
-          // the intra coded error (just as will be done for)
-          // accumulation of "coded_error" for the last frame.
+          // In accumulating a score for the older reference frame take the
+          // best of the motion predicted score and the intra coded error
+          // (just as will be done for) accumulation of "coded_error" for
+          // the last frame.
           if (gf_motion_error < this_error)
             sr_coded_error += gf_motion_error;
           else
@@ -659,17 +656,16 @@ void vp9_first_pass(VP9_COMP *cpi) {
         } else {
           sr_coded_error += motion_error;
         }
-        /* Intra assumed best */
+        // Start by assuming that intra mode is best.
         best_ref_mv.as_int = 0;
 
         if (motion_error <= this_error) {
-          // Keep a count of cases where the inter and intra were
-          // very close and very low. This helps with scene cut
-          // detection for example in cropped clips with black bars
-          // at the sides or top and bottom.
+          // Keep a count of cases where the inter and intra were very close
+          // and very low. This helps with scene cut detection for example in
+          // cropped clips with black bars at the sides or top and bottom.
           if (((this_error - intrapenalty) * 9 <= motion_error * 10) &&
               this_error < 2 * intrapenalty)
-            neutral_count++;
+            ++neutral_count;
 
           mv.as_mv.row *= 8;
           mv.as_mv.col *= 8;
@@ -679,50 +675,49 @@ void vp9_first_pass(VP9_COMP *cpi) {
           xd->mi_8x8[0]->mbmi.ref_frame[0] = LAST_FRAME;
           xd->mi_8x8[0]->mbmi.ref_frame[1] = NONE;
           vp9_build_inter_predictors_sby(xd, mb_row << 1, mb_col << 1, bsize);
-          vp9_encode_sby(x, bsize);
+          vp9_encode_sby_pass1(x, bsize);
           sum_mvr += mv.as_mv.row;
           sum_mvr_abs += abs(mv.as_mv.row);
           sum_mvc += mv.as_mv.col;
           sum_mvc_abs += abs(mv.as_mv.col);
           sum_mvrs += mv.as_mv.row * mv.as_mv.row;
           sum_mvcs += mv.as_mv.col * mv.as_mv.col;
-          intercount++;
+          ++intercount;
 
           best_ref_mv.as_int = mv.as_int;
 
-          // Was the vector non-zero
           if (mv.as_int) {
-            mvcount++;
+            ++mvcount;
 
-            // Was it different from the last non zero vector
+            // Non-zero vector, was it different from the last non zero vector?
             if (mv.as_int != lastmv_as_int)
-              new_mv_count++;
+              ++new_mv_count;
             lastmv_as_int = mv.as_int;
 
-            // Does the Row vector point inwards or outwards
+            // Does the row vector point inwards or outwards?
             if (mb_row < cm->mb_rows / 2) {
               if (mv.as_mv.row > 0)
-                sum_in_vectors--;
+                --sum_in_vectors;
               else if (mv.as_mv.row < 0)
-                sum_in_vectors++;
+                ++sum_in_vectors;
             } else if (mb_row > cm->mb_rows / 2) {
               if (mv.as_mv.row > 0)
-                sum_in_vectors++;
+                ++sum_in_vectors;
               else if (mv.as_mv.row < 0)
-                sum_in_vectors--;
+                --sum_in_vectors;
             }
 
-            // Does the Row vector point inwards or outwards
+            // Does the col vector point inwards or outwards?
             if (mb_col < cm->mb_cols / 2) {
               if (mv.as_mv.col > 0)
-                sum_in_vectors--;
+                --sum_in_vectors;
               else if (mv.as_mv.col < 0)
-                sum_in_vectors++;
+                ++sum_in_vectors;
             } else if (mb_col > cm->mb_cols / 2) {
               if (mv.as_mv.col > 0)
-                sum_in_vectors++;
+                ++sum_in_vectors;
               else if (mv.as_mv.col < 0)
-                sum_in_vectors--;
+                --sum_in_vectors;
             }
           }
         }
@@ -731,7 +726,7 @@ void vp9_first_pass(VP9_COMP *cpi) {
       }
       coded_error += (int64_t)this_error;
 
-      // adjust to the next column of macroblocks
+      // Adjust to the next column of MBs.
       x->plane[0].src.buf += 16;
       x->plane[1].src.buf += uv_mb_height;
       x->plane[2].src.buf += uv_mb_height;
@@ -740,24 +735,24 @@ void vp9_first_pass(VP9_COMP *cpi) {
       recon_uvoffset += uv_mb_height;
     }
 
-    // adjust to the next row of mbs
+    // Adjust to the next row of MBs.
     x->plane[0].src.buf += 16 * x->plane[0].src.stride - 16 * cm->mb_cols;
     x->plane[1].src.buf += uv_mb_height * x->plane[1].src.stride -
                            uv_mb_height * cm->mb_cols;
     x->plane[2].src.buf += uv_mb_height * x->plane[1].src.stride -
                            uv_mb_height * cm->mb_cols;
 
-    vp9_clear_system_state();  // __asm emms;
+    vp9_clear_system_state();
   }
 
-  vp9_clear_system_state();  // __asm emms;
+  vp9_clear_system_state();
   {
     FIRSTPASS_STATS fps;
 
     fps.frame = cm->current_video_frame;
-    fps.intra_error = intra_error >> 8;
-    fps.coded_error = coded_error >> 8;
-    fps.sr_coded_error = sr_coded_error >> 8;
+    fps.intra_error = (double)(intra_error >> 8);
+    fps.coded_error = (double)(coded_error >> 8);
+    fps.sr_coded_error = (double)(sr_coded_error >> 8);
     fps.ssim_weighted_pred_err = fps.coded_error * simple_weight(cpi->Source);
     fps.count = 1.0;
     fps.pcnt_inter = (double)intercount / cm->MBs;
@@ -791,14 +786,14 @@ void vp9_first_pass(VP9_COMP *cpi) {
     // cpi->source_time_stamp.
     fps.duration = (double)(cpi->source->ts_end - cpi->source->ts_start);
 
-    // don't want to do output stats with a stack variable!
+    // Don't want to do output stats with a stack variable!
     twopass->this_frame_stats = fps;
     output_stats(cpi, cpi->output_pkt_list, &twopass->this_frame_stats);
     accumulate_stats(&twopass->total_stats, &fps);
   }
 
   // Copy the previous Last Frame back into gf and and arf buffers if
-  // the prediction is good enough... but also dont allow it to lag too far
+  // the prediction is good enough... but also don't allow it to lag too far.
   if ((twopass->sr_update_lag > 3) ||
       ((cm->current_video_frame > 0) &&
        (twopass->this_frame_stats.pcnt_inter > 0.20) &&
@@ -807,9 +802,9 @@ void vp9_first_pass(VP9_COMP *cpi) {
     vp8_yv12_copy_frame(lst_yv12, gld_yv12);
     twopass->sr_update_lag = 1;
   } else {
-    twopass->sr_update_lag++;
+    ++twopass->sr_update_lag;
   }
-  // swap frame pointers so last frame refers to the frame we just compressed
+  // Swap frame pointers so last frame refers to the frame we just compressed.
   swap_yv12(lst_yv12, new_yv12);
 
   vp9_extend_frame_borders(lst_yv12, cm->subsampling_x, cm->subsampling_y);
@@ -819,7 +814,7 @@ void vp9_first_pass(VP9_COMP *cpi) {
   if (cm->current_video_frame == 0)
     vp8_yv12_copy_frame(lst_yv12, gld_yv12);
 
-  // use this to see what the first pass reconstruction looks like
+  // Use this to see what the first pass reconstruction looks like.
   if (0) {
     char filename[512];
     FILE *recon_file;
@@ -835,15 +830,11 @@ void vp9_first_pass(VP9_COMP *cpi) {
     fclose(recon_file);
   }
 
-  cm->current_video_frame++;
+  ++cm->current_video_frame;
 }
 
-// Estimate a cost per mb attributable to overheads such as the coding of
-// modes and motion vectors.
-// Currently simplistic in its assumptions for testing.
-//
-
-
+// Estimate a cost per mb attributable to overheads such as the coding of modes
+// and motion vectors. This currently makes simplistic assumptions for testing.
 static double bitcost(double prob) {
   return -(log(prob) / log(2.0));
 }
@@ -866,18 +857,17 @@ static int64_t estimate_modemvcost(VP9_COMP *cpi,
   motion_cost = bitcost(av_pct_motion);
   intra_cost = bitcost(av_intra);
 
-  // Estimate of extra bits per mv overhead for mbs
-  // << 9 is the normalization to the (bits * 512) used in vp9_rc_bits_per_mb
+  // Estimate the number of extra bits per mv overhead for mbs. We shift (<< 9)
+  // to match the scaling of number of bits by 512.
   mv_cost = ((int)(fpstats->new_mv_count / fpstats->count) * 8) << 9;
 
-  // Crude estimate of overhead cost from modes
-  // << 9 is the normalization to (bits * 512) used in vp9_rc_bits_per_mb
+  // Produce a crude estimate of the overhead cost from modes. We shift (<< 9)
+  // to match the scaling of number of bits by 512.
   mode_cost =
     (int)((((av_pct_inter - av_pct_motion) * zz_cost) +
            (av_pct_motion * motion_cost) +
            (av_intra * intra_cost)) * cpi->common.MBs) << 9;
 
-  // return mv_cost + mode_cost;
   // TODO(paulwilkins): Fix overhead costs for extended Q range.
 #endif
   return 0;
@@ -894,19 +884,19 @@ static double calc_correction_factor(double err_per_mb,
   const double power_term = MIN(vp9_convert_qindex_to_q(q) * 0.0125 + pt_low,
                                 pt_high);
 
-  // Calculate correction factor
+  // Calculate correction factor.
   if (power_term < 1.0)
     assert(error_term >= 0.0);
 
   return fclamp(pow(error_term, power_term), 0.05, 5.0);
 }
 
-static int estimate_max_q(VP9_COMP *cpi, FIRSTPASS_STATS *fpstats,
-                          int section_target_bandwitdh) {
+int vp9_twopass_worst_quality(VP9_COMP *cpi, FIRSTPASS_STATS *fpstats,
+                              int section_target_bandwitdh) {
   int q;
   const int num_mbs = cpi->common.MBs;
   int target_norm_bits_per_mb;
-  RATE_CONTROL *const rc = &cpi->rc;
+  const RATE_CONTROL *const rc = &cpi->rc;
 
   const double section_err = fpstats->coded_error / fpstats->count;
   const double err_per_mb = section_err / num_mbs;
@@ -920,7 +910,7 @@ static int estimate_max_q(VP9_COMP *cpi, FIRSTPASS_STATS *fpstats,
 
   // Try and pick a max Q that will be high enough to encode the
   // content at the given rate.
-  for (q = rc->best_quality; q < rc->worst_quality; q++) {
+  for (q = rc->best_quality; q < rc->worst_quality; ++q) {
     const double err_correction_factor = calc_correction_factor(err_per_mb,
                                              ERR_DIVISOR, 0.5, 0.90, q);
     const int bits_per_mb_at_this_q = vp9_rc_bits_per_mb(INTER_FRAME, q,
@@ -936,58 +926,6 @@ static int estimate_max_q(VP9_COMP *cpi, FIRSTPASS_STATS *fpstats,
   return q;
 }
 
-// For cq mode estimate a cq level that matches the observed
-// complexity and data rate.
-static int estimate_cq(VP9_COMP *cpi,
-                       FIRSTPASS_STATS *fpstats,
-                       int section_target_bandwitdh) {
-  int q;
-  int num_mbs = cpi->common.MBs;
-  int target_norm_bits_per_mb;
-
-  double section_err = (fpstats->coded_error / fpstats->count);
-  double err_per_mb = section_err / num_mbs;
-  double err_correction_factor;
-  double clip_iiratio;
-  double clip_iifactor;
-
-  target_norm_bits_per_mb = (section_target_bandwitdh < (1 << 20))
-                            ? (512 * section_target_bandwitdh) / num_mbs
-                            : 512 * (section_target_bandwitdh / num_mbs);
-
-
-  // II ratio correction factor for clip as a whole
-  clip_iiratio = cpi->twopass.total_stats.intra_error /
-                 DOUBLE_DIVIDE_CHECK(cpi->twopass.total_stats.coded_error);
-  clip_iifactor = 1.0 - ((clip_iiratio - 10.0) * 0.025);
-  if (clip_iifactor < 0.80)
-    clip_iifactor = 0.80;
-
-  // Try and pick a Q that can encode the content at the given rate.
-  for (q = 0; q < MAXQ; q++) {
-    int bits_per_mb_at_this_q;
-
-    // Error per MB based correction factor
-    err_correction_factor =
-      calc_correction_factor(err_per_mb, 100.0, 0.5, 0.90, q) * clip_iifactor;
-
-    bits_per_mb_at_this_q =
-      vp9_rc_bits_per_mb(INTER_FRAME, q, err_correction_factor);
-
-    if (bits_per_mb_at_this_q <= target_norm_bits_per_mb)
-      break;
-  }
-
-  // Clip value to range "best allowed to (worst allowed - 1)"
-  q = select_cq_level(q);
-  if (q >= cpi->rc.worst_quality)
-    q = cpi->rc.worst_quality - 1;
-  if (q < cpi->rc.best_quality)
-    q = cpi->rc.best_quality;
-
-  return q;
-}
-
 extern void vp9_new_framerate(VP9_COMP *cpi, double framerate);
 
 void vp9_init_second_pass(VP9_COMP *cpi) {
@@ -1005,11 +943,11 @@ void vp9_init_second_pass(VP9_COMP *cpi) {
   twopass->total_stats = *twopass->stats_in_end;
   twopass->total_left_stats = twopass->total_stats;
 
-  // each frame can have a different duration, as the frame rate in the source
-  // isn't guaranteed to be constant.   The frame rate prior to the first frame
-  // encoded in the second pass is a guess.  However the sum duration is not.
-  // Its calculated based on the actual durations of all frames from the first
-  // pass.
+  // Each frame can have a different duration, as the frame rate in the source
+  // isn't guaranteed to be constant. The frame rate prior to the first frame
+  // encoded in the second pass is a guess. However, the sum duration is not.
+  // It is calculated based on the actual durations of all frames from the
+  // first pass.
   vp9_new_framerate(cpi, 10000000.0 * twopass->total_stats.count /
                         twopass->total_stats.duration);
 
@@ -1020,18 +958,18 @@ void vp9_init_second_pass(VP9_COMP *cpi) {
   // Calculate a minimum intra value to be used in determining the IIratio
   // scores used in the second pass. We have this minimum to make sure
   // that clips that are static but "low complexity" in the intra domain
-  // are still boosted appropriately for KF/GF/ARF
+  // are still boosted appropriately for KF/GF/ARF.
   twopass->kf_intra_err_min = KF_MB_INTRA_MIN * cpi->common.MBs;
   twopass->gf_intra_err_min = GF_MB_INTRA_MIN * cpi->common.MBs;
 
-  // This variable monitors how far behind the second ref update is lagging
+  // This variable monitors how far behind the second ref update is lagging.
   twopass->sr_update_lag = 1;
 
   // Scan the first pass file and calculate an average Intra / Inter error score
   // ratio for the sequence.
   {
     double sum_iiratio = 0.0;
-    start_pos = twopass->stats_in;  // Note the starting "file" position.
+    start_pos = twopass->stats_in;
 
     while (input_stats(twopass, &this_frame) != EOF) {
       const double iiratio = this_frame.intra_error /
@@ -1042,7 +980,6 @@ void vp9_init_second_pass(VP9_COMP *cpi) {
     twopass->avg_iiratio = sum_iiratio /
         DOUBLE_DIVIDE_CHECK((double)twopass->total_stats.count);
 
-    // Reset file position
     reset_fpf_position(twopass, start_pos);
   }
 
@@ -1052,7 +989,7 @@ void vp9_init_second_pass(VP9_COMP *cpi) {
     double av_error = twopass->total_stats.ssim_weighted_pred_err /
                       DOUBLE_DIVIDE_CHECK(twopass->total_stats.count);
 
-    start_pos = twopass->stats_in;  // Note starting "file" position
+    start_pos = twopass->stats_in;
 
     twopass->modified_error_total = 0.0;
     twopass->modified_error_min =
@@ -1073,8 +1010,8 @@ void vp9_init_second_pass(VP9_COMP *cpi) {
 void vp9_end_second_pass(VP9_COMP *cpi) {
 }
 
-// This function gives and estimate of how badly we believe
-// the prediction quality is decaying from frame to frame.
+// This function gives an estimate of how badly we believe the prediction
+// quality is decaying from frame to frame.
 static double get_prediction_decay_rate(const VP9_COMMON *cm,
                                         const FIRSTPASS_STATS *next_frame) {
   // Look at the observed drop in prediction quality between the last frame
@@ -1091,12 +1028,10 @@ static double get_prediction_decay_rate(const VP9_COMMON *cm,
 // Function to test for a condition where a complex transition is followed
 // by a static section. For example in slide shows where there is a fade
 // between slides. This is to help with more optimal kf and gf positioning.
-static int detect_transition_to_still(
-  VP9_COMP *cpi,
-  int frame_interval,
-  int still_interval,
-  double loop_decay_rate,
-  double last_decay_rate) {
+static int detect_transition_to_still(VP9_COMP *cpi, int frame_interval,
+                                      int still_interval,
+                                      double loop_decay_rate,
+                                      double last_decay_rate) {
   int trans_to_still = 0;
 
   // Break clause to detect very still sections after motion
@@ -1109,9 +1044,8 @@ static int detect_transition_to_still(
     FIRSTPASS_STATS *position = cpi->twopass.stats_in;
     FIRSTPASS_STATS tmp_next_frame;
 
-    // Look ahead a few frames to see if static condition
-    // persists...
-    for (j = 0; j < still_interval; j++) {
+    // Look ahead a few frames to see if static condition persists...
+    for (j = 0; j < still_interval; ++j) {
       if (EOF == input_stats(&cpi->twopass, &tmp_next_frame))
         break;
 
@@ -1121,7 +1055,7 @@ static int detect_transition_to_still(
 
     reset_fpf_position(&cpi->twopass, position);
 
-    // Only if it does do we signal a transition to still
+    // Only if it does do we signal a transition to still.
     if (j == still_interval)
       trans_to_still = 1;
   }
@@ -1131,7 +1065,7 @@ static int detect_transition_to_still(
 
 // This function detects a flash through the high relative pcnt_second_ref
 // score in the frame following a flash frame. The offset passed in should
-// reflect this
+// reflect this.
 static int detect_flash(const struct twopass_rc *twopass, int offset) {
   FIRSTPASS_STATS next_frame;
 
@@ -1144,7 +1078,7 @@ static int detect_flash(const struct twopass_rc *twopass, int offset) {
     // brief break in prediction (such as a flash) but subsequent frames
     // are reasonably well predicted by an earlier (pre flash) frame.
     // The recovery after a flash is indicated by a high pcnt_second_ref
-    // comapred to pcnt_inter.
+    // compared to pcnt_inter.
     if (next_frame.pcnt_second_ref > next_frame.pcnt_inter &&
         next_frame.pcnt_second_ref >= 0.5)
       flash_detected = 1;
@@ -1153,7 +1087,7 @@ static int detect_flash(const struct twopass_rc *twopass, int offset) {
   return flash_detected;
 }
 
-// Update the motion related elements to the GF arf boost calculation
+// Update the motion related elements to the GF arf boost calculation.
 static void accumulate_frame_motion_stats(
   FIRSTPASS_STATS *this_frame,
   double *this_frame_mv_in_out,
@@ -1165,13 +1099,13 @@ static void accumulate_frame_motion_stats(
   // Accumulate motion stats.
   motion_pct = this_frame->pcnt_motion;
 
-  // Accumulate Motion In/Out of frame stats
+  // Accumulate Motion In/Out of frame stats.
   *this_frame_mv_in_out = this_frame->mv_in_out_count * motion_pct;
   *mv_in_out_accumulator += this_frame->mv_in_out_count * motion_pct;
   *abs_mv_in_out_accumulator += fabs(this_frame->mv_in_out_count * motion_pct);
 
   // Accumulate a measure of how uniform (or conversely how random)
-  // the motion field is. (A ratio of absmv / mv)
+  // the motion field is (a ratio of absmv / mv).
   if (motion_pct > 0.05) {
     const double this_frame_mvr_ratio = fabs(this_frame->mvr_abs) /
                            DOUBLE_DIVIDE_CHECK(fabs(this_frame->MVr));
@@ -1194,7 +1128,7 @@ static double calc_frame_boost(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame,
                                double this_frame_mv_in_out) {
   double frame_boost;
 
-  // Underlying boost factor is based on inter intra error ratio
+  // Underlying boost factor is based on inter intra error ratio.
   if (this_frame->intra_error > cpi->twopass.gf_intra_err_min)
     frame_boost = (IIFACTOR * this_frame->intra_error /
                    DOUBLE_DIVIDE_CHECK(this_frame->coded_error));
@@ -1202,13 +1136,12 @@ static double calc_frame_boost(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame,
     frame_boost = (IIFACTOR * cpi->twopass.gf_intra_err_min /
                    DOUBLE_DIVIDE_CHECK(this_frame->coded_error));
 
-  // Increase boost for frames where new data coming into frame
-  // (eg zoom out). Slightly reduce boost if there is a net balance
-  // of motion out of the frame (zoom in).
-  // The range for this_frame_mv_in_out is -1.0 to +1.0
+  // Increase boost for frames where new data coming into frame (e.g. zoom out).
+  // Slightly reduce boost if there is a net balance of motion out of the frame
+  // (zoom in). The range for this_frame_mv_in_out is -1.0 to +1.0.
   if (this_frame_mv_in_out > 0.0)
     frame_boost += frame_boost * (this_frame_mv_in_out * 2.0);
-  // In extreme case boost is halved
+  // In the extreme case the boost is halved.
   else
     frame_boost += frame_boost * (this_frame_mv_in_out / 2.0);
 
@@ -1230,12 +1163,12 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset,
   int arf_boost;
   int flash_detected = 0;
 
-  // Search forward from the proposed arf/next gf position
-  for (i = 0; i < f_frames; i++) {
+  // Search forward from the proposed arf/next gf position.
+  for (i = 0; i < f_frames; ++i) {
     if (read_frame_stats(twopass, &this_frame, (i + offset)) == EOF)
       break;
 
-    // Update the motion related elements to the boost calculation
+    // Update the motion related elements to the boost calculation.
     accumulate_frame_motion_stats(&this_frame,
                                   &this_frame_mv_in_out, &mv_in_out_accumulator,
                                   &abs_mv_in_out_accumulator,
@@ -1246,7 +1179,7 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset,
     flash_detected = detect_flash(twopass, i + offset) ||
                      detect_flash(twopass, i + offset + 1);
 
-    // Cumulative effect of prediction quality decay
+    // Accumulate the effect of prediction quality decay.
     if (!flash_detected) {
       decay_accumulator *= get_prediction_decay_rate(&cpi->common, &this_frame);
       decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR
@@ -1259,7 +1192,7 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset,
 
   *f_boost = (int)boost_score;
 
-  // Reset for backward looking loop
+  // Reset for backward looking loop.
   boost_score = 0.0;
   mv_ratio_accumulator = 0.0;
   decay_accumulator = 1.0;
@@ -1267,12 +1200,12 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset,
   mv_in_out_accumulator = 0.0;
   abs_mv_in_out_accumulator = 0.0;
 
-  // Search backward towards last gf position
-  for (i = -1; i >= -b_frames; i--) {
+  // Search backward towards last gf position.
+  for (i = -1; i >= -b_frames; --i) {
     if (read_frame_stats(twopass, &this_frame, (i + offset)) == EOF)
       break;
 
-    // Update the motion related elements to the boost calculation
+    // Update the motion related elements to the boost calculation.
     accumulate_frame_motion_stats(&this_frame,
                                   &this_frame_mv_in_out, &mv_in_out_accumulator,
                                   &abs_mv_in_out_accumulator,
@@ -1283,7 +1216,7 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset,
     flash_detected = detect_flash(twopass, i + offset) ||
                      detect_flash(twopass, i + offset + 1);
 
-    // Cumulative effect of prediction quality decay
+    // Cumulative effect of prediction quality decay.
     if (!flash_detected) {
       decay_accumulator *= get_prediction_decay_rate(&cpi->common, &this_frame);
       decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR
@@ -1333,8 +1266,7 @@ static void schedule_frames(VP9_COMP *cpi, const int start, const int end,
     return;
   }
 
-  // ARF Group: work out the ARF schedule.
-  // Mark ARF frames as negative.
+  // ARF Group: Work out the ARF schedule and mark ARF frames as negative.
   if (end < 0) {
     // printf("start:%d end:%d\n", -end, -end);
     // ARF frame is at the end of the range.
@@ -1457,14 +1389,14 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   double decay_accumulator = 1.0;
   double zero_motion_accumulator = 1.0;
 
-  double loop_decay_rate = 1.00;          // Starting decay rate
+  double loop_decay_rate = 1.00;
   double last_loop_decay_rate = 1.00;
 
   double this_frame_mv_in_out = 0.0;
   double mv_in_out_accumulator = 0.0;
   double abs_mv_in_out_accumulator = 0.0;
   double mv_ratio_accumulator_thresh;
-  int max_bits = frame_max_bits(cpi);     // Max for a single frame
+  const int max_bits = frame_max_bits(cpi);  // Max bits for a single frame.
 
   unsigned int allow_alt_ref = cpi->oxcf.play_alternate &&
                                cpi->oxcf.lag_in_frames;
@@ -1477,19 +1409,19 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
 
   twopass->gf_group_bits = 0;
 
-  vp9_clear_system_state();  // __asm emms;
+  vp9_clear_system_state();
 
   start_pos = twopass->stats_in;
 
   // Load stats for the current frame.
   mod_frame_err = calculate_modified_err(cpi, this_frame);
 
-  // Note the error of the frame at the start of the group (this will be
-  // the GF frame error if we code a normal gf
+  // Note the error of the frame at the start of the group. This will be
+  // the GF frame error if we code a normal gf.
   gf_first_frame_err = mod_frame_err;
 
   // If this is a key frame or the overlay from a previous arf then
-  // The error score / cost of this frame has already been accounted for.
+  // the error score / cost of this frame has already been accounted for.
   if (cpi->common.frame_type == KEY_FRAME || rc->source_alt_ref_active)
     gf_group_err -= gf_first_frame_err;
 
@@ -1511,9 +1443,9 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
 
   i = 0;
   while (i < twopass->static_scene_max_gf_interval && i < rc->frames_to_key) {
-    i++;    // Increment the loop counter
+    ++i;
 
-    // Accumulate error score of frames in this gf group
+    // Accumulate error score of frames in this gf group.
     mod_frame_err = calculate_modified_err(cpi, this_frame);
     gf_group_err += mod_frame_err;
 
@@ -1524,13 +1456,13 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     // quality back to an earlier frame is then restored.
     flash_detected = detect_flash(twopass, 0);
 
-    // Update the motion related elements to the boost calculation
+    // Update the motion related elements to the boost calculation.
     accumulate_frame_motion_stats(&next_frame,
                                   &this_frame_mv_in_out, &mv_in_out_accumulator,
                                   &abs_mv_in_out_accumulator,
                                   &mv_ratio_accumulator);
 
-    // Cumulative effect of prediction quality decay
+    // Accumulate the effect of prediction quality decay.
     if (!flash_detected) {
       last_loop_decay_rate = loop_decay_rate;
       loop_decay_rate = get_prediction_decay_rate(&cpi->common, &next_frame);
@@ -1543,8 +1475,8 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
                                       next_frame.pcnt_motion;
       }
 
-      // Break clause to detect very still sections after motion
-      // (for example a static image after a fade or other transition).
+      // Break clause to detect very still sections after motion. For example,
+      // a static image after a fade or other transition.
       if (detect_transition_to_still(cpi, i, 5, loop_decay_rate,
                                      last_loop_decay_rate)) {
         allow_alt_ref = 0;
@@ -1552,16 +1484,16 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
       }
     }
 
-    // Calculate a boost number for this frame
+    // Calculate a boost number for this frame.
     boost_score += (decay_accumulator *
        calc_frame_boost(cpi, &next_frame, this_frame_mv_in_out));
 
     // Break out conditions.
     if (
-      // Break at cpi->max_gf_interval unless almost totally static
+      // Break at cpi->max_gf_interval unless almost totally static.
       (i >= active_max_gf_interval && (zero_motion_accumulator < 0.995)) ||
       (
-        // Don't break out with a very short interval
+        // Don't break out with a very short interval.
         (i > MIN_GF_INTERVAL) &&
         ((boost_score > 125.0) || (next_frame.pcnt_inter < 0.75)) &&
         (!flash_detected) &&
@@ -1580,10 +1512,10 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
 
   twopass->gf_zeromotion_pct = (int)(zero_motion_accumulator * 1000.0);
 
-  // Don't allow a gf too near the next kf
+  // Don't allow a gf too near the next kf.
   if ((rc->frames_to_key - i) < MIN_GF_INTERVAL) {
     while (i < (rc->frames_to_key + !rc->next_key_frame_forced)) {
-      i++;
+      ++i;
 
       if (EOF == input_stats(twopass, this_frame))
         break;
@@ -1613,20 +1545,14 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   else
     rc->baseline_gf_interval = i;
 
-  // Should we use the alternate reference frame
+  // Should we use the alternate reference frame.
   if (allow_alt_ref &&
       (i < cpi->oxcf.lag_in_frames) &&
       (i >= MIN_GF_INTERVAL) &&
-      // for real scene cuts (not forced kfs) dont allow arf very near kf.
+      // For real scene cuts (not forced kfs) don't allow arf very near kf.
       (rc->next_key_frame_forced ||
-        (i <= (rc->frames_to_key - MIN_GF_INTERVAL))) &&
-      ((next_frame.pcnt_inter > 0.75) ||
-       (next_frame.pcnt_second_ref > 0.5)) &&
-      ((mv_in_out_accumulator / (double)i > -0.2) ||
-       (mv_in_out_accumulator > -2.0)) &&
-      (boost_score > 100)) {
-
-    // Alternative boost calculation for alt ref
+      (i <= (rc->frames_to_key - MIN_GF_INTERVAL)))) {
+    // Calculate the boost for alt ref.
     rc->gfu_boost = calc_arf_boost(cpi, 0, (i - 1), (i - 1), &f_boost,
                                    &b_boost);
     rc->source_alt_ref_pending = 1;
@@ -1688,28 +1614,24 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
 #endif
 #endif
 
-  // Calculate the bits to be allocated to the group as a whole
-  if ((cpi->twopass.kf_group_bits > 0) &&
-      (cpi->twopass.kf_group_error_left > 0)) {
-    cpi->twopass.gf_group_bits =
-      (int64_t)(cpi->twopass.kf_group_bits *
+  // Calculate the bits to be allocated to the group as a whole.
+  if (twopass->kf_group_bits > 0 && twopass->kf_group_error_left > 0) {
+    twopass->gf_group_bits = (int64_t)(cpi->twopass.kf_group_bits *
                 (gf_group_err / cpi->twopass.kf_group_error_left));
   } else {
-    cpi->twopass.gf_group_bits = 0;
+    twopass->gf_group_bits = 0;
   }
-  cpi->twopass.gf_group_bits =
-    (cpi->twopass.gf_group_bits < 0)
-    ? 0
-    : (cpi->twopass.gf_group_bits > cpi->twopass.kf_group_bits)
-    ? cpi->twopass.kf_group_bits : cpi->twopass.gf_group_bits;
+  twopass->gf_group_bits = (twopass->gf_group_bits < 0) ?
+     0 : (twopass->gf_group_bits > twopass->kf_group_bits) ?
+     twopass->kf_group_bits : twopass->gf_group_bits;
 
   // Clip cpi->twopass.gf_group_bits based on user supplied data rate
-  // variability limit (cpi->oxcf.two_pass_vbrmax_section)
-  if (cpi->twopass.gf_group_bits > (int64_t)max_bits * rc->baseline_gf_interval)
-    cpi->twopass.gf_group_bits = (int64_t)max_bits * rc->baseline_gf_interval;
+  // variability limit, cpi->oxcf.two_pass_vbrmax_section.
+  if (twopass->gf_group_bits > (int64_t)max_bits * rc->baseline_gf_interval)
+    twopass->gf_group_bits = (int64_t)max_bits * rc->baseline_gf_interval;
 
-  // Reset the file position
-  reset_fpf_position(&cpi->twopass, start_pos);
+  // Reset the file position.
+  reset_fpf_position(twopass, start_pos);
 
   // Assign  bits to the arf or gf.
   for (i = 0; i <= (rc->source_alt_ref_pending &&
@@ -1720,7 +1642,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
 
     int boost = (rc->gfu_boost * gfboost_qadjust(q)) / 100;
 
-    // Set max and minimum boost and hence minimum allocation
+    // Set max and minimum boost and hence minimum allocation.
     boost = clamp(boost, 125, (rc->baseline_gf_interval + 1) * 200);
 
     if (rc->source_alt_ref_pending && i == 0)
@@ -1728,7 +1650,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     else
       allocation_chunks = (rc->baseline_gf_interval * 100) + (boost - 100);
 
-    // Prevent overflow
+    // Prevent overflow.
     if (boost > 1023) {
       int divisor = boost >> 10;
       boost /= divisor;
@@ -1736,18 +1658,18 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     }
 
     // Calculate the number of bits to be spent on the gf or arf based on
-    // the boost number
-    gf_bits = (int)((double)boost * (cpi->twopass.gf_group_bits /
-                                       (double)allocation_chunks));
+    // the boost number.
+    gf_bits = (int)((double)boost * (twopass->gf_group_bits /
+                  (double)allocation_chunks));
 
     // If the frame that is to be boosted is simpler than the average for
     // the gf/arf group then use an alternative calculation
-    // based on the error score of the frame itself
+    // based on the error score of the frame itself.
     if (rc->baseline_gf_interval < 1 ||
         mod_frame_err < gf_group_err / (double)rc->baseline_gf_interval) {
-      double alt_gf_grp_bits = (double)cpi->twopass.kf_group_bits  *
+      double alt_gf_grp_bits = (double)twopass->kf_group_bits  *
         (mod_frame_err * (double)rc->baseline_gf_interval) /
-        DOUBLE_DIVIDE_CHECK(cpi->twopass.kf_group_error_left);
+        DOUBLE_DIVIDE_CHECK(twopass->kf_group_error_left);
 
       int alt_gf_bits = (int)((double)boost * (alt_gf_grp_bits /
                                            (double)allocation_chunks));
@@ -1758,70 +1680,68 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
       // If it is harder than other frames in the group make sure it at
       // least receives an allocation in keeping with its relative error
       // score, otherwise it may be worse off than an "un-boosted" frame.
-      int alt_gf_bits = (int)((double)cpi->twopass.kf_group_bits *
+      int alt_gf_bits = (int)((double)twopass->kf_group_bits *
                         mod_frame_err /
-                        DOUBLE_DIVIDE_CHECK(cpi->twopass.kf_group_error_left));
+                        DOUBLE_DIVIDE_CHECK(twopass->kf_group_error_left));
 
       if (alt_gf_bits > gf_bits)
         gf_bits = alt_gf_bits;
     }
 
-    // Dont allow a negative value for gf_bits
+    // Don't allow a negative value for gf_bits.
     if (gf_bits < 0)
       gf_bits = 0;
 
     if (i == 0) {
-      cpi->twopass.gf_bits = gf_bits;
+      twopass->gf_bits = gf_bits;
     }
     if (i == 1 ||
         (!rc->source_alt_ref_pending &&
-         (cpi->common.frame_type != KEY_FRAME))) {
-      // Per frame bit target for this frame
-      rc->per_frame_bandwidth = gf_bits;
+         cpi->common.frame_type != KEY_FRAME)) {
+      // Calculate the per frame bit target for this frame.
+      vp9_rc_set_frame_target(cpi, gf_bits);
     }
   }
 
   {
-    // Adjust KF group bits and error remaining
-    cpi->twopass.kf_group_error_left -= (int64_t)gf_group_err;
-    cpi->twopass.kf_group_bits -= cpi->twopass.gf_group_bits;
+    // Adjust KF group bits and error remaining.
+    twopass->kf_group_error_left -= (int64_t)gf_group_err;
+    twopass->kf_group_bits -= twopass->gf_group_bits;
 
-    if (cpi->twopass.kf_group_bits < 0)
-      cpi->twopass.kf_group_bits = 0;
+    if (twopass->kf_group_bits < 0)
+      twopass->kf_group_bits = 0;
 
-    // If this is an arf update we want to remove the score for the
-    // overlay frame at the end which will usually be very cheap to code.
-    // The overlay frame has already in effect been coded so we want to spread
-    // the remaining bits amoung the other frames/
+    // If this is an arf update we want to remove the score for the overlay
+    // frame at the end which will usually be very cheap to code.
+    // The overlay frame has already, in effect, been coded so we want to spread
+    // the remaining bits among the other frames.
     // For normal GFs remove the score for the GF itself unless this is
     // also a key frame in which case it has already been accounted for.
     if (rc->source_alt_ref_pending) {
-      cpi->twopass.gf_group_error_left = (int64_t)gf_group_err - mod_frame_err;
+      twopass->gf_group_error_left = (int64_t)(gf_group_err - mod_frame_err);
     } else if (cpi->common.frame_type != KEY_FRAME) {
-      cpi->twopass.gf_group_error_left = (int64_t)(gf_group_err
+      twopass->gf_group_error_left = (int64_t)(gf_group_err
                                                    - gf_first_frame_err);
     } else {
-      cpi->twopass.gf_group_error_left = (int64_t)gf_group_err;
+      twopass->gf_group_error_left = (int64_t)gf_group_err;
     }
 
-    cpi->twopass.gf_group_bits -= cpi->twopass.gf_bits;
+    twopass->gf_group_bits -= twopass->gf_bits;
 
-    if (cpi->twopass.gf_group_bits < 0)
-      cpi->twopass.gf_group_bits = 0;
+    if (twopass->gf_group_bits < 0)
+      twopass->gf_group_bits = 0;
 
     // This condition could fail if there are two kfs very close together
-    // despite (MIN_GF_INTERVAL) and would cause a divide by 0 in the
+    // despite MIN_GF_INTERVAL and would cause a divide by 0 in the
     // calculation of alt_extra_bits.
     if (rc->baseline_gf_interval >= 3) {
       const int boost = rc->source_alt_ref_pending ? b_boost : rc->gfu_boost;
 
       if (boost >= 150) {
-        int alt_extra_bits;
-        int pct_extra = (boost - 100) / 50;
-        pct_extra = (pct_extra > 20) ? 20 : pct_extra;
-
-        alt_extra_bits = (int)((cpi->twopass.gf_group_bits * pct_extra) / 100);
-        cpi->twopass.gf_group_bits -= alt_extra_bits;
+        const int pct_extra = MIN(20, (boost - 100) / 50);
+        const int alt_extra_bits = (int)((twopass->gf_group_bits * pct_extra) /
+                                       100);
+        twopass->gf_group_bits -= alt_extra_bits;
       }
     }
   }
@@ -1830,20 +1750,20 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     FIRSTPASS_STATS sectionstats;
 
     zero_stats(&sectionstats);
-    reset_fpf_position(&cpi->twopass, start_pos);
+    reset_fpf_position(twopass, start_pos);
 
-    for (i = 0; i < rc->baseline_gf_interval; i++) {
-      input_stats(&cpi->twopass, &next_frame);
+    for (i = 0; i < rc->baseline_gf_interval; ++i) {
+      input_stats(twopass, &next_frame);
       accumulate_stats(&sectionstats, &next_frame);
     }
 
     avg_stats(&sectionstats);
 
-    cpi->twopass.section_intra_rating = (int)
+    twopass->section_intra_rating = (int)
       (sectionstats.intra_error /
       DOUBLE_DIVIDE_CHECK(sectionstats.coded_error));
 
-    reset_fpf_position(&cpi->twopass, start_pos);
+    reset_fpf_position(twopass, start_pos);
   }
 }
 
@@ -1879,34 +1799,27 @@ static void assign_std_frame_bits(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     cpi->twopass.gf_group_bits = 0;
 
   // Per frame bit target for this frame.
-  cpi->rc.per_frame_bandwidth = target_frame_size;
-}
-
-static int test_for_kf_one_pass(VP9_COMP *cpi) {
-  // Placeholder function for auto key frame
-  return 0;
+  vp9_rc_set_frame_target(cpi, target_frame_size);
 }
 
 static int test_candidate_kf(VP9_COMP *cpi,
-                             FIRSTPASS_STATS *last_frame,
-                             FIRSTPASS_STATS *this_frame,
-                             FIRSTPASS_STATS *next_frame) {
+                             const FIRSTPASS_STATS *last_frame,
+                             const FIRSTPASS_STATS *this_frame,
+                             const FIRSTPASS_STATS *next_frame) {
   int is_viable_kf = 0;
 
-  // Does the frame satisfy the primary criteria of a key frame
-  //      If so, then examine how well it predicts subsequent frames
+  // Does the frame satisfy the primary criteria of a key frame?
+  // If so, then examine how well it predicts subsequent frames.
   if ((this_frame->pcnt_second_ref < 0.10) &&
       (next_frame->pcnt_second_ref < 0.10) &&
       ((this_frame->pcnt_inter < 0.05) ||
-       (((this_frame->pcnt_inter - this_frame->pcnt_neutral) < .35) &&
+       (((this_frame->pcnt_inter - this_frame->pcnt_neutral) < 0.35) &&
         ((this_frame->intra_error /
           DOUBLE_DIVIDE_CHECK(this_frame->coded_error)) < 2.5) &&
         ((fabs(last_frame->coded_error - this_frame->coded_error) /
-              DOUBLE_DIVIDE_CHECK(this_frame->coded_error) >
-          .40) ||
+              DOUBLE_DIVIDE_CHECK(this_frame->coded_error) > 0.40) ||
          (fabs(last_frame->intra_error - this_frame->intra_error) /
-              DOUBLE_DIVIDE_CHECK(this_frame->intra_error) >
-          .40) ||
+              DOUBLE_DIVIDE_CHECK(this_frame->intra_error) > 0.40) ||
          ((next_frame->intra_error /
            DOUBLE_DIVIDE_CHECK(next_frame->coded_error)) > 3.5))))) {
     int i;
@@ -1920,37 +1833,34 @@ static int test_candidate_kf(VP9_COMP *cpi,
 
     local_next_frame = *next_frame;
 
-    // Note the starting file position so we can reset to it
+    // Note the starting file position so we can reset to it.
     start_pos = cpi->twopass.stats_in;
 
-    // Examine how well the key frame predicts subsequent frames
-    for (i = 0; i < 16; i++) {
+    // Examine how well the key frame predicts subsequent frames.
+    for (i = 0; i < 16; ++i) {
       double next_iiratio = (IIKFACTOR1 * local_next_frame.intra_error /
                              DOUBLE_DIVIDE_CHECK(local_next_frame.coded_error));
 
       if (next_iiratio > RMAX)
         next_iiratio = RMAX;
 
-      // Cumulative effect of decay in prediction quality
+      // Cumulative effect of decay in prediction quality.
       if (local_next_frame.pcnt_inter > 0.85)
         decay_accumulator *= local_next_frame.pcnt_inter;
       else
         decay_accumulator *= (0.85 + local_next_frame.pcnt_inter) / 2.0;
 
-      // decay_accumulator = decay_accumulator * local_next_frame.pcnt_inter;
-
-      // Keep a running total
+      // Keep a running total.
       boost_score += (decay_accumulator * next_iiratio);
 
-      // Test various breakout clauses
+      // Test various breakout clauses.
       if ((local_next_frame.pcnt_inter < 0.05) ||
           (next_iiratio < 1.5) ||
           (((local_next_frame.pcnt_inter -
              local_next_frame.pcnt_neutral) < 0.20) &&
            (next_iiratio < 3.0)) ||
           ((boost_score - old_boost_score) < 3.0) ||
-          (local_next_frame.intra_error < 200)
-         ) {
+          (local_next_frame.intra_error < 200)) {
         break;
       }
 
@@ -1990,8 +1900,6 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
 
   double kf_mod_err = 0.0;
   double kf_group_err = 0.0;
-  double kf_group_intra_err = 0.0;
-  double kf_group_coded_err = 0.0;
   double recent_loop_decay[8] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
 
   RATE_CONTROL *const rc = &cpi->rc;
@@ -1999,23 +1907,23 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
 
   vp9_zero(next_frame);
 
-  vp9_clear_system_state();  // __asm emms;
+  vp9_clear_system_state();
 
   start_position = twopass->stats_in;
   cpi->common.frame_type = KEY_FRAME;
 
-  // is this a forced key frame by interval
+  // Is this a forced key frame by interval.
   rc->this_key_frame_forced = rc->next_key_frame_forced;
 
-  // Clear the alt ref active flag as this can never be active on a key frame
+  // Clear the alt ref active flag as this can never be active on a key frame.
   rc->source_alt_ref_active = 0;
 
-  // Kf is always a gf so clear frames till next gf counter
+  // KF is always a GF so clear frames till next gf counter.
   rc->frames_till_gf_update_due = 0;
 
   rc->frames_to_key = 1;
 
-  // Take a copy of the initial frame details
+  // Take a copy of the initial frame details.
   first_frame = *this_frame;
 
   twopass->kf_group_bits = 0;        // Total bits available to kf group
@@ -2023,86 +1931,75 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
 
   kf_mod_err = calculate_modified_err(cpi, this_frame);
 
-  // find the next keyframe
+  // Find the next keyframe.
   i = 0;
   while (twopass->stats_in < twopass->stats_in_end) {
-    // Accumulate kf group error
+    // Accumulate kf group error.
     kf_group_err += calculate_modified_err(cpi, this_frame);
 
-    // These figures keep intra and coded error counts for all frames including
-    // key frames in the group. The effect of the key frame itself can be
-    // subtracted out using the first_frame data collected above.
-    kf_group_intra_err += this_frame->intra_error;
-    kf_group_coded_err += this_frame->coded_error;
-
-    // load a the next frame's stats
+    // Load the next frame's stats.
     last_frame = *this_frame;
     input_stats(twopass, this_frame);
 
     // Provided that we are not at the end of the file...
     if (cpi->oxcf.auto_key &&
         lookup_next_frame_stats(twopass, &next_frame) != EOF) {
-      // Normal scene cut check
+      // Check for a scene cut.
       if (test_candidate_kf(cpi, &last_frame, this_frame, &next_frame))
         break;
 
-
-      // How fast is prediction quality decaying
+      // How fast is the prediction quality decaying?
       loop_decay_rate = get_prediction_decay_rate(&cpi->common, &next_frame);
 
       // We want to know something about the recent past... rather than
-      // as used elsewhere where we are concened with decay in prediction
+      // as used elsewhere where we are concerned with decay in prediction
       // quality since the last GF or KF.
       recent_loop_decay[i % 8] = loop_decay_rate;
       decay_accumulator = 1.0;
-      for (j = 0; j < 8; j++)
+      for (j = 0; j < 8; ++j)
         decay_accumulator *= recent_loop_decay[j];
 
       // Special check for transition or high motion followed by a
-      // to a static scene.
+      // static scene.
       if (detect_transition_to_still(cpi, i, cpi->key_frame_frequency - i,
                                      loop_decay_rate, decay_accumulator))
         break;
 
-      // Step on to the next frame
-      rc->frames_to_key++;
+      // Step on to the next frame.
+      ++rc->frames_to_key;
 
       // If we don't have a real key frame within the next two
-      // forcekeyframeevery intervals then break out of the loop.
+      // key_frame_frequency intervals then break out of the loop.
       if (rc->frames_to_key >= 2 * (int)cpi->key_frame_frequency)
         break;
     } else {
-      rc->frames_to_key++;
+      ++rc->frames_to_key;
     }
-    i++;
+    ++i;
   }
 
   // If there is a max kf interval set by the user we must obey it.
   // We already breakout of the loop above at 2x max.
-  // This code centers the extra kf if the actual natural
-  // interval is between 1x and 2x
+  // This code centers the extra kf if the actual natural interval
+  // is between 1x and 2x.
   if (cpi->oxcf.auto_key &&
       rc->frames_to_key > (int)cpi->key_frame_frequency) {
     FIRSTPASS_STATS tmp_frame;
 
     rc->frames_to_key /= 2;
 
-    // Copy first frame details
+    // Copy first frame details.
     tmp_frame = first_frame;
 
-    // Reset to the start of the group
+    // Reset to the start of the group.
     reset_fpf_position(twopass, start_position);
 
     kf_group_err = 0;
-    kf_group_intra_err = 0;
-    kf_group_coded_err = 0;
 
-    // Rescan to get the correct error data for the forced kf group
-    for (i = 0; i < rc->frames_to_key; i++) {
-      // Accumulate kf group errors
+    // Rescan to get the correct error data for the forced kf group.
+    for (i = 0; i < rc->frames_to_key; ++i) {
+      // Accumulate kf group errors.
       kf_group_err += calculate_modified_err(cpi, &tmp_frame);
-      kf_group_intra_err += tmp_frame.intra_error;
-      kf_group_coded_err += tmp_frame.coded_error;
 
       // Load the next frame's stats.
       input_stats(twopass, &tmp_frame);
@@ -2114,28 +2011,22 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     rc->next_key_frame_forced = 0;
   }
 
-  // Special case for the last key frame of the file
+  // Special case for the last key frame of the file.
   if (twopass->stats_in >= twopass->stats_in_end) {
-    // Accumulate kf group error
+    // Accumulate kf group error.
     kf_group_err += calculate_modified_err(cpi, this_frame);
-
-    // These figures keep intra and coded error counts for all frames including
-    // key frames in the group. The effect of the key frame itself can be
-    // subtracted out using the first_frame data collected above.
-    kf_group_intra_err += this_frame->intra_error;
-    kf_group_coded_err += this_frame->coded_error;
   }
 
   // Calculate the number of bits that should be assigned to the kf group.
   if (twopass->bits_left > 0 && twopass->modified_error_left > 0.0) {
-    // Max for a single normal frame (not key frame)
+    // Maximum number of bits for a single normal frame (not key frame).
     int max_bits = frame_max_bits(cpi);
 
-    // Maximum bits for the kf group
+    // Maximum number of bits allocated to the key frame group.
     int64_t max_grp_bits;
 
     // Default allocation based on bits left and relative
-    // complexity of the section
+    // complexity of the section.
     twopass->kf_group_bits = (int64_t)(twopass->bits_left *
        (kf_group_err / twopass->modified_error_left));
 
@@ -2146,17 +2037,16 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   } else {
     twopass->kf_group_bits = 0;
   }
-  // Reset the first pass file position
+  // Reset the first pass file position.
   reset_fpf_position(twopass, start_position);
 
   // Determine how big to make this keyframe based on how well the subsequent
   // frames use inter blocks.
   decay_accumulator = 1.0;
   boost_score = 0.0;
-  loop_decay_rate = 1.00;       // Starting decay rate
 
   // Scan through the kf group collating various stats.
-  for (i = 0; i < rc->frames_to_key; i++) {
+  for (i = 0; i < rc->frames_to_key; ++i) {
     double r;
 
     if (EOF == input_stats(twopass, &next_frame))
@@ -2181,7 +2071,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
       if (r > RMAX)
         r = RMAX;
 
-      // How fast is prediction quality decaying
+      // How fast is prediction quality decaying.
       if (!detect_flash(twopass, 0)) {
         loop_decay_rate = get_prediction_decay_rate(&cpi->common, &next_frame);
         decay_accumulator *= loop_decay_rate;
@@ -2199,7 +2089,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     zero_stats(&sectionstats);
     reset_fpf_position(twopass, start_position);
 
-    for (i = 0; i < rc->frames_to_key; i++) {
+    for (i = 0; i < rc->frames_to_key; ++i) {
       input_stats(twopass, &next_frame);
       accumulate_stats(&sectionstats, &next_frame);
     }
@@ -2210,10 +2100,10 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
         DOUBLE_DIVIDE_CHECK(sectionstats.coded_error));
   }
 
-  // Reset the first pass file position
+  // Reset the first pass file position.
   reset_fpf_position(twopass, start_position);
 
-  // Work out how many bits to allocate for the key frame itself
+  // Work out how many bits to allocate for the key frame itself.
   if (1) {
     int kf_boost = (int)boost_score;
     int allocation_chunks;
@@ -2222,33 +2112,34 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     if (kf_boost < (rc->frames_to_key * 3))
       kf_boost = (rc->frames_to_key * 3);
 
-    if (kf_boost < MIN_BOOST)
-      kf_boost = MIN_BOOST;
+    if (kf_boost < MIN_KF_BOOST)
+      kf_boost = MIN_KF_BOOST;
 
     // Make a note of baseline boost and the zero motion
     // accumulator value for use elsewhere.
     rc->kf_boost = kf_boost;
     twopass->kf_zeromotion_pct = (int)(zero_motion_accumulator * 100.0);
 
-    // We do three calculations for kf size.
-    // The first is based on the error score for the whole kf group.
-    // The second (optionally) on the key frames own error if this is
-    // smaller than the average for the group.
-    // The final one insures that the frame receives at least the
-    // allocation it would have received based on its own error score vs
-    // the error score remaining
-    // Special case if the sequence appears almost totaly static
-    // In this case we want to spend almost all of the bits on the
-    // key frame.
-    // cpi->rc.frames_to_key-1 because key frame itself is taken
-    // care of by kf_boost.
+    // Key frame size depends on:
+    // (1) the error score for the whole key frame group,
+    // (2) the key frames' own error if this is smaller than the
+    //     average for the group (optional),
+    // (3) insuring that the frame receives at least the allocation it would
+    //     have received based on its own error score vs the error score
+    //     remaining.
+    // Special case:
+    // If the sequence appears almost totally static we want to spend almost
+    // all of the bits on the key frame.
+    //
+    // We use (cpi->rc.frames_to_key - 1) below because the key frame itself is
+    // taken care of by kf_boost.
     if (zero_motion_accumulator >= 0.99) {
       allocation_chunks = ((rc->frames_to_key - 1) * 10) + kf_boost;
     } else {
       allocation_chunks = ((rc->frames_to_key - 1) * 100) + kf_boost;
     }
 
-    // Prevent overflow
+    // Prevent overflow.
     if (kf_boost > 1028) {
       int divisor = kf_boost >> 10;
       kf_boost /= divisor;
@@ -2258,7 +2149,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     twopass->kf_group_bits = (twopass->kf_group_bits < 0) ? 0
            : twopass->kf_group_bits;
 
-    // Calculate the number of bits to be spent on the key frame
+    // Calculate the number of bits to be spent on the key frame.
     twopass->kf_bits = (int)((double)kf_boost *
         ((double)twopass->kf_group_bits / allocation_chunks));
 
@@ -2277,9 +2168,9 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
       if (twopass->kf_bits > alt_kf_bits)
         twopass->kf_bits = alt_kf_bits;
     } else {
-    // Else if it is much harder than other frames in the group make sure
-    // it at least receives an allocation in keeping with its relative
-    // error score
+      // Else if it is much harder than other frames in the group make sure
+      // it at least receives an allocation in keeping with its relative
+      // error score.
       alt_kf_bits = (int)((double)twopass->bits_left * (kf_mod_err /
                DOUBLE_DIVIDE_CHECK(twopass->modified_error_left)));
 
@@ -2287,16 +2178,12 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
         twopass->kf_bits = alt_kf_bits;
       }
     }
-
     twopass->kf_group_bits -= twopass->kf_bits;
-
-    // Peer frame bit target for this frame
-    rc->per_frame_bandwidth = twopass->kf_bits;
-    // Convert to a per second bitrate
-    cpi->target_bandwidth = (int)(twopass->kf_bits * cpi->output_framerate);
+    // Per frame bit target for this frame.
+    vp9_rc_set_frame_target(cpi, twopass->kf_bits);
   }
 
-  // Note the total error score of the kf group minus the key frame itself
+  // Note the total error score of the kf group minus the key frame itself.
   twopass->kf_group_error_left = (int)(kf_group_err - kf_mod_err);
 
   // Adjust the count of total modified error left.
@@ -2305,73 +2192,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   twopass->modified_error_left -= kf_group_err;
 }
 
-void vp9_get_svc_params(VP9_COMP *cpi) {
-  VP9_COMMON *const cm = &cpi->common;
-  if ((cm->current_video_frame == 0) ||
-      (cm->frame_flags & FRAMEFLAGS_KEY) ||
-      (cpi->oxcf.auto_key && (cpi->rc.frames_since_key %
-                              cpi->key_frame_frequency == 0))) {
-    cm->frame_type = KEY_FRAME;
-    cpi->rc.source_alt_ref_active = 0;
-  } else {
-    cm->frame_type = INTER_FRAME;
-  }
-  cpi->rc.frames_till_gf_update_due = INT_MAX;
-  cpi->rc.baseline_gf_interval = INT_MAX;
-}
-
-// Use this macro to turn on/off use of alt-refs in one-pass mode.
-#define USE_ALTREF_FOR_ONE_PASS   1
-
-void vp9_get_one_pass_params(VP9_COMP *cpi) {
-  VP9_COMMON *const cm = &cpi->common;
-  if (!cpi->refresh_alt_ref_frame &&
-      (cm->current_video_frame == 0 ||
-       cm->frame_flags & FRAMEFLAGS_KEY ||
-       cpi->rc.frames_to_key == 0 ||
-       (cpi->oxcf.auto_key && test_for_kf_one_pass(cpi)))) {
-    cm->frame_type = KEY_FRAME;
-    cpi->rc.this_key_frame_forced = cm->current_video_frame != 0 &&
-                                    cpi->rc.frames_to_key == 0;
-    cpi->rc.frames_to_key = cpi->key_frame_frequency;
-    cpi->rc.kf_boost = KEY_FRAME_BOOST;
-    cpi->rc.source_alt_ref_active = 0;
-  } else {
-    cm->frame_type = INTER_FRAME;
-  }
-  if (cpi->rc.frames_till_gf_update_due == 0) {
-    cpi->rc.baseline_gf_interval = DEFAULT_GF_INTERVAL;
-    cpi->rc.frames_till_gf_update_due = cpi->rc.baseline_gf_interval;
-    // NOTE: frames_till_gf_update_due must be <= frames_to_key.
-    if (cpi->rc.frames_till_gf_update_due > cpi->rc.frames_to_key)
-      cpi->rc.frames_till_gf_update_due = cpi->rc.frames_to_key;
-    cpi->refresh_golden_frame = 1;
-    cpi->rc.source_alt_ref_pending = USE_ALTREF_FOR_ONE_PASS;
-    cpi->rc.gfu_boost = 1000;
-  }
-}
-
-void vp9_get_one_pass_cbr_params(VP9_COMP *cpi) {
-  VP9_COMMON *const cm = &cpi->common;
-  if ((cm->current_video_frame == 0 ||
-      cm->frame_flags & FRAMEFLAGS_KEY ||
-      cpi->rc.frames_to_key == 0 ||
-      (cpi->oxcf.auto_key && test_for_kf_one_pass(cpi)))) {
-    cm->frame_type = KEY_FRAME;
-    cpi->rc.this_key_frame_forced = cm->current_video_frame != 0 &&
-                                    cpi->rc.frames_to_key == 0;
-    cpi->rc.frames_to_key = cpi->key_frame_frequency;
-    cpi->rc.kf_boost = KEY_FRAME_BOOST;
-    cpi->rc.source_alt_ref_active = 0;
-  } else {
-    cm->frame_type = INTER_FRAME;
-  }
-  // Don't use gf_update by default in CBR mode.
-  cpi->rc.frames_till_gf_update_due = INT_MAX;
-  cpi->rc.baseline_gf_interval = INT_MAX;
-}
-
-void vp9_get_first_pass_params(VP9_COMP *cpi) {
+void vp9_rc_get_first_pass_params(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   if (!cpi->refresh_alt_ref_frame &&
       (cm->current_video_frame == 0 ||
@@ -2380,11 +2201,11 @@ void vp9_get_first_pass_params(VP9_COMP *cpi) {
   } else {
     cm->frame_type = INTER_FRAME;
   }
-  // Do not use periodic key frames
+  // Do not use periodic key frames.
   cpi->rc.frames_to_key = INT_MAX;
 }
 
-void vp9_get_second_pass_params(VP9_COMP *cpi) {
+void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
   struct twopass_rc *const twopass = &cpi->twopass;
@@ -2395,37 +2216,30 @@ void vp9_get_second_pass_params(VP9_COMP *cpi) {
 
   double this_frame_intra_error;
   double this_frame_coded_error;
+  int target;
 
   if (!twopass->stats_in)
     return;
 
   if (cpi->refresh_alt_ref_frame) {
     cm->frame_type = INTER_FRAME;
-    rc->per_frame_bandwidth = twopass->gf_bits;
+    vp9_rc_set_frame_target(cpi, twopass->gf_bits);
     return;
   }
 
   vp9_clear_system_state();
 
   if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) {
-    rc->active_worst_quality = cpi->oxcf.cq_level;
+    twopass->active_worst_quality = cpi->oxcf.cq_level;
   } else if (cm->current_video_frame == 0) {
     // Special case code for first frame.
     const int section_target_bandwidth = (int)(twopass->bits_left /
-                                             frames_left);
-    const int tmp_q = estimate_max_q(cpi, &twopass->total_left_stats,
-                                     section_target_bandwidth);
-
-    rc->active_worst_quality = tmp_q;
+                                               frames_left);
+    const int tmp_q = vp9_twopass_worst_quality(cpi, &twopass->total_left_stats,
+                                                section_target_bandwidth);
+    twopass->active_worst_quality = tmp_q;
     rc->ni_av_qi = tmp_q;
     rc->avg_q = vp9_convert_qindex_to_q(tmp_q);
-
-    // Limit the maxq value returned subsequently.
-    // This increases the risk of overspend or underspend if the initial
-    // estimate for the clip is bad, but helps prevent excessive
-    // variation in Q, especially near the end of a clip
-    // where for example a small overspend may cause Q to crash
-    // adjust_maxq_qrange(cpi);
   }
   vp9_zero(this_frame);
   if (EOF == input_stats(twopass, &this_frame))
@@ -2434,19 +2248,19 @@ void vp9_get_second_pass_params(VP9_COMP *cpi) {
   this_frame_intra_error = this_frame.intra_error;
   this_frame_coded_error = this_frame.coded_error;
 
-  // keyframe and section processing !
+  // Keyframe and section processing.
   if (rc->frames_to_key == 0 ||
       (cm->frame_flags & FRAMEFLAGS_KEY)) {
-    // Define next KF group and assign bits to it
+    // Define next KF group and assign bits to it.
     this_frame_copy = this_frame;
     find_next_key_frame(cpi, &this_frame_copy);
   } else {
     cm->frame_type = INTER_FRAME;
   }
 
-  // Is this a GF / ARF (Note that a KF is always also a GF)
+  // Is this frame a GF / ARF? (Note: a key frame is always also a GF).
   if (rc->frames_till_gf_update_due == 0) {
-    // Define next gf group and assign bits to it
+    // Define next gf group and assign bits to it.
     this_frame_copy = this_frame;
 
 #if CONFIG_MULTIPLE_ARF
@@ -2461,18 +2275,19 @@ void vp9_get_second_pass_params(VP9_COMP *cpi) {
 
     if (twopass->gf_zeromotion_pct > 995) {
       // As long as max_thresh for encode breakout is small enough, it is ok
-      // to enable it for no-show frame, i.e. set enable_encode_breakout to 2.
+      // to enable it for show frame, i.e. set allow_encode_breakout to
+      // ENCODE_BREAKOUT_LIMITED.
       if (!cm->show_frame)
-        cpi->enable_encode_breakout = 0;
+        cpi->allow_encode_breakout = ENCODE_BREAKOUT_DISABLED;
       else
-        cpi->enable_encode_breakout = 2;
+        cpi->allow_encode_breakout = ENCODE_BREAKOUT_LIMITED;
     }
 
     rc->frames_till_gf_update_due = rc->baseline_gf_interval;
     cpi->refresh_golden_frame = 1;
   } else {
-    // Otherwise this is an ordinary frame
-    // Assign bits from those allocated to the GF group
+    // Otherwise this is an ordinary frame.
+    // Assign bits from those allocated to the GF group.
     this_frame_copy =  this_frame;
     assign_std_frame_bits(cpi, &this_frame_copy);
   }
@@ -2488,13 +2303,13 @@ void vp9_get_second_pass_params(VP9_COMP *cpi) {
     }
   }
 
-  // Set nominal per second bandwidth for this frame
-  cpi->target_bandwidth = (int)(rc->per_frame_bandwidth *
-                                   cpi->output_framerate);
-  if (cpi->target_bandwidth < 0)
-    cpi->target_bandwidth = 0;
+  if (cpi->common.frame_type == KEY_FRAME)
+    target = vp9_rc_clamp_iframe_target_size(cpi, rc->this_frame_target);
+  else
+    target = vp9_rc_clamp_pframe_target_size(cpi, rc->this_frame_target);
+  vp9_rc_set_frame_target(cpi, target);
 
-  // Update the total stats remaining structure
+  // Update the total stats remaining structure.
   subtract_stats(&twopass->total_left_stats, &this_frame);
 }
 
@@ -2503,5 +2318,18 @@ void vp9_twopass_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
   cpi->twopass.bits_left -=  cpi->rc.this_frame_target;
 #else
   cpi->twopass.bits_left -= 8 * bytes_used;
+  // Update bits left to the kf and gf groups to account for overshoot or
+  // undershoot on these frames.
+  if (cm->frame_type == KEY_FRAME) {
+    cpi->twopass.kf_group_bits += cpi->rc.this_frame_target -
+        cpi->rc.projected_frame_size;
+
+    cpi->twopass.kf_group_bits = MAX(cpi->twopass.kf_group_bits, 0);
+  } else if (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame) {
+    cpi->twopass.gf_group_bits += cpi->rc.this_frame_target -
+        cpi->rc.projected_frame_size;
+
+    cpi->twopass.gf_group_bits = MAX(cpi->twopass.gf_group_bits, 0);
+  }
 #endif
 }
diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h
index ca5b10080..83e337b6d 100644
--- a/vp9/encoder/vp9_firstpass.h
+++ b/vp9/encoder/vp9_firstpass.h
@@ -10,25 +10,92 @@
 
 #ifndef VP9_ENCODER_VP9_FIRSTPASS_H_
 #define VP9_ENCODER_VP9_FIRSTPASS_H_
-#include "vp9/encoder/vp9_onyx_int.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-void vp9_init_first_pass(VP9_COMP *cpi);
-void vp9_first_pass(VP9_COMP *cpi);
-void vp9_end_first_pass(VP9_COMP *cpi);
+typedef struct {
+  double frame;
+  double intra_error;
+  double coded_error;
+  double sr_coded_error;
+  double ssim_weighted_pred_err;
+  double pcnt_inter;
+  double pcnt_motion;
+  double pcnt_second_ref;
+  double pcnt_neutral;
+  double MVr;
+  double mvr_abs;
+  double MVc;
+  double mvc_abs;
+  double MVrv;
+  double MVcv;
+  double mv_in_out_count;
+  double new_mv_count;
+  double duration;
+  double count;
+} FIRSTPASS_STATS;
 
-void vp9_init_second_pass(VP9_COMP *cpi);
-void vp9_get_second_pass_params(VP9_COMP *cpi);
-void vp9_end_second_pass(VP9_COMP *cpi);
+struct twopass_rc {
+  unsigned int section_intra_rating;
+  unsigned int next_iiratio;
+  unsigned int this_iiratio;
+  FIRSTPASS_STATS total_stats;
+  FIRSTPASS_STATS this_frame_stats;
+  FIRSTPASS_STATS *stats_in, *stats_in_end, *stats_in_start;
+  FIRSTPASS_STATS total_left_stats;
+  int first_pass_done;
+  int64_t bits_left;
+  int64_t clip_bits_total;
+  double avg_iiratio;
+  double modified_error_min;
+  double modified_error_max;
+  double modified_error_total;
+  double modified_error_left;
+  double kf_intra_err_min;
+  double gf_intra_err_min;
+  int static_scene_max_gf_interval;
+  int kf_bits;
+  // Remaining error from uncoded frames in a gf group. Two pass use only
+  int64_t gf_group_error_left;
 
-void vp9_get_first_pass_params(VP9_COMP *cpi);
-void vp9_get_one_pass_params(VP9_COMP *cpi);
-void vp9_get_one_pass_cbr_params(VP9_COMP *cpi);
-void vp9_get_svc_params(VP9_COMP *cpi);
+  // Projected total bits available for a key frame group of frames
+  int64_t kf_group_bits;
 
+  // Error score of frames still to be coded in kf group
+  int64_t kf_group_error_left;
+
+  // Projected Bits available for a group of frames including 1 GF or ARF
+  int64_t gf_group_bits;
+  // Bits for the golden frame or ARF - 2 pass only
+  int gf_bits;
+  int alt_extra_bits;
+
+  int sr_update_lag;
+
+  int kf_zeromotion_pct;
+  int gf_zeromotion_pct;
+
+  int active_worst_quality;
+};
+
+struct VP9_COMP;
+
+void vp9_init_first_pass(struct VP9_COMP *cpi);
+void vp9_rc_get_first_pass_params(struct VP9_COMP *cpi);
+void vp9_first_pass(struct VP9_COMP *cpi);
+void vp9_end_first_pass(struct VP9_COMP *cpi);
+
+void vp9_init_second_pass(struct VP9_COMP *cpi);
+void vp9_rc_get_second_pass_params(struct VP9_COMP *cpi);
+void vp9_end_second_pass(struct VP9_COMP *cpi);
+int vp9_twopass_worst_quality(struct VP9_COMP *cpi, FIRSTPASS_STATS *fpstats,
+                              int section_target_bandwitdh);
+
+// Post encode update of the rate control parameters for 2-pass
+void vp9_twopass_postencode_update(struct VP9_COMP *cpi,
+                                   uint64_t bytes_used);
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp9/encoder/vp9_lookahead.c b/vp9/encoder/vp9_lookahead.c
index e6e59c05a..4b642e2b6 100644
--- a/vp9/encoder/vp9_lookahead.c
+++ b/vp9/encoder/vp9_lookahead.c
@@ -11,9 +11,12 @@
 #include <stdlib.h>
 
 #include "./vpx_config.h"
+
 #include "vp9/common/vp9_common.h"
+
 #include "vp9/encoder/vp9_extend.h"
 #include "vp9/encoder/vp9_lookahead.h"
+#include "vp9/encoder/vp9_onyx_int.h"
 
 struct lookahead_ctx {
   unsigned int max_sz;         /* Absolute size of the queue */
diff --git a/vp9/encoder/vp9_mbgraph.c b/vp9/encoder/vp9_mbgraph.c
index c50098678..44c1f9078 100644
--- a/vp9/encoder/vp9_mbgraph.c
+++ b/vp9/encoder/vp9_mbgraph.c
@@ -29,7 +29,6 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi,
   MACROBLOCK   *const x  = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[BLOCK_16X16];
-  unsigned int best_err;
 
   const int tmp_col_min = x->mv_col_min;
   const int tmp_col_max = x->mv_col_max;
@@ -48,27 +47,22 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi,
   ref_full.row = ref_mv->row >> 3;
 
   /*cpi->sf.search_method == HEX*/
-  best_err = vp9_hex_search(x, &ref_full, step_param, x->errorperbit,
-                            0, &v_fn_ptr, 0, ref_mv, dst_mv);
+  vp9_hex_search(x, &ref_full, step_param, x->errorperbit, 0, &v_fn_ptr, 0,
+                 ref_mv, dst_mv);
 
   // Try sub-pixel MC
   // if (bestsme > error_thresh && bestsme < INT_MAX)
   {
     int distortion;
     unsigned int sse;
-    best_err = cpi->find_fractional_mv_step(
-        x, dst_mv, ref_mv,
-        cpi->common.allow_high_precision_mv,
-        x->errorperbit, &v_fn_ptr,
-        0, cpi->sf.subpel_iters_per_step, NULL, NULL,
-        & distortion, &sse);
+    cpi->find_fractional_mv_step(
+        x, dst_mv, ref_mv, cpi->common.allow_high_precision_mv, x->errorperbit,
+        &v_fn_ptr, 0, cpi->sf.subpel_iters_per_step, NULL, NULL, &distortion,
+        &sse);
   }
 
   vp9_set_mbmode_and_mvs(xd, NEWMV, dst_mv);
   vp9_build_inter_predictors_sby(xd, mb_row, mb_col, BLOCK_16X16);
-  best_err = vp9_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
-                          xd->plane[0].dst.buf, xd->plane[0].dst.stride,
-                          INT_MAX);
 
   /* restore UMV window */
   x->mv_col_min = tmp_col_min;
@@ -76,7 +70,9 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi,
   x->mv_row_min = tmp_row_min;
   x->mv_row_max = tmp_row_max;
 
-  return best_err;
+  return vp9_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
+          xd->plane[0].dst.buf, xd->plane[0].dst.stride,
+          INT_MAX);
 }
 
 static int do_16x16_motion_search(VP9_COMP *cpi, const int_mv *ref_mv,
@@ -355,7 +351,7 @@ static void separate_arf_mbs(VP9_COMP *cpi) {
     for (mi_col = 0; mi_col < cm->mi_cols; mi_col++) {
       // If any of the blocks in the sequence failed then the MB
       // goes in segment 0
-      if (arf_not_zz[mi_row/2*cm->mb_cols + mi_col/2]) {
+      if (arf_not_zz[mi_row / 2 * cm->mb_cols + mi_col / 2]) {
         ncnt[0]++;
         cpi->segmentation_map[mi_row * cm->mi_cols + mi_col] = 0;
       } else {
@@ -423,7 +419,7 @@ void vp9_update_mbgraph_stats(VP9_COMP *cpi) {
                                golden_ref, cpi->Source);
   }
 
-  vp9_clear_system_state();  // __asm emms;
+  vp9_clear_system_state();
 
   separate_arf_mbs(cpi);
 }
diff --git a/vp9/encoder/vp9_mbgraph.h b/vp9/encoder/vp9_mbgraph.h
index 79dd2bc95..bc2a7048f 100644
--- a/vp9/encoder/vp9_mbgraph.h
+++ b/vp9/encoder/vp9_mbgraph.h
@@ -15,7 +15,23 @@
 extern "C" {
 #endif
 
-void vp9_update_mbgraph_stats(VP9_COMP *cpi);
+typedef struct {
+  struct {
+    int err;
+    union {
+      int_mv mv;
+      MB_PREDICTION_MODE mode;
+    } m;
+  } ref[MAX_REF_FRAMES];
+} MBGRAPH_MB_STATS;
+
+typedef struct {
+  MBGRAPH_MB_STATS *mb_stats;
+} MBGRAPH_FRAME_STATS;
+
+struct VP9_COMP;
+
+void vp9_update_mbgraph_stats(struct VP9_COMP *cpi);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index ec9934a30..7d6fd3b99 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -349,6 +349,10 @@ int vp9_find_best_sub_pixel_tree(const MACROBLOCK *x,
     tr = br;
     tc = bc;
   }
+  // These lines insure static analysis doesn't warn that
+  // tr and tc aren't used after the above point.
+  (void) tr;
+  (void) tc;
 
   bestmv->row = br;
   bestmv->col = bc;
@@ -452,6 +456,11 @@ int vp9_find_best_sub_pixel_comp_tree(const MACROBLOCK *x,
     tr = br;
     tc = bc;
   }
+  // These lines insure static analysis doesn't warn that
+  // tr and tc aren't used after the above point.
+  (void) tr;
+  (void) tc;
+
   bestmv->row = br;
   bestmv->col = bc;
 
@@ -466,7 +475,6 @@ int vp9_find_best_sub_pixel_comp_tree(const MACROBLOCK *x,
 #undef PRE
 #undef DIST
 #undef CHECK_BETTER
-#undef SP
 
 static INLINE int check_bounds(const MACROBLOCK *x, int row, int col,
                                int range) {
@@ -476,11 +484,9 @@ static INLINE int check_bounds(const MACROBLOCK *x, int row, int col,
          ((col + range) <= x->mv_col_max);
 }
 
-static INLINE int check_point(const MACROBLOCK *x, const MV *mv) {
-  return (mv->col < x->mv_col_min) |
-         (mv->col > x->mv_col_max) |
-         (mv->row < x->mv_row_min) |
-         (mv->row > x->mv_row_max);
+static INLINE int is_mv_in(const MACROBLOCK *x, const MV *mv) {
+  return (mv->col >= x->mv_col_min) && (mv->col <= x->mv_col_max) &&
+         (mv->row >= x->mv_row_min) && (mv->row <= x->mv_row_max);
 }
 
 #define CHECK_BETTER \
@@ -496,11 +502,6 @@ static INLINE int check_point(const MACROBLOCK *x, const MV *mv) {
     }\
   }
 
-#define get_next_chkpts(list, i, n)   \
-    list[0] = ((i) == 0 ? (n) - 1 : (i) - 1);  \
-    list[1] = (i);                             \
-    list[2] = ((i) == (n) - 1 ? 0 : (i) + 1);
-
 #define MAX_PATTERN_SCALES         11
 #define MAX_PATTERN_CANDIDATES      8  // max number of canddiates per scale
 #define PATTERN_CANDIDATES_REF      3  // number of refinement candidates
@@ -578,7 +579,7 @@ static int vp9_pattern_search(const MACROBLOCK *x,
         for (i = 0; i < num_candidates[t]; i++) {
           this_mv.row = br + candidates[t][i].row;
           this_mv.col = bc + candidates[t][i].col;
-          if (check_point(x, &this_mv))
+          if (!is_mv_in(x, &this_mv))
             continue;
           this_offset = base_offset + (this_mv.row * in_what_stride) +
                                        this_mv.col;
@@ -622,7 +623,7 @@ static int vp9_pattern_search(const MACROBLOCK *x,
           for (i = 0; i < num_candidates[s]; i++) {
             this_mv.row = br + candidates[s][i].row;
             this_mv.col = bc + candidates[s][i].col;
-            if (check_point(x, &this_mv))
+            if (!is_mv_in(x, &this_mv))
               continue;
             this_offset = base_offset + (this_mv.row * in_what_stride) +
                                          this_mv.col;
@@ -644,7 +645,10 @@ static int vp9_pattern_search(const MACROBLOCK *x,
       do {
         int next_chkpts_indices[PATTERN_CANDIDATES_REF];
         best_site = -1;
-        get_next_chkpts(next_chkpts_indices, k, num_candidates[s]);
+        next_chkpts_indices[0] = (k == 0) ? num_candidates[s] - 1 : k - 1;
+        next_chkpts_indices[1] = k;
+        next_chkpts_indices[2] = (k == num_candidates[s] - 1) ? 0 : k + 1;
+
         if (check_bounds(x, br, bc, 1 << s)) {
           for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
             this_mv.row = br + candidates[s][next_chkpts_indices[i]].row;
@@ -659,7 +663,7 @@ static int vp9_pattern_search(const MACROBLOCK *x,
           for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
             this_mv.row = br + candidates[s][next_chkpts_indices[i]].row;
             this_mv.col = bc + candidates[s][next_chkpts_indices[i]].col;
-            if (check_point(x, &this_mv))
+            if (!is_mv_in(x, &this_mv))
               continue;
             this_offset = base_offset + (this_mv.row * (in_what_stride)) +
                                          this_mv.col;
@@ -698,7 +702,7 @@ static int vp9_pattern_search(const MACROBLOCK *x,
         for (i = 0; i < 4; i++) {
           this_mv.row = br + neighbors[i].row;
           this_mv.col = bc + neighbors[i].col;
-          if (check_point(x, &this_mv))
+          if (!is_mv_in(x, &this_mv))
             continue;
           this_offset = base_offset + this_mv.row * in_what_stride +
                             this_mv.col;
@@ -851,12 +855,191 @@ int vp9_square_search(const MACROBLOCK *x,
                             square_num_candidates, square_candidates);
 };
 
+// Number of candidates in first hex search
+#define FIRST_HEX_CANDIDATES 6
+// Index of previous hex search's best match
+#define PRE_BEST_CANDIDATE 6
+// Number of candidates in following hex search
+#define NEXT_HEX_CANDIDATES 3
+// Number of candidates in refining search
+#define REFINE_CANDIDATES 4
+
+int vp9_fast_hex_search(const MACROBLOCK *x,
+                        MV *ref_mv,
+                        int search_param,
+                        int sad_per_bit,
+                        const vp9_variance_fn_ptr_t *vfp,
+                        int use_mvcost,
+                        const MV *center_mv,
+                        MV *best_mv) {
+  const MACROBLOCKD* const xd = &x->e_mbd;
+  static const MV hex[FIRST_HEX_CANDIDATES] = {
+    { -1, -2}, {1, -2}, {2, 0}, {1, 2}, { -1, 2}, { -2, 0}
+  };
+  static const MV next_chkpts[PRE_BEST_CANDIDATE][NEXT_HEX_CANDIDATES] = {
+    {{ -2, 0}, { -1, -2}, {1, -2}},
+    {{ -1, -2}, {1, -2}, {2, 0}},
+    {{1, -2}, {2, 0}, {1, 2}},
+    {{2, 0}, {1, 2}, { -1, 2}},
+    {{1, 2}, { -1, 2}, { -2, 0}},
+    {{ -1, 2}, { -2, 0}, { -1, -2}}
+  };
+  static const MV neighbors[REFINE_CANDIDATES] = {
+      {0, -1}, { -1, 0}, {1, 0}, {0, 1}
+  };
+  int i, j;
+
+  const uint8_t *what = x->plane[0].src.buf;
+  const int what_stride = x->plane[0].src.stride;
+  const int in_what_stride = xd->plane[0].pre[0].stride;
+  int br, bc;
+  MV this_mv;
+  unsigned int bestsad = 0x7fffffff;
+  unsigned int thissad;
+  const uint8_t *base_offset;
+  const uint8_t *this_offset;
+  int k = -1;
+  int best_site = -1;
+  const int max_hex_search = 512;
+  const int max_dia_search = 32;
+
+  const int *mvjsadcost = x->nmvjointsadcost;
+  int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
+
+  const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
+
+  // Adjust ref_mv to make sure it is within MV range
+  clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
+  br = ref_mv->row;
+  bc = ref_mv->col;
+
+  // Check the start point
+  base_offset = xd->plane[0].pre[0].buf;
+  this_offset = base_offset + (br * in_what_stride) + bc;
+  this_mv.row = br;
+  this_mv.col = bc;
+  bestsad = vfp->sdf(what, what_stride, this_offset, in_what_stride, 0x7fffffff)
+            + mvsad_err_cost(&this_mv, &fcenter_mv, mvjsadcost, mvsadcost,
+                             sad_per_bit);
+
+  // Initial 6-point hex search
+  if (check_bounds(x, br, bc, 2)) {
+    for (i = 0; i < FIRST_HEX_CANDIDATES; i++) {
+      this_mv.row = br + hex[i].row;
+      this_mv.col = bc + hex[i].col;
+      this_offset = base_offset + (this_mv.row * in_what_stride) + this_mv.col;
+      thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
+                         bestsad);
+      CHECK_BETTER
+    }
+  } else {
+    for (i = 0; i < FIRST_HEX_CANDIDATES; i++) {
+      this_mv.row = br + hex[i].row;
+      this_mv.col = bc + hex[i].col;
+      if (!is_mv_in(x, &this_mv))
+        continue;
+      this_offset = base_offset + (this_mv.row * in_what_stride) + this_mv.col;
+      thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
+                         bestsad);
+      CHECK_BETTER
+    }
+  }
+
+  // Continue hex search if we find a better match in first round
+  if (best_site != -1) {
+    br += hex[best_site].row;
+    bc += hex[best_site].col;
+    k = best_site;
+
+    // Allow search covering maximum MV range
+    for (j = 1; j < max_hex_search; j++) {
+      best_site = -1;
+
+      if (check_bounds(x, br, bc, 2)) {
+        for (i = 0; i < 3; i++) {
+          this_mv.row = br + next_chkpts[k][i].row;
+          this_mv.col = bc + next_chkpts[k][i].col;
+          this_offset = base_offset + (this_mv.row * in_what_stride) +
+              this_mv.col;
+          thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
+                             bestsad);
+          CHECK_BETTER
+        }
+      } else {
+        for (i = 0; i < 3; i++) {
+          this_mv.row = br + next_chkpts[k][i].row;
+          this_mv.col = bc + next_chkpts[k][i].col;
+          if (!is_mv_in(x, &this_mv))
+            continue;
+          this_offset = base_offset + (this_mv.row * in_what_stride) +
+              this_mv.col;
+          thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
+                             bestsad);
+          CHECK_BETTER
+        }
+      }
+
+      if (best_site == -1) {
+        break;
+      } else {
+        br += next_chkpts[k][best_site].row;
+        bc += next_chkpts[k][best_site].col;
+        k += 5 + best_site;
+        if (k >= 12) k -= 12;
+        else if (k >= 6) k -= 6;
+      }
+    }
+  }
+
+  // Check 4 1-away neighbors
+  for (j = 0; j < max_dia_search; j++) {
+    best_site = -1;
+
+    if (check_bounds(x, br, bc, 1)) {
+      for (i = 0; i < REFINE_CANDIDATES; i++) {
+        this_mv.row = br + neighbors[i].row;
+        this_mv.col = bc + neighbors[i].col;
+        this_offset = base_offset + (this_mv.row * in_what_stride) +
+            this_mv.col;
+        thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
+                           bestsad);
+        CHECK_BETTER
+      }
+    } else {
+      for (i = 0; i < REFINE_CANDIDATES; i++) {
+        this_mv.row = br + neighbors[i].row;
+        this_mv.col = bc + neighbors[i].col;
+        if (!is_mv_in(x, &this_mv))
+          continue;
+        this_offset = base_offset + (this_mv.row * in_what_stride) +
+            this_mv.col;
+        thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
+                           bestsad);
+        CHECK_BETTER
+      }
+    }
+
+    if (best_site == -1) {
+      break;
+    } else {
+      br += neighbors[best_site].row;
+      bc += neighbors[best_site].col;
+    }
+  }
+
+  best_mv->row = br;
+  best_mv->col = bc;
+
+  return bestsad;
+}
+
 #undef CHECK_BETTER
 
 int vp9_full_range_search_c(const MACROBLOCK *x, MV *ref_mv, MV *best_mv,
                             int search_param, int sad_per_bit, int *num00,
-                            vp9_variance_fn_ptr_t *fn_ptr, int *mvjcost,
-                            int *mvcost[2], const MV *center_mv) {
+                            const vp9_variance_fn_ptr_t *fn_ptr,
+                            int *mvjcost, int *mvcost[2],
+                            const MV *center_mv) {
   const MACROBLOCKD *const xd = &x->e_mbd;
   const uint8_t *what = x->plane[0].src.buf;
   const int what_stride = x->plane[0].src.stride;
@@ -866,10 +1049,10 @@ int vp9_full_range_search_c(const MACROBLOCK *x, MV *ref_mv, MV *best_mv,
 
   MV this_mv;
 
-  int bestsad = INT_MAX;
+  unsigned int bestsad = INT_MAX;
   int ref_row, ref_col;
 
-  int thissad;
+  unsigned int thissad;
   const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
 
   const int *mvjsadcost = x->nmvjointsadcost;
@@ -970,8 +1153,9 @@ int vp9_full_range_search_c(const MACROBLOCK *x, MV *ref_mv, MV *best_mv,
 int vp9_diamond_search_sad_c(const MACROBLOCK *x,
                              MV *ref_mv, MV *best_mv,
                              int search_param, int sad_per_bit, int *num00,
-                             vp9_variance_fn_ptr_t *fn_ptr, int *mvjcost,
-                             int *mvcost[2], const MV *center_mv) {
+                             const vp9_variance_fn_ptr_t *fn_ptr,
+                             int *mvjcost, int *mvcost[2],
+                             const MV *center_mv) {
   int i, j, step;
 
   const MACROBLOCKD *const xd = &x->e_mbd;
@@ -1104,7 +1288,7 @@ int vp9_diamond_search_sad_c(const MACROBLOCK *x,
 int vp9_diamond_search_sadx4(const MACROBLOCK *x,
                              MV *ref_mv, MV *best_mv, int search_param,
                              int sad_per_bit, int *num00,
-                             vp9_variance_fn_ptr_t *fn_ptr,
+                             const vp9_variance_fn_ptr_t *fn_ptr,
                              int *mvjcost, int *mvcost[2],
                              const MV *center_mv) {
   int i, j, step;
@@ -1283,148 +1467,122 @@ int vp9_diamond_search_sadx4(const MACROBLOCK *x,
 
 int vp9_full_pixel_diamond(VP9_COMP *cpi, MACROBLOCK *x,
                            MV *mvp_full, int step_param,
-                           int sadpb, int further_steps,
-                           int do_refine, vp9_variance_fn_ptr_t *fn_ptr,
-                           const MV *ref_mv, int_mv *dst_mv) {
-  int_mv temp_mv;
-  int thissme, n, num00;
-  int bestsme = cpi->diamond_search_sad(x, mvp_full, &temp_mv.as_mv,
-                                        step_param, sadpb, &num00,
+                           int sadpb, int further_steps, int do_refine,
+                           const vp9_variance_fn_ptr_t *fn_ptr,
+                           const MV *ref_mv, MV *dst_mv) {
+  MV temp_mv;
+  int thissme, n, num00 = 0;
+  int bestsme = cpi->diamond_search_sad(x, mvp_full, &temp_mv,
+                                        step_param, sadpb, &n,
                                         fn_ptr, x->nmvjointcost,
                                         x->mvcost, ref_mv);
-  dst_mv->as_int = temp_mv.as_int;
-
-  n = num00;
-  num00 = 0;
+  *dst_mv = temp_mv;
 
-  /* If there won't be more n-step search, check to see if refining search is
-   * needed. */
+  // If there won't be more n-step search, check to see if refining search is
+  // needed.
   if (n > further_steps)
     do_refine = 0;
 
   while (n < further_steps) {
-    n++;
+    ++n;
 
     if (num00) {
       num00--;
     } else {
-      thissme = cpi->diamond_search_sad(x, mvp_full, &temp_mv.as_mv,
+      thissme = cpi->diamond_search_sad(x, mvp_full, &temp_mv,
                                         step_param + n, sadpb, &num00,
                                         fn_ptr, x->nmvjointcost, x->mvcost,
                                         ref_mv);
 
-      /* check to see if refining search is needed. */
-      if (num00 > (further_steps - n))
+      // check to see if refining search is needed.
+      if (num00 > further_steps - n)
         do_refine = 0;
 
       if (thissme < bestsme) {
         bestsme = thissme;
-        dst_mv->as_int = temp_mv.as_int;
+        *dst_mv = temp_mv;
       }
     }
   }
 
-  /* final 1-away diamond refining search */
-  if (do_refine == 1) {
-    int search_range = 8;
-    int_mv best_mv;
-    best_mv.as_int = dst_mv->as_int;
-    thissme = cpi->refining_search_sad(x, &best_mv.as_mv, sadpb, search_range,
+  // final 1-away diamond refining search
+  if (do_refine) {
+    const int search_range = 8;
+    MV best_mv = *dst_mv;
+    thissme = cpi->refining_search_sad(x, &best_mv, sadpb, search_range,
                                        fn_ptr, x->nmvjointcost, x->mvcost,
                                        ref_mv);
-
     if (thissme < bestsme) {
       bestsme = thissme;
-      dst_mv->as_int = best_mv.as_int;
+      *dst_mv = best_mv;
     }
   }
+
   return bestsme;
 }
 
-int vp9_full_search_sad_c(const MACROBLOCK *x, MV *ref_mv,
+int vp9_full_search_sad_c(const MACROBLOCK *x, const MV *ref_mv,
                           int sad_per_bit, int distance,
-                          vp9_variance_fn_ptr_t *fn_ptr, int *mvjcost,
-                          int *mvcost[2],
-                          const MV *center_mv, int n) {
+                          const vp9_variance_fn_ptr_t *fn_ptr,
+                          int *mvjcost, int *mvcost[2],
+                          const MV *center_mv, MV *best_mv) {
+  int r, c;
   const MACROBLOCKD *const xd = &x->e_mbd;
   const uint8_t *const what = x->plane[0].src.buf;
   const int what_stride = x->plane[0].src.stride;
   const uint8_t *const in_what = xd->plane[0].pre[0].buf;
   const int in_what_stride = xd->plane[0].pre[0].stride;
-  MV *best_mv = &xd->mi_8x8[0]->bmi[n].as_mv[0].as_mv;
-  MV this_mv;
-  int bestsad = INT_MAX;
-  int r, c;
-  int thissad;
-  int ref_row = ref_mv->row;
-  int ref_col = ref_mv->col;
-  // Apply further limits to prevent us looking using vectors that stretch
-  // beyond the UMV border
-  const int row_min = MAX(ref_row - distance, x->mv_row_min);
-  const int row_max = MIN(ref_row + distance, x->mv_row_max);
-  const int col_min = MAX(ref_col - distance, x->mv_col_min);
-  const int col_max = MIN(ref_col + distance, x->mv_col_max);
-  const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
+  const int row_min = MAX(ref_mv->row - distance, x->mv_row_min);
+  const int row_max = MIN(ref_mv->row + distance, x->mv_row_max);
+  const int col_min = MAX(ref_mv->col - distance, x->mv_col_min);
+  const int col_max = MIN(ref_mv->col + distance, x->mv_col_max);
   const int *mvjsadcost = x->nmvjointsadcost;
   int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
-
-  // Work out the mid point for the search
-  const uint8_t *bestaddress = &in_what[ref_row * in_what_stride + ref_col];
-
-  best_mv->row = ref_row;
-  best_mv->col = ref_col;
-
-  // Baseline value at the centre
-  bestsad = fn_ptr->sdf(what, what_stride, bestaddress,
-                        in_what_stride, 0x7fffffff)
-                           + mvsad_err_cost(best_mv, &fcenter_mv,
-                                            mvjsadcost, mvsadcost, sad_per_bit);
-
-  for (r = row_min; r < row_max; r++) {
-    const uint8_t *check_here = &in_what[r * in_what_stride + col_min];
-    this_mv.row = r;
-
-    for (c = col_min; c < col_max; c++) {
-      thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride,
-                            bestsad);
-
-      this_mv.col = c;
-      thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
-                                mvjsadcost, mvsadcost, sad_per_bit);
-
-      if (thissad < bestsad) {
-        bestsad = thissad;
-        best_mv->row = r;
-        best_mv->col = c;
-        bestaddress = check_here;
+  const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
+  const uint8_t *best_address = &in_what[ref_mv->row * in_what_stride +
+                                         ref_mv->col];
+  int best_sad = fn_ptr->sdf(what, what_stride, best_address, in_what_stride,
+                             0x7fffffff) +
+      mvsad_err_cost(ref_mv, &fcenter_mv, mvjsadcost, mvsadcost, sad_per_bit);
+  *best_mv = *ref_mv;
+
+  for (r = row_min; r < row_max; ++r) {
+    for (c = col_min; c < col_max; ++c) {
+      const MV this_mv = {r, c};
+      const uint8_t *check_here = &in_what[r * in_what_stride + c];
+      const int sad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride,
+                                  best_sad) +
+          mvsad_err_cost(&this_mv, &fcenter_mv,
+                         mvjsadcost, mvsadcost, sad_per_bit);
+
+      if (sad < best_sad) {
+        best_sad = sad;
+        *best_mv = this_mv;
+        best_address = check_here;
       }
-
-      check_here++;
     }
   }
 
-  this_mv.row = best_mv->row * 8;
-  this_mv.col = best_mv->col * 8;
-
-  if (bestsad < INT_MAX)
-    return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride,
-                      (unsigned int *)(&thissad)) +
-                      mv_err_cost(&this_mv, center_mv,
-                                  mvjcost, mvcost, x->errorperbit);
-  else
+  if (best_sad < INT_MAX) {
+    unsigned int unused;
+    const MV mv = {best_mv->row * 8, best_mv->col * 8};
+    return fn_ptr->vf(what, what_stride, best_address, in_what_stride, &unused)
+                + mv_err_cost(&mv, center_mv, mvjcost, mvcost, x->errorperbit);
+  } else {
     return INT_MAX;
+  }
 }
 
-int vp9_full_search_sadx3(const MACROBLOCK *x, MV *ref_mv,
+int vp9_full_search_sadx3(const MACROBLOCK *x, const MV *ref_mv,
                           int sad_per_bit, int distance,
-                          vp9_variance_fn_ptr_t *fn_ptr, int *mvjcost,
-                          int *mvcost[2], const MV *center_mv, int n) {
+                          const vp9_variance_fn_ptr_t *fn_ptr,
+                          int *mvjcost, int *mvcost[2],
+                          const MV *center_mv, MV *best_mv) {
   const MACROBLOCKD *const xd = &x->e_mbd;
   const uint8_t *const what = x->plane[0].src.buf;
   const int what_stride = x->plane[0].src.stride;
   const uint8_t *const in_what = xd->plane[0].pre[0].buf;
   const int in_what_stride = xd->plane[0].pre[0].stride;
-  MV *best_mv = &xd->mi_8x8[0]->bmi[n].as_mv[0].as_mv;
   MV this_mv;
   unsigned int bestsad = INT_MAX;
   int r, c;
@@ -1520,17 +1678,16 @@ int vp9_full_search_sadx3(const MACROBLOCK *x, MV *ref_mv,
     return INT_MAX;
 }
 
-int vp9_full_search_sadx8(const MACROBLOCK *x, MV *ref_mv,
+int vp9_full_search_sadx8(const MACROBLOCK *x, const MV *ref_mv,
                           int sad_per_bit, int distance,
-                          vp9_variance_fn_ptr_t *fn_ptr,
+                          const vp9_variance_fn_ptr_t *fn_ptr,
                           int *mvjcost, int *mvcost[2],
-                          const MV *center_mv, int n) {
+                          const MV *center_mv, MV *best_mv) {
   const MACROBLOCKD *const xd = &x->e_mbd;
   const uint8_t *const what = x->plane[0].src.buf;
   const int what_stride = x->plane[0].src.stride;
   const uint8_t *const in_what = xd->plane[0].pre[0].buf;
   const int in_what_stride = xd->plane[0].pre[0].stride;
-  MV *best_mv = &xd->mi_8x8[0]->bmi[n].as_mv[0].as_mv;
   MV this_mv;
   unsigned int bestsad = INT_MAX;
   int r, c;
@@ -1656,7 +1813,8 @@ int vp9_full_search_sadx8(const MACROBLOCK *x, MV *ref_mv,
 
 int vp9_refining_search_sad_c(const MACROBLOCK *x,
                               MV *ref_mv, int error_per_bit,
-                              int search_range, vp9_variance_fn_ptr_t *fn_ptr,
+                              int search_range,
+                              const vp9_variance_fn_ptr_t *fn_ptr,
                               int *mvjcost, int *mvcost[2],
                               const MV *center_mv) {
   const MACROBLOCKD *const xd = &x->e_mbd;
@@ -1669,11 +1827,7 @@ int vp9_refining_search_sad_c(const MACROBLOCK *x,
   const uint8_t *const in_what = xd->plane[0].pre[0].buf;
   const uint8_t *best_address = &in_what[ref_mv->row * in_what_stride +
                                              ref_mv->col];
-  unsigned int thissad;
-
   const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
-  MV this_mv;
-
   const int *mvjsadcost = x->nmvjointsadcost;
   int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
 
@@ -1685,18 +1839,13 @@ int vp9_refining_search_sad_c(const MACROBLOCK *x,
     int best_site = -1;
 
     for (j = 0; j < 4; j++) {
-      this_mv.row = ref_mv->row + neighbors[j].row;
-      this_mv.col = ref_mv->col + neighbors[j].col;
-
-      if ((this_mv.col > x->mv_col_min) &&
-          (this_mv.col < x->mv_col_max) &&
-          (this_mv.row > x->mv_row_min) &&
-          (this_mv.row < x->mv_row_max)) {
+      const MV this_mv = {ref_mv->row + neighbors[j].row,
+                          ref_mv->col + neighbors[j].col};
+      if (is_mv_in(x, &this_mv)) {
         const uint8_t *check_here = &in_what[this_mv.row * in_what_stride +
                                                 this_mv.col];
-        thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride,
-                              bestsad);
-
+        unsigned int thissad = fn_ptr->sdf(what, what_stride, check_here,
+                                           in_what_stride, bestsad);
         if (thissad < bestsad) {
           thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
                                     mvjsadcost, mvsadcost, error_per_bit);
@@ -1718,20 +1867,21 @@ int vp9_refining_search_sad_c(const MACROBLOCK *x,
     }
   }
 
-  this_mv.row = ref_mv->row * 8;
-  this_mv.col = ref_mv->col * 8;
-
-  if (bestsad < INT_MAX)
+  if (bestsad < INT_MAX) {
+    unsigned int unused;
+    const MV mv = {ref_mv->row * 8, ref_mv->col * 8};
     return fn_ptr->vf(what, what_stride, best_address, in_what_stride,
-                      (unsigned int *)(&thissad)) +
-        mv_err_cost(&this_mv, center_mv, mvjcost, mvcost, x->errorperbit);
-  else
+                      &unused) +
+        mv_err_cost(&mv, center_mv, mvjcost, mvcost, x->errorperbit);
+  } else {
     return INT_MAX;
+  }
 }
 
 int vp9_refining_search_sadx4(const MACROBLOCK *x,
                               MV *ref_mv, int error_per_bit,
-                              int search_range, vp9_variance_fn_ptr_t *fn_ptr,
+                              int search_range,
+                              const vp9_variance_fn_ptr_t *fn_ptr,
                               int *mvjcost, int *mvcost[2],
                               const MV *center_mv) {
   const MACROBLOCKD *const xd = &x->e_mbd;
@@ -1844,8 +1994,10 @@ int vp9_refining_search_sadx4(const MACROBLOCK *x,
 // mode.
 int vp9_refining_search_8p_c(const MACROBLOCK *x,
                              MV *ref_mv, int error_per_bit,
-                             int search_range, vp9_variance_fn_ptr_t *fn_ptr,
-                             int *mvjcost, int *mvcost[2], const MV *center_mv,
+                             int search_range,
+                             const vp9_variance_fn_ptr_t *fn_ptr,
+                             int *mvjcost, int *mvcost[2],
+                             const MV *center_mv,
                              const uint8_t *second_pred, int w, int h) {
   const MACROBLOCKD *const xd = &x->e_mbd;
   const MV neighbors[8] = {{-1, 0}, {0, -1}, {0, 1}, {1, 0},
@@ -1878,10 +2030,7 @@ int vp9_refining_search_8p_c(const MACROBLOCK *x,
       this_mv.row = ref_mv->row + neighbors[j].row;
       this_mv.col = ref_mv->col + neighbors[j].col;
 
-      if ((this_mv.col > x->mv_col_min) &&
-          (this_mv.col < x->mv_col_max) &&
-          (this_mv.row > x->mv_row_min) &&
-          (this_mv.row < x->mv_row_max)) {
+      if (is_mv_in(x, &this_mv)) {
         const uint8_t *check_here = &in_what[this_mv.row * in_what_stride +
                                                 this_mv.col];
 
diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h
index 28b46b503..586a74c9c 100644
--- a/vp9/encoder/vp9_mcomp.h
+++ b/vp9/encoder/vp9_mcomp.h
@@ -45,8 +45,8 @@ int vp9_init_search_range(struct VP9_COMP *cpi, int size);
 int vp9_full_pixel_diamond(struct VP9_COMP *cpi, MACROBLOCK *x,
                            MV *mvp_full, int step_param,
                            int sadpb, int further_steps, int do_refine,
-                           vp9_variance_fn_ptr_t *fn_ptr,
-                           const MV *ref_mv, int_mv *dst_mv);
+                           const vp9_variance_fn_ptr_t *fn_ptr,
+                           const MV *ref_mv, MV *dst_mv);
 
 int vp9_hex_search(const MACROBLOCK *x,
                    MV *ref_mv,
@@ -75,6 +75,14 @@ int vp9_square_search(const MACROBLOCK *x,
                       int use_mvcost,
                       const MV *center_mv,
                       MV *best_mv);
+int vp9_fast_hex_search(const MACROBLOCK *x,
+                        MV *ref_mv,
+                        int search_param,
+                        int sad_per_bit,
+                        const vp9_variance_fn_ptr_t *vfp,
+                        int use_mvcost,
+                        const MV *center_mv,
+                        MV *best_mv);
 
 typedef int (fractional_mv_step_fp) (
     const MACROBLOCK *x,
@@ -107,15 +115,16 @@ typedef int (fractional_mv_step_comp_fp) (
 extern fractional_mv_step_comp_fp vp9_find_best_sub_pixel_comp_tree;
 
 typedef int (*vp9_full_search_fn_t)(const MACROBLOCK *x,
-                                    MV *ref_mv, int sad_per_bit,
-                                    int distance, vp9_variance_fn_ptr_t *fn_ptr,
+                                    const MV *ref_mv, int sad_per_bit,
+                                    int distance,
+                                    const vp9_variance_fn_ptr_t *fn_ptr,
                                     int *mvjcost, int *mvcost[2],
-                                    const MV *center_mv, int n);
+                                    const MV *center_mv, MV *best_mv);
 
 typedef int (*vp9_refining_search_fn_t)(const MACROBLOCK *x,
                                         MV *ref_mv, int sad_per_bit,
                                         int distance,
-                                        vp9_variance_fn_ptr_t *fn_ptr,
+                                        const vp9_variance_fn_ptr_t *fn_ptr,
                                         int *mvjcost, int *mvcost[2],
                                         const MV *center_mv);
 
@@ -123,13 +132,14 @@ typedef int (*vp9_diamond_search_fn_t)(const MACROBLOCK *x,
                                        MV *ref_mv, MV *best_mv,
                                        int search_param, int sad_per_bit,
                                        int *num00,
-                                       vp9_variance_fn_ptr_t *fn_ptr,
+                                       const vp9_variance_fn_ptr_t *fn_ptr,
                                        int *mvjcost, int *mvcost[2],
                                        const MV *center_mv);
 
 int vp9_refining_search_8p_c(const MACROBLOCK *x,
                              MV *ref_mv, int error_per_bit,
-                             int search_range, vp9_variance_fn_ptr_t *fn_ptr,
+                             int search_range,
+                             const vp9_variance_fn_ptr_t *fn_ptr,
                              int *mvjcost, int *mvcost[2],
                              const MV *center_mv, const uint8_t *second_pred,
                              int w, int h);
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index a9b0718c8..95ebb0c6d 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -14,6 +14,8 @@
 
 #include "./vpx_config.h"
 #include "./vpx_scale_rtcd.h"
+#include "vpx/internal/vpx_psnr.h"
+#include "vpx_ports/vpx_timer.h"
 
 #include "vp9/common/vp9_alloccommon.h"
 #include "vp9/common/vp9_filter.h"
@@ -30,7 +32,6 @@
 #include "vp9/encoder/vp9_mbgraph.h"
 #include "vp9/encoder/vp9_onyx_int.h"
 #include "vp9/encoder/vp9_picklpf.h"
-#include "vp9/encoder/vp9_psnr.h"
 #include "vp9/encoder/vp9_ratectrl.h"
 #include "vp9/encoder/vp9_rdopt.h"
 #include "vp9/encoder/vp9_segmentation.h"
@@ -38,8 +39,6 @@
 #include "vp9/encoder/vp9_vaq.h"
 #include "vp9/encoder/vp9_resize.h"
 
-#include "vpx_ports/vpx_timer.h"
-
 void vp9_entropy_mode_init();
 void vp9_coef_tree_initialize();
 
@@ -93,19 +92,10 @@ FILE *kf_list;
 FILE *keyfile;
 #endif
 
-#ifdef SPEEDSTATS
-unsigned int frames_at_speed[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                    0, 0, 0};
-#endif
-
-#if defined(SECTIONBITS_OUTPUT)
-extern unsigned __int64 Sectionbits[500];
-#endif
-
-extern void vp9_init_quantizer(VP9_COMP *cpi);
+void vp9_init_quantizer(VP9_COMP *cpi);
 
 static const double in_frame_q_adj_ratio[MAX_SEGMENTS] =
-  {1.0, 1.5, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0};
+  {1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
 
 static INLINE void Scale2Ratio(int mode, int *hr, int *hs) {
   switch (mode) {
@@ -163,20 +153,22 @@ void vp9_initialize_enc() {
 }
 
 static void dealloc_compressor_data(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+
   // Delete sementation map
   vpx_free(cpi->segmentation_map);
-  cpi->segmentation_map = 0;
-  vpx_free(cpi->common.last_frame_seg_map);
-  cpi->common.last_frame_seg_map = 0;
+  cpi->segmentation_map = NULL;
+  vpx_free(cm->last_frame_seg_map);
+  cm->last_frame_seg_map = NULL;
   vpx_free(cpi->coding_context.last_frame_seg_map_copy);
-  cpi->coding_context.last_frame_seg_map_copy = 0;
+  cpi->coding_context.last_frame_seg_map_copy = NULL;
 
   vpx_free(cpi->complexity_map);
   cpi->complexity_map = 0;
   vpx_free(cpi->active_map);
   cpi->active_map = 0;
 
-  vp9_free_frame_buffers(&cpi->common);
+  vp9_free_frame_buffers(cm);
 
   vp9_free_frame_buffer(&cpi->last_frame_uf);
   vp9_free_frame_buffer(&cpi->scaled_source);
@@ -203,19 +195,20 @@ static void dealloc_compressor_data(VP9_COMP *cpi) {
 // to a target value
 // target q value
 int vp9_compute_qdelta(const VP9_COMP *cpi, double qstart, double qtarget) {
+  const RATE_CONTROL *const rc = &cpi->rc;
+  int start_index = rc->worst_quality;
+  int target_index = rc->worst_quality;
   int i;
-  int start_index = cpi->rc.worst_quality;
-  int target_index = cpi->rc.worst_quality;
 
   // Convert the average q value to an index.
-  for (i = cpi->rc.best_quality; i < cpi->rc.worst_quality; i++) {
+  for (i = rc->best_quality; i < rc->worst_quality; ++i) {
     start_index = i;
     if (vp9_convert_qindex_to_q(i) >= qstart)
       break;
   }
 
   // Convert the q target to an index
-  for (i = cpi->rc.best_quality; i < cpi->rc.worst_quality; i++) {
+  for (i = rc->best_quality; i < rc->worst_quality; ++i) {
     target_index = i;
     if (vp9_convert_qindex_to_q(i) >= qtarget)
       break;
@@ -227,28 +220,23 @@ int vp9_compute_qdelta(const VP9_COMP *cpi, double qstart, double qtarget) {
 // Computes a q delta (in "q index" terms) to get from a starting q value
 // to a value that should equate to thegiven rate ratio.
 
-int vp9_compute_qdelta_by_rate(VP9_COMP *cpi,
-                               double base_q_index, double rate_target_ratio) {
+static int compute_qdelta_by_rate(VP9_COMP *cpi, int base_q_index,
+                                  double rate_target_ratio) {
   int i;
-  int base_bits_per_mb;
-  int target_bits_per_mb;
   int target_index = cpi->rc.worst_quality;
 
-  // Make SURE use of floating point in this function is safe.
-  vp9_clear_system_state();
-
   // Look up the current projected bits per block for the base index
-  base_bits_per_mb = vp9_rc_bits_per_mb(cpi->common.frame_type,
-                                        base_q_index, 1.0);
+  const int base_bits_per_mb = vp9_rc_bits_per_mb(cpi->common.frame_type,
+                                            base_q_index, 1.0);
 
   // Find the target bits per mb based on the base value and given ratio.
-  target_bits_per_mb = rate_target_ratio * base_bits_per_mb;
+  const int target_bits_per_mb = (int)(rate_target_ratio * base_bits_per_mb);
 
   // Convert the q target to an index
-  for (i = cpi->rc.best_quality; i < cpi->rc.worst_quality; i++) {
+  for (i = cpi->rc.best_quality; i < cpi->rc.worst_quality; ++i) {
     target_index = i;
-    if (vp9_rc_bits_per_mb(cpi->common.frame_type,
-                           i, 1.0) <= target_bits_per_mb )
+    if (vp9_rc_bits_per_mb(cpi->common.frame_type, i, 1.0) <=
+            target_bits_per_mb )
       break;
   }
 
@@ -258,11 +246,8 @@ int vp9_compute_qdelta_by_rate(VP9_COMP *cpi,
 // This function sets up a set of segments with delta Q values around
 // the baseline frame quantizer.
 static void setup_in_frame_q_adj(VP9_COMP *cpi) {
-  VP9_COMMON *cm = &cpi->common;
-  struct segmentation *seg = &cm->seg;
-  // double q_ratio;
-  int segment;
-  int qindex_delta;
+  VP9_COMMON *const cm = &cpi->common;
+  struct segmentation *const seg = &cm->seg;
 
   // Make SURE use of floating point in this function is safe.
   vp9_clear_system_state();
@@ -270,13 +255,14 @@ static void setup_in_frame_q_adj(VP9_COMP *cpi) {
   if (cm->frame_type == KEY_FRAME ||
       cpi->refresh_alt_ref_frame ||
       (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) {
+    int segment;
+
     // Clear down the segment map
     vpx_memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
 
     // Clear down the complexity map used for rd
     vpx_memset(cpi->complexity_map, 0, cm->mi_rows * cm->mi_cols);
 
-    // Enable segmentation
     vp9_enable_segmentation((VP9_PTR)cpi);
     vp9_clearall_segfeatures(seg);
 
@@ -287,9 +273,8 @@ static void setup_in_frame_q_adj(VP9_COMP *cpi) {
     vp9_disable_segfeature(seg, 0, SEG_LVL_ALT_Q);
 
     // Use some of the segments for in frame Q adjustment
-    for (segment = 1; segment < 3; segment++) {
-      qindex_delta =
-        vp9_compute_qdelta_by_rate(cpi, cm->base_qindex,
+    for (segment = 1; segment < 2; segment++) {
+      const int qindex_delta = compute_qdelta_by_rate(cpi, cm->base_qindex,
                                    in_frame_q_adj_ratio[segment]);
       vp9_enable_segfeature(seg, segment, SEG_LVL_ALT_Q);
       vp9_set_segdata(seg, segment, SEG_LVL_ALT_Q, qindex_delta);
@@ -297,8 +282,8 @@ static void setup_in_frame_q_adj(VP9_COMP *cpi) {
   }
 }
 static void configure_static_seg_features(VP9_COMP *cpi) {
-  VP9_COMMON *cm = &cpi->common;
-  struct segmentation *seg = &cm->seg;
+  VP9_COMMON *const cm = &cpi->common;
+  struct segmentation *const seg = &cm->seg;
 
   int high_q = (int)(cpi->rc.avg_q > 48.0);
   int qi_delta;
@@ -442,13 +427,13 @@ static void print_seg_map(VP9_COMP *cpi) {
 
 static void update_reference_segmentation_map(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
+  MODE_INFO **mi_8x8_ptr = cm->mi_grid_visible;
+  uint8_t *cache_ptr = cm->last_frame_seg_map;
   int row, col;
-  MODE_INFO **mi_8x8, **mi_8x8_ptr = cm->mi_grid_visible;
-  uint8_t *cache_ptr = cm->last_frame_seg_map, *cache;
 
   for (row = 0; row < cm->mi_rows; row++) {
-    mi_8x8 = mi_8x8_ptr;
-    cache = cache_ptr;
+    MODE_INFO **mi_8x8 = mi_8x8_ptr;
+    uint8_t *cache = cache_ptr;
     for (col = 0; col < cm->mi_cols; col++, mi_8x8++, cache++)
       cache[0] = mi_8x8[0]->mbmi.segment_id;
     mi_8x8_ptr += cm->mode_info_stride;
@@ -581,7 +566,7 @@ static void set_good_speed_feature(VP9_COMMON *cm,
                                    int speed) {
   int i;
   sf->adaptive_rd_thresh = 1;
-  sf->recode_loop = (speed < 1);
+  sf->recode_loop = ((speed < 1) ? ALLOW_RECODE : ALLOW_RECODE_KFMAXBW);
   if (speed == 1) {
     sf->use_square_partition_only = !frame_is_intra_only(cm);
     sf->less_rectangular_check  = 1;
@@ -599,7 +584,7 @@ static void set_good_speed_feature(VP9_COMMON *cm,
     sf->adaptive_pred_interp_filter = 1;
     sf->auto_mv_step_size = 1;
     sf->adaptive_rd_thresh = 2;
-    sf->recode_loop = 2;
+    sf->recode_loop = ALLOW_RECODE_KFARFGF;
     sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
     sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V;
     sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V;
@@ -635,7 +620,7 @@ static void set_good_speed_feature(VP9_COMMON *cm,
     sf->last_partitioning_redo_frequency = 3;
 
     sf->adaptive_rd_thresh = 2;
-    sf->recode_loop = 2;
+    sf->recode_loop = ALLOW_RECODE_KFARFGF;
     sf->use_lp32x32fdct = 1;
     sf->mode_skip_start = 11;
     sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
@@ -663,6 +648,7 @@ static void set_good_speed_feature(VP9_COMMON *cm,
     sf->reference_masking = 1;
     sf->auto_mv_step_size = 1;
 
+    sf->disable_split_var_thresh = 32;
     sf->disable_filter_search_var_thresh = 100;
     sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
 
@@ -698,6 +684,7 @@ static void set_good_speed_feature(VP9_COMMON *cm,
     sf->reference_masking = 1;
     sf->auto_mv_step_size = 1;
 
+    sf->disable_split_var_thresh = 64;
     sf->disable_filter_search_var_thresh = 200;
     sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
 
@@ -715,9 +702,9 @@ static void set_good_speed_feature(VP9_COMMON *cm,
     sf->adaptive_rd_thresh = 4;
     sf->mode_skip_start = 6;
   }
-  if (speed == 5) {
+  if (speed >= 5) {
     sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
-    sf->use_one_partition_size_always = 1;
+    sf->partition_search_type = FIXED_PARTITION;
     sf->always_this_block_size = BLOCK_16X16;
     sf->tx_size_search_method = frame_is_intra_only(cm) ?
       USE_FULL_RD : USE_LARGESTALL;
@@ -752,7 +739,9 @@ static void set_rt_speed_feature(VP9_COMMON *cm,
                                  int speed) {
   sf->static_segmentation = 0;
   sf->adaptive_rd_thresh = 1;
-  sf->recode_loop = (speed < 1);
+  sf->recode_loop = ((speed < 1) ? ALLOW_RECODE : ALLOW_RECODE_KFMAXBW);
+  sf->encode_breakout_thresh = 1;
+
   if (speed == 1) {
     sf->use_square_partition_only = !frame_is_intra_only(cm);
     sf->less_rectangular_check = 1;
@@ -770,10 +759,11 @@ static void set_rt_speed_feature(VP9_COMMON *cm,
     sf->adaptive_pred_interp_filter = 1;
     sf->auto_mv_step_size = 1;
     sf->adaptive_rd_thresh = 2;
-    sf->recode_loop = 2;
+    sf->recode_loop = ALLOW_RECODE_KFARFGF;
     sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
     sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V;
     sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V;
+    sf->encode_breakout_thresh = 8;
   }
   if (speed >= 2) {
     sf->use_square_partition_only = !frame_is_intra_only(cm);
@@ -806,13 +796,14 @@ static void set_rt_speed_feature(VP9_COMMON *cm,
     sf->last_partitioning_redo_frequency = 3;
 
     sf->adaptive_rd_thresh = 2;
-    sf->recode_loop = 2;
+    sf->recode_loop = ALLOW_RECODE_KFARFGF;
     sf->use_lp32x32fdct = 1;
     sf->mode_skip_start = 11;
     sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
     sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V;
     sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V;
     sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V;
+    sf->encode_breakout_thresh = 200;
   }
   if (speed >= 3) {
     sf->use_square_partition_only = 1;
@@ -835,25 +826,37 @@ static void set_rt_speed_feature(VP9_COMMON *cm,
     sf->use_fast_coef_updates = 2;
     sf->adaptive_rd_thresh = 4;
     sf->mode_skip_start = 6;
+    sf->encode_breakout_thresh = 400;
   }
   if (speed >= 4) {
     sf->optimize_coefficients = 0;
+    sf->disable_split_mask = DISABLE_ALL_SPLIT;
+    sf->use_fast_lpf_pick = 2;
+    sf->encode_breakout_thresh = 700;
   }
   if (speed >= 5) {
     int i;
-    sf->disable_split_mask = DISABLE_ALL_SPLIT;
+    sf->adaptive_rd_thresh = 5;
     sf->auto_min_max_partition_size = frame_is_intra_only(cm) ?
         RELAXED_NEIGHBORING_MIN_MAX : STRICT_NEIGHBORING_MIN_MAX;
+    sf->adjust_partitioning_from_last_frame =
+        cm->last_frame_type == KEY_FRAME || (0 ==
+        (cm->current_video_frame + 1) % sf->last_partitioning_redo_frequency);
     sf->subpel_force_stop = 1;
     for (i = 0; i < TX_SIZES; i++) {
       sf->intra_y_mode_mask[i] = INTRA_DC_H_V;
       sf->intra_uv_mode_mask[i] = INTRA_DC_ONLY;
     }
-    sf->use_fast_lpf_pick = 2;
-    sf->RD = 0;
+    sf->frame_parameter_update = 0;
+    sf->encode_breakout_thresh = 1000;
+    sf->search_method = FAST_HEX;
   }
   if (speed >= 6) {
-    sf->super_fast_rtc = 1;
+    sf->partition_search_type = VAR_BASED_FIXED_PARTITION;
+  }
+  if (speed >= 7) {
+    sf->partition_search_type = VAR_BASED_FIXED_PARTITION;
+    sf->use_nonrd_pick_mode = 1;
   }
 }
 
@@ -867,13 +870,15 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
   if (speed < 0)
     speed = -speed;
 
+#if CONFIG_INTERNAL_STATS
   for (i = 0; i < MAX_MODES; ++i)
     cpi->mode_chosen_counts[i] = 0;
+#endif
 
   // best quality defaults
-  sf->RD = 1;
+  sf->frame_parameter_update = 1;
   sf->search_method = NSTEP;
-  sf->recode_loop = 1;
+  sf->recode_loop = ALLOW_RECODE;
   sf->subpel_search_method = SUBPEL_TREE;
   sf->subpel_iters_per_step = 2;
   sf->subpel_force_stop = 0;
@@ -889,7 +894,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
   sf->adaptive_motion_search = 0;
   sf->adaptive_pred_interp_filter = 0;
   sf->reference_masking = 0;
-  sf->use_one_partition_size_always = 0;
+  sf->partition_search_type = SEARCH_PARTITION;
   sf->less_rectangular_check = 0;
   sf->use_square_partition_only = 0;
   sf->auto_min_max_partition_size = NOT_IN_USE;
@@ -910,9 +915,9 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
   sf->use_uv_intra_rd_estimate = 0;
   sf->use_fast_lpf_pick = 0;
   sf->use_fast_coef_updates = 0;
-  sf->using_small_partition_info = 0;
   sf->mode_skip_start = MAX_MODES;  // Mode index at which mode skip mask set
-  sf->super_fast_rtc = 0;
+  sf->use_nonrd_pick_mode = 0;
+  sf->encode_breakout_thresh = 0;
 
   switch (cpi->oxcf.mode) {
     case MODE_BESTQUALITY:
@@ -941,7 +946,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
 
   // No recode for 1 pass.
   if (cpi->pass == 0) {
-    sf->recode_loop = 0;
+    sf->recode_loop = DISALLOW_RECODE;
     sf->optimize_coefficients = 0;
   }
 
@@ -957,25 +962,29 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
 
   cpi->mb.optimize = cpi->sf.optimize_coefficients == 1 && cpi->pass != 1;
 
-#ifdef SPEEDSTATS
-  frames_at_speed[cpi->speed]++;
-#endif
+  if (cpi->encode_breakout && cpi->oxcf.mode == MODE_REALTIME &&
+      sf->encode_breakout_thresh > cpi->encode_breakout)
+    cpi->encode_breakout = sf->encode_breakout_thresh;
+
+  if (sf->disable_split_mask == DISABLE_ALL_SPLIT)
+    sf->adaptive_pred_interp_filter = 0;
 }
 
 static void alloc_raw_frame_buffers(VP9_COMP *cpi) {
   VP9_COMMON *cm = &cpi->common;
+  const VP9_CONFIG *oxcf = &cpi->oxcf;
 
-  cpi->lookahead = vp9_lookahead_init(cpi->oxcf.width, cpi->oxcf.height,
+  cpi->lookahead = vp9_lookahead_init(oxcf->width, oxcf->height,
                                       cm->subsampling_x, cm->subsampling_y,
-                                      cpi->oxcf.lag_in_frames);
+                                      oxcf->lag_in_frames);
   if (!cpi->lookahead)
     vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate lag buffers");
 
   if (vp9_realloc_frame_buffer(&cpi->alt_ref_buffer,
-                               cpi->oxcf.width, cpi->oxcf.height,
+                               oxcf->width, oxcf->height,
                                cm->subsampling_x, cm->subsampling_y,
-                               VP9_ENC_BORDER_IN_PIXELS))
+                               VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL))
     vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate altref buffer");
 }
@@ -1043,14 +1052,14 @@ static void update_frame_size(VP9_COMP *cpi) {
   if (vp9_realloc_frame_buffer(&cpi->last_frame_uf,
                                cm->width, cm->height,
                                cm->subsampling_x, cm->subsampling_y,
-                               VP9_ENC_BORDER_IN_PIXELS))
+                               VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL))
     vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                        "Failed to reallocate last frame buffer");
 
   if (vp9_realloc_frame_buffer(&cpi->scaled_source,
                                cm->width, cm->height,
                                cm->subsampling_x, cm->subsampling_y,
-                               VP9_ENC_BORDER_IN_PIXELS))
+                               VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL))
     vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                        "Failed to reallocate scaled source buffer");
 
@@ -1100,15 +1109,13 @@ int vp9_reverse_trans(int x) {
 
 void vp9_new_framerate(VP9_COMP *cpi, double framerate) {
   VP9_COMMON *const cm = &cpi->common;
-  int64_t vbr_max_bits;
+  int vbr_max_bits;
 
   if (framerate < 0.1)
     framerate = 30;
 
   cpi->oxcf.framerate = framerate;
   cpi->output_framerate = cpi->oxcf.framerate;
-  cpi->rc.per_frame_bandwidth = (int)(cpi->oxcf.target_bandwidth
-                                      / cpi->output_framerate);
   cpi->rc.av_per_frame_bandwidth = (int)(cpi->oxcf.target_bandwidth
                                          / cpi->output_framerate);
   cpi->rc.min_frame_bandwidth = (int)(cpi->rc.av_per_frame_bandwidth *
@@ -1126,10 +1133,10 @@ void vp9_new_framerate(VP9_COMP *cpi, double framerate) {
   // be acheived because of a user specificed max q (e.g. when the user
   // specifies lossless encode.
   //
-  vbr_max_bits = ((int64_t)cpi->rc.av_per_frame_bandwidth *
-                  (int64_t)cpi->oxcf.two_pass_vbrmax_section) / 100;
+  vbr_max_bits = (int)(((int64_t)cpi->rc.av_per_frame_bandwidth *
+      cpi->oxcf.two_pass_vbrmax_section) / 100);
   cpi->rc.max_frame_bandwidth =
-    MAX(MAX((cm->MBs * MAX_MB_RATE), MAXRATE_1080P), vbr_max_bits);
+      MAX(MAX((cm->MBs * MAX_MB_RATE), MAXRATE_1080P), vbr_max_bits);
 
   // Set Maximum gf/arf interval
   cpi->rc.max_gf_interval = 16;
@@ -1150,7 +1157,7 @@ void vp9_new_framerate(VP9_COMP *cpi, double framerate) {
     cpi->rc.max_gf_interval = cpi->twopass.static_scene_max_gf_interval;
 }
 
-static int64_t rescale(int val, int64_t num, int denom) {
+static int64_t rescale(int64_t val, int64_t num, int denom) {
   int64_t llnum = num;
   int64_t llden = denom;
   int64_t llval = val;
@@ -1158,6 +1165,124 @@ static int64_t rescale(int val, int64_t num, int denom) {
   return (llval * llnum / llden);
 }
 
+// Initialize layer context data from init_config().
+static void init_layer_context(VP9_COMP *const cpi) {
+  const VP9_CONFIG *const oxcf = &cpi->oxcf;
+  int temporal_layer = 0;
+  cpi->svc.spatial_layer_id = 0;
+  cpi->svc.temporal_layer_id = 0;
+  for (temporal_layer = 0; temporal_layer < cpi->svc.number_temporal_layers;
+      ++temporal_layer) {
+    LAYER_CONTEXT *const lc = &cpi->svc.layer_context[temporal_layer];
+    RATE_CONTROL *const lrc = &lc->rc;
+    lrc->avg_frame_qindex[INTER_FRAME] = q_trans[oxcf->worst_allowed_q];
+    lrc->last_q[INTER_FRAME] = q_trans[oxcf->worst_allowed_q];
+    lrc->ni_av_qi = q_trans[oxcf->worst_allowed_q];
+    lrc->total_actual_bits = 0;
+    lrc->total_target_vs_actual = 0;
+    lrc->ni_tot_qi = 0;
+    lrc->tot_q = 0.0;
+    lrc->avg_q = 0.0;
+    lrc->ni_frames = 0;
+    lrc->decimation_count = 0;
+    lrc->decimation_factor = 0;
+    lrc->rate_correction_factor = 1.0;
+    lrc->key_frame_rate_correction_factor = 1.0;
+    lc->target_bandwidth = oxcf->ts_target_bitrate[temporal_layer] *
+        1000;
+    lrc->buffer_level = rescale((int)(oxcf->starting_buffer_level),
+                               lc->target_bandwidth, 1000);
+    lrc->bits_off_target = lrc->buffer_level;
+  }
+}
+
+// Update the layer context from a change_config() call.
+static void update_layer_context_change_config(VP9_COMP *const cpi,
+                                               const int target_bandwidth) {
+  const VP9_CONFIG *const oxcf = &cpi->oxcf;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  int temporal_layer = 0;
+  float bitrate_alloc = 1.0;
+  for (temporal_layer = 0; temporal_layer < cpi->svc.number_temporal_layers;
+      ++temporal_layer) {
+    LAYER_CONTEXT *const lc = &cpi->svc.layer_context[temporal_layer];
+    RATE_CONTROL *const lrc = &lc->rc;
+    lc->target_bandwidth = oxcf->ts_target_bitrate[temporal_layer] * 1000;
+    bitrate_alloc = (float)lc->target_bandwidth / (float)target_bandwidth;
+    // Update buffer-related quantities.
+    lc->starting_buffer_level =
+        (int64_t)(oxcf->starting_buffer_level * bitrate_alloc);
+    lc->optimal_buffer_level =
+        (int64_t)(oxcf->optimal_buffer_level * bitrate_alloc);
+    lc->maximum_buffer_size =
+        (int64_t)(oxcf->maximum_buffer_size * bitrate_alloc);
+    lrc->bits_off_target = MIN(lrc->bits_off_target, lc->maximum_buffer_size);
+    lrc->buffer_level = MIN(lrc->buffer_level, lc->maximum_buffer_size);
+    // Update framerate-related quantities.
+    lc->framerate = oxcf->framerate / oxcf->ts_rate_decimator[temporal_layer];
+    lrc->av_per_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate);
+    lrc->max_frame_bandwidth = rc->max_frame_bandwidth;
+    // Update qp-related quantities.
+    lrc->worst_quality = rc->worst_quality;
+    lrc->best_quality = rc->best_quality;
+  }
+}
+
+// Prior to encoding the frame, update framerate-related quantities
+// for the current layer.
+static void update_layer_framerate(VP9_COMP *const cpi) {
+  int temporal_layer = cpi->svc.temporal_layer_id;
+  const VP9_CONFIG *const oxcf = &cpi->oxcf;
+  LAYER_CONTEXT *const lc = &cpi->svc.layer_context[temporal_layer];
+  RATE_CONTROL *const lrc = &lc->rc;
+  lc->framerate = oxcf->framerate / oxcf->ts_rate_decimator[temporal_layer];
+  lrc->av_per_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate);
+  lrc->max_frame_bandwidth = cpi->rc.max_frame_bandwidth;
+  // Update the average layer frame size (non-cumulative per-frame-bw).
+  if (temporal_layer == 0) {
+    lc->avg_frame_size = lrc->av_per_frame_bandwidth;
+  } else {
+    double prev_layer_framerate = oxcf->framerate /
+        oxcf->ts_rate_decimator[temporal_layer - 1];
+    int prev_layer_target_bandwidth =
+        oxcf->ts_target_bitrate[temporal_layer - 1] * 1000;
+    lc->avg_frame_size =
+        (int)((lc->target_bandwidth - prev_layer_target_bandwidth) /
+              (lc->framerate - prev_layer_framerate));
+  }
+}
+
+// Prior to encoding the frame, set the layer context, for the current layer
+// to be encoded, to the cpi struct.
+static void restore_layer_context(VP9_COMP *const cpi) {
+  int temporal_layer = cpi->svc.temporal_layer_id;
+  LAYER_CONTEXT *lc = &cpi->svc.layer_context[temporal_layer];
+  int frame_since_key = cpi->rc.frames_since_key;
+  int frame_to_key = cpi->rc.frames_to_key;
+  cpi->rc = lc->rc;
+  cpi->oxcf.target_bandwidth = lc->target_bandwidth;
+  cpi->oxcf.starting_buffer_level = lc->starting_buffer_level;
+  cpi->oxcf.optimal_buffer_level = lc->optimal_buffer_level;
+  cpi->oxcf.maximum_buffer_size = lc->maximum_buffer_size;
+  cpi->output_framerate = lc->framerate;
+  // Reset the frames_since_key and frames_to_key counters to their values
+  // before the layer restore. Keep these defined for the stream (not layer).
+  cpi->rc.frames_since_key = frame_since_key;
+  cpi->rc.frames_to_key = frame_to_key;
+}
+
+// Save the layer context after encoding the frame.
+static void save_layer_context(VP9_COMP *const cpi) {
+  int temporal_layer = cpi->svc.temporal_layer_id;
+  LAYER_CONTEXT *lc = &cpi->svc.layer_context[temporal_layer];
+  lc->rc = cpi->rc;
+  lc->target_bandwidth = (int)cpi->oxcf.target_bandwidth;
+  lc->starting_buffer_level = cpi->oxcf.starting_buffer_level;
+  lc->optimal_buffer_level = cpi->oxcf.optimal_buffer_level;
+  lc->maximum_buffer_size = cpi->oxcf.maximum_buffer_size;
+  lc->framerate = cpi->output_framerate;
+}
+
 static void set_tile_limits(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
 
@@ -1184,12 +1309,20 @@ static void init_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {
   cm->subsampling_y = 0;
   vp9_alloc_compressor_data(cpi);
 
+  // Spatial scalability.
+  cpi->svc.number_spatial_layers = oxcf->ss_number_layers;
+  // Temporal scalability.
+  cpi->svc.number_temporal_layers = oxcf->ts_number_layers;
+
+  if (cpi->svc.number_temporal_layers > 1 &&
+      cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) {
+    init_layer_context(cpi);
+  }
+
   // change includes all joint functionality
   vp9_change_config(ptr, oxcf);
 
   // Initialize active best and worst q and average q values.
-  cpi->rc.active_worst_quality      = cpi->oxcf.worst_allowed_q;
-
   if (cpi->pass == 0 && cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) {
     cpi->rc.avg_frame_qindex[0] = cpi->oxcf.worst_allowed_q;
     cpi->rc.avg_frame_qindex[1] = cpi->oxcf.worst_allowed_q;
@@ -1224,9 +1357,6 @@ static void init_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {
   cpi->gld_fb_idx = 1;
   cpi->alt_fb_idx = 2;
 
-  cpi->current_layer = 0;
-  cpi->use_svc = 0;
-
   set_tile_limits(cpi);
 
   cpi->fixed_divide[0] = 0;
@@ -1234,7 +1364,6 @@ static void init_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {
     cpi->fixed_divide[i] = 0x80000 / i;
 }
 
-
 void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {
   VP9_COMP *cpi = (VP9_COMP *)(ptr);
   VP9_COMMON *const cm = &cpi->common;
@@ -1248,6 +1377,9 @@ void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {
 
   cpi->oxcf = *oxcf;
 
+  if (cpi->oxcf.cpu_used == -6)
+    cpi->oxcf.play_alternate = 0;
+
   switch (cpi->oxcf.mode) {
       // Real time and one pass deprecated in test code base
     case MODE_GOODQUALITY:
@@ -1298,6 +1430,7 @@ void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {
     for (i = 0; i < MAX_SEGMENTS; i++)
       cpi->segment_encode_breakout[i] = cpi->oxcf.encode_breakout;
   }
+  cpi->encode_breakout = cpi->oxcf.encode_breakout;
 
   // local file playback mode == really big buffer
   if (cpi->oxcf.end_usage == USAGE_LOCAL_FILE_PLAYBACK) {
@@ -1326,10 +1459,10 @@ void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {
                                             cpi->oxcf.target_bandwidth, 1000);
   // Under a configuration change, where maximum_buffer_size may change,
   // keep buffer level clipped to the maximum allowed buffer size.
-  if (cpi->rc.bits_off_target > cpi->oxcf.maximum_buffer_size) {
-    cpi->rc.bits_off_target = cpi->oxcf.maximum_buffer_size;
-    cpi->rc.buffer_level = cpi->rc.bits_off_target;
-  }
+  cpi->rc.bits_off_target = MIN(cpi->rc.bits_off_target,
+                                cpi->oxcf.maximum_buffer_size);
+  cpi->rc.buffer_level = MIN(cpi->rc.buffer_level,
+                             cpi->oxcf.maximum_buffer_size);
 
   // Set up frame rate and related parameters rate control values.
   vp9_new_framerate(cpi, cpi->oxcf.framerate);
@@ -1339,16 +1472,11 @@ void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {
   cpi->rc.best_quality = cpi->oxcf.best_allowed_q;
 
   // active values should only be modified if out of new range
-  cpi->rc.active_worst_quality = clamp(cpi->rc.active_worst_quality,
-                                       cpi->rc.best_quality,
-                                       cpi->rc.worst_quality);
 
   cpi->cq_target_quality = cpi->oxcf.cq_level;
 
   cm->interp_filter = DEFAULT_INTERP_FILTER;
 
-  cpi->target_bandwidth = cpi->oxcf.target_bandwidth;
-
   cm->display_width = cpi->oxcf.width;
   cm->display_height = cpi->oxcf.height;
 
@@ -1359,24 +1487,24 @@ void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {
 
   if (cpi->initial_width) {
     // Increasing the size of the frame beyond the first seen frame, or some
-    // otherwise signalled maximum size, is not supported.
+    // otherwise signaled maximum size, is not supported.
     // TODO(jkoleszar): exit gracefully.
     assert(cm->width <= cpi->initial_width);
     assert(cm->height <= cpi->initial_height);
   }
   update_frame_size(cpi);
 
-  cpi->speed = cpi->oxcf.cpu_used;
+  if (cpi->svc.number_temporal_layers > 1 &&
+      cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) {
+    update_layer_context_change_config(cpi, (int)cpi->oxcf.target_bandwidth);
+  }
+
+  cpi->speed = abs(cpi->oxcf.cpu_used);
 
-  if (cpi->oxcf.lag_in_frames == 0) {
-    // force to allowlag to 0 if lag_in_frames is 0;
-    cpi->oxcf.allow_lag = 0;
-  } else if (cpi->oxcf.lag_in_frames > MAX_LAG_BUFFERS) {
-     // Limit on lag buffers as these are not currently dynamically allocated
+  // Limit on lag buffers as these are not currently dynamically allocated.
+  if (cpi->oxcf.lag_in_frames > MAX_LAG_BUFFERS)
     cpi->oxcf.lag_in_frames = MAX_LAG_BUFFERS;
-  }
 
-  // YX Temp
 #if CONFIG_MULTIPLE_ARF
   vp9_zero(cpi->alt_ref_source);
 #else
@@ -1441,6 +1569,7 @@ static void alloc_mode_context(VP9_COMMON *cm, int num_4x4_blk,
   int num_pix = num_4x4_blk << 4;
   int i, k;
   ctx->num_4x4_blk = num_4x4_blk;
+
   CHECK_MEM_ERROR(cm, ctx->zcoeff_blk,
                   vpx_calloc(num_4x4_blk, sizeof(uint8_t)));
   for (i = 0; i < MAX_MB_PLANE; ++i) {
@@ -1484,7 +1613,6 @@ static void init_pick_mode_context(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x  = &cpi->mb;
 
-
   for (i = 0; i < BLOCK_SIZES; ++i) {
     const int num_4x4_w = num_4x4_blocks_wide_lookup[i];
     const int num_4x4_h = num_4x4_blocks_high_lookup[i];
@@ -1589,6 +1717,8 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
 
   vp9_create_common(cm);
 
+  cpi->use_svc = 0;
+
   init_config((VP9_PTR)cpi, oxcf);
 
   init_pick_mode_context(cpi);
@@ -1604,9 +1734,6 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
   cpi->alt_is_last  = 0;
   cpi->gold_is_alt  = 0;
 
-  // Spatial scalability
-  cpi->number_spatial_layers = oxcf->ss_number_layers;
-
   // Create the encoder segmentation map and set all entries to 0
   CHECK_MEM_ERROR(cm, cpi->segmentation_map,
                   vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
@@ -1632,11 +1759,6 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
                                sizeof(*cpi->mbgraph_stats[i].mb_stats), 1));
   }
 
-#ifdef ENTROPY_STATS
-  if (cpi->pass != 1)
-    init_context_counters();
-#endif
-
   /*Initialize the feed-forward activity masking.*/
   cpi->activity_avg = 90 << 12;
   cpi->key_frame_frequency = cpi->oxcf.key_freq;
@@ -1741,7 +1863,7 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
 
   cpi->output_pkt_list = oxcf->output_pkt_list;
 
-  cpi->enable_encode_breakout = 1;
+  cpi->allow_encode_breakout = ENCODE_BREAKOUT_ENABLED;
 
   if (cpi->pass == 1) {
     vp9_init_first_pass(cpi);
@@ -1908,10 +2030,12 @@ void vp9_remove_compressor(VP9_PTR *ptr) {
                   / time_encoded;
 
       if (cpi->b_calculate_psnr) {
-        const double total_psnr = vp9_mse2psnr(cpi->total_samples, 255.0,
-                                               cpi->total_sq_error);
-        const double totalp_psnr = vp9_mse2psnr(cpi->totalp_samples, 255.0,
-                                                cpi->totalp_sq_error);
+        const double total_psnr =
+            vpx_sse_to_psnr((double)cpi->total_samples, 255.0,
+                            (double)cpi->total_sq_error);
+        const double totalp_psnr =
+            vpx_sse_to_psnr((double)cpi->totalp_samples, 255.0,
+                            (double)cpi->totalp_sq_error);
         const double total_ssim = 100 * pow(cpi->summed_quality /
                                                 cpi->summed_weights, 8.0);
         const double totalp_ssim = 100 * pow(cpi->summedp_quality /
@@ -1967,21 +2091,6 @@ void vp9_remove_compressor(VP9_PTR *ptr) {
     }
 #endif
 
-#if defined(SECTIONBITS_OUTPUT)
-
-    if (0) {
-      int i;
-      FILE *f = fopen("tokenbits.stt", "a");
-
-      for (i = 0; i < 28; i++)
-        fprintf(f, "%8d", (int)(Sectionbits[i] / 256));
-
-      fprintf(f, "\n");
-      fclose(f);
-    }
-
-#endif
-
 #if 0
     {
       printf("\n_pick_loop_filter_level:%d\n", cpi->time_pick_lpf / 1000);
@@ -2102,12 +2211,12 @@ static void calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b,
     const int w = widths[i];
     const int h = heights[i];
     const uint32_t samples = w * h;
-    const double sse = calc_plane_error(a_planes[i], a_strides[i],
-                                        b_planes[i], b_strides[i],
-                                        w, h);
+    const uint64_t sse = calc_plane_error(a_planes[i], a_strides[i],
+                                          b_planes[i], b_strides[i],
+                                          w, h);
     psnr->sse[1 + i] = sse;
     psnr->samples[1 + i] = samples;
-    psnr->psnr[1 + i] = vp9_mse2psnr(samples, 255.0, sse);
+    psnr->psnr[1 + i] = vpx_sse_to_psnr(samples, 255.0, (double)sse);
 
     total_sse += sse;
     total_samples += samples;
@@ -2115,7 +2224,8 @@ static void calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b,
 
   psnr->sse[0] = total_sse;
   psnr->samples[0] = total_samples;
-  psnr->psnr[0] = vp9_mse2psnr(total_samples, 255.0, total_sse);
+  psnr->psnr[0] = vpx_sse_to_psnr((double)total_samples, 255.0,
+                                  (double)total_sse);
 }
 
 static void generate_psnr_packet(VP9_COMP *cpi) {
@@ -2448,34 +2558,33 @@ static double compute_edge_pixel_proportion(YV12_BUFFER_CONFIG *frame) {
 
 // Function to test for conditions that indicate we should loop
 // back and recode a frame.
-static int recode_loop_test(VP9_COMP *cpi,
+static int recode_loop_test(const VP9_COMP *cpi,
                             int high_limit, int low_limit,
                             int q, int maxq, int minq) {
+  const VP9_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *const rc = &cpi->rc;
   int force_recode = 0;
-  VP9_COMMON *cm = &cpi->common;
 
   // Special case trap if maximum allowed frame size exceeded.
-  if (cpi->rc.projected_frame_size > cpi->rc.max_frame_bandwidth) {
+  if (rc->projected_frame_size > rc->max_frame_bandwidth) {
     force_recode = 1;
 
   // Is frame recode allowed.
   // Yes if either recode mode 1 is selected or mode 2 is selected
   // and the frame is a key frame, golden frame or alt_ref_frame
-  } else if ((cpi->sf.recode_loop == 1) ||
-      ((cpi->sf.recode_loop == 2) &&
-       ((cm->frame_type == KEY_FRAME) ||
-        cpi->refresh_golden_frame ||
-        cpi->refresh_alt_ref_frame))) {
+  } else if ((cpi->sf.recode_loop == ALLOW_RECODE) ||
+             ((cpi->sf.recode_loop == ALLOW_RECODE_KFARFGF) &&
+              (cm->frame_type == KEY_FRAME ||
+               cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame))) {
     // General over and under shoot tests
-    if (((cpi->rc.projected_frame_size > high_limit) && (q < maxq)) ||
-        ((cpi->rc.projected_frame_size < low_limit) && (q > minq))) {
+    if ((rc->projected_frame_size > high_limit && q < maxq) ||
+        (rc->projected_frame_size < low_limit && q > minq)) {
       force_recode = 1;
     } else if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) {
       // Deal with frame undershoot and whether or not we are
       // below the automatically set cq level.
       if (q > cpi->cq_target_quality &&
-          cpi->rc.projected_frame_size <
-          ((cpi->rc.this_frame_target * 7) >> 3)) {
+          rc->projected_frame_size < ((rc->this_frame_target * 7) >> 3)) {
         force_recode = 1;
       }
     }
@@ -2583,7 +2692,7 @@ static void scale_references(VP9_COMP *cpi) {
       vp9_realloc_frame_buffer(&cm->frame_bufs[new_fb].buf,
                                cm->width, cm->height,
                                cm->subsampling_x, cm->subsampling_y,
-                               VP9_ENC_BORDER_IN_PIXELS);
+                               VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL);
       scale_and_extend_frame(ref, &cm->frame_bufs[new_fb].buf);
       cpi->scaled_ref_idx[ref_frame - 1] = new_fb;
     } else {
@@ -2629,14 +2738,14 @@ static void output_frame_level_debug_stats(VP9_COMP *cpi) {
   FILE *const f = fopen("tmp.stt", cm->current_video_frame ? "a" : "w");
   int recon_err;
 
-  vp9_clear_system_state();  // __asm emms;
+  vp9_clear_system_state();
 
   recon_err = vp9_calc_ss_err(cpi->Source, get_frame_new_buffer(cm));
 
   if (cpi->twopass.total_left_stats.coded_error != 0.0)
     fprintf(f, "%10u %10d %10d %10d %10d %10d "
         "%10"PRId64" %10"PRId64" %10d "
-        "%7.2lf %7.2lf %7.2lf %7.2lf %7.2lf %7.2lf"
+        "%7.2lf %7.2lf %7.2lf %7.2lf %7.2lf"
         "%6d %6d %5d %5d %5d "
         "%10"PRId64" %10.3lf"
         "%10lf %8u %10d %10d %10d\n",
@@ -2649,7 +2758,7 @@ static void output_frame_level_debug_stats(VP9_COMP *cpi) {
         cpi->rc.total_actual_bits, cm->base_qindex,
         vp9_convert_qindex_to_q(cm->base_qindex),
         (double)vp9_dc_quant(cm->base_qindex, 0) / 4.0,
-        vp9_convert_qindex_to_q(cpi->rc.active_worst_quality), cpi->rc.avg_q,
+        cpi->rc.avg_q,
         vp9_convert_qindex_to_q(cpi->rc.ni_av_qi),
         vp9_convert_qindex_to_q(cpi->cq_target_quality),
         cpi->refresh_last_frame, cpi->refresh_golden_frame,
@@ -2673,8 +2782,6 @@ static void output_frame_level_debug_stats(VP9_COMP *cpi) {
 
     for (i = 0; i < MAX_MODES; ++i)
       fprintf(fmodes, "%5d ", cpi->mode_chosen_counts[i]);
-    for (i = 0; i < MAX_REFS; ++i)
-      fprintf(fmodes, "%5d ", cpi->sub8x8_mode_chosen_counts[i]);
 
     fprintf(fmodes, "\n");
 
@@ -2683,25 +2790,68 @@ static void output_frame_level_debug_stats(VP9_COMP *cpi) {
 }
 #endif
 
+static void encode_without_recode_loop(VP9_COMP *cpi,
+                                       size_t *size,
+                                       uint8_t *dest,
+                                       int q) {
+  VP9_COMMON *const cm = &cpi->common;
+  vp9_clear_system_state();
+  vp9_set_quantizer(cpi, q);
+
+  // Set up entropy context depending on frame type. The decoder mandates
+  // the use of the default context, index 0, for keyframes and inter
+  // frames where the error_resilient_mode or intra_only flag is set. For
+  // other inter-frames the encoder currently uses only two contexts;
+  // context 1 for ALTREF frames and context 0 for the others.
+  if (cm->frame_type == KEY_FRAME) {
+    vp9_setup_key_frame(cpi);
+  } else {
+    if (!cm->intra_only && !cm->error_resilient_mode && !cpi->use_svc) {
+      cpi->common.frame_context_idx = cpi->refresh_alt_ref_frame;
+    }
+    vp9_setup_inter_frame(cpi);
+  }
+  // Variance adaptive and in frame q adjustment experiments are mutually
+  // exclusive.
+  if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
+    vp9_vaq_frame_setup(cpi);
+  } else if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) {
+    setup_in_frame_q_adj(cpi);
+  }
+  // transform / motion compensation build reconstruction frame
+  vp9_encode_frame(cpi);
+
+  // Update the skip mb flag probabilities based on the distribution
+  // seen in the last encoder iteration.
+  // update_base_skip_probs(cpi);
+  vp9_clear_system_state();
+}
+
 static void encode_with_recode_loop(VP9_COMP *cpi,
                                     size_t *size,
                                     uint8_t *dest,
-                                    int *q,
+                                    int q,
                                     int bottom_index,
-                                    int top_index,
-                                    int frame_over_shoot_limit,
-                                    int frame_under_shoot_limit) {
+                                    int top_index) {
   VP9_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
   int loop_count = 0;
   int loop = 0;
   int overshoot_seen = 0;
   int undershoot_seen = 0;
   int q_low = bottom_index, q_high = top_index;
+  int frame_over_shoot_limit;
+  int frame_under_shoot_limit;
+
+  // Decide frame size bounds
+  vp9_rc_compute_frame_size_bounds(cpi, rc->this_frame_target,
+                                   &frame_under_shoot_limit,
+                                   &frame_over_shoot_limit);
 
   do {
-    vp9_clear_system_state();  // __asm emms;
+    vp9_clear_system_state();
 
-    vp9_set_quantizer(cpi, *q);
+    vp9_set_quantizer(cpi, q);
 
     if (loop_count == 0) {
       // Set up entropy context depending on frame type. The decoder mandates
@@ -2712,7 +2862,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi,
       if (cm->frame_type == KEY_FRAME) {
         vp9_setup_key_frame(cpi);
       } else {
-        if (!cm->intra_only && !cm->error_resilient_mode) {
+        if (!cm->intra_only && !cm->error_resilient_mode && !cpi->use_svc) {
           cpi->common.frame_context_idx = cpi->refresh_alt_ref_frame;
         }
         vp9_setup_inter_frame(cpi);
@@ -2728,25 +2878,24 @@ static void encode_with_recode_loop(VP9_COMP *cpi,
     }
 
     // transform / motion compensation build reconstruction frame
-
     vp9_encode_frame(cpi);
 
     // Update the skip mb flag probabilities based on the distribution
     // seen in the last encoder iteration.
     // update_base_skip_probs(cpi);
 
-    vp9_clear_system_state();  // __asm emms;
+    vp9_clear_system_state();
 
     // Dummy pack of the bitstream using up to date stats to get an
     // accurate estimate of output frame size to determine if we need
     // to recode.
-    if (cpi->sf.recode_loop != 0) {
+    if (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF) {
       vp9_save_coding_context(cpi);
       cpi->dummy_packing = 1;
-      if (!cpi->sf.super_fast_rtc)
+      if (!cpi->sf.use_nonrd_pick_mode)
         vp9_pack_bitstream(cpi, dest, size);
 
-      cpi->rc.projected_frame_size = (*size) << 3;
+      rc->projected_frame_size = (int)(*size) << 3;
       vp9_restore_coding_context(cpi);
 
       if (frame_over_shoot_limit == 0)
@@ -2757,9 +2906,9 @@ static void encode_with_recode_loop(VP9_COMP *cpi,
       loop = 0;
     } else {
       if ((cm->frame_type == KEY_FRAME) &&
-           cpi->rc.this_key_frame_forced &&
-           (cpi->rc.projected_frame_size < cpi->rc.max_frame_bandwidth)) {
-        int last_q = *q;
+           rc->this_key_frame_forced &&
+           (rc->projected_frame_size < rc->max_frame_bandwidth)) {
+        int last_q = q;
         int kf_err = vp9_calc_ss_err(cpi->Source, get_frame_new_buffer(cm));
 
         int high_err_target = cpi->ambient_err;
@@ -2771,65 +2920,65 @@ static void encode_with_recode_loop(VP9_COMP *cpi,
         // The key frame is not good enough or we can afford
         // to make it better without undue risk of popping.
         if ((kf_err > high_err_target &&
-             cpi->rc.projected_frame_size <= frame_over_shoot_limit) ||
+             rc->projected_frame_size <= frame_over_shoot_limit) ||
             (kf_err > low_err_target &&
-             cpi->rc.projected_frame_size <= frame_under_shoot_limit)) {
+             rc->projected_frame_size <= frame_under_shoot_limit)) {
           // Lower q_high
-          q_high = *q > q_low ? *q - 1 : q_low;
+          q_high = q > q_low ? q - 1 : q_low;
 
           // Adjust Q
-          *q = ((*q) * high_err_target) / kf_err;
-          *q = MIN((*q), (q_high + q_low) >> 1);
+          q = (q * high_err_target) / kf_err;
+          q = MIN(q, (q_high + q_low) >> 1);
         } else if (kf_err < low_err_target &&
-                   cpi->rc.projected_frame_size >= frame_under_shoot_limit) {
+                   rc->projected_frame_size >= frame_under_shoot_limit) {
           // The key frame is much better than the previous frame
           // Raise q_low
-          q_low = *q < q_high ? *q + 1 : q_high;
+          q_low = q < q_high ? q + 1 : q_high;
 
           // Adjust Q
-          *q = ((*q) * low_err_target) / kf_err;
-          *q = MIN((*q), (q_high + q_low + 1) >> 1);
+          q = (q * low_err_target) / kf_err;
+          q = MIN(q, (q_high + q_low + 1) >> 1);
         }
 
         // Clamp Q to upper and lower limits:
-        *q = clamp(*q, q_low, q_high);
+        q = clamp(q, q_low, q_high);
 
-        loop = *q != last_q;
+        loop = q != last_q;
       } else if (recode_loop_test(
           cpi, frame_over_shoot_limit, frame_under_shoot_limit,
-          *q, MAX(q_high, top_index), bottom_index)) {
+          q, MAX(q_high, top_index), bottom_index)) {
         // Is the projected frame size out of range and are we allowed
         // to attempt to recode.
-        int last_q = *q;
+        int last_q = q;
         int retries = 0;
 
         // Frame size out of permitted range:
         // Update correction factor & compute new Q to try...
 
         // Frame is too large
-        if (cpi->rc.projected_frame_size > cpi->rc.this_frame_target) {
+        if (rc->projected_frame_size > rc->this_frame_target) {
           // Special case if the projected size is > the max allowed.
-          if (cpi->rc.projected_frame_size >= cpi->rc.max_frame_bandwidth)
-            q_high = cpi->rc.worst_quality;
+          if (rc->projected_frame_size >= rc->max_frame_bandwidth)
+            q_high = rc->worst_quality;
 
           // Raise Qlow as to at least the current value
-          q_low = *q < q_high ? *q + 1 : q_high;
+          q_low = q < q_high ? q + 1 : q_high;
 
           if (undershoot_seen || loop_count > 1) {
             // Update rate_correction_factor unless
             vp9_rc_update_rate_correction_factors(cpi, 1);
 
-            *q = (q_high + q_low + 1) / 2;
+            q = (q_high + q_low + 1) / 2;
           } else {
             // Update rate_correction_factor unless
             vp9_rc_update_rate_correction_factors(cpi, 0);
 
-            *q = vp9_rc_regulate_q(cpi, cpi->rc.this_frame_target,
+            q = vp9_rc_regulate_q(cpi, rc->this_frame_target,
                                    bottom_index, MAX(q_high, top_index));
 
-            while (*q < q_low && retries < 10) {
+            while (q < q_low && retries < 10) {
               vp9_rc_update_rate_correction_factors(cpi, 0);
-              *q = vp9_rc_regulate_q(cpi, cpi->rc.this_frame_target,
+              q = vp9_rc_regulate_q(cpi, rc->this_frame_target,
                                      bottom_index, MAX(q_high, top_index));
               retries++;
             }
@@ -2838,27 +2987,27 @@ static void encode_with_recode_loop(VP9_COMP *cpi,
           overshoot_seen = 1;
         } else {
           // Frame is too small
-          q_high = *q > q_low ? *q - 1 : q_low;
+          q_high = q > q_low ? q - 1 : q_low;
 
           if (overshoot_seen || loop_count > 1) {
             vp9_rc_update_rate_correction_factors(cpi, 1);
-            *q = (q_high + q_low) / 2;
+            q = (q_high + q_low) / 2;
           } else {
             vp9_rc_update_rate_correction_factors(cpi, 0);
-            *q = vp9_rc_regulate_q(cpi, cpi->rc.this_frame_target,
+            q = vp9_rc_regulate_q(cpi, rc->this_frame_target,
                                    bottom_index, top_index);
             // Special case reset for qlow for constrained quality.
             // This should only trigger where there is very substantial
             // undershoot on a frame and the auto cq level is above
             // the user passsed in value.
             if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY &&
-                *q < q_low) {
-              q_low = *q;
+                q < q_low) {
+              q_low = q;
             }
 
-            while (*q > q_high && retries < 10) {
+            while (q > q_high && retries < 10) {
               vp9_rc_update_rate_correction_factors(cpi, 0);
-              *q = vp9_rc_regulate_q(cpi, cpi->rc.this_frame_target,
+              q = vp9_rc_regulate_q(cpi, rc->this_frame_target,
                                      bottom_index, top_index);
               retries++;
             }
@@ -2868,17 +3017,17 @@ static void encode_with_recode_loop(VP9_COMP *cpi,
         }
 
         // Clamp Q to upper and lower limits:
-        *q = clamp(*q, q_low, q_high);
+        q = clamp(q, q_low, q_high);
 
-        loop = *q != last_q;
+        loop = q != last_q;
       } else {
         loop = 0;
       }
     }
 
     // Special case for overlay frame.
-    if (cpi->rc.is_src_frame_alt_ref &&
-        (cpi->rc.projected_frame_size < cpi->rc.max_frame_bandwidth))
+    if (rc->is_src_frame_alt_ref &&
+        rc->projected_frame_size < rc->max_frame_bandwidth)
       loop = 0;
 
     if (loop) {
@@ -2912,6 +3061,9 @@ static void get_ref_frame_flags(VP9_COMP *cpi) {
   if (cpi->gold_is_last)
     cpi->ref_frame_flags &= ~VP9_GOLD_FLAG;
 
+  if (cpi->rc.frames_till_gf_update_due == INT_MAX)
+    cpi->ref_frame_flags &= ~VP9_GOLD_FLAG;
+
   if (cpi->alt_is_last)
     cpi->ref_frame_flags &= ~VP9_ALT_FLAG;
 
@@ -2943,20 +3095,18 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
   VP9_COMMON *const cm = &cpi->common;
   TX_SIZE t;
   int q;
-  int frame_over_shoot_limit;
-  int frame_under_shoot_limit;
   int top_index;
   int bottom_index;
 
-  SPEED_FEATURES *const sf = &cpi->sf;
-  unsigned int max_mv_def = MIN(cpi->common.width, cpi->common.height);
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  const unsigned int max_mv_def = MIN(cm->width, cm->height);
   struct segmentation *const seg = &cm->seg;
 
   set_ext_overrides(cpi);
 
   /* Scale the source buffer, if required. */
-  if (cm->mi_cols * 8 != cpi->un_scaled_source->y_width ||
-      cm->mi_rows * 8 != cpi->un_scaled_source->y_height) {
+  if (cm->mi_cols * MI_SIZE != cpi->un_scaled_source->y_width ||
+      cm->mi_rows * MI_SIZE != cpi->un_scaled_source->y_height) {
     scale_and_extend_frame_nonnormative(cpi->un_scaled_source,
                                         &cpi->scaled_source);
     cpi->Source = &cpi->scaled_source;
@@ -2965,12 +3115,8 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
   }
   scale_references(cpi);
 
-  // Clear down mmx registers to allow floating point in what follows.
   vp9_clear_system_state();
 
-  // Clear zbin over-quant value and mode boost values.
-  cpi->zbin_mode_boost = 0;
-
   // Enable or disable mode based tweaking of the zbin.
   // For 2 pass only used where GF/ARF prediction quality
   // is above a threshold.
@@ -2978,7 +3124,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
   cpi->zbin_mode_boost_enabled = 0;
 
   // Current default encoder behavior for the altref sign bias.
-  cpi->common.ref_frame_sign_bias[ALTREF_FRAME] = cpi->rc.source_alt_ref_active;
+  cm->ref_frame_sign_bias[ALTREF_FRAME] = cpi->rc.source_alt_ref_active;
 
   // Set default state for segment based loop filter update flags.
   cm->lf.mode_ref_delta_update = 0;
@@ -2987,7 +3133,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
   cpi->mv_step_param = vp9_init_search_range(cpi, max_mv_def);
   // Initialize cpi->max_mv_magnitude and cpi->mv_step_param if appropriate.
   if (sf->auto_mv_step_size) {
-    if (frame_is_intra_only(&cpi->common)) {
+    if (frame_is_intra_only(cm)) {
       // Initialize max_mv_magnitude for use in the first INTER frame
       // after a key/intra-only frame.
       cpi->max_mv_magnitude = max_mv_def;
@@ -2996,8 +3142,8 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
         // Allow mv_steps to correspond to twice the max mv magnitude found
         // in the previous frame, capped by the default max_mv_magnitude based
         // on resolution.
-        cpi->mv_step_param = vp9_init_search_range(
-            cpi, MIN(max_mv_def, 2 * cpi->max_mv_magnitude));
+        cpi->mv_step_param = vp9_init_search_range(cpi, MIN(max_mv_def, 2 *
+                                 cpi->max_mv_magnitude));
       cpi->max_mv_magnitude = 0;
     }
   }
@@ -3020,7 +3166,11 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
     cm->error_resilient_mode = (cpi->oxcf.error_resilient_mode != 0);
     cm->frame_parallel_decoding_mode =
       (cpi->oxcf.frame_parallel_decoding_mode != 0);
+
+    // By default, encoder assumes decoder can use prev_mi.
+    cm->coding_use_prev_mi = 1;
     if (cm->error_resilient_mode) {
+      cm->coding_use_prev_mi = 0;
       cm->frame_parallel_decoding_mode = 1;
       cm->reset_frame_context = 0;
       cm->refresh_frame_context = 0;
@@ -3034,21 +3184,17 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
   // static regions if indicated.
   // Only allowed in second pass of two pass (as requires lagged coding)
   // and if the relevant speed feature flag is set.
-  if ((cpi->pass == 2) && (cpi->sf.static_segmentation)) {
+  if (cpi->pass == 2 && cpi->sf.static_segmentation)
     configure_static_seg_features(cpi);
-  }
 
   // For 1 pass CBR, check if we are dropping this frame.
   // Never drop on key frame.
   if (cpi->pass == 0 &&
       cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER &&
       cm->frame_type != KEY_FRAME) {
-    if (vp9_drop_frame(cpi)) {
-      // Update buffer level with zero size, update frame counters, and return.
-      vp9_update_buffer_level(cpi, 0);
-      cm->last_frame_type = cm->frame_type;
+    if (vp9_rc_drop_frame(cpi)) {
       vp9_rc_postencode_update_drop_frame(cpi);
-      cm->current_video_frame++;
+      ++cm->current_video_frame;
       return;
     }
   }
@@ -3086,44 +3232,20 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
   vp9_write_yuv_frame(cpi->Source);
 #endif
 
-  // Decide how big to make the frame.
-  vp9_rc_pick_frame_size_target(cpi);
-
-  // Decide frame size bounds
-  vp9_rc_compute_frame_size_bounds(cpi, cpi->rc.this_frame_target,
-                                   &frame_under_shoot_limit,
-                                   &frame_over_shoot_limit);
-
   // Decide q and q bounds.
-  q = vp9_rc_pick_q_and_adjust_q_bounds(cpi,
-                                        &bottom_index,
-                                        &top_index);
-
-  // JBB : This is realtime mode.  In real time mode the first frame
-  // should be larger. Q of 0 is disabled because we force tx size to be
-  // 16x16...
-  if (cpi->sf.super_fast_rtc) {
-    if (cpi->common.current_video_frame == 0)
-      q /= 3;
-
-    if (q == 0)
-      q++;
-  }
+  q = vp9_rc_pick_q_and_bounds(cpi, &bottom_index, &top_index);
 
   if (!frame_is_intra_only(cm)) {
     cm->interp_filter = DEFAULT_INTERP_FILTER;
     /* TODO: Decide this more intelligently */
-    set_high_precision_mv(cpi, (q < HIGH_PRECISION_MV_QTHRESH));
+    set_high_precision_mv(cpi, q < HIGH_PRECISION_MV_QTHRESH);
   }
 
-  encode_with_recode_loop(cpi,
-                          size,
-                          dest,
-                          &q,
-                          bottom_index,
-                          top_index,
-                          frame_over_shoot_limit,
-                          frame_under_shoot_limit);
+  if (cpi->sf.recode_loop == DISALLOW_RECODE) {
+    encode_without_recode_loop(cpi, size, dest, q);
+  } else {
+    encode_with_recode_loop(cpi, size, dest, q, bottom_index, top_index);
+  }
 
   // Special case code to reduce pulsing when key frames are forced at a
   // fixed interval. Note the reconstruction error if it is the frame before
@@ -3170,41 +3292,30 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
   update_reference_frames(cpi);
 
   for (t = TX_4X4; t <= TX_32X32; t++)
-    full_to_model_counts(cpi->common.counts.coef[t],
-                         cpi->coef_counts[t]);
-  if (!cpi->common.error_resilient_mode &&
-      !cpi->common.frame_parallel_decoding_mode) {
-    vp9_adapt_coef_probs(&cpi->common);
-  }
-
-  if (!frame_is_intra_only(&cpi->common)) {
-    if (!cpi->common.error_resilient_mode &&
-        !cpi->common.frame_parallel_decoding_mode) {
-      vp9_adapt_mode_probs(&cpi->common);
-      vp9_adapt_mv_probs(&cpi->common, cpi->common.allow_high_precision_mv);
-    }
-  }
+    full_to_model_counts(cm->counts.coef[t], cpi->coef_counts[t]);
 
-#ifdef ENTROPY_STATS
-  vp9_update_mode_context_stats(cpi);
-#endif
+  if (!cm->error_resilient_mode && !cm->frame_parallel_decoding_mode)
+    vp9_adapt_coef_probs(cm);
 
-  /* Move storing frame_type out of the above loop since it is also
-   * needed in motion search besides loopfilter */
-  cm->last_frame_type = cm->frame_type;
+  if (!frame_is_intra_only(cm)) {
+    if (!cm->error_resilient_mode && !cm->frame_parallel_decoding_mode) {
+      vp9_adapt_mode_probs(cm);
+      vp9_adapt_mv_probs(cm, cm->allow_high_precision_mv);
+    }
+  }
 
 #if 0
   output_frame_level_debug_stats(cpi);
 #endif
   if (cpi->refresh_golden_frame == 1)
-    cm->frame_flags = cm->frame_flags | FRAMEFLAGS_GOLDEN;
+    cm->frame_flags |= FRAMEFLAGS_GOLDEN;
   else
-    cm->frame_flags = cm->frame_flags&~FRAMEFLAGS_GOLDEN;
+    cm->frame_flags &= ~FRAMEFLAGS_GOLDEN;
 
   if (cpi->refresh_alt_ref_frame == 1)
-    cm->frame_flags = cm->frame_flags | FRAMEFLAGS_ALTREF;
+    cm->frame_flags |= FRAMEFLAGS_ALTREF;
   else
-    cm->frame_flags = cm->frame_flags&~FRAMEFLAGS_ALTREF;
+    cm->frame_flags &= ~FRAMEFLAGS_ALTREF;
 
   get_ref_frame_flags(cpi);
 
@@ -3253,6 +3364,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
   // reset to normal state now that we are done.
   if (!cm->show_existing_frame)
     cm->last_show_frame = cm->show_frame;
+
   if (cm->show_frame) {
     // current mip will be the prev_mip for the next frame
     MODE_INFO *temp = cm->prev_mip;
@@ -3273,6 +3385,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
     // update not a real frame
     ++cm->current_video_frame;
   }
+
   // restore prev_mi
   cm->prev_mi = cm->prev_mip + cm->mode_info_stride + 1;
   cm->prev_mi_grid_visible = cm->prev_mi_grid_base + cm->mode_info_stride + 1;
@@ -3280,16 +3393,16 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
 
 static void SvcEncode(VP9_COMP *cpi, size_t *size, uint8_t *dest,
                       unsigned int *frame_flags) {
-  vp9_get_svc_params(cpi);
+  vp9_rc_get_svc_params(cpi);
   encode_frame_to_data_rate(cpi, size, dest, frame_flags);
 }
 
 static void Pass0Encode(VP9_COMP *cpi, size_t *size, uint8_t *dest,
                         unsigned int *frame_flags) {
   if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) {
-    vp9_get_one_pass_cbr_params(cpi);
+    vp9_rc_get_one_pass_cbr_params(cpi);
   } else {
-    vp9_get_one_pass_params(cpi);
+    vp9_rc_get_one_pass_vbr_params(cpi);
   }
   encode_frame_to_data_rate(cpi, size, dest, frame_flags);
 }
@@ -3300,16 +3413,16 @@ static void Pass1Encode(VP9_COMP *cpi, size_t *size, uint8_t *dest,
   (void) dest;
   (void) frame_flags;
 
-  vp9_get_first_pass_params(cpi);
+  vp9_rc_get_first_pass_params(cpi);
   vp9_set_quantizer(cpi, find_fp_qindex());
   vp9_first_pass(cpi);
 }
 
 static void Pass2Encode(VP9_COMP *cpi, size_t *size,
                         uint8_t *dest, unsigned int *frame_flags) {
-  cpi->enable_encode_breakout = 1;
+  cpi->allow_encode_breakout = ENCODE_BREAKOUT_ENABLED;
 
-  vp9_get_second_pass_params(cpi);
+  vp9_rc_get_second_pass_params(cpi);
   encode_frame_to_data_rate(cpi, size, dest, frame_flags);
 
   vp9_twopass_postencode_update(cpi, *size);
@@ -3318,6 +3431,7 @@ static void Pass2Encode(VP9_COMP *cpi, size_t *size,
 static void check_initial_width(VP9_COMP *cpi, int subsampling_x,
                                 int subsampling_y) {
   VP9_COMMON *const cm = &cpi->common;
+
   if (!cpi->initial_width) {
     cm->subsampling_x = subsampling_x;
     cm->subsampling_y = subsampling_y;
@@ -3331,12 +3445,12 @@ static void check_initial_width(VP9_COMP *cpi, int subsampling_x,
 int vp9_receive_raw_frame(VP9_PTR ptr, unsigned int frame_flags,
                           YV12_BUFFER_CONFIG *sd, int64_t time_stamp,
                           int64_t end_time) {
-  VP9_COMP              *cpi = (VP9_COMP *) ptr;
-  VP9_COMMON             *cm = &cpi->common;
-  struct vpx_usec_timer  timer;
-  int                    res = 0;
-  const int    subsampling_x = sd->uv_width  < sd->y_width;
-  const int    subsampling_y = sd->uv_height < sd->y_height;
+  VP9_COMP *cpi = (VP9_COMP *)ptr;
+  VP9_COMMON *cm = &cpi->common;
+  struct vpx_usec_timer timer;
+  int res = 0;
+  const int subsampling_x = sd->uv_width  < sd->y_width;
+  const int subsampling_y = sd->uv_height < sd->y_height;
 
   check_initial_width(cpi, subsampling_x, subsampling_y);
   vpx_usec_timer_start(&timer);
@@ -3558,11 +3672,17 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
     adjust_frame_rate(cpi);
   }
 
+  if (cpi->svc.number_temporal_layers > 1 &&
+      cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) {
+    update_layer_framerate(cpi);
+    restore_layer_context(cpi);
+  }
+
   // start with a 0 size frame
   *size = 0;
 
   // Clear down mmx registers
-  vp9_clear_system_state();  // __asm emms;
+  vp9_clear_system_state();
 
   /* find a free buffer for the new frame, releasing the reference previously
    * held.
@@ -3587,7 +3707,7 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
   vp9_realloc_frame_buffer(get_frame_new_buffer(cm),
                            cm->width, cm->height,
                            cm->subsampling_x, cm->subsampling_y,
-                           VP9_ENC_BORDER_IN_PIXELS);
+                           VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL);
 
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
     const int idx = cm->ref_frame_map[get_ref_frame_idx(cpi, ref_frame)];
@@ -3607,8 +3727,9 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
   xd->interp_kernel = vp9_get_interp_kernel(
       DEFAULT_INTERP_FILTER == SWITCHABLE ? EIGHTTAP : DEFAULT_INTERP_FILTER);
 
-  if (cpi->oxcf.aq_mode == VARIANCE_AQ)
+  if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
     vp9_vaq_init();
+  }
 
   if (cpi->use_svc) {
     SvcEncode(cpi, size, dest, frame_flags);
@@ -3633,6 +3754,12 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
     cpi->droppable = !frame_is_reference(cpi);
   }
 
+  // Save layer specific state.
+  if (cpi->svc.number_temporal_layers > 1 &&
+      cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) {
+    save_layer_context(cpi);
+  }
+
   vpx_usec_timer_mark(&cmptimer);
   cpi->time_compress_data += vpx_usec_timer_elapsed(&cmptimer);
 
@@ -3642,7 +3769,7 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
 #if CONFIG_INTERNAL_STATS
 
   if (cpi->pass != 1) {
-    cpi->bytes += *size;
+    cpi->bytes += (int)(*size);
 
     if (cm->show_frame) {
       cpi->count++;
@@ -3717,22 +3844,23 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
 
 int vp9_get_preview_raw_frame(VP9_PTR comp, YV12_BUFFER_CONFIG *dest,
                               vp9_ppflags_t *flags) {
-  VP9_COMP *cpi = (VP9_COMP *) comp;
+  VP9_COMP *cpi = (VP9_COMP *)comp;
+  VP9_COMMON *cm = &cpi->common;
 
-  if (!cpi->common.show_frame) {
+  if (!cm->show_frame) {
     return -1;
   } else {
     int ret;
 #if CONFIG_VP9_POSTPROC
-    ret = vp9_post_proc_frame(&cpi->common, dest, flags);
+    ret = vp9_post_proc_frame(cm, dest, flags);
 #else
 
-    if (cpi->common.frame_to_show) {
-      *dest = *cpi->common.frame_to_show;
-      dest->y_width = cpi->common.width;
-      dest->y_height = cpi->common.height;
-      dest->uv_width = cpi->common.width >> cpi->common.subsampling_x;
-      dest->uv_height = cpi->common.height >> cpi->common.subsampling_y;
+    if (cm->frame_to_show) {
+      *dest = *cm->frame_to_show;
+      dest->y_width = cm->width;
+      dest->y_height = cm->height;
+      dest->uv_width = cm->width >> cm->subsampling_x;
+      dest->uv_height = cm->height >> cm->subsampling_y;
       ret = 0;
     } else {
       ret = -1;
@@ -3846,11 +3974,11 @@ int vp9_set_size_literal(VP9_PTR comp, unsigned int width,
     cm->width = width;
     if (cm->width * 5 < cpi->initial_width) {
       cm->width = cpi->initial_width / 5 + 1;
-      printf("Warning: Desired width too small, changed to %d \n", cm->width);
+      printf("Warning: Desired width too small, changed to %d\n", cm->width);
     }
     if (cm->width > cpi->initial_width) {
       cm->width = cpi->initial_width;
-      printf("Warning: Desired width too large, changed to %d \n", cm->width);
+      printf("Warning: Desired width too large, changed to %d\n", cm->width);
     }
   }
 
@@ -3858,11 +3986,11 @@ int vp9_set_size_literal(VP9_PTR comp, unsigned int width,
     cm->height = height;
     if (cm->height * 5 < cpi->initial_height) {
       cm->height = cpi->initial_height / 5 + 1;
-      printf("Warning: Desired height too small, changed to %d \n", cm->height);
+      printf("Warning: Desired height too small, changed to %d\n", cm->height);
     }
     if (cm->height > cpi->initial_height) {
       cm->height = cpi->initial_height;
-      printf("Warning: Desired height too large, changed to %d \n", cm->height);
+      printf("Warning: Desired height too large, changed to %d\n", cm->height);
     }
   }
 
diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h
index d928312b6..fd2356591 100644
--- a/vp9/encoder/vp9_onyx_int.h
+++ b/vp9/encoder/vp9_onyx_int.h
@@ -23,7 +23,9 @@
 #include "vp9/common/vp9_onyxc_int.h"
 
 #include "vp9/encoder/vp9_encodemb.h"
+#include "vp9/encoder/vp9_firstpass.h"
 #include "vp9/encoder/vp9_lookahead.h"
+#include "vp9/encoder/vp9_mbgraph.h"
 #include "vp9/encoder/vp9_mcomp.h"
 #include "vp9/encoder/vp9_quantize.h"
 #include "vp9/encoder/vp9_ratectrl.h"
@@ -35,17 +37,17 @@
 extern "C" {
 #endif
 
-#define DISABLE_RC_LONG_TERM_MEM 0
 // #define MODE_TEST_HIT_STATS
 
-// #define SPEEDSTATS 1
 #if CONFIG_MULTIPLE_ARF
 // Set MIN_GF_INTERVAL to 1 for the full decomposition.
 #define MIN_GF_INTERVAL             2
 #else
 #define MIN_GF_INTERVAL             4
 #endif
-#define DEFAULT_GF_INTERVAL         7
+#define DEFAULT_GF_INTERVAL         10
+#define DEFAULT_KF_BOOST            2000
+#define DEFAULT_GF_BOOST            2000
 
 #define KEY_FRAME_CONTEXT 5
 
@@ -78,42 +80,6 @@ typedef struct {
   FRAME_CONTEXT fc;
 } CODING_CONTEXT;
 
-typedef struct {
-  double frame;
-  double intra_error;
-  double coded_error;
-  double sr_coded_error;
-  double ssim_weighted_pred_err;
-  double pcnt_inter;
-  double pcnt_motion;
-  double pcnt_second_ref;
-  double pcnt_neutral;
-  double MVr;
-  double mvr_abs;
-  double MVc;
-  double mvc_abs;
-  double MVrv;
-  double MVcv;
-  double mv_in_out_count;
-  double new_mv_count;
-  double duration;
-  double count;
-} FIRSTPASS_STATS;
-
-typedef struct {
-  struct {
-    int err;
-    union {
-      int_mv mv;
-      MB_PREDICTION_MODE mode;
-    } m;
-  } ref[MAX_REF_FRAMES];
-} MBGRAPH_MB_STATS;
-
-typedef struct {
-  MBGRAPH_MB_STATS *mb_stats;
-} MBGRAPH_FRAME_STATS;
-
 // This enumerator type needs to be kept aligned with the mode order in
 // const MODE_DEFINITION vp9_mode_order[MAX_MODES] used in the rd code.
 typedef enum {
@@ -170,7 +136,8 @@ typedef enum {
   NSTEP = 1,
   HEX = 2,
   BIGDIA = 3,
-  SQUARE = 4
+  SQUARE = 4,
+  FAST_HEX = 5
 } SEARCH_METHODS;
 
 typedef enum {
@@ -231,18 +198,50 @@ typedef enum {
   LAST_FRAME_PARTITION_ALL = 2
 } LAST_FRAME_PARTITION_METHOD;
 
+typedef enum {
+  // No recode.
+  DISALLOW_RECODE = 0,
+  // Allow recode for KF and exceeding maximum frame bandwidth.
+  ALLOW_RECODE_KFMAXBW = 1,
+  // Allow recode only for KF/ARF/GF frames.
+  ALLOW_RECODE_KFARFGF = 2,
+  // Allow recode for all frames based on bitrate constraints.
+  ALLOW_RECODE = 3,
+} RECODE_LOOP_TYPE;
+
+typedef enum {
+  // encode_breakout is disabled.
+  ENCODE_BREAKOUT_DISABLED = 0,
+  // encode_breakout is enabled.
+  ENCODE_BREAKOUT_ENABLED = 1,
+  // encode_breakout is enabled with small max_thresh limit.
+  ENCODE_BREAKOUT_LIMITED = 2
+} ENCODE_BREAKOUT_TYPE;
+
+typedef enum {
+  // Search partitions using RD/NONRD criterion
+  SEARCH_PARTITION = 0,
+
+  // Always use a fixed size partition
+  FIXED_PARTITION = 1,
+
+  // Use a fixed size partition in every 64X64 SB, where the size is
+  // determined based on source variance
+  VAR_BASED_FIXED_PARTITION = 2,
+
+  // Use an arbitrary partitioning scheme based on source variance within
+  // a 64X64 SB
+  VAR_BASED_PARTITION
+} PARTITION_SEARCH_TYPE;
+
 typedef struct {
-  // This flag refers to whether or not to perform rd optimization.
-  int RD;
+  // Frame level coding parameter update
+  int frame_parameter_update;
 
   // Motion search method (Diamond, NSTEP, Hex, Big Diamond, Square, etc).
   SEARCH_METHODS search_method;
 
-  // Recode_loop can be:
-  // 0 means we only encode a frame once
-  // 1 means we can re-encode based on bitrate constraints on any frame
-  // 2 means we can only recode gold, alt, and key frames.
-  int recode_loop;
+  RECODE_LOOP_TYPE recode_loop;
 
   // Subpel_search_method can only be subpel_tree which does a subpixel
   // logarithmic search that keeps stepping at 1/2 pixel units until
@@ -321,16 +320,6 @@ typedef struct {
 
   // TODO(JBB): remove this as its no longer used.
 
-  // If set partition size will always be always_this_block_size.
-  int use_one_partition_size_always;
-
-  // Skip rectangular partition test when partition type none gives better
-  // rd than partition type split.
-  int less_rectangular_check;
-
-  // Disable testing non square partitions. (eg 16x32)
-  int use_square_partition_only;
-
   // After looking at the first set of modes (set by index here), skip
   // checking modes for reference frames that don't match the reference frame
   // of the best so far.
@@ -339,9 +328,18 @@ typedef struct {
   // TODO(JBB): Remove this.
   int reference_masking;
 
-  // Used in conjunction with use_one_partition_size_always.
+  PARTITION_SEARCH_TYPE partition_search_type;
+
+  // Used if partition_search_type = FIXED_SIZE_PARTITION
   BLOCK_SIZE always_this_block_size;
 
+  // Skip rectangular partition test when partition type none gives better
+  // rd than partition type split.
+  int less_rectangular_check;
+
+  // Disable testing non square partitions. (eg 16x32)
+  int use_square_partition_only;
+
   // Sets min and max partition sizes for this 64x64 region based on the
   // same 64x64 in last encoded frame, and the left and above neighbor.
   AUTO_MIN_MAX_MODE auto_min_max_partition_size;
@@ -364,11 +362,6 @@ typedef struct {
   // inter modes or to enable it always.
   int disable_split_mask;
 
-  // TODO(jbb): Remove this and everything that uses it. It's only valid if
-  // we were doing small to large partition checks. We currently do the
-  // reverse.
-  int using_small_partition_info;
-
   // TODO(jingning): combine the related motion search speed features
   // This allows us to use motion search at other sizes as a starting
   // point for this motion search and limits the search range around it.
@@ -417,10 +410,24 @@ typedef struct {
   // by only looking at counts from 1/2 the bands.
   int use_fast_coef_updates;  // 0: 2-loop, 1: 1-loop, 2: 1-loop reduced
 
-  // This flag control the use of the new super fast rtc mode
-  int super_fast_rtc;
+  // This flag controls the use of non-RD mode decision.
+  int use_nonrd_pick_mode;
+
+  // This variable sets the encode_breakout threshold. Currently, it is only
+  // enabled in real time mode.
+  int encode_breakout_thresh;
 } SPEED_FEATURES;
 
+typedef struct {
+  RATE_CONTROL rc;
+  int target_bandwidth;
+  int64_t starting_buffer_level;
+  int64_t optimal_buffer_level;
+  int64_t maximum_buffer_size;
+  double framerate;
+  int avg_frame_size;
+} LAYER_CONTEXT;
+
 typedef struct VP9_COMP {
   DECLARE_ALIGNED(16, int16_t, y_quant[QINDEX_RANGE][8]);
   DECLARE_ALIGNED(16, int16_t, y_quant_shift[QINDEX_RANGE][8]);
@@ -454,7 +461,7 @@ typedef struct VP9_COMP {
   YV12_BUFFER_CONFIG *un_scaled_source;
   YV12_BUFFER_CONFIG scaled_source;
 
-  unsigned int key_frame_frequency;
+  int key_frame_frequency;
 
   int gold_is_last;  // gold same as last frame ( short circuit gold searches)
   int alt_is_last;  // Alt same as last ( short circuit altref search)
@@ -465,9 +472,6 @@ typedef struct VP9_COMP {
   int gld_fb_idx;
   int alt_fb_idx;
 
-  int current_layer;
-  int use_svc;
-
 #if CONFIG_MULTIPLE_ARF
   int alt_ref_fb_idx[REF_FRAMES - 3];
 #endif
@@ -498,12 +502,6 @@ typedef struct VP9_COMP {
   // Ambient reconstruction err target for force key frames
   int ambient_err;
 
-  unsigned int mode_chosen_counts[MAX_MODES];
-  unsigned int sub8x8_mode_chosen_counts[MAX_REFS];
-  int64_t mode_skip_mask;
-  int ref_frame_mask;
-  int set_ref_frame_mask;
-
   int rd_threshes[MAX_SEGMENTS][BLOCK_SIZES][MAX_MODES];
   int rd_thresh_freq_fact[BLOCK_SIZES][MAX_MODES];
   int rd_thresh_sub8x8[MAX_SEGMENTS][BLOCK_SIZES][MAX_REFS];
@@ -543,7 +541,6 @@ typedef struct VP9_COMP {
   vp9_coeff_probs_model frame_coef_probs[TX_SIZES][PLANE_TYPES];
   vp9_coeff_stats frame_branch_ct[TX_SIZES][PLANE_TYPES];
 
-  int64_t target_bandwidth;
   struct vpx_codec_pkt_list  *output_pkt_list;
 
   MBGRAPH_FRAME_STATS mbgraph_stats[MAX_LAG_BUFFERS];
@@ -567,6 +564,13 @@ typedef struct VP9_COMP {
   unsigned int max_mv_magnitude;
   int mv_step_param;
 
+  // Default value is 1. From first pass stats, encode_breakout may be disabled.
+  ENCODE_BREAKOUT_TYPE allow_encode_breakout;
+
+  // Get threshold from external input. In real time mode, it can be
+  // overwritten according to encoding speed.
+  int encode_breakout;
+
   unsigned char *segmentation_map;
 
   // segment threashold for encode breakout
@@ -588,52 +592,15 @@ typedef struct VP9_COMP {
   uint64_t time_pick_lpf;
   uint64_t time_encode_sb_row;
 
-  struct twopass_rc {
-    unsigned int section_intra_rating;
-    unsigned int next_iiratio;
-    unsigned int this_iiratio;
-    FIRSTPASS_STATS total_stats;
-    FIRSTPASS_STATS this_frame_stats;
-    FIRSTPASS_STATS *stats_in, *stats_in_end, *stats_in_start;
-    FIRSTPASS_STATS total_left_stats;
-    int first_pass_done;
-    int64_t bits_left;
-    int64_t clip_bits_total;
-    double avg_iiratio;
-    double modified_error_min;
-    double modified_error_max;
-    double modified_error_total;
-    double modified_error_left;
-    double kf_intra_err_min;
-    double gf_intra_err_min;
-    int static_scene_max_gf_interval;
-    int kf_bits;
-    // Remaining error from uncoded frames in a gf group. Two pass use only
-    int64_t gf_group_error_left;
-
-    // Projected total bits available for a key frame group of frames
-    int64_t kf_group_bits;
-
-    // Error score of frames still to be coded in kf group
-    int64_t kf_group_error_left;
-
-    // Projected Bits available for a group of frames including 1 GF or ARF
-    int64_t gf_group_bits;
-    // Bits for the golden frame or ARF - 2 pass only
-    int gf_bits;
-    int alt_extra_bits;
-
-    int sr_update_lag;
-
-    int kf_zeromotion_pct;
-    int gf_zeromotion_pct;
-  } twopass;
+  struct twopass_rc twopass;
 
   YV12_BUFFER_CONFIG alt_ref_buffer;
   YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS];
   int fixed_divide[512];
 
 #if CONFIG_INTERNAL_STATS
+  unsigned int mode_chosen_counts[MAX_MODES];
+
   int    count;
   double total_y;
   double total_u;
@@ -684,9 +651,17 @@ typedef struct VP9_COMP {
   int initial_width;
   int initial_height;
 
-  int number_spatial_layers;
-  int enable_encode_breakout;   // Default value is 1. From first pass stats,
-                                // encode_breakout may be disabled.
+  int use_svc;
+
+  struct svc {
+    int spatial_layer_id;
+    int temporal_layer_id;
+    int number_spatial_layers;
+    int number_temporal_layers;
+    // Layer context used for rate control in CBR mode, only defined for
+    // temporal layers for now.
+    LAYER_CONTEXT layer_context[VPX_TS_MAX_LAYERS];
+  } svc;
 
 #if CONFIG_MULTIPLE_ARF
   // ARF tracking variables.
@@ -741,8 +716,6 @@ void vp9_encode_frame(VP9_COMP *cpi);
 
 void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, size_t *size);
 
-void vp9_activity_masking(VP9_COMP *cpi, MACROBLOCK *x);
-
 void vp9_set_speed_features(VP9_COMP *cpi);
 
 int vp9_calc_ss_err(const YV12_BUFFER_CONFIG *source,
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 1aaa4162b..87f20fa1c 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -98,10 +98,14 @@ static int full_pixel_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   mvp_full.col >>= 3;
   mvp_full.row >>= 3;
 
-  bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param,
-                                   sadpb, further_steps, 1,
-                                   &cpi->fn_ptr[bsize],
-                                   &ref_mv.as_mv, tmp_mv);
+  if (cpi->sf.search_method == FAST_HEX) {
+    vp9_fast_hex_search(x, &mvp_full, step_param, sadpb, &cpi->fn_ptr[bsize],
+                        1, &ref_mv.as_mv, &tmp_mv->as_mv);
+  } else {
+    vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param, sadpb, further_steps,
+                           1, &cpi->fn_ptr[bsize], &ref_mv.as_mv,
+                           &tmp_mv->as_mv);
+  }
 
   x->mv_col_min = tmp_col_min;
   x->mv_col_max = tmp_col_max;
@@ -130,9 +134,50 @@ static int full_pixel_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   // calculate the bit cost on motion vector
   *rate_mv = vp9_mv_bit_cost(&tmp_mv->as_mv, &ref_mv.as_mv,
                              x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+  return bestsme;
+}
 
+static void sub_pixel_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
+                                    const TileInfo *const tile,
+                                    BLOCK_SIZE bsize, int mi_row, int mi_col,
+                                    int_mv *tmp_mv) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
+  struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0}};
+  int ref = mbmi->ref_frame[0];
+  int_mv ref_mv = mbmi->ref_mvs[ref][0];
+  int dis;
 
-  return bestsme;
+  const YV12_BUFFER_CONFIG *scaled_ref_frame = vp9_get_scaled_ref_frame(cpi,
+                                                                        ref);
+  if (scaled_ref_frame) {
+    int i;
+    // Swap out the reference frame for a version that's been scaled to
+    // match the resolution of the current frame, allowing the existing
+    // motion search code to be used without additional modifications.
+    for (i = 0; i < MAX_MB_PLANE; i++)
+      backup_yv12[i] = xd->plane[i].pre[0];
+
+    setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL);
+  }
+
+  tmp_mv->as_mv.col >>= 3;
+  tmp_mv->as_mv.row >>= 3;
+
+  cpi->find_fractional_mv_step(x, &tmp_mv->as_mv, &ref_mv.as_mv,
+                               cpi->common.allow_high_precision_mv,
+                               x->errorperbit,
+                               &cpi->fn_ptr[bsize],
+                               cpi->sf.subpel_force_stop,
+                               cpi->sf.subpel_iters_per_step,
+                               x->nmvjointcost, x->mvcost,
+                               &dis, &x->pred_sse[ref]);
+
+  if (scaled_ref_frame) {
+    int i;
+    for (i = 0; i < MAX_MB_PLANE; i++)
+      xd->plane[i].pre[0] = backup_yv12[i];
+  }
 }
 
 // TODO(jingning) placeholder for inter-frame non-RD mode decision.
@@ -145,16 +190,21 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                             BLOCK_SIZE bsize) {
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
+  struct macroblock_plane *const p = &x->plane[0];
+  struct macroblockd_plane *const pd = &xd->plane[0];
   const BLOCK_SIZE block_size = get_plane_block_size(bsize, &xd->plane[0]);
-  MB_PREDICTION_MODE this_mode;
-  MV_REFERENCE_FRAME ref_frame;
+  MB_PREDICTION_MODE this_mode, best_mode = ZEROMV;
+  MV_REFERENCE_FRAME ref_frame, best_ref_frame = LAST_FRAME;
   int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
   struct buf_2d yv12_mb[4][MAX_MB_PLANE];
   static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
                                     VP9_ALT_FLAG };
   int64_t best_rd = INT64_MAX;
   int64_t this_rd;
-  int64_t cost[4]= { 0, 100, 150,  205 };
+  static const int cost[4]= { 0, 50, 75, 100 };
+
+  const int64_t inter_mode_thresh = 300;
+  const int64_t intra_mode_cost = 50;
 
   x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
 
@@ -164,12 +214,17 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
 
   // initialize mode decisions
   *returnrate = INT_MAX;
+  *returndistortion = INT64_MAX;
   vpx_memset(mbmi, 0, sizeof(MB_MODE_INFO));
   mbmi->sb_type = bsize;
   mbmi->ref_frame[0] = NONE;
   mbmi->ref_frame[1] = NONE;
   mbmi->tx_size = MIN(max_txsize_lookup[bsize],
                       tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
+  mbmi->interp_filter = cpi->common.interp_filter == SWITCHABLE ?
+                        EIGHTTAP : cpi->common.interp_filter;
+  mbmi->skip = 0;
+  mbmi->segment_id = 0;
 
   for (ref_frame = LAST_FRAME; ref_frame <= LAST_FRAME ; ++ref_frame) {
     x->pred_mv_sad[ref_frame] = INT_MAX;
@@ -194,12 +249,14 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     clamp_mv2(&frame_mv[NEARESTMV][ref_frame].as_mv, xd);
     clamp_mv2(&frame_mv[NEARMV][ref_frame].as_mv, xd);
 
+    mbmi->ref_frame[0] = ref_frame;
+
     for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
-      int rate = cost[this_mode - NEARESTMV];
+      int rate = cost[INTER_OFFSET(this_mode)];
       int64_t dist;
 
       if (this_mode == NEWMV) {
-        if (this_rd < 300)
+        if (this_rd < 500)
           continue;
 
         x->mode_sad[ref_frame][INTER_OFFSET(NEWMV)] =
@@ -208,34 +265,55 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
 
         if (frame_mv[NEWMV][ref_frame].as_int == INVALID_MV)
           continue;
+
+        sub_pixel_motion_search(cpi, x, tile, bsize, mi_row, mi_col,
+                                &frame_mv[NEWMV][ref_frame]);
       }
 
-      dist = x->mode_sad[ref_frame][INTER_OFFSET(this_mode)];
+      mbmi->mode = this_mode;
+      mbmi->mv[0].as_int = frame_mv[this_mode][ref_frame].as_int;
+      vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
+
+      dist = cpi->fn_ptr[bsize].sdf(p->src.buf, p->src.stride,
+                                    pd->dst.buf, pd->dst.stride, INT_MAX);
       this_rd = rate + dist;
 
       if (this_rd < best_rd) {
         best_rd = this_rd;
-        mbmi->mode = this_mode;
-        mbmi->ref_frame[0] = ref_frame;
-        mbmi->mv[0].as_int = frame_mv[this_mode][ref_frame].as_int;
-        xd->mi_8x8[0]->bmi[0].as_mv[0].as_int = mbmi->mv[0].as_int;
-        mbmi->interp_filter = cpi->common.interp_filter == SWITCHABLE ?
-            EIGHTTAP : cpi->common.interp_filter;
-
-        mbmi->ref_frame[1] = INTRA_FRAME;
-        mbmi->tx_size = max_txsize_lookup[bsize];
-        mbmi->uv_mode = this_mode;
-        mbmi->skip_coeff = 0;
-        mbmi->sb_type = bsize;
-        mbmi->segment_id = 0;
+        best_mode = this_mode;
+        best_ref_frame = ref_frame;
       }
     }
   }
 
-  // TODO(jingning) sub-pixel motion search, if NEWMV is chosen
+  mbmi->mode = best_mode;
+  mbmi->ref_frame[0] = best_ref_frame;
+  mbmi->mv[0].as_int = frame_mv[best_mode][best_ref_frame].as_int;
+  xd->mi_8x8[0]->bmi[0].as_mv[0].as_int = mbmi->mv[0].as_int;
 
-  // TODO(jingning) intra prediction search, if the best SAD is above a certain
+  // Perform intra prediction search, if the best SAD is above a certain
   // threshold.
+  if (best_rd > inter_mode_thresh) {
+    for (this_mode = DC_PRED; this_mode <= DC_PRED; ++this_mode) {
+      vp9_predict_intra_block(xd, 0, b_width_log2(bsize),
+                              mbmi->tx_size, this_mode,
+                              &p->src.buf[0], p->src.stride,
+                              &pd->dst.buf[0], pd->dst.stride, 0, 0, 0);
+
+      this_rd = cpi->fn_ptr[bsize].sdf(p->src.buf,
+                                       p->src.stride,
+                                       pd->dst.buf,
+                                       pd->dst.stride, INT_MAX);
+
+      if (this_rd + intra_mode_cost < best_rd) {
+        best_rd = this_rd;
+        mbmi->mode = this_mode;
+        mbmi->ref_frame[0] = INTRA_FRAME;
+        mbmi->uv_mode = this_mode;
+        mbmi->mv[0].as_int = INVALID_MV;
+      }
+    }
+  }
 
   return INT64_MAX;
 }
diff --git a/vp9/encoder/vp9_psnr.c b/vp9/encoder/vp9_psnr.c
deleted file mode 100644
index 58294e15a..000000000
--- a/vp9/encoder/vp9_psnr.c
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <math.h>
-
-#include "vpx_scale/yv12config.h"
-
-#define MAX_PSNR 100
-
-double vp9_mse2psnr(double samples, double peak, double mse) {
-  double psnr;
-
-  if (mse > 0.0)
-    psnr = 10.0 * log10(peak * peak * samples / mse);
-  else
-    psnr = MAX_PSNR;  // Limit to prevent / 0
-
-  if (psnr > MAX_PSNR)
-    psnr = MAX_PSNR;
-
-  return psnr;
-}
diff --git a/vp9/encoder/vp9_psnr.h b/vp9/encoder/vp9_psnr.h
deleted file mode 100644
index ffe00ed2c..000000000
--- a/vp9/encoder/vp9_psnr.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef VP9_ENCODER_VP9_PSNR_H_
-#define VP9_ENCODER_VP9_PSNR_H_
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-double vp9_mse2psnr(double samples, double peak, double mse);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // VP9_ENCODER_VP9_PSNR_H_
diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index a2eea1cd7..372c36221 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@@ -26,7 +26,7 @@ void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t count,
                       const int16_t *dequant_ptr,
                       int zbin_oq_value, uint16_t *eob_ptr,
                       const int16_t *scan, const int16_t *iscan) {
-  int i, non_zero_count = count, eob = -1;
+  int i, non_zero_count = (int)count, eob = -1;
   const int zbins[2] = { zbin_ptr[0] + zbin_oq_value,
                          zbin_ptr[1] + zbin_oq_value };
   const int nzbins[2] = { zbins[0] * -1,
@@ -37,7 +37,7 @@ void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t count,
 
   if (!skip_block) {
     // Pre-scan pass
-    for (i = count - 1; i >= 0; i--) {
+    for (i = (int)count - 1; i >= 0; i--) {
       const int rc = scan[i];
       const int coeff = coeff_ptr[rc];
 
@@ -79,55 +79,47 @@ void vp9_quantize_b_32x32_c(const int16_t *coeff_ptr, intptr_t n_coeffs,
                             const int16_t *dequant_ptr,
                             int zbin_oq_value, uint16_t *eob_ptr,
                             const int16_t *scan, const int16_t *iscan) {
-  int i, rc, eob;
-  int zbins[2], nzbins[2];
-  int x, y, z, sz;
+  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0] + zbin_oq_value, 1),
+                         ROUND_POWER_OF_TWO(zbin_ptr[1] + zbin_oq_value, 1) };
+  const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1};
+
   int idx = 0;
   int idx_arr[1024];
+  int i, eob = -1;
 
-  vpx_memset(qcoeff_ptr, 0, n_coeffs*sizeof(int16_t));
-  vpx_memset(dqcoeff_ptr, 0, n_coeffs*sizeof(int16_t));
-
-  eob = -1;
-
-  // Base ZBIN
-  zbins[0] = ROUND_POWER_OF_TWO(zbin_ptr[0] + zbin_oq_value, 1);
-  zbins[1] = ROUND_POWER_OF_TWO(zbin_ptr[1] + zbin_oq_value, 1);
-  nzbins[0] = zbins[0] * -1;
-  nzbins[1] = zbins[1] * -1;
+  vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(int16_t));
+  vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(int16_t));
 
   if (!skip_block) {
     // Pre-scan pass
     for (i = 0; i < n_coeffs; i++) {
-      rc = scan[i];
-      z = coeff_ptr[rc];
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
 
       // If the coefficient is out of the base ZBIN range, keep it for
       // quantization.
-      if (z >= zbins[rc != 0] || z <= nzbins[rc != 0])
+      if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0])
         idx_arr[idx++] = i;
     }
 
     // Quantization pass: only process the coefficients selected in
     // pre-scan pass. Note: idx can be zero.
     for (i = 0; i < idx; i++) {
-      rc = scan[idx_arr[i]];
-
-      z = coeff_ptr[rc];
-      sz = (z >> 31);                               // sign of z
-      x  = (z ^ sz) - sz;                           // x = abs(z)
-
-      x += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
-      x  = clamp(x, INT16_MIN, INT16_MAX);
-      y  = ((((x * quant_ptr[rc != 0]) >> 16) + x) *
-            quant_shift_ptr[rc != 0]) >> 15;      // quantize (x)
-
-      x  = (y ^ sz) - sz;                         // get the sign back
-      qcoeff_ptr[rc]  = x;                        // write to destination
-      dqcoeff_ptr[rc] = x * dequant_ptr[rc != 0] / 2;  // dequantized value
-
-      if (y)
-        eob = idx_arr[i];                         // last nonzero coeffs
+      const int rc = scan[idx_arr[i]];
+      const int coeff = coeff_ptr[rc];
+      const int coeff_sign = (coeff >> 31);
+      int tmp;
+      int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+      abs_coeff = clamp(abs_coeff, INT16_MIN, INT16_MAX);
+      tmp = ((((abs_coeff * quant_ptr[rc != 0]) >> 16) + abs_coeff) *
+               quant_shift_ptr[rc != 0]) >> 15;
+
+      qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
+
+      if (tmp)
+        eob = idx_arr[i];
     }
   }
   *eob_ptr = eob + 1;
@@ -136,8 +128,8 @@ void vp9_quantize_b_32x32_c(const int16_t *coeff_ptr, intptr_t n_coeffs,
 void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block,
                                 const int16_t *scan, const int16_t *iscan) {
   MACROBLOCKD *const xd = &x->e_mbd;
-  struct macroblock_plane* p = &x->plane[plane];
-  struct macroblockd_plane* pd = &xd->plane[plane];
+  struct macroblock_plane *p = &x->plane[plane];
+  struct macroblockd_plane *pd = &xd->plane[plane];
 
   vp9_quantize_b(BLOCK_OFFSET(p->coeff, block),
            16, x->skip_block,
@@ -223,38 +215,30 @@ void vp9_init_quantizer(VP9_COMP *cpi) {
 }
 
 void vp9_mb_init_quantizer(VP9_COMP *cpi, MACROBLOCK *x) {
-  int i;
-  VP9_COMMON *const cm = &cpi->common;
+  const VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
-  int zbin_extra;
-  int segment_id = xd->mi_8x8[0]->mbmi.segment_id;
-  const int qindex = vp9_get_qindex(&cpi->common.seg, segment_id,
-                                    cpi->common.base_qindex);
-
-  int rdmult = vp9_compute_rd_mult(cpi, qindex + cm->y_dc_delta_q);
+  const int segment_id = xd->mi_8x8[0]->mbmi.segment_id;
+  const int qindex = vp9_get_qindex(&cm->seg, segment_id, cm->base_qindex);
+  const int rdmult = vp9_compute_rd_mult(cpi, qindex + cm->y_dc_delta_q);
+  const int zbin = cpi->zbin_mode_boost + x->act_zbin_adj;
+  int i;
 
   // Y
-  zbin_extra = (cpi->common.y_dequant[qindex][1] *
-                 (cpi->zbin_mode_boost + x->act_zbin_adj)) >> 7;
-
   x->plane[0].quant = cpi->y_quant[qindex];
   x->plane[0].quant_shift = cpi->y_quant_shift[qindex];
   x->plane[0].zbin = cpi->y_zbin[qindex];
   x->plane[0].round = cpi->y_round[qindex];
-  x->plane[0].zbin_extra = (int16_t)zbin_extra;
-  x->e_mbd.plane[0].dequant = cpi->common.y_dequant[qindex];
+  x->plane[0].zbin_extra = (int16_t)((cm->y_dequant[qindex][1] * zbin) >> 7);
+  xd->plane[0].dequant = cm->y_dequant[qindex];
 
   // UV
-  zbin_extra = (cpi->common.uv_dequant[qindex][1] *
-                (cpi->zbin_mode_boost + x->act_zbin_adj)) >> 7;
-
   for (i = 1; i < 3; i++) {
     x->plane[i].quant = cpi->uv_quant[qindex];
     x->plane[i].quant_shift = cpi->uv_quant_shift[qindex];
     x->plane[i].zbin = cpi->uv_zbin[qindex];
     x->plane[i].round = cpi->uv_round[qindex];
-    x->plane[i].zbin_extra = (int16_t)zbin_extra;
-    x->e_mbd.plane[i].dequant = cpi->common.uv_dequant[qindex];
+    x->plane[i].zbin_extra = (int16_t)((cm->uv_dequant[qindex][1] * zbin) >> 7);
+    xd->plane[i].dequant = cm->uv_dequant[qindex];
   }
 
 #if CONFIG_ALPHA
@@ -263,18 +247,14 @@ void vp9_mb_init_quantizer(VP9_COMP *cpi, MACROBLOCK *x) {
   x->plane[3].zbin = cpi->a_zbin[qindex];
   x->plane[3].round = cpi->a_round[qindex];
   x->plane[3].zbin_extra = (int16_t)zbin_extra;
-  x->e_mbd.plane[3].dequant = cpi->common.a_dequant[qindex];
+  xd->plane[3].dequant = cm->a_dequant[qindex];
 #endif
 
-  x->skip_block = vp9_segfeature_active(&cpi->common.seg, segment_id,
-                                        SEG_LVL_SKIP);
-
-  /* save this macroblock QIndex for vp9_update_zbin_extra() */
+  x->skip_block = vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP);
   x->q_index = qindex;
 
-  /* R/D setup */
-  cpi->mb.errorperbit = rdmult >> 6;
-  cpi->mb.errorperbit += (cpi->mb.errorperbit == 0);
+  x->errorperbit = rdmult >> 6;
+  x->errorperbit += (x->errorperbit == 0);
 
   vp9_initialize_me_consts(cpi, x->q_index);
 }
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index 701557238..89aa82140 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -8,23 +8,24 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <limits.h>
 #include <assert.h>
+#include <limits.h>
 #include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "vpx_mem/vpx_mem.h"
 
 #include "vp9/common/vp9_alloccommon.h"
 #include "vp9/common/vp9_common.h"
-#include "vp9/encoder/vp9_ratectrl.h"
 #include "vp9/common/vp9_entropymode.h"
-#include "vpx_mem/vpx_mem.h"
-#include "vp9/common/vp9_systemdependent.h"
-#include "vp9/encoder/vp9_encodemv.h"
 #include "vp9/common/vp9_quant_common.h"
 #include "vp9/common/vp9_seg_common.h"
+#include "vp9/common/vp9_systemdependent.h"
+
+#include "vp9/encoder/vp9_encodemv.h"
+#include "vp9/encoder/vp9_ratectrl.h"
 
 #define LIMIT_QRANGE_FOR_ALTREF_AND_KEY 1
 
@@ -209,51 +210,60 @@ static int estimate_bits_at_q(int frame_kind, int q, int mbs,
                            : (bpm * mbs) >> BPER_MB_NORMBITS;
 }
 
-
-static void calc_iframe_target_size(VP9_COMP *cpi) {
-  const VP9_CONFIG *oxcf = &cpi->oxcf;
-  RATE_CONTROL *const rc = &cpi->rc;
-  int target;
-
-  vp9_clear_system_state();  // __asm emms;
-
-  // For 1-pass.
-  if (cpi->pass == 0 && oxcf->end_usage == USAGE_STREAM_FROM_SERVER) {
-    if (cpi->common.current_video_frame == 0) {
-      target = oxcf->starting_buffer_level / 2;
-    } else {
-      // TODO(marpan): Add in adjustment based on Q.
-      // If this keyframe was forced, use a more recent Q estimate.
-      // int Q = (cpi->common.frame_flags & FRAMEFLAGS_KEY) ?
-      //    cpi->rc.avg_frame_qindex : cpi->rc.ni_av_qi;
-      int initial_boost = 32;
-      // Boost depends somewhat on frame rate.
-      int kf_boost = MAX(initial_boost, (int)(2 * cpi->output_framerate - 16));
-      // Adjustment up based on q: need to fix.
-      // kf_boost = kf_boost * kfboost_qadjust(Q) / 100;
-      // Frame separation adjustment (down).
-      if (rc->frames_since_key  < cpi->output_framerate / 2) {
-        kf_boost = (int)(kf_boost * rc->frames_since_key /
-                       (cpi->output_framerate / 2));
-      }
-      kf_boost = (kf_boost < 16) ? 16 : kf_boost;
-      target = ((16 + kf_boost) * rc->per_frame_bandwidth) >> 4;
-    }
-    rc->active_worst_quality = rc->worst_quality;
-  } else {
-    target = rc->per_frame_bandwidth;
+int vp9_rc_clamp_pframe_target_size(const VP9_COMP *const cpi, int target) {
+  const RATE_CONTROL *rc = &cpi->rc;
+  const int min_frame_target = MAX(rc->min_frame_bandwidth,
+                                   rc->av_per_frame_bandwidth >> 5);
+  if (target < min_frame_target)
+    target = min_frame_target;
+  if (cpi->refresh_golden_frame && rc->is_src_frame_alt_ref) {
+    // If there is an active ARF at this location use the minimum
+    // bits on this frame even if it is a constructed arf.
+    // The active maximum quantizer insures that an appropriate
+    // number of bits will be spent if needed for constructed ARFs.
+    target = min_frame_target;
   }
+  // Clip the frame target to the maximum allowed value.
+  if (target > rc->max_frame_bandwidth)
+    target = rc->max_frame_bandwidth;
+  return target;
+}
 
+int vp9_rc_clamp_iframe_target_size(const VP9_COMP *const cpi, int target) {
+  const RATE_CONTROL *rc = &cpi->rc;
+  const VP9_CONFIG *oxcf = &cpi->oxcf;
   if (oxcf->rc_max_intra_bitrate_pct) {
-    const int max_rate = rc->per_frame_bandwidth *
+    const int max_rate = rc->av_per_frame_bandwidth *
         oxcf->rc_max_intra_bitrate_pct / 100;
     target = MIN(target, max_rate);
   }
-  rc->this_frame_target = target;
+  if (target > rc->max_frame_bandwidth)
+    target = rc->max_frame_bandwidth;
+  return target;
+}
+
+
+// Update the buffer level for higher layers, given the encoded current layer.
+static void update_layer_buffer_level(VP9_COMP *const cpi,
+                                      int encoded_frame_size) {
+  int temporal_layer = 0;
+  int current_temporal_layer = cpi->svc.temporal_layer_id;
+  for (temporal_layer = current_temporal_layer + 1;
+      temporal_layer < cpi->svc.number_temporal_layers; ++temporal_layer) {
+    LAYER_CONTEXT *lc = &cpi->svc.layer_context[temporal_layer];
+    RATE_CONTROL *lrc = &lc->rc;
+    int bits_off_for_this_layer = (int)(lc->target_bandwidth / lc->framerate -
+        encoded_frame_size);
+    lrc->bits_off_target += bits_off_for_this_layer;
+
+    // Clip buffer level to maximum buffer size for the layer.
+    lrc->bits_off_target = MIN(lrc->bits_off_target, lc->maximum_buffer_size);
+    lrc->buffer_level = lrc->bits_off_target;
+  }
 }
 
 // Update the buffer level: leaky bucket model.
-void vp9_update_buffer_level(VP9_COMP *cpi, int encoded_frame_size) {
+static void update_buffer_level(VP9_COMP *cpi, int encoded_frame_size) {
   const VP9_COMMON *const cm = &cpi->common;
   const VP9_CONFIG *oxcf = &cpi->oxcf;
   RATE_CONTROL *const rc = &cpi->rc;
@@ -266,14 +276,18 @@ void vp9_update_buffer_level(VP9_COMP *cpi, int encoded_frame_size) {
   }
 
   // Clip the buffer level to the maximum specified buffer size.
-  rc->buffer_level = MIN(rc->bits_off_target, oxcf->maximum_buffer_size);
+  rc->bits_off_target = MIN(rc->bits_off_target, oxcf->maximum_buffer_size);
+  rc->buffer_level = rc->bits_off_target;
+
+  if (cpi->use_svc && cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) {
+    update_layer_buffer_level(cpi, encoded_frame_size);
+  }
 }
 
-int vp9_drop_frame(VP9_COMP *cpi) {
+int vp9_rc_drop_frame(VP9_COMP *cpi) {
   const VP9_CONFIG *oxcf = &cpi->oxcf;
   RATE_CONTROL *const rc = &cpi->rc;
 
-
   if (!oxcf->drop_frames_water_mark) {
     return 0;
   } else {
@@ -284,7 +298,7 @@ int vp9_drop_frame(VP9_COMP *cpi) {
       // If buffer is below drop_mark, for now just drop every other frame
       // (starting with the next frame) until it increases back over drop_mark.
       int drop_mark = (int)(oxcf->drop_frames_water_mark *
-                                oxcf->optimal_buffer_level / 100);
+          oxcf->optimal_buffer_level / 100);
       if ((rc->buffer_level > drop_mark) &&
           (rc->decimation_factor > 0)) {
         --rc->decimation_factor;
@@ -308,127 +322,12 @@ int vp9_drop_frame(VP9_COMP *cpi) {
   }
 }
 
-// Adjust active_worst_quality level based on buffer level.
-static int adjust_active_worst_quality_from_buffer_level(const VP9_CONFIG *oxcf,
-    const RATE_CONTROL *rc) {
-  // Adjust active_worst_quality: If buffer is above the optimal/target level,
-  // bring active_worst_quality down depending on fullness over buffer.
-  // If buffer is below the optimal level, let the active_worst_quality go from
-  // ambient Q (at buffer = optimal level) to worst_quality level
-  // (at buffer = critical level).
-
-  int active_worst_quality = rc->active_worst_quality;
-  // Maximum limit for down adjustment, ~20%.
-  int max_adjustment_down = active_worst_quality / 5;
-  // Buffer level below which we push active_worst to worst_quality.
-  int critical_level = oxcf->optimal_buffer_level >> 2;
-  int adjustment = 0;
-  int buff_lvl_step = 0;
-  if (rc->buffer_level > oxcf->optimal_buffer_level) {
-    // Adjust down.
-    if (max_adjustment_down) {
-      buff_lvl_step = (int)((oxcf->maximum_buffer_size -
-          oxcf->optimal_buffer_level) / max_adjustment_down);
-      if (buff_lvl_step)
-        adjustment = (int)((rc->buffer_level - oxcf->optimal_buffer_level) /
-                            buff_lvl_step);
-      active_worst_quality -= adjustment;
-    }
-  } else if (rc->buffer_level > critical_level) {
-    // Adjust up from ambient Q.
-    if (critical_level) {
-      buff_lvl_step = (oxcf->optimal_buffer_level - critical_level);
-      if (buff_lvl_step) {
-        adjustment = (rc->worst_quality - rc->avg_frame_qindex[INTER_FRAME]) *
-                         (oxcf->optimal_buffer_level - rc->buffer_level) /
-                             buff_lvl_step;
-      }
-      active_worst_quality = rc->avg_frame_qindex[INTER_FRAME] + adjustment;
-    }
-  } else {
-    // Set to worst_quality if buffer is below critical level.
-    active_worst_quality = rc->worst_quality;
-  }
-  return active_worst_quality;
-}
-
-// Adjust target frame size with respect to the buffering constraints:
-static int target_size_from_buffer_level(const VP9_CONFIG *oxcf,
-                                         const RATE_CONTROL *rc) {
-  int target = rc->this_frame_target;
-  const int64_t diff = oxcf->optimal_buffer_level - rc->buffer_level;
-  const int one_pct_bits = 1 + oxcf->optimal_buffer_level / 100;
-
-  if (diff > 0) {
-    // Lower the target bandwidth for this frame.
-    const int pct_low = MIN(diff / one_pct_bits, oxcf->under_shoot_pct);
-    target -= (target * pct_low) / 200;
-  } else  if (diff < 0) {
-    // Increase the target bandwidth for this frame.
-    const int pct_high = MIN(-diff / one_pct_bits, oxcf->over_shoot_pct);
-    target += (target * pct_high) / 200;
-  }
-
-  return target;
-}
-
-static void calc_pframe_target_size(VP9_COMP *const cpi) {
-  RATE_CONTROL *const rc = &cpi->rc;
-  const VP9_CONFIG *const oxcf = &cpi->oxcf;
-  int min_frame_target;
-  rc->this_frame_target = rc->per_frame_bandwidth;
-
-  if (cpi->pass == 0 && oxcf->end_usage == USAGE_STREAM_FROM_SERVER) {
-    // Need to decide how low min_frame_target should be for 1-pass CBR.
-    // For now, use: cpi->rc.av_per_frame_bandwidth / 16:
-    min_frame_target = MAX(rc->av_per_frame_bandwidth >> 4,
-                           FRAME_OVERHEAD_BITS);
-    rc->this_frame_target = target_size_from_buffer_level(oxcf, rc);
-    // Adjust qp-max based on buffer level.
-    rc->active_worst_quality =
-        adjust_active_worst_quality_from_buffer_level(oxcf, rc);
-
-    if (rc->this_frame_target < min_frame_target)
-      rc->this_frame_target = min_frame_target;
-    return;
-  }
-
-  // Check that the total sum of adjustments is not above the maximum allowed.
-  // That is, having allowed for the KF and GF penalties, we have not pushed
-  // the current inter-frame target too low. If the adjustment we apply here is
-  // not capable of recovering all the extra bits we have spent in the KF or GF,
-  // then the remainder will have to be recovered over a longer time span via
-  // other buffer / rate control mechanisms.
-  min_frame_target = MAX(rc->min_frame_bandwidth,
-                         rc->av_per_frame_bandwidth >> 5);
-
-  if (rc->this_frame_target < min_frame_target)
-    rc->this_frame_target = min_frame_target;
-
-  // Adjust target frame size for Golden Frames:
-  if (cpi->refresh_golden_frame) {
-    // If we are using alternate ref instead of gf then do not apply the boost
-    // It will instead be applied to the altref update
-    // Jims modified boost
-    if (!rc->source_alt_ref_active) {
-      // The spend on the GF is defined in the two pass code
-      // for two pass encodes
-      rc->this_frame_target = rc->per_frame_bandwidth;
-    } else {
-      // If there is an active ARF at this location use the minimum
-      // bits on this frame even if it is a constructed arf.
-      // The active maximum quantizer insures that an appropriate
-      // number of bits will be spent if needed for constructed ARFs.
-      rc->this_frame_target = 0;
-    }
-  }
-}
-
 static double get_rate_correction_factor(const VP9_COMP *cpi) {
   if (cpi->common.frame_type == KEY_FRAME) {
     return cpi->rc.key_frame_rate_correction_factor;
   } else {
-    if (cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame)
+    if ((cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) &&
+        !(cpi->use_svc && cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER))
       return cpi->rc.gf_rate_correction_factor;
     else
       return cpi->rc.rate_correction_factor;
@@ -439,7 +338,8 @@ static void set_rate_correction_factor(VP9_COMP *cpi, double factor) {
   if (cpi->common.frame_type == KEY_FRAME) {
     cpi->rc.key_frame_rate_correction_factor = factor;
   } else {
-    if (cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame)
+    if ((cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) &&
+        !(cpi->use_svc && cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER))
       cpi->rc.gf_rate_correction_factor = factor;
     else
       cpi->rc.rate_correction_factor = factor;
@@ -455,7 +355,7 @@ void vp9_rc_update_rate_correction_factors(VP9_COMP *cpi, int damp_var) {
   int projected_size_based_on_q = 0;
 
   // Clear down mmx registers to allow floating point in what follows
-  vp9_clear_system_state();  // __asm emms;
+  vp9_clear_system_state();
 
   // Work out how big we would have expected the frame to be at this Q given
   // the current correction factor.
@@ -463,7 +363,6 @@ void vp9_rc_update_rate_correction_factors(VP9_COMP *cpi, int damp_var) {
   projected_size_based_on_q = estimate_bits_at_q(cpi->common.frame_type, q,
                                                  cpi->common.MBs,
                                                  rate_correction_factor);
-
   // Work out a size correction factor.
   if (projected_size_based_on_q > 0)
     correction_factor = (100 * cpi->rc.projected_frame_size) /
@@ -562,13 +461,206 @@ static int get_active_quality(int q, int gfu_boost, int low, int high,
   }
 }
 
-int vp9_rc_pick_q_and_adjust_q_bounds(const VP9_COMP *cpi,
-                                      int *bottom_index, int *top_index) {
+static int calc_active_worst_quality_one_pass_vbr(const VP9_COMP *cpi) {
+  int active_worst_quality;
+  if (cpi->common.frame_type == KEY_FRAME) {
+    if (cpi->common.current_video_frame == 0) {
+      active_worst_quality = cpi->rc.worst_quality;
+    } else {
+      // Choose active worst quality twice as large as the last q.
+      active_worst_quality = cpi->rc.last_q[KEY_FRAME] * 2;
+    }
+  } else if (!cpi->rc.is_src_frame_alt_ref &&
+             (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
+    if (cpi->common.current_video_frame == 1) {
+      active_worst_quality = cpi->rc.last_q[KEY_FRAME] * 5 / 4;
+    } else {
+      // Choose active worst quality twice as large as the last q.
+      active_worst_quality = cpi->rc.last_q[INTER_FRAME];
+    }
+  } else {
+    if (cpi->common.current_video_frame == 1) {
+      active_worst_quality = cpi->rc.last_q[KEY_FRAME] * 2;
+    } else {
+      // Choose active worst quality twice as large as the last q.
+      active_worst_quality = cpi->rc.last_q[INTER_FRAME] * 2;
+    }
+  }
+  if (active_worst_quality > cpi->rc.worst_quality)
+    active_worst_quality = cpi->rc.worst_quality;
+  return active_worst_quality;
+}
+
+// Adjust active_worst_quality level based on buffer level.
+static int calc_active_worst_quality_one_pass_cbr(const VP9_COMP *cpi) {
+  // Adjust active_worst_quality: If buffer is above the optimal/target level,
+  // bring active_worst_quality down depending on fullness of buffer.
+  // If buffer is below the optimal level, let the active_worst_quality go from
+  // ambient Q (at buffer = optimal level) to worst_quality level
+  // (at buffer = critical level).
+  const VP9_CONFIG *oxcf = &cpi->oxcf;
+  const RATE_CONTROL *rc = &cpi->rc;
+  // Buffer level below which we push active_worst to worst_quality.
+  int64_t critical_level = oxcf->optimal_buffer_level >> 2;
+  int64_t buff_lvl_step = 0;
+  int adjustment = 0;
+  int active_worst_quality;
+  if (cpi->common.frame_type == KEY_FRAME)
+    return rc->worst_quality;
+  if (cpi->common.current_video_frame > 1)
+    active_worst_quality = MIN(rc->worst_quality,
+                               rc->avg_frame_qindex[INTER_FRAME] * 5 / 4);
+  else
+    active_worst_quality = MIN(rc->worst_quality,
+                               rc->avg_frame_qindex[KEY_FRAME] * 3 / 2);
+  if (rc->buffer_level > oxcf->optimal_buffer_level) {
+    // Adjust down.
+    // Maximum limit for down adjustment, ~30%.
+    int max_adjustment_down = active_worst_quality / 3;
+    if (max_adjustment_down) {
+      buff_lvl_step = ((oxcf->maximum_buffer_size -
+                        oxcf->optimal_buffer_level) / max_adjustment_down);
+      if (buff_lvl_step)
+        adjustment = (int)((rc->buffer_level - oxcf->optimal_buffer_level) /
+                            buff_lvl_step);
+      active_worst_quality -= adjustment;
+    }
+  } else if (rc->buffer_level > critical_level) {
+    // Adjust up from ambient Q.
+    if (critical_level) {
+      buff_lvl_step = (oxcf->optimal_buffer_level - critical_level);
+      if (buff_lvl_step) {
+        adjustment =
+            (int)((rc->worst_quality - rc->avg_frame_qindex[INTER_FRAME]) *
+                  (oxcf->optimal_buffer_level - rc->buffer_level) /
+                  buff_lvl_step);
+      }
+      active_worst_quality = rc->avg_frame_qindex[INTER_FRAME] + adjustment;
+    }
+  } else {
+    // Set to worst_quality if buffer is below critical level.
+    active_worst_quality = rc->worst_quality;
+  }
+  return active_worst_quality;
+}
+
+static int rc_pick_q_and_bounds_one_pass_cbr(const VP9_COMP *cpi,
+                                             int *bottom_index,
+                                             int *top_index) {
+  const VP9_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  int active_best_quality;
+  int active_worst_quality = calc_active_worst_quality_one_pass_cbr(cpi);
+  int q;
+
+  if (frame_is_intra_only(cm)) {
+    active_best_quality = rc->best_quality;
+    // Handle the special case for key frames forced when we have75 reached
+    // the maximum key frame interval. Here force the Q to a range
+    // based on the ambient Q to reduce the risk of popping.
+    if (rc->this_key_frame_forced) {
+      int qindex = rc->last_boosted_qindex;
+      double last_boosted_q = vp9_convert_qindex_to_q(qindex);
+      int delta_qindex = vp9_compute_qdelta(cpi, last_boosted_q,
+                                            (last_boosted_q * 0.75));
+      active_best_quality = MAX(qindex + delta_qindex, rc->best_quality);
+    } else if (cm->current_video_frame > 0) {
+      // not first frame of one pass and kf_boost is set
+      double q_adj_factor = 1.0;
+      double q_val;
+
+      active_best_quality = get_active_quality(rc->avg_frame_qindex[KEY_FRAME],
+                                               rc->kf_boost,
+                                               kf_low, kf_high,
+                                               kf_low_motion_minq,
+                                               kf_high_motion_minq);
+
+      // Allow somewhat lower kf minq with small image formats.
+      if ((cm->width * cm->height) <= (352 * 288)) {
+        q_adj_factor -= 0.25;
+      }
+
+      // Convert the adjustment factor to a qindex delta
+      // on active_best_quality.
+      q_val = vp9_convert_qindex_to_q(active_best_quality);
+      active_best_quality += vp9_compute_qdelta(cpi, q_val, q_val *
+                                                   q_adj_factor);
+    }
+  } else if (!rc->is_src_frame_alt_ref &&
+             (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
+    // Use the lower of active_worst_quality and recent
+    // average Q as basis for GF/ARF best Q limit unless last frame was
+    // a key frame.
+    if (rc->frames_since_key > 1 &&
+        rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality) {
+      q = rc->avg_frame_qindex[INTER_FRAME];
+    } else {
+      q = active_worst_quality;
+    }
+    active_best_quality = get_active_quality(
+        q, rc->gfu_boost, gf_low, gf_high,
+        gf_low_motion_minq, gf_high_motion_minq);
+  } else {
+    // Use the lower of active_worst_quality and recent/average Q.
+    if (cm->current_video_frame > 1) {
+      if (rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality)
+        active_best_quality = inter_minq[rc->avg_frame_qindex[INTER_FRAME]];
+      else
+        active_best_quality = inter_minq[active_worst_quality];
+    } else {
+      if (rc->avg_frame_qindex[KEY_FRAME] < active_worst_quality)
+        active_best_quality = inter_minq[rc->avg_frame_qindex[KEY_FRAME]];
+      else
+        active_best_quality = inter_minq[active_worst_quality];
+    }
+  }
+
+  // Clip the active best and worst quality values to limits
+  active_best_quality = clamp(active_best_quality,
+                              rc->best_quality, rc->worst_quality);
+  active_worst_quality = clamp(active_worst_quality,
+                               active_best_quality, rc->worst_quality);
+
+  *top_index = active_worst_quality;
+  *bottom_index = active_best_quality;
+
+#if LIMIT_QRANGE_FOR_ALTREF_AND_KEY
+  // Limit Q range for the adaptive loop.
+  if (cm->frame_type == KEY_FRAME && !rc->this_key_frame_forced) {
+    if (!(cm->current_video_frame == 0))
+      *top_index = (active_worst_quality + active_best_quality * 3) / 4;
+  }
+#endif
+  // Special case code to try and match quality with forced key frames
+  if (cm->frame_type == KEY_FRAME && rc->this_key_frame_forced) {
+    q = rc->last_boosted_qindex;
+  } else {
+    q = vp9_rc_regulate_q(cpi, rc->this_frame_target,
+                          active_best_quality, active_worst_quality);
+    if (q > *top_index) {
+      // Special case when we are targeting the max allowed rate
+      if (cpi->rc.this_frame_target >= cpi->rc.max_frame_bandwidth)
+        *top_index = q;
+      else
+        q = *top_index;
+    }
+  }
+  assert(*top_index <= rc->worst_quality &&
+         *top_index >= rc->best_quality);
+  assert(*bottom_index <= rc->worst_quality &&
+         *bottom_index >= rc->best_quality);
+  assert(q <= rc->worst_quality && q >= rc->best_quality);
+  return q;
+}
+
+static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi,
+                                             int *bottom_index,
+                                             int *top_index) {
   const VP9_COMMON *const cm = &cpi->common;
   const RATE_CONTROL *const rc = &cpi->rc;
   const VP9_CONFIG *const oxcf = &cpi->oxcf;
   int active_best_quality;
-  int active_worst_quality = rc->active_worst_quality;
+  int active_worst_quality = calc_active_worst_quality_one_pass_vbr(cpi);
   int q;
 
   if (frame_is_intra_only(cm)) {
@@ -583,13 +675,12 @@ int vp9_rc_pick_q_and_adjust_q_bounds(const VP9_COMP *cpi,
       int delta_qindex = vp9_compute_qdelta(cpi, last_boosted_q,
                                             (last_boosted_q * 0.75));
       active_best_quality = MAX(qindex + delta_qindex, rc->best_quality);
-    } else if (!(cpi->pass == 0 && cm->current_video_frame == 0)) {
+    } else if (cm->current_video_frame > 0) {
       // not first frame of one pass and kf_boost is set
       double q_adj_factor = 1.0;
       double q_val;
 
-      // Baseline value derived from cpi->active_worst_quality and kf boost
-      active_best_quality = get_active_quality(active_worst_quality,
+      active_best_quality = get_active_quality(rc->avg_frame_qindex[KEY_FRAME],
                                                rc->kf_boost,
                                                kf_low, kf_high,
                                                kf_low_motion_minq,
@@ -600,9 +691,6 @@ int vp9_rc_pick_q_and_adjust_q_bounds(const VP9_COMP *cpi,
         q_adj_factor -= 0.25;
       }
 
-      // Make a further adjustment based on the kf zero motion measure.
-      q_adj_factor += 0.05 - (0.001 * (double)cpi->twopass.kf_zeromotion_pct);
-
       // Convert the adjustment factor to a qindex delta
       // on active_best_quality.
       q_val = vp9_convert_qindex_to_q(active_best_quality);
@@ -618,7 +706,6 @@ int vp9_rc_pick_q_and_adjust_q_bounds(const VP9_COMP *cpi,
 #endif
   } else if (!rc->is_src_frame_alt_ref &&
              (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
-
     // Use the lower of active_worst_quality and recent
     // average Q as basis for GF/ARF best Q limit unless last frame was
     // a key frame.
@@ -626,7 +713,7 @@ int vp9_rc_pick_q_and_adjust_q_bounds(const VP9_COMP *cpi,
         rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality) {
       q = rc->avg_frame_qindex[INTER_FRAME];
     } else {
-      q = active_worst_quality;
+      q = rc->avg_frame_qindex[KEY_FRAME];
     }
     // For constrained quality dont allow Q less than the cq level
     if (oxcf->end_usage == USAGE_CONSTRAINED_QUALITY) {
@@ -669,14 +756,11 @@ int vp9_rc_pick_q_and_adjust_q_bounds(const VP9_COMP *cpi,
     if (oxcf->end_usage == USAGE_CONSTANT_QUALITY) {
       active_best_quality = cpi->cq_target_quality;
     } else {
-      if (cpi->pass == 0 &&
-          rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality)
-        // 1-pass: for now, use the average Q for the active_best, if its lower
-        // than active_worst.
+      // Use the lower of active_worst_quality and recent/average Q.
+      if (cm->current_video_frame > 1)
         active_best_quality = inter_minq[rc->avg_frame_qindex[INTER_FRAME]];
       else
-        active_best_quality = inter_minq[active_worst_quality];
-
+        active_best_quality = inter_minq[rc->avg_frame_qindex[KEY_FRAME]];
       // For the constrained quality mode we don't want
       // q to fall below the cq level.
       if ((oxcf->end_usage == USAGE_CONSTRAINED_QUALITY) &&
@@ -693,17 +777,192 @@ int vp9_rc_pick_q_and_adjust_q_bounds(const VP9_COMP *cpi,
   }
 
   // Clip the active best and worst quality values to limits
-  if (active_worst_quality > rc->worst_quality)
-    active_worst_quality = rc->worst_quality;
+  active_best_quality = clamp(active_best_quality,
+                              rc->best_quality, rc->worst_quality);
+  active_worst_quality = clamp(active_worst_quality,
+                               active_best_quality, rc->worst_quality);
 
-  if (active_best_quality < rc->best_quality)
-    active_best_quality = rc->best_quality;
+  *top_index = active_worst_quality;
+  *bottom_index = active_best_quality;
 
-  if (active_best_quality > rc->worst_quality)
-    active_best_quality = rc->worst_quality;
+#if LIMIT_QRANGE_FOR_ALTREF_AND_KEY
+  // Limit Q range for the adaptive loop.
+  if (cm->frame_type == KEY_FRAME && !rc->this_key_frame_forced) {
+    if (!(cm->current_video_frame == 0))
+      *top_index = (active_worst_quality + active_best_quality * 3) / 4;
+  } else if (!rc->is_src_frame_alt_ref &&
+             (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
+    *top_index = (active_worst_quality + active_best_quality) / 2;
+  }
+#endif
+  if (oxcf->end_usage == USAGE_CONSTANT_QUALITY) {
+    q = active_best_quality;
+  // Special case code to try and match quality with forced key frames
+  } else if ((cm->frame_type == KEY_FRAME) && rc->this_key_frame_forced) {
+    q = rc->last_boosted_qindex;
+  } else {
+    q = vp9_rc_regulate_q(cpi, rc->this_frame_target,
+                          active_best_quality, active_worst_quality);
+    if (q > *top_index) {
+      // Special case when we are targeting the max allowed rate
+      if (cpi->rc.this_frame_target >= cpi->rc.max_frame_bandwidth)
+        *top_index = q;
+      else
+        q = *top_index;
+    }
+  }
+#if CONFIG_MULTIPLE_ARF
+  // Force the quantizer determined by the coding order pattern.
+  if (cpi->multi_arf_enabled && (cm->frame_type != KEY_FRAME) &&
+      cpi->oxcf.end_usage != USAGE_CONSTANT_QUALITY) {
+    double new_q;
+    double current_q = vp9_convert_qindex_to_q(active_worst_quality);
+    int level = cpi->this_frame_weight;
+    assert(level >= 0);
+    new_q = current_q * (1.0 - (0.2 * (cpi->max_arf_level - level)));
+    q = active_worst_quality +
+        vp9_compute_qdelta(cpi, current_q, new_q);
+
+    *bottom_index = q;
+    *top_index    = q;
+    printf("frame:%d q:%d\n", cm->current_video_frame, q);
+  }
+#endif
+  assert(*top_index <= rc->worst_quality &&
+         *top_index >= rc->best_quality);
+  assert(*bottom_index <= rc->worst_quality &&
+         *bottom_index >= rc->best_quality);
+  assert(q <= rc->worst_quality && q >= rc->best_quality);
+  return q;
+}
+
+static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi,
+                                         int *bottom_index,
+                                         int *top_index) {
+  const VP9_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const VP9_CONFIG *const oxcf = &cpi->oxcf;
+  int active_best_quality;
+  int active_worst_quality = cpi->twopass.active_worst_quality;
+  int q;
+
+  if (frame_is_intra_only(cm)) {
+#if !CONFIG_MULTIPLE_ARF
+    // Handle the special case for key frames forced when we have75 reached
+    // the maximum key frame interval. Here force the Q to a range
+    // based on the ambient Q to reduce the risk of popping.
+    if (rc->this_key_frame_forced) {
+      int qindex = rc->last_boosted_qindex;
+      double last_boosted_q = vp9_convert_qindex_to_q(qindex);
+      int delta_qindex = vp9_compute_qdelta(cpi, last_boosted_q,
+                                            (last_boosted_q * 0.75));
+      active_best_quality = MAX(qindex + delta_qindex, rc->best_quality);
+    } else {
+      // Not forced keyframe.
+      double q_adj_factor = 1.0;
+      double q_val;
+      // Baseline value derived from cpi->active_worst_quality and kf boost.
+      active_best_quality = get_active_quality(active_worst_quality,
+                                               rc->kf_boost,
+                                               kf_low, kf_high,
+                                               kf_low_motion_minq,
+                                               kf_high_motion_minq);
+
+      // Allow somewhat lower kf minq with small image formats.
+      if ((cm->width * cm->height) <= (352 * 288)) {
+        q_adj_factor -= 0.25;
+      }
+
+      // Make a further adjustment based on the kf zero motion measure.
+      q_adj_factor += 0.05 - (0.001 * (double)cpi->twopass.kf_zeromotion_pct);
+
+      // Convert the adjustment factor to a qindex delta
+      // on active_best_quality.
+      q_val = vp9_convert_qindex_to_q(active_best_quality);
+      active_best_quality += vp9_compute_qdelta(cpi, q_val, q_val *
+                                                   q_adj_factor);
+    }
+#else
+    double current_q;
+    // Force the KF quantizer to be 30% of the active_worst_quality.
+    current_q = vp9_convert_qindex_to_q(active_worst_quality);
+    active_best_quality = active_worst_quality
+        + vp9_compute_qdelta(cpi, current_q, current_q * 0.3);
+#endif
+  } else if (!rc->is_src_frame_alt_ref &&
+             (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
+    // Use the lower of active_worst_quality and recent
+    // average Q as basis for GF/ARF best Q limit unless last frame was
+    // a key frame.
+    if (rc->frames_since_key > 1 &&
+        rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality) {
+      q = rc->avg_frame_qindex[INTER_FRAME];
+    } else {
+      q = active_worst_quality;
+    }
+    // For constrained quality dont allow Q less than the cq level
+    if (oxcf->end_usage == USAGE_CONSTRAINED_QUALITY) {
+      if (q < cpi->cq_target_quality)
+        q = cpi->cq_target_quality;
+      if (rc->frames_since_key > 1) {
+        active_best_quality = get_active_quality(q, rc->gfu_boost,
+                                                 gf_low, gf_high,
+                                                 afq_low_motion_minq,
+                                                 afq_high_motion_minq);
+      } else {
+        active_best_quality = get_active_quality(q, rc->gfu_boost,
+                                                 gf_low, gf_high,
+                                                 gf_low_motion_minq,
+                                                 gf_high_motion_minq);
+      }
+      // Constrained quality use slightly lower active best.
+      active_best_quality = active_best_quality * 15 / 16;
 
-  if (active_worst_quality < active_best_quality)
-    active_worst_quality = active_best_quality;
+    } else if (oxcf->end_usage == USAGE_CONSTANT_QUALITY) {
+      if (!cpi->refresh_alt_ref_frame) {
+        active_best_quality = cpi->cq_target_quality;
+      } else {
+        if (rc->frames_since_key > 1) {
+          active_best_quality = get_active_quality(
+              q, rc->gfu_boost, gf_low, gf_high,
+              afq_low_motion_minq, afq_high_motion_minq);
+        } else {
+          active_best_quality = get_active_quality(
+              q, rc->gfu_boost, gf_low, gf_high,
+              gf_low_motion_minq, gf_high_motion_minq);
+        }
+      }
+    } else {
+      active_best_quality = get_active_quality(
+          q, rc->gfu_boost, gf_low, gf_high,
+          gf_low_motion_minq, gf_high_motion_minq);
+    }
+  } else {
+    if (oxcf->end_usage == USAGE_CONSTANT_QUALITY) {
+      active_best_quality = cpi->cq_target_quality;
+    } else {
+      active_best_quality = inter_minq[active_worst_quality];
+
+      // For the constrained quality mode we don't want
+      // q to fall below the cq level.
+      if ((oxcf->end_usage == USAGE_CONSTRAINED_QUALITY) &&
+          (active_best_quality < cpi->cq_target_quality)) {
+        // If we are strongly undershooting the target rate in the last
+        // frames then use the user passed in cq value not the auto
+        // cq value.
+        if (rc->rolling_actual_bits < rc->min_frame_bandwidth)
+          active_best_quality = oxcf->cq_level;
+        else
+          active_best_quality = cpi->cq_target_quality;
+      }
+    }
+  }
+
+  // Clip the active best and worst quality values to limits.
+  active_best_quality = clamp(active_best_quality,
+                              rc->best_quality, rc->worst_quality);
+  active_worst_quality = clamp(active_worst_quality,
+                               active_best_quality, rc->worst_quality);
 
   *top_index = active_worst_quality;
   *bottom_index = active_best_quality;
@@ -711,8 +970,7 @@ int vp9_rc_pick_q_and_adjust_q_bounds(const VP9_COMP *cpi,
 #if LIMIT_QRANGE_FOR_ALTREF_AND_KEY
   // Limit Q range for the adaptive loop.
   if (cm->frame_type == KEY_FRAME && !rc->this_key_frame_forced) {
-    if (!(cpi->pass == 0 && cm->current_video_frame == 0))
-      *top_index = (active_worst_quality + active_best_quality * 3) / 4;
+    *top_index = (active_worst_quality + active_best_quality * 3) / 4;
   } else if (!rc->is_src_frame_alt_ref &&
              (oxcf->end_usage != USAGE_STREAM_FROM_SERVER) &&
              (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
@@ -722,14 +980,14 @@ int vp9_rc_pick_q_and_adjust_q_bounds(const VP9_COMP *cpi,
 
   if (oxcf->end_usage == USAGE_CONSTANT_QUALITY) {
     q = active_best_quality;
-  // Special case code to try and match quality with forced key frames
+  // Special case code to try and match quality with forced key frames.
   } else if ((cm->frame_type == KEY_FRAME) && rc->this_key_frame_forced) {
     q = rc->last_boosted_qindex;
   } else {
     q = vp9_rc_regulate_q(cpi, rc->this_frame_target,
                           active_best_quality, active_worst_quality);
     if (q > *top_index) {
-      // Special case when we are targeting the max allowed rate
+      // Special case when we are targeting the max allowed rate.
       if (cpi->rc.this_frame_target >= cpi->rc.max_frame_bandwidth)
         *top_index = q;
       else
@@ -761,6 +1019,35 @@ int vp9_rc_pick_q_and_adjust_q_bounds(const VP9_COMP *cpi,
   return q;
 }
 
+int vp9_rc_pick_q_and_bounds(const VP9_COMP *cpi,
+                             int *bottom_index,
+                             int *top_index) {
+  int q;
+  if (cpi->pass == 0) {
+    if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
+      q = rc_pick_q_and_bounds_one_pass_cbr(cpi, bottom_index, top_index);
+    else
+      q = rc_pick_q_and_bounds_one_pass_vbr(cpi, bottom_index, top_index);
+  } else {
+    q = rc_pick_q_and_bounds_two_pass(cpi, bottom_index, top_index);
+  }
+
+  // JBB : This is realtime mode.  In real time mode the first frame
+  // should be larger. Q of 0 is disabled because we force tx size to be
+  // 16x16...
+  if (cpi->sf.use_nonrd_pick_mode) {
+    if (cpi->common.current_video_frame == 0)
+      q /= 3;
+    if (q == 0)
+      q++;
+    if (q < *bottom_index)
+      *bottom_index = q;
+    else if (q > *top_index)
+      *top_index = q;
+  }
+  return q;
+}
+
 void vp9_rc_compute_frame_size_bounds(const VP9_COMP *cpi,
                                       int this_frame_target,
                                       int *frame_under_shoot_limit,
@@ -804,24 +1091,14 @@ void vp9_rc_compute_frame_size_bounds(const VP9_COMP *cpi,
   }
 }
 
-// return of 0 means drop frame
-int vp9_rc_pick_frame_size_target(VP9_COMP *cpi) {
+void vp9_rc_set_frame_target(VP9_COMP *cpi, int target) {
   const VP9_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
 
-  if (cm->frame_type == KEY_FRAME)
-    calc_iframe_target_size(cpi);
-  else
-    calc_pframe_target_size(cpi);
-
-  // Clip the frame target to the maximum allowed value.
-  if (rc->this_frame_target > rc->max_frame_bandwidth)
-    rc->this_frame_target = rc->max_frame_bandwidth;
-
+  rc->this_frame_target = target;
   // Target rate per SB64 (including partial SB64s.
   rc->sb64_target_rate = ((int64_t)rc->this_frame_target * 64 * 64) /
                              (cm->width * cm->height);
-  return 1;
 }
 
 static void update_alt_ref_frame_stats(VP9_COMP *cpi) {
@@ -865,11 +1142,14 @@ static void update_golden_frame_stats(VP9_COMP *cpi) {
 void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
   VP9_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
+
+  cm->last_frame_type = cm->frame_type;
   // Update rate control heuristics
-  rc->projected_frame_size = (bytes_used << 3);
+  rc->projected_frame_size = (int)(bytes_used << 3);
 
   // Post encode loop adjustment of Q prediction.
-  vp9_rc_update_rate_correction_factors(cpi, (cpi->sf.recode_loop ||
+  vp9_rc_update_rate_correction_factors(
+      cpi, (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF ||
             cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) ? 2 : 0);
 
   // Keep a record of last Q and ambient average Q.
@@ -878,7 +1158,8 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
     rc->avg_frame_qindex[KEY_FRAME] = ROUND_POWER_OF_TWO(
         3 * rc->avg_frame_qindex[KEY_FRAME] + cm->base_qindex, 2);
   } else if (!rc->is_src_frame_alt_ref &&
-             (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
+      (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame) &&
+      !(cpi->use_svc && cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)) {
     rc->last_q[2] = cm->base_qindex;
     rc->avg_frame_qindex[2] = ROUND_POWER_OF_TWO(
         3 * rc->avg_frame_qindex[2] + cm->base_qindex, 2);
@@ -907,7 +1188,7 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
     rc->last_boosted_qindex = cm->base_qindex;
   }
 
-  vp9_update_buffer_level(cpi, rc->projected_frame_size);
+  update_buffer_level(cpi, rc->projected_frame_size);
 
   // Rolling monitors of whether we are over or underspending used to help
   // regulate min and Max Q in two pass.
@@ -929,22 +1210,6 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
   rc->total_target_vs_actual += (rc->this_frame_target -
                                  rc->projected_frame_size);
 
-#ifndef DISABLE_RC_LONG_TERM_MEM
-  // Update bits left to the kf and gf groups to account for overshoot or
-  // undershoot on these frames
-  if (cm->frame_type == KEY_FRAME) {
-    cpi->twopass.kf_group_bits += cpi->rc.this_frame_target -
-                                  cpi->rc.projected_frame_size;
-
-    cpi->twopass.kf_group_bits = MAX(cpi->twopass.kf_group_bits, 0);
-  } else if (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame) {
-    cpi->twopass.gf_group_bits += cpi->rc.this_frame_target -
-                                  cpi->rc.projected_frame_size;
-
-    cpi->twopass.gf_group_bits = MAX(cpi->twopass.gf_group_bits, 0);
-  }
-#endif
-
   if (cpi->oxcf.play_alternate && cpi->refresh_alt_ref_frame &&
       (cm->frame_type != KEY_FRAME))
     // Update the alternate reference frame stats as appropriate.
@@ -962,6 +1227,172 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
 }
 
 void vp9_rc_postencode_update_drop_frame(VP9_COMP *cpi) {
+  // Update buffer level with zero size, update frame counters, and return.
+  update_buffer_level(cpi, 0);
+  cpi->common.last_frame_type = cpi->common.frame_type;
   cpi->rc.frames_since_key++;
   cpi->rc.frames_to_key--;
 }
+
+static int test_for_kf_one_pass(VP9_COMP *cpi) {
+  // Placeholder function for auto key frame
+  return 0;
+}
+// Use this macro to turn on/off use of alt-refs in one-pass mode.
+#define USE_ALTREF_FOR_ONE_PASS   1
+
+static int calc_pframe_target_size_one_pass_vbr(const VP9_COMP *const cpi) {
+  static const int af_ratio = 10;
+  const RATE_CONTROL *rc = &cpi->rc;
+  int target;
+#if USE_ALTREF_FOR_ONE_PASS
+  target = (!rc->is_src_frame_alt_ref &&
+            (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) ?
+      (rc->av_per_frame_bandwidth * cpi->rc.baseline_gf_interval * af_ratio) /
+      (cpi->rc.baseline_gf_interval + af_ratio - 1) :
+      (rc->av_per_frame_bandwidth * cpi->rc.baseline_gf_interval) /
+      (cpi->rc.baseline_gf_interval + af_ratio - 1);
+#else
+  target = rc->av_per_frame_bandwidth;
+#endif
+  return vp9_rc_clamp_pframe_target_size(cpi, target);
+}
+
+static int calc_iframe_target_size_one_pass_vbr(const VP9_COMP *const cpi) {
+  static const int kf_ratio = 25;
+  const RATE_CONTROL *rc = &cpi->rc;
+  int target = rc->av_per_frame_bandwidth * kf_ratio;
+  return vp9_rc_clamp_iframe_target_size(cpi, target);
+}
+
+void vp9_rc_get_one_pass_vbr_params(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+  int target;
+  if (!cpi->refresh_alt_ref_frame &&
+      (cm->current_video_frame == 0 ||
+       cm->frame_flags & FRAMEFLAGS_KEY ||
+       rc->frames_to_key == 0 ||
+       (cpi->oxcf.auto_key && test_for_kf_one_pass(cpi)))) {
+    cm->frame_type = KEY_FRAME;
+    rc->this_key_frame_forced = cm->current_video_frame != 0 &&
+                                rc->frames_to_key == 0;
+    rc->frames_to_key = cpi->key_frame_frequency;
+    rc->kf_boost = DEFAULT_KF_BOOST;
+    rc->source_alt_ref_active = 0;
+  } else {
+    cm->frame_type = INTER_FRAME;
+  }
+  if (rc->frames_till_gf_update_due == 0) {
+    rc->baseline_gf_interval = DEFAULT_GF_INTERVAL;
+    rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+    // NOTE: frames_till_gf_update_due must be <= frames_to_key.
+    if (rc->frames_till_gf_update_due > rc->frames_to_key)
+      rc->frames_till_gf_update_due = rc->frames_to_key;
+    cpi->refresh_golden_frame = 1;
+    rc->source_alt_ref_pending = USE_ALTREF_FOR_ONE_PASS;
+    rc->gfu_boost = DEFAULT_GF_BOOST;
+  }
+  if (cm->frame_type == KEY_FRAME)
+    target = calc_iframe_target_size_one_pass_vbr(cpi);
+  else
+    target = calc_pframe_target_size_one_pass_vbr(cpi);
+  vp9_rc_set_frame_target(cpi, target);
+}
+
+static int calc_pframe_target_size_one_pass_cbr(const VP9_COMP *cpi) {
+  const VP9_CONFIG *oxcf = &cpi->oxcf;
+  const RATE_CONTROL *rc = &cpi->rc;
+  const int64_t diff = oxcf->optimal_buffer_level - rc->buffer_level;
+  const int64_t one_pct_bits = 1 + oxcf->optimal_buffer_level / 100;
+  int min_frame_target = MAX(rc->av_per_frame_bandwidth >> 4,
+                             FRAME_OVERHEAD_BITS);
+  int target = rc->av_per_frame_bandwidth;
+  if (cpi->svc.number_temporal_layers > 1 &&
+      cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) {
+    // Note that for layers, av_per_frame_bandwidth is the cumulative
+    // per-frame-bandwidth. For the target size of this frame, use the
+    // layer average frame size (i.e., non-cumulative per-frame-bw).
+    int current_temporal_layer = cpi->svc.temporal_layer_id;
+    const LAYER_CONTEXT *lc = &cpi->svc.layer_context[current_temporal_layer];
+    target = lc->avg_frame_size;
+    min_frame_target = MAX(lc->avg_frame_size >> 4, FRAME_OVERHEAD_BITS);
+  }
+  if (diff > 0) {
+    // Lower the target bandwidth for this frame.
+    const int pct_low = (int)MIN(diff / one_pct_bits, oxcf->under_shoot_pct);
+    target -= (target * pct_low) / 200;
+  } else if (diff < 0) {
+    // Increase the target bandwidth for this frame.
+    const int pct_high = (int)MIN(-diff / one_pct_bits, oxcf->over_shoot_pct);
+    target += (target * pct_high) / 200;
+  }
+  return MAX(min_frame_target, target);
+}
+
+static int calc_iframe_target_size_one_pass_cbr(const VP9_COMP *cpi) {
+  const RATE_CONTROL *rc = &cpi->rc;
+  int target;
+
+  if (cpi->common.current_video_frame == 0) {
+    target = ((cpi->oxcf.starting_buffer_level / 2) > INT_MAX)
+      ? INT_MAX : (int)(cpi->oxcf.starting_buffer_level / 2);
+  } else {
+    const int initial_boost = 32;
+    int kf_boost = MAX(initial_boost, (int)(2 * cpi->output_framerate - 16));
+    if (rc->frames_since_key < cpi->output_framerate / 2) {
+      kf_boost = (int)(kf_boost * rc->frames_since_key /
+                       (cpi->output_framerate / 2));
+    }
+    target = ((16 + kf_boost) * rc->av_per_frame_bandwidth) >> 4;
+  }
+  return vp9_rc_clamp_iframe_target_size(cpi, target);
+}
+
+void vp9_rc_get_svc_params(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  int target = cpi->rc.av_per_frame_bandwidth;
+  if ((cm->current_video_frame == 0) ||
+      (cm->frame_flags & FRAMEFLAGS_KEY) ||
+      (cpi->oxcf.auto_key && (cpi->rc.frames_since_key %
+                              cpi->key_frame_frequency == 0))) {
+    cm->frame_type = KEY_FRAME;
+    cpi->rc.source_alt_ref_active = 0;
+    if (cpi->pass == 0 && cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) {
+      target = calc_iframe_target_size_one_pass_cbr(cpi);
+    }
+  } else {
+    cm->frame_type = INTER_FRAME;
+    if (cpi->pass == 0 && cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) {
+      target = calc_pframe_target_size_one_pass_cbr(cpi);
+    }
+  }
+  vp9_rc_set_frame_target(cpi, target);
+  cpi->rc.frames_till_gf_update_due = INT_MAX;
+  cpi->rc.baseline_gf_interval = INT_MAX;
+}
+
+void vp9_rc_get_one_pass_cbr_params(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+  int target;
+  if ((cm->current_video_frame == 0 ||
+      cm->frame_flags & FRAMEFLAGS_KEY ||
+      rc->frames_to_key == 0 ||
+      (cpi->oxcf.auto_key && test_for_kf_one_pass(cpi)))) {
+    cm->frame_type = KEY_FRAME;
+    rc->this_key_frame_forced = cm->current_video_frame != 0 &&
+                                rc->frames_to_key == 0;
+    rc->frames_to_key = cpi->key_frame_frequency;
+    rc->kf_boost = DEFAULT_KF_BOOST;
+    rc->source_alt_ref_active = 0;
+    target = calc_iframe_target_size_one_pass_cbr(cpi);
+  } else {
+    cm->frame_type = INTER_FRAME;
+    target = calc_pframe_target_size_one_pass_cbr(cpi);
+  }
+  vp9_rc_set_frame_target(cpi, target);
+  // Don't use gf_update by default in CBR mode.
+  rc->frames_till_gf_update_due = INT_MAX;
+  rc->baseline_gf_interval = INT_MAX;
+}
diff --git a/vp9/encoder/vp9_ratectrl.h b/vp9/encoder/vp9_ratectrl.h
index eba4b7a92..5dbc7d138 100644
--- a/vp9/encoder/vp9_ratectrl.h
+++ b/vp9/encoder/vp9_ratectrl.h
@@ -34,19 +34,18 @@ typedef struct {
   double key_frame_rate_correction_factor;
   double gf_rate_correction_factor;
 
-  unsigned int frames_since_golden;
-  unsigned int frames_till_gf_update_due;  // Count down till next GF
-  unsigned int max_gf_interval;
-  unsigned int baseline_gf_interval;
-  unsigned int frames_to_key;
-  unsigned int frames_since_key;
-  unsigned int this_key_frame_forced;
-  unsigned int next_key_frame_forced;
-  unsigned int source_alt_ref_pending;
-  unsigned int source_alt_ref_active;
-  unsigned int is_src_frame_alt_ref;
-
-  int per_frame_bandwidth;        // Current section per frame bandwidth target
+  int frames_since_golden;
+  int frames_till_gf_update_due;
+  int max_gf_interval;
+  int baseline_gf_interval;
+  int frames_to_key;
+  int frames_since_key;
+  int this_key_frame_forced;
+  int next_key_frame_forced;
+  int source_alt_ref_pending;
+  int source_alt_ref_active;
+  int is_src_frame_alt_ref;
+
   int av_per_frame_bandwidth;     // Average frame size target for clip
   int min_frame_bandwidth;        // Minimum allocation used for any frame
   int max_frame_bandwidth;        // Maximum burst rate allowed for a frame.
@@ -58,8 +57,8 @@ typedef struct {
   double tot_q;
   double avg_q;
 
-  int buffer_level;
-  int bits_off_target;
+  int64_t buffer_level;
+  int64_t bits_off_target;
 
   int decimation_factor;
   int decimation_count;
@@ -74,7 +73,6 @@ typedef struct {
   int total_target_vs_actual;        // debug stats
 
   int worst_quality;
-  int active_worst_quality;
   int best_quality;
   // int active_best_quality;
 } RATE_CONTROL;
@@ -89,50 +87,79 @@ void vp9_setup_inter_frame(struct VP9_COMP *cpi);
 
 double vp9_convert_qindex_to_q(int qindex);
 
-// Updates rate correction factors
-void vp9_rc_update_rate_correction_factors(struct VP9_COMP *cpi, int damp_var);
-
 // initialize luts for minq
 void vp9_rc_init_minq_luts(void);
 
-// return of 0 means drop frame
-// Changes only rc.this_frame_target and rc.sb64_rate_target
-int vp9_rc_pick_frame_size_target(struct VP9_COMP *cpi);
+// Generally at the high level, the following flow is expected
+// to be enforced for rate control:
+// First call per frame, one of:
+//   vp9_rc_get_one_pass_vbr_params()
+//   vp9_rc_get_one_pass_cbr_params()
+//   vp9_rc_get_svc_params()
+//   vp9_rc_get_first_pass_params()
+//   vp9_rc_get_second_pass_params()
+// depending on the usage to set the rate control encode parameters desired.
+//
+// Then, call encode_frame_to_data_rate() to perform the
+// actual encode. This function will in turn call encode_frame()
+// one or more times, followed by one of:
+//   vp9_rc_postencode_update()
+//   vp9_rc_postencode_update_drop_frame()
+//
+// The majority of rate control parameters are only expected
+// to be set in the vp9_rc_get_..._params() functions and
+// updated during the vp9_rc_postencode_update...() functions.
+// The only exceptions are vp9_rc_drop_frame() and
+// vp9_rc_update_rate_correction_factors() functions.
+
+// Functions to set parameters for encoding before the actual
+// encode_frame_to_data_rate() function.
+void vp9_rc_get_one_pass_vbr_params(struct VP9_COMP *cpi);
+void vp9_rc_get_one_pass_cbr_params(struct VP9_COMP *cpi);
+void vp9_rc_get_svc_params(struct VP9_COMP *cpi);
+
+// Post encode update of the rate control parameters based
+// on bytes used
+void vp9_rc_postencode_update(struct VP9_COMP *cpi,
+                              uint64_t bytes_used);
+// Post encode update of the rate control parameters for dropped frames
+void vp9_rc_postencode_update_drop_frame(struct VP9_COMP *cpi);
+
+// Updates rate correction factors
+// Changes only the rate correction factors in the rate control structure.
+void vp9_rc_update_rate_correction_factors(struct VP9_COMP *cpi, int damp_var);
+
+// Decide if we should drop this frame: For 1-pass CBR.
+// Changes only the decimation count in the rate control structure
+int vp9_rc_drop_frame(struct VP9_COMP *cpi);
 
+// Computes frame size bounds.
 void vp9_rc_compute_frame_size_bounds(const struct VP9_COMP *cpi,
                                       int this_frame_target,
                                       int *frame_under_shoot_limit,
                                       int *frame_over_shoot_limit);
 
 // Picks q and q bounds given the target for bits
-int vp9_rc_pick_q_and_adjust_q_bounds(const struct VP9_COMP *cpi,
-                                      int *bottom_index,
-                                      int *top_index);
+int vp9_rc_pick_q_and_bounds(const struct VP9_COMP *cpi,
+                             int *bottom_index,
+                             int *top_index);
 
 // Estimates q to achieve a target bits per frame
 int vp9_rc_regulate_q(const struct VP9_COMP *cpi, int target_bits_per_frame,
                       int active_best_quality, int active_worst_quality);
 
-// Post encode update of the rate control parameters based
-// on bytes used
-void vp9_rc_postencode_update(struct VP9_COMP *cpi,
-                              uint64_t bytes_used);
-// for dropped frames
-void vp9_rc_postencode_update_drop_frame(struct VP9_COMP *cpi);
-
-// estimates bits per mb for a given qindex and correction factor
+// Estimates bits per mb for a given qindex and correction factor.
 int vp9_rc_bits_per_mb(FRAME_TYPE frame_type, int qindex,
                        double correction_factor);
 
-// Post encode update of the rate control parameters for 2-pass
-void vp9_twopass_postencode_update(struct VP9_COMP *cpi,
-                                   uint64_t bytes_used);
-
-// Decide if we should drop this frame: For 1-pass CBR.
-int vp9_drop_frame(struct VP9_COMP *cpi);
-
-// Update the buffer level.
-void vp9_update_buffer_level(struct VP9_COMP *cpi, int encoded_frame_size);
+// Clamping utilities for bitrate targets for iframes and pframes.
+int vp9_rc_clamp_iframe_target_size(const struct VP9_COMP *const cpi,
+                                    int target);
+int vp9_rc_clamp_pframe_target_size(const struct VP9_COMP *const cpi,
+                                    int target);
+// Utility to set frame_target into the RATE_CONTROL structure
+// This function is called only from the vp9_rc_get_..._params() functions.
+void vp9_rc_set_frame_target(struct VP9_COMP *cpi, int target);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index e5230feb4..f7577e174 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -68,7 +68,7 @@ struct rdcost_block_args {
   int64_t this_rd;
   int64_t best_rd;
   int skip;
-  const int16_t *scan, *nb;
+  const scan_order *so;
 };
 
 const MODE_DEFINITION vp9_mode_order[MAX_MODES] = {
@@ -274,7 +274,7 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi) {
   MACROBLOCK *x = &cpi->mb;
   int qindex, i;
 
-  vp9_clear_system_state();  // __asm emms;
+  vp9_clear_system_state();
 
   // Further tests required to see if optimum is different
   // for key frames, golden frames and arf frames.
@@ -285,7 +285,8 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi) {
   cpi->RDDIV = RDDIV_BITS;  // in bits (to multiply D by 128)
   cpi->RDMULT = vp9_compute_rd_mult(cpi, qindex);
 
-  x->errorperbit = cpi->RDMULT / RD_MULT_EPB_RATIO + (x->errorperbit == 0);
+  x->errorperbit = cpi->RDMULT / RD_MULT_EPB_RATIO;
+  x->errorperbit += (x->errorperbit == 0);
 
   vp9_set_speed_features(cpi);
 
@@ -294,21 +295,22 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi) {
 
   set_block_thresholds(cpi);
 
-  fill_token_costs(x->token_costs, cm->fc.coef_probs);
+  if (!cpi->sf.use_nonrd_pick_mode) {
+    fill_token_costs(x->token_costs, cm->fc.coef_probs);
 
-  if (!cpi->sf.super_fast_rtc) {
     for (i = 0; i < PARTITION_CONTEXTS; i++)
       vp9_cost_tokens(x->partition_cost[i], get_partition_probs(cm, i),
                       vp9_partition_tree);
+  }
 
+  if (!cpi->sf.use_nonrd_pick_mode || (cm->current_video_frame & 0x07) == 1) {
     fill_mode_costs(cpi);
 
     if (!frame_is_intra_only(cm)) {
       vp9_build_nmv_cost_table(x->nmvjointcost,
                                cm->allow_high_precision_mv ? x->nmvcost_hp
                                                            : x->nmvcost,
-                               &cm->fc.nmvc,
-                               cm->allow_high_precision_mv, 1, 1);
+                               &cm->fc.nmvc, cm->allow_high_precision_mv);
 
       for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
         vp9_cost_tokens((int *)x->inter_mode_cost[i],
@@ -414,9 +416,10 @@ static void model_rd_from_var_lapndz(unsigned int var, unsigned int n,
     *dist = 0;
   } else {
     int d_q10, r_q10;
-    uint64_t xsq_q10_64 =
+    const uint64_t xsq_q10_64 =
         ((((uint64_t)qstep * qstep * n) << 10) + (var >> 1)) / var;
-    int xsq_q10 = xsq_q10_64 > MAX_XSQ_Q10 ? MAX_XSQ_Q10 : xsq_q10_64;
+    const int xsq_q10 = xsq_q10_64 > MAX_XSQ_Q10 ?
+                        MAX_XSQ_Q10 : (int)xsq_q10_64;
     model_rd_norm(xsq_q10, &r_q10, &d_q10);
     *rate = (n * r_q10 + 2) >> 2;
     *dist = (var * (int64_t)d_q10 + 512) >> 10;
@@ -429,7 +432,9 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize,
   // Note our transform coeffs are 8 times an orthogonal transform.
   // Hence quantizer step is also 8 times. To get effective quantizer
   // we need to divide by 8 before sending to modeling function.
-  int i, rate_sum = 0, dist_sum = 0;
+  int i;
+  int64_t rate_sum = 0;
+  int64_t dist_sum = 0;
   int ref = xd->mi_8x8[0]->mbmi.ref_frame[0];
   unsigned int sse;
 
@@ -443,20 +448,33 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize,
 
     if (i == 0)
       x->pred_sse[ref] = sse;
-    if (cpi->sf.super_fast_rtc) {
-      dist_sum += (int)sse;
+
+    // Fast approximate the modelling function.
+    if (cpi->speed > 4) {
+      int64_t rate;
+      int64_t dist;
+      int64_t square_error = sse;
+      int quantizer = (pd->dequant[1] >> 3);
+
+      if (quantizer < 120)
+        rate = (square_error * (280 - quantizer)) >> 8;
+      else
+        rate = 0;
+      dist = (square_error * quantizer) >> 8;
+      rate_sum += rate;
+      dist_sum += dist;
     } else {
       int rate;
       int64_t dist;
       model_rd_from_var_lapndz(sse, 1 << num_pels_log2_lookup[bs],
                                pd->dequant[1] >> 3, &rate, &dist);
       rate_sum += rate;
-      dist_sum += (int)dist;
+      dist_sum += dist;
     }
   }
 
-  *out_rate_sum = rate_sum;
-  *out_dist_sum = (int64_t)dist_sum << 4;
+  *out_rate_sum = (int)rate_sum;
+  *out_dist_sum = dist_sum << 4;
 }
 
 static void model_rd_for_sb_y_tx(VP9_COMP *cpi, BLOCK_SIZE bsize,
@@ -546,18 +564,16 @@ static INLINE int cost_coeffs(MACROBLOCK *x,
   const PLANE_TYPE type = pd->plane_type;
   const int16_t *band_count = &band_counts[tx_size][1];
   const int eob = p->eobs[block];
-  const int16_t *const qcoeff_ptr = BLOCK_OFFSET(p->qcoeff, block);
-  const int ref = mbmi->ref_frame[0] != INTRA_FRAME;
+  const int16_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
   unsigned int (*token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] =
-                   x->token_costs[tx_size][type][ref];
-  const ENTROPY_CONTEXT above_ec = !!*A, left_ec = !!*L;
+                   x->token_costs[tx_size][type][is_inter_block(mbmi)];
   uint8_t *p_tok = x->token_cache;
-  int pt = combine_entropy_contexts(above_ec, left_ec);
+  int pt = combine_entropy_contexts(*A, *L);
   int c, cost;
 
   // Check for consistency of tx_size with mode info
   assert(type == PLANE_TYPE_Y ? mbmi->tx_size == tx_size
-                                      : get_uv_tx_size(mbmi) == tx_size);
+                              : get_uv_tx_size(mbmi) == tx_size);
 
   if (eob == 0) {
     // single eob token
@@ -567,7 +583,7 @@ static INLINE int cost_coeffs(MACROBLOCK *x,
     int band_left = *band_count++;
 
     // dc token
-    int v = qcoeff_ptr[0];
+    int v = qcoeff[0];
     int prev_t = vp9_dct_value_tokens_ptr[v].token;
     cost = (*token_costs)[0][pt][prev_t] + vp9_dct_value_cost_ptr[v];
     p_tok[0] = vp9_pt_energy_class[prev_t];
@@ -578,7 +594,7 @@ static INLINE int cost_coeffs(MACROBLOCK *x,
       const int rc = scan[c];
       int t;
 
-      v = qcoeff_ptr[rc];
+      v = qcoeff[rc];
       t = vp9_dct_value_tokens_ptr[v].token;
       pt = get_coef_context(nb, p_tok, c);
       cost += (*token_costs)[!prev_t][pt][t] + vp9_dct_value_cost_ptr[v];
@@ -634,7 +650,7 @@ static void rate_block(int plane, int block, BLOCK_SIZE plane_bsize,
 
   args->rate = cost_coeffs(args->x, plane, block, args->t_above + x_idx,
                            args->t_left + y_idx, tx_size,
-                           args->scan, args->nb);
+                           args->so->scan, args->so->neighbors);
 }
 
 static void block_rd_txfm(int plane, int block, BLOCK_SIZE plane_bsize,
@@ -643,17 +659,15 @@ static void block_rd_txfm(int plane, int block, BLOCK_SIZE plane_bsize,
   MACROBLOCK *const x = args->x;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
-  struct encode_b_args encode_args = {x, NULL, &mbmi->skip_coeff};
-
   int64_t rd1, rd2, rd;
 
   if (args->skip)
     return;
 
-  if (!is_inter_block(&xd->mi_8x8[0]->mbmi))
-    vp9_encode_block_intra(plane, block, plane_bsize, tx_size, &encode_args);
+  if (!is_inter_block(mbmi))
+    vp9_encode_block_intra(x, plane, block, plane_bsize, tx_size, &mbmi->skip);
   else
-    vp9_xform_quant(plane, block, plane_bsize, tx_size, &encode_args);
+    vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
 
   dist_block(plane, block, tx_size, args);
   rate_block(plane, block, plane_bsize, tx_size, args);
@@ -677,10 +691,16 @@ static void block_rd_txfm(int plane, int block, BLOCK_SIZE plane_bsize,
   }
 }
 
-void vp9_get_entropy_contexts(TX_SIZE tx_size,
-    ENTROPY_CONTEXT t_above[16], ENTROPY_CONTEXT t_left[16],
-    const ENTROPY_CONTEXT *above, const ENTROPY_CONTEXT *left,
-    int num_4x4_w, int num_4x4_h) {
+void vp9_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size,
+                              const struct macroblockd_plane *pd,
+                              ENTROPY_CONTEXT t_above[16],
+                              ENTROPY_CONTEXT t_left[16]) {
+  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+  const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
+  const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
+  const ENTROPY_CONTEXT *const above = pd->above_context;
+  const ENTROPY_CONTEXT *const left = pd->left_context;
+
   int i;
   switch (tx_size) {
     case TX_4X4:
@@ -710,49 +730,35 @@ void vp9_get_entropy_contexts(TX_SIZE tx_size,
   }
 }
 
-static void init_rdcost_stack(MACROBLOCK *x, const int64_t ref_rdcost,
-                              struct rdcost_block_args *arg) {
-  vpx_memset(arg, 0, sizeof(struct rdcost_block_args));
-  arg->x = x;
-  arg->best_rd = ref_rdcost;
-}
-
 static void txfm_rd_in_plane(MACROBLOCK *x,
                              int *rate, int64_t *distortion,
                              int *skippable, int64_t *sse,
                              int64_t ref_best_rd, int plane,
                              BLOCK_SIZE bsize, TX_SIZE tx_size) {
-  struct rdcost_block_args rd_stack;
   MACROBLOCKD *const xd = &x->e_mbd;
   struct macroblockd_plane *const pd = &xd->plane[plane];
-  const BLOCK_SIZE bs = get_plane_block_size(bsize, pd);
-  const int num_4x4_w = num_4x4_blocks_wide_lookup[bs];
-  const int num_4x4_h = num_4x4_blocks_high_lookup[bs];
-  const scan_order *so;
+  struct rdcost_block_args args = { 0 };
+  args.x = x;
+  args.best_rd = ref_best_rd;
 
-  init_rdcost_stack(x, ref_best_rd, &rd_stack);
   if (plane == 0)
     xd->mi_8x8[0]->mbmi.tx_size = tx_size;
 
-  vp9_get_entropy_contexts(tx_size, rd_stack.t_above, rd_stack.t_left,
-                           pd->above_context, pd->left_context,
-                           num_4x4_w, num_4x4_h);
+  vp9_get_entropy_contexts(bsize, tx_size, pd, args.t_above, args.t_left);
 
-  so = get_scan(xd, tx_size, pd->plane_type, 0);
-  rd_stack.scan = so->scan;
-  rd_stack.nb = so->neighbors;
+  args.so = get_scan(xd, tx_size, pd->plane_type, 0);
 
   vp9_foreach_transformed_block_in_plane(xd, bsize, plane,
-                                         block_rd_txfm, &rd_stack);
-  if (rd_stack.skip) {
+                                         block_rd_txfm, &args);
+  if (args.skip) {
     *rate       = INT_MAX;
     *distortion = INT64_MAX;
     *sse        = INT64_MAX;
     *skippable  = 0;
   } else {
-    *distortion = rd_stack.this_dist;
-    *rate       = rd_stack.this_rate;
-    *sse        = rd_stack.this_sse;
+    *distortion = args.this_dist;
+    *rate       = args.this_rate;
+    *sse        = args.this_sse;
     *skippable  = vp9_is_skippable_in_plane(x, bsize, plane);
   }
 }
@@ -787,7 +793,10 @@ static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
   vp9_prob skip_prob = vp9_get_skip_prob(cm, xd);
-  int64_t rd[TX_SIZES][2];
+  int64_t rd[TX_SIZES][2] = {{INT64_MAX, INT64_MAX},
+                             {INT64_MAX, INT64_MAX},
+                             {INT64_MAX, INT64_MAX},
+                             {INT64_MAX, INT64_MAX}};
   int n, m;
   int s0, s1;
   const TX_SIZE max_mode_tx_size = tx_mode_to_biggest_tx_size[cm->tx_mode];
@@ -851,6 +860,11 @@ static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
   }
 }
 
+static int64_t scaled_rd_cost(int rdmult, int rddiv,
+                              int rate, int64_t dist, double scale) {
+  return (int64_t) (RDCOST(rdmult, rddiv, rate, dist) * scale);
+}
+
 static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x,
                                           int (*r)[2], int *rate,
                                           int64_t *d, int64_t *distortion,
@@ -862,7 +876,10 @@ static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x,
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
   vp9_prob skip_prob = vp9_get_skip_prob(cm, xd);
-  int64_t rd[TX_SIZES][2];
+  int64_t rd[TX_SIZES][2] = {{INT64_MAX, INT64_MAX},
+                             {INT64_MAX, INT64_MAX},
+                             {INT64_MAX, INT64_MAX},
+                             {INT64_MAX, INT64_MAX}};
   int n, m;
   int s0, s1;
   double scale_rd[TX_SIZES] = {1.73, 1.44, 1.20, 1.00};
@@ -885,10 +902,13 @@ static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x,
         r[n][1] += vp9_cost_one(tx_probs[m]);
     }
     if (s[n]) {
-      rd[n][0] = rd[n][1] = RDCOST(x->rdmult, x->rddiv, s1, d[n]) * scale;
+      rd[n][0] = rd[n][1] = scaled_rd_cost(x->rdmult, x->rddiv, s1, d[n],
+                                           scale);
     } else {
-      rd[n][0] = RDCOST(x->rdmult, x->rddiv, r[n][0] + s0, d[n]) * scale;
-      rd[n][1] = RDCOST(x->rdmult, x->rddiv, r[n][1] + s0, d[n]) * scale;
+      rd[n][0] = scaled_rd_cost(x->rdmult, x->rddiv, r[n][0] + s0, d[n],
+                                scale);
+      rd[n][1] = scaled_rd_cost(x->rdmult, x->rddiv, r[n][1] + s0, d[n],
+                                scale);
     }
     if (rd[n][1] < best_rd) {
       best_rd = rd[n][1];
@@ -915,27 +935,23 @@ static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x,
   }
 }
 
-static void super_block_yrd(VP9_COMP *cpi,
-                            MACROBLOCK *x, int *rate, int64_t *distortion,
-                            int *skip, int64_t *psse, BLOCK_SIZE bs,
-                            int64_t txfm_cache[TX_MODES],
-                            int64_t ref_best_rd) {
+static void inter_super_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
+                                  int64_t *distortion, int *skip,
+                                  int64_t *psse, BLOCK_SIZE bs,
+                                  int64_t txfm_cache[TX_MODES],
+                                  int64_t ref_best_rd) {
   int r[TX_SIZES][2], s[TX_SIZES];
   int64_t d[TX_SIZES], sse[TX_SIZES];
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
-  const int b_inter_mode = is_inter_block(mbmi);
   const TX_SIZE max_tx_size = max_txsize_lookup[bs];
   TX_SIZE tx_size;
 
-
   assert(bs == mbmi->sb_type);
-  if (b_inter_mode)
-    vp9_subtract_sby(x, bs);
 
-  if (cpi->sf.tx_size_search_method == USE_LARGESTALL ||
-      (cpi->sf.tx_size_search_method != USE_FULL_RD &&
-       !b_inter_mode)) {
+  vp9_subtract_plane(x, bs, 0);
+
+  if (cpi->sf.tx_size_search_method == USE_LARGESTALL) {
     vpx_memset(txfm_cache, 0, TX_MODES * sizeof(int64_t));
     choose_largest_txfm_size(cpi, x, rate, distortion, skip, sse,
                              ref_best_rd, bs);
@@ -944,8 +960,7 @@ static void super_block_yrd(VP9_COMP *cpi,
     return;
   }
 
-  if (cpi->sf.tx_size_search_method == USE_LARGESTINTRA_MODELINTER &&
-      b_inter_mode) {
+  if (cpi->sf.tx_size_search_method == USE_LARGESTINTRA_MODELINTER) {
     for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size)
       model_rd_for_sb_y_tx(cpi, bs, tx_size, x, xd,
                            &r[tx_size][0], &d[tx_size], &s[tx_size]);
@@ -963,6 +978,36 @@ static void super_block_yrd(VP9_COMP *cpi,
     *psse = sse[mbmi->tx_size];
 }
 
+static void intra_super_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
+                                  int64_t *distortion, int *skip,
+                                  int64_t *psse, BLOCK_SIZE bs,
+                                  int64_t txfm_cache[TX_MODES],
+                                  int64_t ref_best_rd) {
+  int64_t sse[TX_SIZES];
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
+
+  assert(bs == mbmi->sb_type);
+  if (cpi->sf.tx_size_search_method != USE_FULL_RD) {
+    vpx_memset(txfm_cache, 0, TX_MODES * sizeof(int64_t));
+    choose_largest_txfm_size(cpi, x, rate, distortion, skip, sse,
+                             ref_best_rd, bs);
+  } else {
+    int r[TX_SIZES][2], s[TX_SIZES];
+    int64_t d[TX_SIZES];
+    TX_SIZE tx_size;
+    for (tx_size = TX_4X4; tx_size <= max_txsize_lookup[bs]; ++tx_size)
+      txfm_rd_in_plane(x, &r[tx_size][0], &d[tx_size],
+                       &s[tx_size], &sse[tx_size],
+                       ref_best_rd, 0, bs, tx_size);
+    choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s,
+                             skip, txfm_cache, bs);
+  }
+  if (psse)
+    *psse = sse[mbmi->tx_size];
+}
+
+
 static int conditional_skipintra(MB_PREDICTION_MODE mode,
                                  MB_PREDICTION_MODE best_intra_mode) {
   if (mode == D117_PRED &&
@@ -1064,7 +1109,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
         so = &vp9_scan_orders[TX_4X4][tx_type];
 
         if (tx_type != DCT_DCT)
-          vp9_short_fht4x4(src_diff, coeff, 8, tx_type);
+          vp9_fht4x4(src_diff, coeff, 8, tx_type);
         else
           x->fwd_txm4x4(src_diff, coeff, 8);
 
@@ -1223,8 +1268,8 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
     }
     mic->mbmi.mode = mode;
 
-    super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion, &s, NULL,
-                    bsize, local_tx_cache, best_rd);
+    intra_super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion,
+        &s, NULL, bsize, local_tx_cache, best_rd);
 
     if (this_rate_tokenonly == INT_MAX)
       continue;
@@ -1259,7 +1304,7 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
   return best_rd;
 }
 
-static void super_block_uvrd(VP9_COMP *const cpi, MACROBLOCK *x,
+static void super_block_uvrd(MACROBLOCK *x,
                              int *rate, int64_t *distortion, int *skippable,
                              int64_t *sse, BLOCK_SIZE bsize,
                              int64_t ref_best_rd) {
@@ -1273,8 +1318,11 @@ static void super_block_uvrd(VP9_COMP *const cpi, MACROBLOCK *x,
   if (ref_best_rd < 0)
     goto term;
 
-  if (is_inter_block(mbmi))
-    vp9_subtract_sbuv(x, bsize);
+  if (is_inter_block(mbmi)) {
+    int plane;
+    for (plane = 1; plane < MAX_MB_PLANE; ++plane)
+      vp9_subtract_plane(x, bsize, plane);
+  }
 
   *rate = 0;
   *distortion = 0;
@@ -1306,6 +1354,7 @@ static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                        int *rate, int *rate_tokenonly,
                                        int64_t *distortion, int *skippable,
                                        BLOCK_SIZE bsize, TX_SIZE max_tx_size) {
+  MACROBLOCKD *xd = &x->e_mbd;
   MB_PREDICTION_MODE mode;
   MB_PREDICTION_MODE mode_selected = DC_PRED;
   int64_t best_rd = INT64_MAX, this_rd;
@@ -1316,9 +1365,9 @@ static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,
     if (!(cpi->sf.intra_uv_mode_mask[max_tx_size] & (1 << mode)))
       continue;
 
-    x->e_mbd.mi_8x8[0]->mbmi.uv_mode = mode;
+    xd->mi_8x8[0]->mbmi.uv_mode = mode;
 
-    super_block_uvrd(cpi, x, &this_rate_tokenonly,
+    super_block_uvrd(x, &this_rate_tokenonly,
                      &this_distortion, &s, &this_sse, bsize, best_rd);
     if (this_rate_tokenonly == INT_MAX)
       continue;
@@ -1336,7 +1385,7 @@ static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,
       if (!x->select_txfm_size) {
         int i;
         struct macroblock_plane *const p = x->plane;
-        struct macroblockd_plane *const pd = x->e_mbd.plane;
+        struct macroblockd_plane *const pd = xd->plane;
         for (i = 1; i < MAX_MB_PLANE; ++i) {
           p[i].coeff    = ctx->coeff_pbuf[i][2];
           p[i].qcoeff   = ctx->qcoeff_pbuf[i][2];
@@ -1357,25 +1406,21 @@ static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,
     }
   }
 
-  x->e_mbd.mi_8x8[0]->mbmi.uv_mode = mode_selected;
+  xd->mi_8x8[0]->mbmi.uv_mode = mode_selected;
   return best_rd;
 }
 
-static int64_t rd_sbuv_dcpred(VP9_COMP *cpi, MACROBLOCK *x,
+static int64_t rd_sbuv_dcpred(const VP9_COMMON *cm, MACROBLOCK *x,
                               int *rate, int *rate_tokenonly,
                               int64_t *distortion, int *skippable,
                               BLOCK_SIZE bsize) {
-  int64_t this_rd;
-  int64_t this_sse;
+  int64_t unused;
 
   x->e_mbd.mi_8x8[0]->mbmi.uv_mode = DC_PRED;
-  super_block_uvrd(cpi, x, rate_tokenonly, distortion,
-                   skippable, &this_sse, bsize, INT64_MAX);
-  *rate = *rate_tokenonly +
-          x->intra_uv_mode_cost[cpi->common.frame_type][DC_PRED];
-  this_rd = RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
-
-  return this_rd;
+  super_block_uvrd(x, rate_tokenonly, distortion,
+                   skippable, &unused, bsize, INT64_MAX);
+  *rate = *rate_tokenonly + x->intra_uv_mode_cost[cm->frame_type][DC_PRED];
+  return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
 }
 
 static void choose_intra_uv_mode(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
@@ -1388,8 +1433,8 @@ static void choose_intra_uv_mode(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
   // Use an estimated rd for uv_intra based on DC_PRED if the
   // appropriate speed flag is set.
   if (cpi->sf.use_uv_intra_rd_estimate) {
-    rd_sbuv_dcpred(cpi, x, rate_uv, rate_uv_tokenonly, dist_uv, skip_uv,
-                   bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize);
+    rd_sbuv_dcpred(&cpi->common, x, rate_uv, rate_uv_tokenonly, dist_uv,
+                   skip_uv, bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize);
   // Else do a proper rd search for each possible transform size that may
   // be considered in the main rd loop.
   } else {
@@ -1403,8 +1448,7 @@ static void choose_intra_uv_mode(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
 static int cost_mv_ref(VP9_COMP *cpi, MB_PREDICTION_MODE mode,
                        int mode_context) {
   MACROBLOCK *const x = &cpi->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  const int segment_id = xd->mi_8x8[0]->mbmi.segment_id;
+  const int segment_id = x->e_mbd.mi_8x8[0]->mbmi.segment_id;
 
   // Don't account for mode here if segment skip is enabled.
   if (!vp9_segfeature_active(&cpi->common.seg, segment_id, SEG_LVL_SKIP)) {
@@ -1429,7 +1473,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
                                 int *rate_mv);
 
 static int labels2mode(MACROBLOCK *x, int i,
-                       MB_PREDICTION_MODE this_mode,
+                       MB_PREDICTION_MODE mode,
                        int_mv *this_mv, int_mv *this_second_mv,
                        int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES],
                        int_mv seg_mvs[MAX_REF_FRAMES],
@@ -1439,23 +1483,18 @@ static int labels2mode(MACROBLOCK *x, int i,
   MACROBLOCKD *const xd = &x->e_mbd;
   MODE_INFO *const mic = xd->mi_8x8[0];
   MB_MODE_INFO *mbmi = &mic->mbmi;
-  int cost = 0, thismvcost = 0;
+  int thismvcost = 0;
   int idx, idy;
   const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[mbmi->sb_type];
   const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[mbmi->sb_type];
   const int has_second_rf = has_second_ref(mbmi);
 
-  /* We have to be careful retrieving previously-encoded motion vectors.
-   Ones from this macroblock have to be pulled from the BLOCKD array
-   as they have not yet made it to the bmi array in our MB_MODE_INFO. */
-  MB_PREDICTION_MODE m;
-
   // the only time we should do costing for new motion vector or mode
   // is when we are on a new label  (jbb May 08, 2007)
-  switch (m = this_mode) {
+  switch (mode) {
     case NEWMV:
       this_mv->as_int = seg_mvs[mbmi->ref_frame[0]].as_int;
-      thismvcost  = vp9_mv_bit_cost(&this_mv->as_mv, &best_ref_mv->as_mv,
+      thismvcost += vp9_mv_bit_cost(&this_mv->as_mv, &best_ref_mv->as_mv,
                                     mvjcost, mvcost, MV_COST_WEIGHT_SUB);
       if (has_second_rf) {
         this_second_mv->as_int = seg_mvs[mbmi->ref_frame[1]].as_int;
@@ -1467,14 +1506,12 @@ static int labels2mode(MACROBLOCK *x, int i,
     case NEARESTMV:
       this_mv->as_int = frame_mv[NEARESTMV][mbmi->ref_frame[0]].as_int;
       if (has_second_rf)
-        this_second_mv->as_int =
-            frame_mv[NEARESTMV][mbmi->ref_frame[1]].as_int;
+        this_second_mv->as_int = frame_mv[NEARESTMV][mbmi->ref_frame[1]].as_int;
       break;
     case NEARMV:
       this_mv->as_int = frame_mv[NEARMV][mbmi->ref_frame[0]].as_int;
       if (has_second_rf)
-        this_second_mv->as_int =
-            frame_mv[NEARMV][mbmi->ref_frame[1]].as_int;
+        this_second_mv->as_int = frame_mv[NEARMV][mbmi->ref_frame[1]].as_int;
       break;
     case ZEROMV:
       this_mv->as_int = 0;
@@ -1485,22 +1522,19 @@ static int labels2mode(MACROBLOCK *x, int i,
       break;
   }
 
-  cost = cost_mv_ref(cpi, this_mode,
-                     mbmi->mode_context[mbmi->ref_frame[0]]);
-
   mic->bmi[i].as_mv[0].as_int = this_mv->as_int;
   if (has_second_rf)
     mic->bmi[i].as_mv[1].as_int = this_second_mv->as_int;
 
-  mic->bmi[i].as_mode = m;
+  mic->bmi[i].as_mode = mode;
 
   for (idy = 0; idy < num_4x4_blocks_high; ++idy)
     for (idx = 0; idx < num_4x4_blocks_wide; ++idx)
       vpx_memcpy(&mic->bmi[i + idy * 2 + idx],
                  &mic->bmi[i], sizeof(mic->bmi[i]));
 
-  cost += thismvcost;
-  return cost;
+  return cost_mv_ref(cpi, mode, mbmi->mode_context[mbmi->ref_frame[0]]) +
+            thismvcost;
 }
 
 static int64_t encode_inter_mb_segment(VP9_COMP *cpi,
@@ -1604,13 +1638,11 @@ typedef struct {
   int mvthresh;
 } BEST_SEG_INFO;
 
-static INLINE int mv_check_bounds(MACROBLOCK *x, int_mv *mv) {
-  int r = 0;
-  r |= (mv->as_mv.row >> 3) < x->mv_row_min;
-  r |= (mv->as_mv.row >> 3) > x->mv_row_max;
-  r |= (mv->as_mv.col >> 3) < x->mv_col_min;
-  r |= (mv->as_mv.col >> 3) > x->mv_col_max;
-  return r;
+static INLINE int mv_check_bounds(const MACROBLOCK *x, const MV *mv) {
+  return (mv->row >> 3) < x->mv_row_min ||
+         (mv->row >> 3) > x->mv_row_max ||
+         (mv->col >> 3) < x->mv_col_min ||
+         (mv->col >> 3) > x->mv_col_max;
 }
 
 static INLINE void mi_buf_shift(MACROBLOCK *x, int i) {
@@ -1645,14 +1677,15 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
                                     BEST_SEG_INFO *bsi_buf, int filter_idx,
                                     int_mv seg_mvs[4][MAX_REF_FRAMES],
                                     int mi_row, int mi_col) {
-  int i, br = 0, idx, idy;
+  int k, br = 0, idx, idy;
   int64_t bd = 0, block_sse = 0;
   MB_PREDICTION_MODE this_mode;
+  MACROBLOCKD *xd = &x->e_mbd;
   VP9_COMMON *cm = &cpi->common;
-  MODE_INFO *mi = x->e_mbd.mi_8x8[0];
+  MODE_INFO *mi = xd->mi_8x8[0];
   MB_MODE_INFO *const mbmi = &mi->mbmi;
   struct macroblock_plane *const p = &x->plane[0];
-  struct macroblockd_plane *const pd = &x->e_mbd.plane[0];
+  struct macroblockd_plane *const pd = &xd->plane[0];
   const int label_count = 4;
   int64_t this_segment_rd = 0;
   int label_mv_thresh;
@@ -1660,7 +1693,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
   const BLOCK_SIZE bsize = mbmi->sb_type;
   const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
   const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
-  vp9_variance_fn_ptr_t *v_fn_ptr;
+  vp9_variance_fn_ptr_t *v_fn_ptr = &cpi->fn_ptr[bsize];
   ENTROPY_CONTEXT t_above[2], t_left[2];
   BEST_SEG_INFO *bsi = bsi_buf + filter_idx;
   int mode_idx;
@@ -1670,8 +1703,6 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
   vpx_memcpy(t_above, pd->above_context, sizeof(t_above));
   vpx_memcpy(t_left, pd->left_context, sizeof(t_left));
 
-  v_fn_ptr = &cpi->fn_ptr[bsize];
-
   // 64 makes this threshold really big effectively
   // making it so that we very rarely check mvs on
   // segments.   setting this to 1 would make mv thresh
@@ -1687,20 +1718,17 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
       int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
       MB_PREDICTION_MODE mode_selected = ZEROMV;
       int64_t best_rd = INT64_MAX;
-      i = idy * 2 + idx;
-
-      frame_mv[ZEROMV][mbmi->ref_frame[0]].as_int = 0;
-      vp9_append_sub8x8_mvs_for_idx(cm, &x->e_mbd, tile,
-                                    i, 0, mi_row, mi_col,
-                                    &frame_mv[NEARESTMV][mbmi->ref_frame[0]],
-                                    &frame_mv[NEARMV][mbmi->ref_frame[0]]);
-      if (has_second_rf) {
-        frame_mv[ZEROMV][mbmi->ref_frame[1]].as_int = 0;
-        vp9_append_sub8x8_mvs_for_idx(cm, &x->e_mbd, tile,
-                                      i, 1, mi_row, mi_col,
-                                      &frame_mv[NEARESTMV][mbmi->ref_frame[1]],
-                                      &frame_mv[NEARMV][mbmi->ref_frame[1]]);
+      const int i = idy * 2 + idx;
+      int ref;
+
+      for (ref = 0; ref < 1 + has_second_rf; ++ref) {
+        const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref];
+        frame_mv[ZEROMV][frame].as_int = 0;
+        vp9_append_sub8x8_mvs_for_idx(cm, xd, tile, i, ref, mi_row, mi_col,
+                                      &frame_mv[NEARESTMV][frame],
+                                      &frame_mv[NEARMV][frame]);
       }
+
       // search for the best motion vector on this segment
       for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
         const struct buf_2d orig_src = x->plane[0].src;
@@ -1829,28 +1857,28 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
             bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param,
                                              sadpb, further_steps, 0, v_fn_ptr,
                                              &bsi->ref_mv->as_mv,
-                                             new_mv);
+                                             &new_mv->as_mv);
           }
 
           // Should we do a full search (best quality only)
           if (cpi->oxcf.mode == MODE_BESTQUALITY ||
               cpi->oxcf.mode == MODE_SECONDPASS_BEST) {
+            int_mv *const best_mv = &mi->bmi[i].as_mv[0];
             /* Check if mvp_full is within the range. */
             clamp_mv(&mvp_full, x->mv_col_min, x->mv_col_max,
                      x->mv_row_min, x->mv_row_max);
-
             thissme = cpi->full_search_sad(x, &mvp_full,
                                            sadpb, 16, v_fn_ptr,
                                            x->nmvjointcost, x->mvcost,
-                                           &bsi->ref_mv->as_mv, i);
-
+                                           &bsi->ref_mv->as_mv,
+                                           &best_mv->as_mv);
             if (thissme < bestsme) {
               bestsme = thissme;
-              new_mv->as_int = mi->bmi[i].as_mv[0].as_int;
+              new_mv->as_int = best_mv->as_int;
             } else {
-              /* The full search result is actually worse so re-instate the
-               * previous best vector */
-              mi->bmi[i].as_mv[0].as_int = new_mv->as_int;
+              // The full search result is actually worse so re-instate the
+              // previous best vector
+              best_mv->as_int = new_mv->as_int;
             }
           }
 
@@ -1928,10 +1956,9 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
         }
 
         // Trap vectors that reach beyond the UMV borders
-        if (mv_check_bounds(x, &mode_mv[this_mode]))
-          continue;
-        if (has_second_rf &&
-            mv_check_bounds(x, &second_mode_mv[this_mode]))
+        if (mv_check_bounds(x, &mode_mv[this_mode].as_mv) ||
+            (has_second_rf &&
+             mv_check_bounds(x, &second_mode_mv[this_mode].as_mv)))
           continue;
 
         if (filter_idx > 0) {
@@ -2042,8 +2069,8 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
   bsi->sse = block_sse;
 
   // update the coding decisions
-  for (i = 0; i < 4; ++i)
-    bsi->modes[i] = mi->bmi[i].as_mode;
+  for (k = 0; k < 4; ++k)
+    bsi->modes[k] = mi->bmi[k].as_mode;
 }
 
 static int64_t rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x,
@@ -2356,7 +2383,7 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   int sadpb = x->sadperbit16;
   MV mvp_full;
   int ref = mbmi->ref_frame[0];
-  int_mv ref_mv = mbmi->ref_mvs[ref][0];
+  MV ref_mv = mbmi->ref_mvs[ref][0].as_mv;
 
   int tmp_col_min = x->mv_col_min;
   int tmp_col_max = x->mv_col_max;
@@ -2366,10 +2393,10 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   const YV12_BUFFER_CONFIG *scaled_ref_frame = vp9_get_scaled_ref_frame(cpi,
                                                                         ref);
 
-  int_mv pred_mv[3];
-  pred_mv[0] = mbmi->ref_mvs[ref][0];
-  pred_mv[1] = mbmi->ref_mvs[ref][1];
-  pred_mv[2] = x->pred_mv[ref];
+  MV pred_mv[3];
+  pred_mv[0] = mbmi->ref_mvs[ref][0].as_mv;
+  pred_mv[1] = mbmi->ref_mvs[ref][1].as_mv;
+  pred_mv[2] = x->pred_mv[ref].as_mv;
 
   if (scaled_ref_frame) {
     int i;
@@ -2382,26 +2409,18 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
     setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL);
   }
 
-  vp9_set_mv_search_range(x, &ref_mv.as_mv);
+  vp9_set_mv_search_range(x, &ref_mv);
 
-  // Adjust search parameters based on small partitions' result.
-  if (x->fast_ms) {
-    // adjust search range
-    step_param = 6;
-    if (x->fast_ms > 1)
-      step_param = 8;
+  // Work out the size of the first step in the mv step search.
+  // 0 here is maximum length first step. 1 is MAX >> 1 etc.
+  if (cpi->sf.auto_mv_step_size && cpi->common.show_frame) {
+    // Take wtd average of the step_params based on the last frame's
+    // max mv magnitude and that based on the best ref mvs of the current
+    // block for the given reference.
+    step_param = (vp9_init_search_range(cpi, x->max_mv_context[ref]) +
+                  cpi->mv_step_param) >> 1;
   } else {
-    // Work out the size of the first step in the mv step search.
-    // 0 here is maximum length first step. 1 is MAX >> 1 etc.
-    if (cpi->sf.auto_mv_step_size && cpi->common.show_frame) {
-      // Take wtd average of the step_params based on the last frame's
-      // max mv magnitude and that based on the best ref mvs of the current
-      // block for the given reference.
-      step_param = (vp9_init_search_range(cpi, x->max_mv_context[ref]) +
-                    cpi->mv_step_param) >> 1;
-    } else {
-      step_param = cpi->mv_step_param;
-    }
+    step_param = cpi->mv_step_param;
   }
 
   if (cpi->sf.adaptive_motion_search && bsize < BLOCK_64X64 &&
@@ -2435,7 +2454,7 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
     }
   }
 
-  mvp_full = pred_mv[x->mv_best_ref_index[ref]].as_mv;
+  mvp_full = pred_mv[x->mv_best_ref_index[ref]];
 
   mvp_full.col >>= 3;
   mvp_full.row >>= 3;
@@ -2443,23 +2462,27 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   // Further step/diamond searches as necessary
   further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;
 
-  if (cpi->sf.search_method == HEX) {
+  if (cpi->sf.search_method == FAST_HEX) {
+    bestsme = vp9_fast_hex_search(x, &mvp_full, step_param, sadpb,
+                                  &cpi->fn_ptr[bsize], 1,
+                                  &ref_mv, &tmp_mv->as_mv);
+  } else if (cpi->sf.search_method == HEX) {
     bestsme = vp9_hex_search(x, &mvp_full, step_param, sadpb, 1,
                              &cpi->fn_ptr[bsize], 1,
-                             &ref_mv.as_mv, &tmp_mv->as_mv);
+                             &ref_mv, &tmp_mv->as_mv);
   } else if (cpi->sf.search_method == SQUARE) {
     bestsme = vp9_square_search(x, &mvp_full, step_param, sadpb, 1,
                                 &cpi->fn_ptr[bsize], 1,
-                                &ref_mv.as_mv, &tmp_mv->as_mv);
+                                &ref_mv, &tmp_mv->as_mv);
   } else if (cpi->sf.search_method == BIGDIA) {
     bestsme = vp9_bigdia_search(x, &mvp_full, step_param, sadpb, 1,
                                 &cpi->fn_ptr[bsize], 1,
-                                &ref_mv.as_mv, &tmp_mv->as_mv);
+                                &ref_mv, &tmp_mv->as_mv);
   } else {
     bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param,
                                      sadpb, further_steps, 1,
                                      &cpi->fn_ptr[bsize],
-                                     &ref_mv.as_mv, tmp_mv);
+                                     &ref_mv, &tmp_mv->as_mv);
   }
 
   x->mv_col_min = tmp_col_min;
@@ -2469,7 +2492,7 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
 
   if (bestsme < INT_MAX) {
     int dis;  /* TODO: use dis in distortion calculation later. */
-    cpi->find_fractional_mv_step(x, &tmp_mv->as_mv, &ref_mv.as_mv,
+    cpi->find_fractional_mv_step(x, &tmp_mv->as_mv, &ref_mv,
                                  cm->allow_high_precision_mv,
                                  x->errorperbit,
                                  &cpi->fn_ptr[bsize],
@@ -2478,7 +2501,7 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
                                  x->nmvjointcost, x->mvcost,
                                  &dis, &x->pred_sse[ref]);
   }
-  *rate_mv = vp9_mv_bit_cost(&tmp_mv->as_mv, &ref_mv.as_mv,
+  *rate_mv = vp9_mv_bit_cost(&tmp_mv->as_mv, &ref_mv,
                              x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
 
   if (cpi->sf.adaptive_motion_search && cpi->common.show_frame)
@@ -2705,6 +2728,8 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
       int_mv tmp_mv;
       single_motion_search(cpi, x, tile, bsize, mi_row, mi_col,
                            &tmp_mv, &rate_mv);
+      if (tmp_mv.as_int == INVALID_MV)
+        return INT64_MAX;
       *rate2 += rate_mv;
       frame_mv[refs[0]].as_int =
           xd->mi_8x8[0]->bmi[0].as_mv[0].as_int = tmp_mv.as_int;
@@ -2717,7 +2742,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
       frame_mv[refs[0]].as_int == 0 &&
       !vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP) &&
       (num_refs == 1 || frame_mv[refs[1]].as_int == 0)) {
-    int rfc = mbmi->mode_context[mbmi->ref_frame[0]];
+    int rfc = mbmi->mode_context[refs[0]];
     int c1 = cost_mv_ref(cpi, NEARMV, rfc);
     int c2 = cost_mv_ref(cpi, NEARESTMV, rfc);
     int c3 = cost_mv_ref(cpi, ZEROMV, rfc);
@@ -2732,17 +2757,17 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
       assert(this_mode == ZEROMV);
       if (num_refs == 1) {
         if ((c3 >= c2 &&
-             mode_mv[NEARESTMV][mbmi->ref_frame[0]].as_int == 0) ||
+             mode_mv[NEARESTMV][refs[0]].as_int == 0) ||
             (c3 >= c1 &&
-             mode_mv[NEARMV][mbmi->ref_frame[0]].as_int == 0))
+             mode_mv[NEARMV][refs[0]].as_int == 0))
           return INT64_MAX;
       } else {
         if ((c3 >= c2 &&
-             mode_mv[NEARESTMV][mbmi->ref_frame[0]].as_int == 0 &&
-             mode_mv[NEARESTMV][mbmi->ref_frame[1]].as_int == 0) ||
+             mode_mv[NEARESTMV][refs[0]].as_int == 0 &&
+             mode_mv[NEARESTMV][refs[1]].as_int == 0) ||
             (c3 >= c1 &&
-             mode_mv[NEARMV][mbmi->ref_frame[0]].as_int == 0 &&
-             mode_mv[NEARMV][mbmi->ref_frame[1]].as_int == 0))
+             mode_mv[NEARMV][refs[0]].as_int == 0 &&
+             mode_mv[NEARMV][refs[1]].as_int == 0))
           return INT64_MAX;
       }
     }
@@ -2754,7 +2779,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     if (this_mode != NEWMV)
       clamp_mv2(&cur_mv[i].as_mv, xd);
 
-    if (mv_check_bounds(x, &cur_mv[i]))
+    if (mv_check_bounds(x, &cur_mv[i].as_mv))
       return INT64_MAX;
     mbmi->mv[i].as_int = cur_mv[i].as_int;
   }
@@ -2773,8 +2798,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
    * are only three options: Last/Golden, ARF/Last or Golden/ARF, or in other
    * words if you present them in that order, the second one is always known
    * if the first is known */
-  *rate2 += cost_mv_ref(cpi, this_mode,
-                        mbmi->mode_context[mbmi->ref_frame[0]]);
+  *rate2 += cost_mv_ref(cpi, this_mode, mbmi->mode_context[refs[0]]);
 
   if (!(*mode_excluded))
     *mode_excluded = is_comp_pred ? cm->reference_mode == SINGLE_REFERENCE
@@ -2910,33 +2934,26 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   if (cm->interp_filter == SWITCHABLE)
     *rate2 += get_switchable_rate(x);
 
-  if (!is_comp_pred && cpi->enable_encode_breakout) {
+  if (!is_comp_pred) {
     if (cpi->active_map_enabled && x->active_ptr[0] == 0)
       x->skip = 1;
-    else if (x->encode_breakout) {
+    else if (cpi->allow_encode_breakout && x->encode_breakout) {
       const BLOCK_SIZE y_size = get_plane_block_size(bsize, &xd->plane[0]);
       const BLOCK_SIZE uv_size = get_plane_block_size(bsize, &xd->plane[1]);
       unsigned int var, sse;
       // Skipping threshold for ac.
       unsigned int thresh_ac;
-      // The encode_breakout input
-      unsigned int encode_breakout = x->encode_breakout << 4;
-      unsigned int max_thresh = 36000;
-
+      // Set a maximum for threshold to avoid big PSNR loss in low bitrate case.
       // Use extreme low threshold for static frames to limit skipping.
-      if (cpi->enable_encode_breakout == 2)
-        max_thresh = 128;
+      const unsigned int max_thresh = (cpi->allow_encode_breakout ==
+                                      ENCODE_BREAKOUT_LIMITED) ? 128 : 36000;
+      // The encode_breakout input
+      const unsigned int min_thresh =
+          MIN(((unsigned int)x->encode_breakout << 4), max_thresh);
 
       // Calculate threshold according to dequant value.
       thresh_ac = (xd->plane[0].dequant[1] * xd->plane[0].dequant[1]) / 9;
-
-      // Use encode_breakout input if it is bigger than internal threshold.
-      if (thresh_ac < encode_breakout)
-        thresh_ac = encode_breakout;
-
-      // Set a maximum for threshold to avoid big PSNR loss in low bitrate case.
-      if (thresh_ac > max_thresh)
-        thresh_ac = max_thresh;
+      thresh_ac = clamp(thresh_ac, min_thresh, max_thresh);
 
       var = cpi->fn_ptr[y_size].vf(x->plane[0].src.buf, x->plane[0].src.stride,
                                    xd->plane[0].dst.buf,
@@ -2999,8 +3016,8 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     int64_t rdcosty = INT64_MAX;
 
     // Y cost and distortion
-    super_block_yrd(cpi, x, rate_y, distortion_y, &skippable_y, psse,
-                    bsize, txfm_cache, ref_best_rd);
+    inter_super_block_yrd(cpi, x, rate_y, distortion_y, &skippable_y, psse,
+                          bsize, txfm_cache, ref_best_rd);
 
     if (*rate_y == INT_MAX) {
       *rate2 = INT_MAX;
@@ -3015,7 +3032,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     rdcosty = RDCOST(x->rdmult, x->rddiv, *rate2, *distortion);
     rdcosty = MIN(rdcosty, RDCOST(x->rdmult, x->rddiv, 0, *psse));
 
-    super_block_uvrd(cpi, x, rate_uv, distortion_uv, &skippable_uv, &sseuv,
+    super_block_uvrd(x, rate_uv, distortion_uv, &skippable_uv, &sseuv,
                      bsize, ref_best_rd - rdcosty);
     if (*rate_uv == INT_MAX) {
       *rate2 = INT_MAX;
@@ -3123,10 +3140,10 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
                                   BLOCK_SIZE bsize,
                                   PICK_MODE_CONTEXT *ctx,
                                   int64_t best_rd_so_far) {
-  VP9_COMMON *cm = &cpi->common;
-  MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
-  const struct segmentation *seg = &cm->seg;
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
+  const struct segmentation *const seg = &cm->seg;
   const BLOCK_SIZE block_size = get_plane_block_size(bsize, &xd->plane[0]);
   MB_PREDICTION_MODE this_mode;
   MV_REFERENCE_FRAME ref_frame, second_ref_frame;
@@ -3162,12 +3179,14 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   const int bws = num_8x8_blocks_wide_lookup[bsize] / 2;
   const int bhs = num_8x8_blocks_high_lookup[bsize] / 2;
   int best_skip2 = 0;
+  int mode_skip_mask = 0;
+  const int mode_skip_start = cpi->sf.mode_skip_start + 1;
+  const int *const rd_threshes = cpi->rd_threshes[segment_id][bsize];
+  const int *const rd_thresh_freq_fact = cpi->rd_thresh_freq_fact[bsize];
+  const int mode_search_skip_flags = cpi->sf.mode_search_skip_flags;
 
   x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
 
-  // Everywhere the flag is set the error is much higher than its neighbors.
-  ctx->modes_with_high_error = 0;
-
   estimate_ref_frame_costs(cpi, segment_id, ref_costs_single, ref_costs_comp,
                            &comp_mode_p);
 
@@ -3195,16 +3214,72 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     frame_mv[ZEROMV][ref_frame].as_int = 0;
   }
 
-  cpi->ref_frame_mask = 0;
-  for (ref_frame = LAST_FRAME;
-       ref_frame <= ALTREF_FRAME && cpi->sf.reference_masking; ++ref_frame) {
-    int i;
-    for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
-      if ((x->pred_mv_sad[ref_frame] >> 2) > x->pred_mv_sad[i]) {
-        cpi->ref_frame_mask |= (1 << ref_frame);
-        break;
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    // All modes from vp9_mode_order that use this frame as any ref
+    static const int ref_frame_mask_all[] = {
+        0x0, 0x123291, 0x25c444, 0x39b722
+    };
+    // Fixed mv modes (NEARESTMV, NEARMV, ZEROMV) from vp9_mode_order that use
+    // this frame as their primary ref
+    static const int ref_frame_mask_fixedmv[] = {
+        0x0, 0x121281, 0x24c404, 0x080102
+    };
+    if (!(cpi->ref_frame_flags & flag_list[ref_frame])) {
+      // Skip modes for missing references
+      mode_skip_mask |= ref_frame_mask_all[ref_frame];
+    } else if (cpi->sf.reference_masking) {
+      for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
+        // Skip fixed mv modes for poor references
+        if ((x->pred_mv_sad[ref_frame] >> 2) > x->pred_mv_sad[i]) {
+          mode_skip_mask |= ref_frame_mask_fixedmv[ref_frame];
+          break;
+        }
       }
     }
+    // If the segment reference frame feature is enabled....
+    // then do nothing if the current ref frame is not allowed..
+    if (vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
+        vp9_get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) {
+      mode_skip_mask |= ref_frame_mask_all[ref_frame];
+    }
+  }
+
+  // If the segment skip feature is enabled....
+  // then do nothing if the current mode is not allowed..
+  if (vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)) {
+    const int inter_non_zero_mode_mask = 0x1F7F7;
+    mode_skip_mask |= inter_non_zero_mode_mask;
+  }
+
+  // Disable this drop out case if the ref frame
+  // segment level feature is enabled for this segment. This is to
+  // prevent the possibility that we end up unable to pick any mode.
+  if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) {
+    // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
+    // unless ARNR filtering is enabled in which case we want
+    // an unfiltered alternative. We allow near/nearest as well
+    // because they may result in zero-zero MVs but be cheaper.
+    if (cpi->rc.is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) {
+      const int altref_zero_mask =
+          ~((1 << THR_NEARESTA) | (1 << THR_NEARA) | (1 << THR_ZEROA));
+      mode_skip_mask |= altref_zero_mask;
+      if (frame_mv[NEARMV][ALTREF_FRAME].as_int != 0)
+        mode_skip_mask |= (1 << THR_NEARA);
+      if (frame_mv[NEARESTMV][ALTREF_FRAME].as_int != 0)
+        mode_skip_mask |= (1 << THR_NEARESTA);
+    }
+  }
+
+  // TODO(JBB): This is to make up for the fact that we don't have sad
+  // functions that work when the block size reads outside the umv.  We
+  // should fix this either by making the motion search just work on
+  // a representative block in the boundary ( first ) and then implement a
+  // function that does sads when inside the border..
+  if ((mi_row + bhs) > cm->mi_rows || (mi_col + bws) > cm->mi_cols) {
+    const int new_modes_mask =
+        (1 << THR_NEWMV) | (1 << THR_NEWG) | (1 << THR_NEWA) |
+        (1 << THR_COMP_NEWLA) | (1 << THR_COMP_NEWGA);
+    mode_skip_mask |= new_modes_mask;
   }
 
   for (mode_index = 0; mode_index < MAX_MODES; ++mode_index) {
@@ -3218,109 +3293,95 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     int64_t tx_cache[TX_MODES];
     int i;
     int this_skip2 = 0;
-    int64_t total_sse = INT_MAX;
+    int64_t total_sse = INT64_MAX;
     int early_term = 0;
 
-    for (i = 0; i < TX_MODES; ++i)
-      tx_cache[i] = INT64_MAX;
-
-    x->skip = 0;
-    this_mode = vp9_mode_order[mode_index].mode;
-    ref_frame = vp9_mode_order[mode_index].ref_frame[0];
-    second_ref_frame = vp9_mode_order[mode_index].ref_frame[1];
-
     // Look at the reference frame of the best mode so far and set the
     // skip mask to look at a subset of the remaining modes.
-    if (mode_index > cpi->sf.mode_skip_start) {
-      if (mode_index == (cpi->sf.mode_skip_start + 1)) {
-        switch (vp9_mode_order[best_mode_index].ref_frame[0]) {
-          case INTRA_FRAME:
-            cpi->mode_skip_mask = 0;
-            break;
-          case LAST_FRAME:
-            cpi->mode_skip_mask = LAST_FRAME_MODE_MASK;
-            break;
-          case GOLDEN_FRAME:
-            cpi->mode_skip_mask = GOLDEN_FRAME_MODE_MASK;
-            break;
-          case ALTREF_FRAME:
-            cpi->mode_skip_mask = ALT_REF_MODE_MASK;
-            break;
-          case NONE:
-          case MAX_REF_FRAMES:
-            assert(0 && "Invalid Reference frame");
-        }
+    if (mode_index == mode_skip_start) {
+      switch (vp9_mode_order[best_mode_index].ref_frame[0]) {
+        case INTRA_FRAME:
+          break;
+        case LAST_FRAME:
+          mode_skip_mask |= LAST_FRAME_MODE_MASK;
+          break;
+        case GOLDEN_FRAME:
+          mode_skip_mask |= GOLDEN_FRAME_MODE_MASK;
+          break;
+        case ALTREF_FRAME:
+          mode_skip_mask |= ALT_REF_MODE_MASK;
+          break;
+        case NONE:
+        case MAX_REF_FRAMES:
+          assert(0 && "Invalid Reference frame");
       }
-      if (cpi->mode_skip_mask & ((int64_t)1 << mode_index))
-        continue;
     }
-
-    // Skip if the current reference frame has been masked off
-    if (cpi->ref_frame_mask & (1 << ref_frame) && this_mode != NEWMV)
+    if (mode_skip_mask & (1 << mode_index))
       continue;
 
     // Test best rd so far against threshold for trying this mode.
-    if ((best_rd < ((int64_t)cpi->rd_threshes[segment_id][bsize][mode_index] *
-                     cpi->rd_thresh_freq_fact[bsize][mode_index] >> 5)) ||
-        cpi->rd_threshes[segment_id][bsize][mode_index] == INT_MAX)
-      continue;
-
-    // Do not allow compound prediction if the segment level reference
-    // frame feature is in use as in this case there can only be one reference.
-    if ((second_ref_frame > INTRA_FRAME) &&
-         vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME))
-      continue;
-
-    // Skip some checking based on small partitions' result.
-    if (x->fast_ms > 1 && !ref_frame)
-      continue;
-    if (x->fast_ms > 2 && ref_frame != x->subblock_ref)
-      continue;
-
-    mbmi->ref_frame[0] = ref_frame;
-    mbmi->ref_frame[1] = second_ref_frame;
+    if (best_rd < ((int64_t)rd_threshes[mode_index] *
+                  rd_thresh_freq_fact[mode_index] >> 5) ||
+        rd_threshes[mode_index] == INT_MAX)
+     continue;
 
-    if (!(ref_frame == INTRA_FRAME
-        || (cpi->ref_frame_flags & flag_list[ref_frame]))) {
-      continue;
-    }
-    if (!(second_ref_frame == NONE
-        || (cpi->ref_frame_flags & flag_list[second_ref_frame]))) {
-      continue;
-    }
+    this_mode = vp9_mode_order[mode_index].mode;
+    ref_frame = vp9_mode_order[mode_index].ref_frame[0];
+    second_ref_frame = vp9_mode_order[mode_index].ref_frame[1];
 
     comp_pred = second_ref_frame > INTRA_FRAME;
     if (comp_pred) {
-      if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA)
-        if (vp9_mode_order[best_mode_index].ref_frame[0] == INTRA_FRAME)
-          continue;
-      if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_REFMISMATCH)
-        if (ref_frame != best_inter_ref_frame &&
-            second_ref_frame != best_inter_ref_frame)
+      if ((mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA) &&
+          vp9_mode_order[best_mode_index].ref_frame[0] == INTRA_FRAME)
+        continue;
+      if ((mode_search_skip_flags & FLAG_SKIP_COMP_REFMISMATCH) &&
+          ref_frame != best_inter_ref_frame &&
+          second_ref_frame != best_inter_ref_frame)
+        continue;
+      mode_excluded = mode_excluded ?
+            mode_excluded : cm->reference_mode == SINGLE_REFERENCE;
+    } else {
+      if (ref_frame != INTRA_FRAME && second_ref_frame != INTRA_FRAME)
+        mode_excluded = mode_excluded ?
+            mode_excluded : cm->reference_mode == COMPOUND_REFERENCE;
+    }
+
+    if (ref_frame == INTRA_FRAME) {
+      // Disable intra modes other than DC_PRED for blocks with low variance
+      // Threshold for intra skipping based on source variance
+      // TODO(debargha): Specialize the threshold for super block sizes
+      static const unsigned int skip_intra_var_thresh[BLOCK_SIZES] = {
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+      };
+      if ((mode_search_skip_flags & FLAG_SKIP_INTRA_LOWVAR) &&
+          this_mode != DC_PRED &&
+          x->source_variance < skip_intra_var_thresh[bsize])
+        continue;
+      // Only search the oblique modes if the best so far is
+      // one of the neighboring directional modes
+      if ((mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) &&
+          (this_mode >= D45_PRED && this_mode <= TM_PRED)) {
+        if (vp9_mode_order[best_mode_index].ref_frame[0] > INTRA_FRAME)
           continue;
+      }
+      if (mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
+        if (conditional_skipintra(this_mode, best_intra_mode))
+            continue;
+      }
     }
 
-    set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
+    mbmi->mode = this_mode;
     mbmi->uv_mode = DC_PRED;
-
+    mbmi->ref_frame[0] = ref_frame;
+    mbmi->ref_frame[1] = second_ref_frame;
     // Evaluate all sub-pel filters irrespective of whether we can use
     // them for this frame.
     mbmi->interp_filter = cm->interp_filter == SWITCHABLE ? EIGHTTAP
                                                           : cm->interp_filter;
+    x->skip = 0;
+    set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
     xd->interp_kernel = vp9_get_interp_kernel(mbmi->interp_filter);
 
-    if (comp_pred) {
-      if (!(cpi->ref_frame_flags & flag_list[second_ref_frame]))
-        continue;
-
-      mode_excluded = mode_excluded ? mode_excluded
-                                    : cm->reference_mode == SINGLE_REFERENCE;
-    } else {
-      if (ref_frame != INTRA_FRAME && second_ref_frame != INTRA_FRAME)
-        mode_excluded = mode_excluded ?
-            mode_excluded : cm->reference_mode == COMPOUND_REFERENCE;
-    }
-
     // Select prediction reference frames.
     for (i = 0; i < MAX_MB_PLANE; i++) {
       xd->plane[i].pre[0] = yv12_mb[ref_frame][i];
@@ -3328,46 +3389,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
         xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i];
     }
 
-    // If the segment reference frame feature is enabled....
-    // then do nothing if the current ref frame is not allowed..
-    if (vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
-        vp9_get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) !=
-            (int)ref_frame) {
-      continue;
-    // If the segment skip feature is enabled....
-    // then do nothing if the current mode is not allowed..
-    } else if (vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP) &&
-               (this_mode != ZEROMV && ref_frame != INTRA_FRAME)) {
-      continue;
-    // Disable this drop out case if the ref frame
-    // segment level feature is enabled for this segment. This is to
-    // prevent the possibility that we end up unable to pick any mode.
-    } else if (!vp9_segfeature_active(seg, segment_id,
-                                      SEG_LVL_REF_FRAME)) {
-      // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
-      // unless ARNR filtering is enabled in which case we want
-      // an unfiltered alternative. We allow near/nearest as well
-      // because they may result in zero-zero MVs but be cheaper.
-      if (cpi->rc.is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) {
-        if ((this_mode != ZEROMV &&
-             !(this_mode == NEARMV &&
-               frame_mv[NEARMV][ALTREF_FRAME].as_int == 0) &&
-             !(this_mode == NEARESTMV &&
-               frame_mv[NEARESTMV][ALTREF_FRAME].as_int == 0)) ||
-            ref_frame != ALTREF_FRAME) {
-          continue;
-        }
-      }
-    }
-    // TODO(JBB): This is to make up for the fact that we don't have sad
-    // functions that work when the block size reads outside the umv.  We
-    // should fix this either by making the motion search just work on
-    // a representative block in the boundary ( first ) and then implement a
-    // function that does sads when inside the border..
-    if (((mi_row + bhs) > cm->mi_rows || (mi_col + bws) > cm->mi_cols) &&
-        this_mode == NEWMV) {
-      continue;
-    }
+    for (i = 0; i < TX_MODES; ++i)
+      tx_cache[i] = INT64_MAX;
 
 #ifdef MODE_TEST_HIT_STATS
     // TEST/DEBUG CODE
@@ -3375,34 +3398,10 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     cpi->mode_test_hits[bsize]++;
 #endif
 
-
     if (ref_frame == INTRA_FRAME) {
       TX_SIZE uv_tx;
-      // Disable intra modes other than DC_PRED for blocks with low variance
-      // Threshold for intra skipping based on source variance
-      // TODO(debargha): Specialize the threshold for super block sizes
-      static const unsigned int skip_intra_var_thresh[BLOCK_SIZES] = {
-        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
-      };
-      if ((cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_LOWVAR) &&
-          this_mode != DC_PRED &&
-          x->source_variance < skip_intra_var_thresh[mbmi->sb_type])
-        continue;
-      // Only search the oblique modes if the best so far is
-      // one of the neighboring directional modes
-      if ((cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) &&
-          (this_mode >= D45_PRED && this_mode <= TM_PRED)) {
-        if (vp9_mode_order[best_mode_index].ref_frame[0] > INTRA_FRAME)
-          continue;
-      }
-      mbmi->mode = this_mode;
-      if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
-        if (conditional_skipintra(mbmi->mode, best_intra_mode))
-            continue;
-      }
-
-      super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable, NULL,
-                      bsize, tx_cache, best_rd);
+      intra_super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable, NULL,
+                            bsize, tx_cache, best_rd);
 
       if (rate_y == INT_MAX)
         continue;
@@ -3424,8 +3423,6 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
         rate2 += intra_cost_penalty;
       distortion2 = distortion_y + distortion_uv;
     } else {
-      mbmi->mode = this_mode;
-      compmode_cost = vp9_cost_bit(comp_mode_p, second_ref_frame > INTRA_FRAME);
       this_rd = handle_inter_mode(cpi, x, tile, bsize,
                                   tx_cache,
                                   &rate2, &distortion2, &skippable,
@@ -3437,14 +3434,16 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
                                   single_newmv, &total_sse, best_rd);
       if (this_rd == INT64_MAX)
         continue;
-    }
 
-    if (cm->reference_mode == REFERENCE_MODE_SELECT)
-      rate2 += compmode_cost;
+      compmode_cost = vp9_cost_bit(comp_mode_p, comp_pred);
+
+      if (cm->reference_mode == REFERENCE_MODE_SELECT)
+        rate2 += compmode_cost;
+    }
 
     // Estimate the reference frame signaling cost and add it
     // to the rolling cost variable.
-    if (second_ref_frame > INTRA_FRAME) {
+    if (comp_pred) {
       rate2 += ref_costs_comp[ref_frame];
     } else {
       rate2 += ref_costs_single[ref_frame];
@@ -3552,7 +3551,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
 
         // TODO(debargha): enhance this test with a better distortion prediction
         // based on qp, activity mask and history
-        if ((cpi->sf.mode_search_skip_flags & FLAG_EARLY_TERMINATE) &&
+        if ((mode_search_skip_flags & FLAG_EARLY_TERMINATE) &&
             (mode_index > MIN_EARLY_TERM_INDEX)) {
           const int qstep = xd->plane[0].dequant[1];
           // TODO(debargha): Enhance this by specializing for each mode_index
@@ -3662,17 +3661,6 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     }
   }
 
-  // Flag all modes that have a distortion thats > 2x the best we found at
-  // this level.
-  for (mode_index = 0; mode_index < MB_MODE_COUNT; ++mode_index) {
-    if (mode_index == NEARESTMV || mode_index == NEARMV || mode_index == NEWMV)
-      continue;
-
-    if (mode_distortions[mode_index] > 2 * *returndistortion) {
-      ctx->modes_with_high_error |= (1 << mode_index);
-    }
-  }
-
   assert((cm->interp_filter == SWITCHABLE) ||
          (cm->interp_filter == best_mbmode.interp_filter) ||
          !is_inter_block(&best_mbmode));
@@ -3787,6 +3775,8 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
   int_mv seg_mvs[4][MAX_REF_FRAMES];
   b_mode_info best_bmodes[4];
   int best_skip2 = 0;
+  int ref_frame_mask = 0;
+  int mode_skip_mask = 0;
 
   x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
   vpx_memset(x->zcoeff_blk[TX_4X4], 0, 4);
@@ -3822,13 +3812,12 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
     frame_mv[ZEROMV][ref_frame].as_int = 0;
   }
 
-  cpi->ref_frame_mask = 0;
   for (ref_frame = LAST_FRAME;
        ref_frame <= ALTREF_FRAME && cpi->sf.reference_masking; ++ref_frame) {
     int i;
     for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
       if ((x->pred_mv_sad[ref_frame] >> 1) > x->pred_mv_sad[i]) {
-        cpi->ref_frame_mask |= (1 << ref_frame);
+        ref_frame_mask |= (1 << ref_frame);
         break;
       }
     }
@@ -3861,23 +3850,23 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
       if (mode_index == 3) {
         switch (vp9_ref_order[best_mode_index].ref_frame[0]) {
           case INTRA_FRAME:
-            cpi->mode_skip_mask = 0;
+            mode_skip_mask = 0;
             break;
           case LAST_FRAME:
-            cpi->mode_skip_mask = 0x0010;
+            mode_skip_mask = 0x0010;
             break;
           case GOLDEN_FRAME:
-            cpi->mode_skip_mask = 0x0008;
+            mode_skip_mask = 0x0008;
             break;
           case ALTREF_FRAME:
-            cpi->mode_skip_mask = 0x0000;
+            mode_skip_mask = 0x0000;
             break;
           case NONE:
           case MAX_REF_FRAMES:
             assert(0 && "Invalid Reference frame");
         }
       }
-      if (cpi->mode_skip_mask & ((int64_t)1 << mode_index))
+      if (mode_skip_mask & (1 << mode_index))
         continue;
     }
 
@@ -4137,11 +4126,6 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
         if (tmp_rd == INT64_MAX)
           continue;
       } else {
-        if (cm->interp_filter == SWITCHABLE) {
-          int rs = get_switchable_rate(x);
-          tmp_best_rdu -= RDCOST(x->rdmult, x->rddiv, rs, 0);
-        }
-        tmp_rd = tmp_best_rdu;
         total_sse = tmp_best_sse;
         rate = tmp_best_rate;
         rate_y = tmp_best_ratey;
@@ -4173,7 +4157,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
         // then dont bother looking at UV
         vp9_build_inter_predictors_sbuv(&x->e_mbd, mi_row, mi_col,
                                         BLOCK_8X8);
-        super_block_uvrd(cpi, x, &rate_uv, &distortion_uv, &uv_skippable,
+        super_block_uvrd(x, &rate_uv, &distortion_uv, &uv_skippable,
                          &uv_sse, BLOCK_8X8, tmp_best_rdu);
         if (rate_uv == INT_MAX)
           continue;
@@ -4392,7 +4376,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
 
   if (best_rd == INT64_MAX && bsize < BLOCK_8X8) {
     *returnrate = INT_MAX;
-    *returndistortion = INT_MAX;
+    *returndistortion = INT64_MAX;
     return best_rd;
   }
 
diff --git a/vp9/encoder/vp9_rdopt.h b/vp9/encoder/vp9_rdopt.h
index 96cea4216..6b85d67f8 100644
--- a/vp9/encoder/vp9_rdopt.h
+++ b/vp9/encoder/vp9_rdopt.h
@@ -80,10 +80,10 @@ void vp9_init_me_luts();
 void vp9_set_mbmode_and_mvs(MACROBLOCKD *xd, MB_PREDICTION_MODE mode,
                             const MV *mv);
 
-void vp9_get_entropy_contexts(TX_SIZE tx_size,
-    ENTROPY_CONTEXT t_above[16], ENTROPY_CONTEXT t_left[16],
-    const ENTROPY_CONTEXT *above, const ENTROPY_CONTEXT *left,
-    int num_4x4_w, int num_4x4_h);
+void vp9_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size,
+                              const struct macroblockd_plane *pd,
+                              ENTROPY_CONTEXT t_above[16],
+                              ENTROPY_CONTEXT t_left[16]);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vp9/encoder/vp9_resize.c b/vp9/encoder/vp9_resize.c
index 0766b5107..4e6efaeb9 100644
--- a/vp9/encoder/vp9_resize.c
+++ b/vp9/encoder/vp9_resize.c
@@ -14,6 +14,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+
 #include "vp9/common/vp9_common.h"
 #include "vp9/encoder/vp9_resize.h"
 
@@ -24,9 +25,6 @@
 #define SUBPEL_MASK               ((1 << SUBPEL_BITS) - 1)
 #define INTERP_PRECISION_BITS     32
 
-#define ROUND_POWER_OF_TWO(value, n) \
-    (((value) + (1 << ((n) - 1))) >> (n))
-
 typedef int16_t interp_kernel[INTERP_TAPS];
 
 // Filters for interpolation (0.5-band) - note this also filters integer pels.
diff --git a/vp9/encoder/vp9_sad_c.c b/vp9/encoder/vp9_sad.c
index 58c5df47e..58c5df47e 100644
--- a/vp9/encoder/vp9_sad_c.c
+++ b/vp9/encoder/vp9_sad.c
diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c
index e822e4c64..502e4b678 100644
--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c
@@ -20,7 +20,6 @@
 #include "vp9/encoder/vp9_firstpass.h"
 #include "vp9/encoder/vp9_mcomp.h"
 #include "vp9/encoder/vp9_onyx_int.h"
-#include "vp9/encoder/vp9_psnr.h"
 #include "vp9/encoder/vp9_quantize.h"
 #include "vp9/encoder/vp9_ratectrl.h"
 #include "vp9/encoder/vp9_segmentation.h"
@@ -29,7 +28,6 @@
 #include "vpx_scale/vpx_scale.h"
 
 #define ALT_REF_MC_ENABLED 1    // dis/enable MC in AltRef filtering
-#define ALT_REF_SUBPEL_ENABLED 1  // dis/enable subpel in MC AltRef filtering
 
 static void temporal_filter_predictors_mb_c(MACROBLOCKD *xd,
                                             uint8_t *y_mb_ptr,
@@ -134,17 +132,16 @@ static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi,
   int sadpb = x->sadperbit16;
   int bestsme = INT_MAX;
 
-  int_mv best_ref_mv1;
-  int_mv best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */
-  int_mv *ref_mv;
+  MV best_ref_mv1 = {0, 0};
+  MV best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */
+  MV *ref_mv = &x->e_mbd.mi_8x8[0]->bmi[0].as_mv[0].as_mv;
 
   // Save input state
   struct buf_2d src = x->plane[0].src;
   struct buf_2d pre = xd->plane[0].pre[0];
 
-  best_ref_mv1.as_int = 0;
-  best_ref_mv1_full.as_mv.col = best_ref_mv1.as_mv.col >> 3;
-  best_ref_mv1_full.as_mv.row = best_ref_mv1.as_mv.row >> 3;
+  best_ref_mv1_full.col = best_ref_mv1.col >> 3;
+  best_ref_mv1_full.row = best_ref_mv1.row >> 3;
 
   // Setup frame pointers
   x->plane[0].src.buf = arf_frame_buf;
@@ -161,21 +158,17 @@ static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi,
 
   /*cpi->sf.search_method == HEX*/
   // Ignore mv costing by sending NULL pointer instead of cost arrays
-  ref_mv = &x->e_mbd.mi_8x8[0]->bmi[0].as_mv[0];
-  bestsme = vp9_hex_search(x, &best_ref_mv1_full.as_mv,
-                           step_param, sadpb, 1,
-                           &cpi->fn_ptr[BLOCK_16X16],
-                           0, &best_ref_mv1.as_mv, &ref_mv->as_mv);
+  vp9_hex_search(x, &best_ref_mv1_full, step_param, sadpb, 1,
+                 &cpi->fn_ptr[BLOCK_16X16], 0, &best_ref_mv1, ref_mv);
 
-#if ALT_REF_SUBPEL_ENABLED
   // Try sub-pixel MC?
   // if (bestsme > error_thresh && bestsme < INT_MAX)
   {
     int distortion;
     unsigned int sse;
     // Ignore mv costing by sending NULL pointer instead of cost array
-    bestsme = cpi->find_fractional_mv_step(x, &ref_mv->as_mv,
-                                           &best_ref_mv1.as_mv,
+    bestsme = cpi->find_fractional_mv_step(x, ref_mv,
+                                           &best_ref_mv1,
                                            cpi->common.allow_high_precision_mv,
                                            x->errorperbit,
                                            &cpi->fn_ptr[BLOCK_16X16],
@@ -183,7 +176,6 @@ static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi,
                                            NULL, NULL,
                                            &distortion, &sse);
   }
-#endif
 
   // Restore input state
   x->plane[0].src = src;
@@ -523,11 +515,16 @@ void vp9_configure_arnr_filter(VP9_COMP *cpi,
   cpi->active_arnr_frames = frames_bwd + 1 + frames_fwd;
 
   // Adjust the strength based on active max q
-  q = ((int)vp9_convert_qindex_to_q(cpi->rc.active_worst_quality) >> 1);
-  if (q > 8) {
+  if (cpi->common.current_video_frame > 1)
+    q = ((int)vp9_convert_qindex_to_q(
+        cpi->rc.avg_frame_qindex[INTER_FRAME]));
+  else
+    q = ((int)vp9_convert_qindex_to_q(
+        cpi->rc.avg_frame_qindex[KEY_FRAME]));
+  if (q > 16) {
     cpi->active_arnr_strength = cpi->oxcf.arnr_strength;
   } else {
-    cpi->active_arnr_strength = cpi->oxcf.arnr_strength - (8 - q);
+    cpi->active_arnr_strength = cpi->oxcf.arnr_strength - ((16 - q) / 2);
     if (cpi->active_arnr_strength < 0)
       cpi->active_arnr_strength = 0;
   }
diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c
index ed1301a8a..7ae110707 100644
--- a/vp9/encoder/vp9_tokenize.c
+++ b/vp9/encoder/vp9_tokenize.c
@@ -23,8 +23,8 @@
 
 static TOKENVALUE dct_value_tokens[DCT_MAX_VALUE * 2];
 const TOKENVALUE *vp9_dct_value_tokens_ptr;
-static int dct_value_cost[DCT_MAX_VALUE * 2];
-const int *vp9_dct_value_cost_ptr;
+static int16_t dct_value_cost[DCT_MAX_VALUE * 2];
+const int16_t *vp9_dct_value_cost_ptr;
 
 // Array indices are identical to previously-existing CONTEXT_NODE indices
 const vp9_tree_index vp9_coef_tree[TREE_SIZE(ENTROPY_TOKENS)] = {
@@ -160,7 +160,6 @@ struct tokenize_b_args {
   VP9_COMP *cpi;
   MACROBLOCKD *xd;
   TOKENEXTRA **tp;
-  TX_SIZE tx_size;
   uint8_t *token_cache;
 };
 
@@ -188,6 +187,18 @@ static INLINE void add_token(TOKENEXTRA **t, const vp9_prob *context_tree,
   ++counts[token];
 }
 
+static INLINE void add_token_no_extra(TOKENEXTRA **t,
+                                      const vp9_prob *context_tree,
+                                      uint8_t token,
+                                      uint8_t skip_eob_node,
+                                      unsigned int *counts) {
+  (*t)->token = token;
+  (*t)->context_tree = context_tree;
+  (*t)->skip_eob_node = skip_eob_node;
+  (*t)++;
+  ++counts[token];
+}
+
 static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize,
                        TX_SIZE tx_size, void *arg) {
   struct tokenize_b_args* const args = arg;
@@ -199,17 +210,22 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize,
   struct macroblockd_plane *pd = &xd->plane[plane];
   MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
   int pt; /* near block/prev token context index */
-  int c = 0;
+  int c;
   TOKENEXTRA *t = *tp;        /* store tokens starting here */
   int eob = p->eobs[block];
   const PLANE_TYPE type = pd->plane_type;
-  const int16_t *qcoeff_ptr = BLOCK_OFFSET(p->qcoeff, block);
+  const int16_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
   const int segment_id = mbmi->segment_id;
   const int16_t *scan, *nb;
   const scan_order *so;
-  vp9_coeff_count *const counts = cpi->coef_counts[tx_size];
-  vp9_coeff_probs_model *const coef_probs = cpi->common.fc.coef_probs[tx_size];
   const int ref = is_inter_block(mbmi);
+  unsigned int (*const counts)[COEFF_CONTEXTS][ENTROPY_TOKENS] =
+      cpi->coef_counts[tx_size][type][ref];
+  vp9_prob (*const coef_probs)[COEFF_CONTEXTS][UNCONSTRAINED_NODES] =
+      cpi->common.fc.coef_probs[tx_size][type][ref];
+  unsigned int (*const eob_branch)[COEFF_CONTEXTS] =
+      cpi->common.counts.eob_branch[tx_size][type][ref];
+
   const uint8_t *const band = get_band_translate(tx_size);
   const int seg_eob = get_tx_eob(&cpi->common.seg, segment_id, tx_size);
 
@@ -225,27 +241,26 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize,
   while (c < eob) {
     int v = 0;
     int skip_eob = 0;
-    v = qcoeff_ptr[scan[c]];
+    v = qcoeff[scan[c]];
 
     while (!v) {
-      add_token(&t, coef_probs[type][ref][band[c]][pt], 0, ZERO_TOKEN, skip_eob,
-                counts[type][ref][band[c]][pt]);
-
-      cpi->common.counts.eob_branch[tx_size][type][ref][band[c]][pt] +=
-          !skip_eob;
+      add_token_no_extra(&t, coef_probs[band[c]][pt], ZERO_TOKEN, skip_eob,
+                         counts[band[c]][pt]);
+      eob_branch[band[c]][pt] += !skip_eob;
 
       skip_eob = 1;
       token_cache[scan[c]] = 0;
       ++c;
       pt = get_coef_context(nb, token_cache, c);
-      v = qcoeff_ptr[scan[c]];
+      v = qcoeff[scan[c]];
     }
-    add_token(&t, coef_probs[type][ref][band[c]][pt],
-              vp9_dct_value_tokens_ptr[v].extra,
-              vp9_dct_value_tokens_ptr[v].token, skip_eob,
-              counts[type][ref][band[c]][pt]);
 
-    cpi->common.counts.eob_branch[tx_size][type][ref][band[c]][pt] += !skip_eob;
+    add_token(&t, coef_probs[band[c]][pt],
+              vp9_dct_value_tokens_ptr[v].extra,
+              (uint8_t)vp9_dct_value_tokens_ptr[v].token,
+              (uint8_t)skip_eob,
+              counts[band[c]][pt]);
+    eob_branch[band[c]][pt] += !skip_eob;
 
     token_cache[scan[c]] =
         vp9_pt_energy_class[vp9_dct_value_tokens_ptr[v].token];
@@ -253,9 +268,9 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize,
     pt = get_coef_context(nb, token_cache, c);
   }
   if (c < seg_eob) {
-    add_token(&t, coef_probs[type][ref][band[c]][pt], 0, EOB_TOKEN, 0,
-              counts[type][ref][band[c]][pt]);
-    ++cpi->common.counts.eob_branch[tx_size][type][ref][band[c]][pt];
+    add_token_no_extra(&t, coef_probs[band[c]][pt], EOB_TOKEN, 0,
+                       counts[band[c]][pt]);
+    ++eob_branch[band[c]][pt];
   }
 
   *tp = t;
@@ -299,8 +314,8 @@ void vp9_tokenize_sb(VP9_COMP *cpi, TOKENEXTRA **t, int dry_run,
   const int ctx = vp9_get_skip_context(xd);
   const int skip_inc = !vp9_segfeature_active(&cm->seg, mbmi->segment_id,
                                               SEG_LVL_SKIP);
-  struct tokenize_b_args arg = {cpi, xd, t, mbmi->tx_size, cpi->mb.token_cache};
-  if (mbmi->skip_coeff) {
+  struct tokenize_b_args arg = {cpi, xd, t, cpi->mb.token_cache};
+  if (mbmi->skip) {
     if (!dry_run)
       cm->counts.skip[ctx][1] += skip_inc;
     reset_skip_context(xd, bsize);
diff --git a/vp9/encoder/vp9_tokenize.h b/vp9/encoder/vp9_tokenize.h
index ea86240be..063c0bafe 100644
--- a/vp9/encoder/vp9_tokenize.h
+++ b/vp9/encoder/vp9_tokenize.h
@@ -47,7 +47,7 @@ struct VP9_COMP;
 void vp9_tokenize_sb(struct VP9_COMP *cpi, TOKENEXTRA **t, int dry_run,
                      BLOCK_SIZE bsize);
 
-extern const int *vp9_dct_value_cost_ptr;
+extern const int16_t *vp9_dct_value_cost_ptr;
 /* TODO: The Token field should be broken out into a separate char array to
  *  improve cache locality, since it's needed for costing when the rest of the
  *  fields are not.
diff --git a/vp9/encoder/vp9_vaq.c b/vp9/encoder/vp9_vaq.c
index 1f9cb8709..600029b19 100644
--- a/vp9/encoder/vp9_vaq.c
+++ b/vp9/encoder/vp9_vaq.c
@@ -19,8 +19,8 @@
 #include "vp9/encoder/vp9_segmentation.h"
 #include "vp9/common/vp9_systemdependent.h"
 
-#define ENERGY_MIN (-3)
-#define ENERGY_MAX (3)
+#define ENERGY_MIN (-1)
+#define ENERGY_MAX (1)
 #define ENERGY_SPAN (ENERGY_MAX - ENERGY_MIN +  1)
 #define ENERGY_IN_BOUNDS(energy)\
   assert((energy) >= ENERGY_MIN && (energy) <= ENERGY_MAX)
@@ -44,7 +44,7 @@ unsigned int vp9_vaq_segment_id(int energy) {
 double vp9_vaq_rdmult_ratio(int energy) {
   ENERGY_IN_BOUNDS(energy);
 
-  vp9_clear_system_state();  // __asm emms;
+  vp9_clear_system_state();
 
   return RDMULT_RATIO(energy);
 }
@@ -52,7 +52,7 @@ double vp9_vaq_rdmult_ratio(int energy) {
 double vp9_vaq_inv_q_ratio(int energy) {
   ENERGY_IN_BOUNDS(energy);
 
-  vp9_clear_system_state();  // __asm emms;
+  vp9_clear_system_state();
 
   return Q_RATIO(-energy);
 }
@@ -63,9 +63,9 @@ void vp9_vaq_init() {
 
   assert(ENERGY_SPAN <= MAX_SEGMENTS);
 
-  vp9_clear_system_state();  // __asm emms;
+  vp9_clear_system_state();
 
-  base_ratio = 1.8;
+  base_ratio = 1.5;
 
   for (i = ENERGY_MIN; i <= ENERGY_MAX; i++) {
     Q_RATIO(i) = pow(base_ratio, i/3.0);
@@ -75,35 +75,39 @@ void vp9_vaq_init() {
 void vp9_vaq_frame_setup(VP9_COMP *cpi) {
   VP9_COMMON *cm = &cpi->common;
   struct segmentation *seg = &cm->seg;
-  int base_q = vp9_convert_qindex_to_q(cm->base_qindex);
-  int base_rdmult = vp9_compute_rd_mult(cpi, cm->base_qindex +
-                                        cm->y_dc_delta_q);
+  const double base_q = vp9_convert_qindex_to_q(cm->base_qindex);
+  const int base_rdmult = vp9_compute_rd_mult(cpi, cm->base_qindex +
+                                              cm->y_dc_delta_q);
   int i;
 
-  vp9_enable_segmentation((VP9_PTR)cpi);
-  vp9_clearall_segfeatures(seg);
+  if (cm->frame_type == KEY_FRAME ||
+      cpi->refresh_alt_ref_frame ||
+      (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) {
+    vp9_enable_segmentation((VP9_PTR)cpi);
+    vp9_clearall_segfeatures(seg);
 
-  seg->abs_delta = SEGMENT_DELTADATA;
+    seg->abs_delta = SEGMENT_DELTADATA;
 
-  vp9_clear_system_state();  // __asm emms;
+  vp9_clear_system_state();
 
-  for (i = ENERGY_MIN; i <= ENERGY_MAX; i++) {
-    int qindex_delta, segment_rdmult;
+    for (i = ENERGY_MIN; i <= ENERGY_MAX; i++) {
+      int qindex_delta, segment_rdmult;
 
-    if (Q_RATIO(i) == 1) {
-      // No need to enable SEG_LVL_ALT_Q for this segment
-      RDMULT_RATIO(i) = 1;
-      continue;
-    }
+      if (Q_RATIO(i) == 1) {
+        // No need to enable SEG_LVL_ALT_Q for this segment
+        RDMULT_RATIO(i) = 1;
+        continue;
+      }
 
-    qindex_delta = vp9_compute_qdelta(cpi, base_q, base_q * Q_RATIO(i));
-    vp9_set_segdata(seg, SEGMENT_ID(i), SEG_LVL_ALT_Q, qindex_delta);
-    vp9_enable_segfeature(seg, SEGMENT_ID(i), SEG_LVL_ALT_Q);
+      qindex_delta = vp9_compute_qdelta(cpi, base_q, base_q * Q_RATIO(i));
+      vp9_set_segdata(seg, SEGMENT_ID(i), SEG_LVL_ALT_Q, qindex_delta);
+      vp9_enable_segfeature(seg, SEGMENT_ID(i), SEG_LVL_ALT_Q);
 
-    segment_rdmult = vp9_compute_rd_mult(cpi, cm->base_qindex + qindex_delta +
-                                         cm->y_dc_delta_q);
+      segment_rdmult = vp9_compute_rd_mult(cpi, cm->base_qindex + qindex_delta +
+                                           cm->y_dc_delta_q);
 
-    RDMULT_RATIO(i) = (double) segment_rdmult / base_rdmult;
+      RDMULT_RATIO(i) = (double) segment_rdmult / base_rdmult;
+    }
   }
 }
 
@@ -137,11 +141,8 @@ int vp9_block_energy(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs) {
   double energy;
   unsigned int var = block_variance(cpi, x, bs);
 
-  vp9_clear_system_state();  // __asm emms;
-
-  // if (var <= 1000)
-  //   return 0;
+  vp9_clear_system_state();
 
-  energy = 0.9*(logf(var + 1) - 10.0);
-  return clamp(round(energy), ENERGY_MIN, ENERGY_MAX);
+  energy = 0.9 * (log(var + 1.0) - 10.0);
+  return clamp((int)round(energy), ENERGY_MIN, ENERGY_MAX);
 }
diff --git a/vp9/encoder/vp9_variance_c.c b/vp9/encoder/vp9_variance.c
index 8bc385089..8bc385089 100644
--- a/vp9/encoder/vp9_variance_c.c
+++ b/vp9/encoder/vp9_variance.c
diff --git a/vp9/encoder/vp9_write_bit_buffer.h b/vp9/encoder/vp9_write_bit_buffer.h
index 5958b4806..1795e05e4 100644
--- a/vp9/encoder/vp9_write_bit_buffer.h
+++ b/vp9/encoder/vp9_write_bit_buffer.h
@@ -29,7 +29,7 @@ static size_t vp9_rb_bytes_written(struct vp9_write_bit_buffer *wb) {
 }
 
 static void vp9_wb_write_bit(struct vp9_write_bit_buffer *wb, int bit) {
-  const int off = wb->bit_offset;
+  const int off = (int)wb->bit_offset;
   const int p = off / CHAR_BIT;
   const int q = CHAR_BIT - 1 - off % CHAR_BIT;
   if (q == CHAR_BIT -1) {
diff --git a/vp9/encoder/vp9_writer.c b/vp9/encoder/vp9_writer.c
index 3d13d07b6..fda1b390e 100644
--- a/vp9/encoder/vp9_writer.c
+++ b/vp9/encoder/vp9_writer.c
@@ -12,11 +12,6 @@
 #include "vp9/encoder/vp9_writer.h"
 #include "vp9/common/vp9_entropy.h"
 
-#if defined(SECTIONBITS_OUTPUT)
-unsigned __int64 Sectionbits[500];
-
-#endif
-
 #ifdef ENTROPY_STATS
 unsigned int active_section = 0;
 #endif
diff --git a/vp9/encoder/vp9_writer.h b/vp9/encoder/vp9_writer.h
index 62f555c99..defeec377 100644
--- a/vp9/encoder/vp9_writer.h
+++ b/vp9/encoder/vp9_writer.h
@@ -44,17 +44,6 @@ static void vp9_write(vp9_writer *br, int bit, int probability) {
   unsigned int lowvalue = br->lowvalue;
   register unsigned int shift;
 
-#ifdef ENTROPY_STATS
-#if defined(SECTIONBITS_OUTPUT)
-
-  if (bit)
-    Sectionbits[active_section] += vp9_prob_cost[255 - probability];
-  else
-    Sectionbits[active_section] += vp9_prob_cost[probability];
-
-#endif
-#endif
-
   split = 1 + (((range - 1) * probability) >> 8);
 
   range = split;
diff --git a/vp9/encoder/x86/vp9_dct_avx2.c b/vp9/encoder/x86/vp9_dct_avx2.c
index ea031fb07..b5269ed03 100644
--- a/vp9/encoder/x86/vp9_dct_avx2.c
+++ b/vp9/encoder/x86/vp9_dct_avx2.c
@@ -16,7 +16,7 @@ void vp9_fdct4x4_avx2(const int16_t *input, int16_t *output, int stride) {
   // The 2D transform is done with two passes which are actually pretty
   // similar. In the first one, we transform the columns and transpose
   // the results. In the second one, we transform the rows. To achieve that,
-  // as the first pass results are transposed, we tranpose the columns (that
+  // as the first pass results are transposed, we transpose the columns (that
   // is the transposed rows) and transpose the results (so that it goes back
   // in normal/row positions).
   int pass;
@@ -46,7 +46,7 @@ void vp9_fdct4x4_avx2(const int16_t *input, int16_t *output, int stride) {
     in3 = _mm_slli_epi16(in3, 4);
     // if (i == 0 && input[0]) input[0] += 1;
     {
-      // The mask will only contain wether the first value is zero, all
+      // The mask will only contain whether the first value is zero, all
       // other comparison will fail as something shifted by 4 (above << 4)
       // can never be equal to one. To increment in the non-zero case, we
       // add the mask and one for the first element:
@@ -59,7 +59,7 @@ void vp9_fdct4x4_avx2(const int16_t *input, int16_t *output, int stride) {
   }
   // Do the two transform/transpose passes
   for (pass = 0; pass < 2; ++pass) {
-    // Transform 1/2: Add/substract
+    // Transform 1/2: Add/subtract
     const __m128i r0 = _mm_add_epi16(in0, in3);
     const __m128i r1 = _mm_add_epi16(in1, in2);
     const __m128i r2 = _mm_sub_epi16(in1, in2);
@@ -244,32 +244,36 @@ void fadst4_avx2(__m128i *in) {
   transpose_4x4_avx2(in);
 }
 
-void vp9_short_fht4x4_avx2(const int16_t *input, int16_t *output,
-                           int stride, int tx_type) {
+void vp9_fht4x4_avx2(const int16_t *input, int16_t *output,
+                     int stride, int tx_type) {
   __m128i in[4];
-  load_buffer_4x4_avx2(input, in, stride);
+
   switch (tx_type) {
-    case 0:  // DCT_DCT
-      fdct4_avx2(in);
-      fdct4_avx2(in);
+    case DCT_DCT:
+      vp9_fdct4x4_avx2(input, output, stride);
       break;
-    case 1:  // ADST_DCT
+    case ADST_DCT:
+      load_buffer_4x4_avx2(input, in, stride);
       fadst4_avx2(in);
       fdct4_avx2(in);
+      write_buffer_4x4_avx2(output, in);
       break;
-    case 2:  // DCT_ADST
+    case DCT_ADST:
+      load_buffer_4x4_avx2(input, in, stride);
       fdct4_avx2(in);
       fadst4_avx2(in);
+      write_buffer_4x4_avx2(output, in);
       break;
-    case 3:  // ADST_ADST
+    case ADST_ADST:
+      load_buffer_4x4_avx2(input, in, stride);
       fadst4_avx2(in);
       fadst4_avx2(in);
+      write_buffer_4x4_avx2(output, in);
       break;
     default:
       assert(0);
       break;
   }
-  write_buffer_4x4_avx2(output, in);
 }
 
 void vp9_fdct8x8_avx2(const int16_t *input, int16_t *output, int stride) {
@@ -313,7 +317,7 @@ void vp9_fdct8x8_avx2(const int16_t *input, int16_t *output, int stride) {
   for (pass = 0; pass < 2; pass++) {
     // To store results of each pass before the transpose.
     __m128i res0, res1, res2, res3, res4, res5, res6, res7;
-    // Add/substract
+    // Add/subtract
     const __m128i q0 = _mm_add_epi16(in0, in7);
     const __m128i q1 = _mm_add_epi16(in1, in6);
     const __m128i q2 = _mm_add_epi16(in2, in5);
@@ -324,7 +328,7 @@ void vp9_fdct8x8_avx2(const int16_t *input, int16_t *output, int stride) {
     const __m128i q7 = _mm_sub_epi16(in0, in7);
     // Work on first four results
     {
-      // Add/substract
+      // Add/subtract
       const __m128i r0 = _mm_add_epi16(q0, q3);
       const __m128i r1 = _mm_add_epi16(q1, q2);
       const __m128i r2 = _mm_sub_epi16(q1, q2);
@@ -386,7 +390,7 @@ void vp9_fdct8x8_avx2(const int16_t *input, int16_t *output, int stride) {
       // Combine
       const __m128i r0 = _mm_packs_epi32(s0, s1);
       const __m128i r1 = _mm_packs_epi32(s2, s3);
-      // Add/substract
+      // Add/subtract
       const __m128i x0 = _mm_add_epi16(q4, r0);
       const __m128i x1 = _mm_sub_epi16(q4, r0);
       const __m128i x2 = _mm_sub_epi16(q7, r1);
@@ -1028,40 +1032,46 @@ void fadst8_avx2(__m128i *in) {
   array_transpose_8x8_avx2(in, in);
 }
 
-void vp9_short_fht8x8_avx2(const int16_t *input, int16_t *output,
-                           int stride, int tx_type) {
+void vp9_fht8x8_avx2(const int16_t *input, int16_t *output,
+                     int stride, int tx_type) {
   __m128i in[8];
-  load_buffer_8x8_avx2(input, in, stride);
+
   switch (tx_type) {
-    case 0:  // DCT_DCT
-      fdct8_avx2(in);
-      fdct8_avx2(in);
+    case DCT_DCT:
+      vp9_fdct8x8_avx2(input, output, stride);
       break;
-    case 1:  // ADST_DCT
+    case ADST_DCT:
+      load_buffer_8x8_avx2(input, in, stride);
       fadst8_avx2(in);
       fdct8_avx2(in);
+      right_shift_8x8_avx2(in, 1);
+      write_buffer_8x8_avx2(output, in, 8);
       break;
-    case 2:  // DCT_ADST
+    case DCT_ADST:
+      load_buffer_8x8_avx2(input, in, stride);
       fdct8_avx2(in);
       fadst8_avx2(in);
+      right_shift_8x8_avx2(in, 1);
+      write_buffer_8x8_avx2(output, in, 8);
       break;
-    case 3:  // ADST_ADST
+    case ADST_ADST:
+      load_buffer_8x8_avx2(input, in, stride);
       fadst8_avx2(in);
       fadst8_avx2(in);
+      right_shift_8x8_avx2(in, 1);
+      write_buffer_8x8_avx2(output, in, 8);
       break;
     default:
       assert(0);
       break;
   }
-  right_shift_8x8_avx2(in, 1);
-  write_buffer_8x8_avx2(output, in, 8);
 }
 
 void vp9_fdct16x16_avx2(const int16_t *input, int16_t *output, int stride) {
   // The 2D transform is done with two passes which are actually pretty
   // similar. In the first one, we transform the columns and transpose
   // the results. In the second one, we transform the rows. To achieve that,
-  // as the first pass results are transposed, we tranpose the columns (that
+  // as the first pass results are transposed, we transpose the columns (that
   // is the transposed rows) and transpose the results (so that it goes back
   // in normal/row positions).
   int pass;
@@ -1218,7 +1228,7 @@ void vp9_fdct16x16_avx2(const int16_t *input, int16_t *output, int stride) {
       }
       // Work on the first eight values; fdct8(input, even_results);
       {
-        // Add/substract
+        // Add/subtract
         const __m128i q0 = _mm_add_epi16(input0, input7);
         const __m128i q1 = _mm_add_epi16(input1, input6);
         const __m128i q2 = _mm_add_epi16(input2, input5);
@@ -1229,7 +1239,7 @@ void vp9_fdct16x16_avx2(const int16_t *input, int16_t *output, int stride) {
         const __m128i q7 = _mm_sub_epi16(input0, input7);
         // Work on first four results
         {
-          // Add/substract
+          // Add/subtract
           const __m128i r0 = _mm_add_epi16(q0, q3);
           const __m128i r1 = _mm_add_epi16(q1, q2);
           const __m128i r2 = _mm_sub_epi16(q1, q2);
@@ -1293,7 +1303,7 @@ void vp9_fdct16x16_avx2(const int16_t *input, int16_t *output, int stride) {
           // Combine
           const __m128i r0 = _mm_packs_epi32(s0, s1);
           const __m128i r1 = _mm_packs_epi32(s2, s3);
-          // Add/substract
+          // Add/subtract
           const __m128i x0 = _mm_add_epi16(q4, r0);
           const __m128i x1 = _mm_sub_epi16(q4, r0);
           const __m128i x2 = _mm_sub_epi16(q7, r1);
@@ -2534,36 +2544,39 @@ void fadst16_avx2(__m128i *in0, __m128i *in1) {
   array_transpose_16x16_avx2(in0, in1);
 }
 
-void vp9_short_fht16x16_avx2(const int16_t *input, int16_t *output,
-                             int stride, int tx_type) {
+void vp9_fht16x16_avx2(const int16_t *input, int16_t *output,
+                      int stride, int tx_type) {
   __m128i in0[16], in1[16];
-  load_buffer_16x16_avx2(input, in0, in1, stride);
+
   switch (tx_type) {
-    case 0:  // DCT_DCT
-      fdct16_avx2(in0, in1);
-      right_shift_16x16_avx2(in0, in1);
-      fdct16_avx2(in0, in1);
+    case DCT_DCT:
+      vp9_fdct16x16_avx2(input, output, stride);
       break;
-    case 1:  // ADST_DCT
+    case ADST_DCT:
+      load_buffer_16x16_avx2(input, in0, in1, stride);
       fadst16_avx2(in0, in1);
       right_shift_16x16_avx2(in0, in1);
       fdct16_avx2(in0, in1);
+      write_buffer_16x16_avx2(output, in0, in1, 16);
       break;
-    case 2:  // DCT_ADST
+    case DCT_ADST:
+      load_buffer_16x16_avx2(input, in0, in1, stride);
       fdct16_avx2(in0, in1);
       right_shift_16x16_avx2(in0, in1);
       fadst16_avx2(in0, in1);
+      write_buffer_16x16_avx2(output, in0, in1, 16);
       break;
-    case 3:  // ADST_ADST
+    case ADST_ADST:
+      load_buffer_16x16_avx2(input, in0, in1, stride);
       fadst16_avx2(in0, in1);
       right_shift_16x16_avx2(in0, in1);
       fadst16_avx2(in0, in1);
+      write_buffer_16x16_avx2(output, in0, in1, 16);
       break;
     default:
       assert(0);
       break;
   }
-  write_buffer_16x16_avx2(output, in0, in1, 16);
 }
 
 #define FDCT32x32_2D_AVX2 vp9_fdct32x32_rd_avx2
diff --git a/vp9/encoder/x86/vp9_dct_sse2.c b/vp9/encoder/x86/vp9_dct_sse2.c
index c876cc273..f3735ebd3 100644
--- a/vp9/encoder/x86/vp9_dct_sse2.c
+++ b/vp9/encoder/x86/vp9_dct_sse2.c
@@ -16,7 +16,7 @@ void vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride) {
   // The 2D transform is done with two passes which are actually pretty
   // similar. In the first one, we transform the columns and transpose
   // the results. In the second one, we transform the rows. To achieve that,
-  // as the first pass results are transposed, we tranpose the columns (that
+  // as the first pass results are transposed, we transpose the columns (that
   // is the transposed rows) and transpose the results (so that it goes back
   // in normal/row positions).
   int pass;
@@ -47,7 +47,7 @@ void vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride) {
     in1 = _mm_slli_epi16(in1, 4);
     // if (i == 0 && input[0]) input[0] += 1;
     {
-      // The mask will only contain wether the first value is zero, all
+      // The mask will only contain whether the first value is zero, all
       // other comparison will fail as something shifted by 4 (above << 4)
       // can never be equal to one. To increment in the non-zero case, we
       // add the mask and one for the first element:
@@ -60,7 +60,7 @@ void vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride) {
   }
   // Do the two transform/transpose passes
   for (pass = 0; pass < 2; ++pass) {
-    // Transform 1/2: Add/substract
+    // Transform 1/2: Add/subtract
     const __m128i r0 = _mm_add_epi16(in0, in1);
     const __m128i r1 = _mm_sub_epi16(in0, in1);
     const __m128i r2 = _mm_unpacklo_epi64(r0, r1);
@@ -242,32 +242,36 @@ void fadst4_sse2(__m128i *in) {
   transpose_4x4(in);
 }
 
-void vp9_short_fht4x4_sse2(const int16_t *input, int16_t *output,
-                           int stride, int tx_type) {
+void vp9_fht4x4_sse2(const int16_t *input, int16_t *output,
+                     int stride, int tx_type) {
   __m128i in[4];
-  load_buffer_4x4(input, in, stride);
+
   switch (tx_type) {
-    case 0:  // DCT_DCT
-      fdct4_sse2(in);
-      fdct4_sse2(in);
+    case DCT_DCT:
+      vp9_fdct4x4_sse2(input, output, stride);
       break;
-    case 1:  // ADST_DCT
+    case ADST_DCT:
+      load_buffer_4x4(input, in, stride);
       fadst4_sse2(in);
       fdct4_sse2(in);
+      write_buffer_4x4(output, in);
       break;
-    case 2:  // DCT_ADST
+    case DCT_ADST:
+      load_buffer_4x4(input, in, stride);
       fdct4_sse2(in);
       fadst4_sse2(in);
+      write_buffer_4x4(output, in);
       break;
-    case 3:  // ADST_ADST
+    case ADST_ADST:
+      load_buffer_4x4(input, in, stride);
       fadst4_sse2(in);
       fadst4_sse2(in);
+      write_buffer_4x4(output, in);
       break;
-    default:
-      assert(0);
-      break;
+   default:
+     assert(0);
+     break;
   }
-  write_buffer_4x4(output, in);
 }
 
 void vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride) {
@@ -311,7 +315,7 @@ void vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride) {
   for (pass = 0; pass < 2; pass++) {
     // To store results of each pass before the transpose.
     __m128i res0, res1, res2, res3, res4, res5, res6, res7;
-    // Add/substract
+    // Add/subtract
     const __m128i q0 = _mm_add_epi16(in0, in7);
     const __m128i q1 = _mm_add_epi16(in1, in6);
     const __m128i q2 = _mm_add_epi16(in2, in5);
@@ -322,7 +326,7 @@ void vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride) {
     const __m128i q7 = _mm_sub_epi16(in0, in7);
     // Work on first four results
     {
-      // Add/substract
+      // Add/subtract
       const __m128i r0 = _mm_add_epi16(q0, q3);
       const __m128i r1 = _mm_add_epi16(q1, q2);
       const __m128i r2 = _mm_sub_epi16(q1, q2);
@@ -384,7 +388,7 @@ void vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride) {
       // Combine
       const __m128i r0 = _mm_packs_epi32(s0, s1);
       const __m128i r1 = _mm_packs_epi32(s2, s3);
-      // Add/substract
+      // Add/subtract
       const __m128i x0 = _mm_add_epi16(q4, r0);
       const __m128i x1 = _mm_sub_epi16(q4, r0);
       const __m128i x2 = _mm_sub_epi16(q7, r1);
@@ -1026,40 +1030,46 @@ void fadst8_sse2(__m128i *in) {
   array_transpose_8x8(in, in);
 }
 
-void vp9_short_fht8x8_sse2(const int16_t *input, int16_t *output,
-                           int stride, int tx_type) {
+void vp9_fht8x8_sse2(const int16_t *input, int16_t *output,
+                     int stride, int tx_type) {
   __m128i in[8];
-  load_buffer_8x8(input, in, stride);
+
   switch (tx_type) {
-    case 0:  // DCT_DCT
-      fdct8_sse2(in);
-      fdct8_sse2(in);
+    case DCT_DCT:
+      vp9_fdct8x8_sse2(input, output, stride);
       break;
-    case 1:  // ADST_DCT
+    case ADST_DCT:
+      load_buffer_8x8(input, in, stride);
       fadst8_sse2(in);
       fdct8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
       break;
-    case 2:  // DCT_ADST
+    case DCT_ADST:
+      load_buffer_8x8(input, in, stride);
       fdct8_sse2(in);
       fadst8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
       break;
-    case 3:  // ADST_ADST
+    case ADST_ADST:
+      load_buffer_8x8(input, in, stride);
       fadst8_sse2(in);
       fadst8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
       break;
     default:
       assert(0);
       break;
   }
-  right_shift_8x8(in, 1);
-  write_buffer_8x8(output, in, 8);
 }
 
 void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) {
   // The 2D transform is done with two passes which are actually pretty
   // similar. In the first one, we transform the columns and transpose
   // the results. In the second one, we transform the rows. To achieve that,
-  // as the first pass results are transposed, we tranpose the columns (that
+  // as the first pass results are transposed, we transpose the columns (that
   // is the transposed rows) and transpose the results (so that it goes back
   // in normal/row positions).
   int pass;
@@ -1216,7 +1226,7 @@ void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) {
       }
       // Work on the first eight values; fdct8(input, even_results);
       {
-        // Add/substract
+        // Add/subtract
         const __m128i q0 = _mm_add_epi16(input0, input7);
         const __m128i q1 = _mm_add_epi16(input1, input6);
         const __m128i q2 = _mm_add_epi16(input2, input5);
@@ -1227,7 +1237,7 @@ void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) {
         const __m128i q7 = _mm_sub_epi16(input0, input7);
         // Work on first four results
         {
-          // Add/substract
+          // Add/subtract
           const __m128i r0 = _mm_add_epi16(q0, q3);
           const __m128i r1 = _mm_add_epi16(q1, q2);
           const __m128i r2 = _mm_sub_epi16(q1, q2);
@@ -1291,7 +1301,7 @@ void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) {
           // Combine
           const __m128i r0 = _mm_packs_epi32(s0, s1);
           const __m128i r1 = _mm_packs_epi32(s2, s3);
-          // Add/substract
+          // Add/subtract
           const __m128i x0 = _mm_add_epi16(q4, r0);
           const __m128i x1 = _mm_sub_epi16(q4, r0);
           const __m128i x2 = _mm_sub_epi16(q7, r1);
@@ -2532,36 +2542,39 @@ void fadst16_sse2(__m128i *in0, __m128i *in1) {
   array_transpose_16x16(in0, in1);
 }
 
-void vp9_short_fht16x16_sse2(const int16_t *input, int16_t *output,
-                             int stride, int tx_type) {
+void vp9_fht16x16_sse2(const int16_t *input, int16_t *output,
+                       int stride, int tx_type) {
   __m128i in0[16], in1[16];
-  load_buffer_16x16(input, in0, in1, stride);
+
   switch (tx_type) {
-    case 0:  // DCT_DCT
-      fdct16_sse2(in0, in1);
-      right_shift_16x16(in0, in1);
-      fdct16_sse2(in0, in1);
+    case DCT_DCT:
+      vp9_fdct16x16_sse2(input, output, stride);
       break;
-    case 1:  // ADST_DCT
+    case ADST_DCT:
+      load_buffer_16x16(input, in0, in1, stride);
       fadst16_sse2(in0, in1);
       right_shift_16x16(in0, in1);
       fdct16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
       break;
-    case 2:  // DCT_ADST
+    case DCT_ADST:
+      load_buffer_16x16(input, in0, in1, stride);
       fdct16_sse2(in0, in1);
       right_shift_16x16(in0, in1);
       fadst16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
       break;
-    case 3:  // ADST_ADST
+    case ADST_ADST:
+      load_buffer_16x16(input, in0, in1, stride);
       fadst16_sse2(in0, in1);
       right_shift_16x16(in0, in1);
       fadst16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
       break;
     default:
       assert(0);
       break;
   }
-  write_buffer_16x16(output, in0, in1, 16);
 }
 
 #define FDCT32x32_2D vp9_fdct32x32_rd_sse2
diff --git a/vp9/encoder/x86/vp9_quantize_ssse3.asm b/vp9/encoder/x86/vp9_quantize_ssse3.asm
index db306603b..48ccef8cc 100644
--- a/vp9/encoder/x86/vp9_quantize_ssse3.asm
+++ b/vp9/encoder/x86/vp9_quantize_ssse3.asm
@@ -188,7 +188,8 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   pmaxsw                          m8, m7
   pshuflw                         m7, m8, 0x1
   pmaxsw                          m8, m7
-  pextrw                        [r2], m8, 0
+  pextrw                          r6, m8, 0
+  mov                             [r2], r6
   RET
 
   ; skip-block, i.e. just write all zeroes
@@ -214,5 +215,5 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
 %endmacro
 
 INIT_XMM ssse3
-QUANTIZE_FN b, 6
+QUANTIZE_FN b, 7
 QUANTIZE_FN b_32x32, 7
diff --git a/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c b/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c
new file mode 100644
index 000000000..b8bfa8900
--- /dev/null
+++ b/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c
@@ -0,0 +1,641 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h>  // AVX2
+#include "vpx_ports/mem.h"
+#include "vp9/encoder/vp9_variance.h"
+
+DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = {
+  16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
+  16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
+  15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1,
+  15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1,
+  14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
+  14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
+  13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3,
+  13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3,
+  12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
+  12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
+  11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5,
+  11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5,
+  10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
+  10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
+  9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7,
+  9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9,
+  7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9,
+  6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
+  6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
+  5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11,
+  5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11,
+  4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
+  4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
+  3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13,
+  3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13,
+  2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
+  2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
+  1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15,
+  1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15
+};
+
+unsigned int vp9_sub_pixel_variance32xh_avx2(const uint8_t *src,
+                                             int src_stride,
+                                             int x_offset,
+                                             int y_offset,
+                                             const uint8_t *dst,
+                                             int dst_stride,
+                                             int height,
+                                             unsigned int *sse) {
+  __m256i src_reg,  dst_reg,  exp_src_lo,  exp_src_hi,  exp_dst_lo,  exp_dst_hi;
+  __m256i sse_reg,  sum_reg,  sse_reg_hi,  res_cmp,  sum_reg_lo,  sum_reg_hi;
+  __m256i zero_reg;
+  int i, sum;
+  sum_reg = _mm256_set1_epi16(0);
+  sse_reg = _mm256_set1_epi16(0);
+  zero_reg = _mm256_set1_epi16(0);
+
+  if (x_offset == 0) {
+    // x_offset = 0 and y_offset = 0
+    if (y_offset == 0) {
+      for (i = 0; i < height ; i++) {
+        // load source and destination
+        src_reg = _mm256_loadu_si256((__m256i const *) (src));
+        dst_reg = _mm256_load_si256((__m256i const *) (dst));
+
+        // expend each byte to 2 bytes
+        exp_src_lo = _mm256_unpacklo_epi8(src_reg, zero_reg);
+        exp_src_hi = _mm256_unpackhi_epi8(src_reg, zero_reg);
+
+        exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg);
+        exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg);
+
+        // source - dest
+        exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo);
+        exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi);
+
+        // calculate sum
+        sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo);
+        exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo);
+        sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi);
+        exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi);
+
+        // calculate sse
+        sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo);
+        sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
+
+        src+= src_stride;
+        dst+= dst_stride;
+      }
+    // x_offset = 0 and y_offset = 8
+    } else if (y_offset == 8) {
+        __m256i src_next_reg;
+        for (i = 0; i < height ; i++) {
+          // load source + next source + destination
+          src_reg = _mm256_loadu_si256((__m256i const *) (src));
+          src_next_reg = _mm256_loadu_si256((__m256i const *)
+                                         (src + src_stride));
+          dst_reg = _mm256_load_si256((__m256i const *) (dst));
+          // average between current and next stride source
+          src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
+
+          // expend each byte to 2 bytes
+          exp_src_lo = _mm256_unpacklo_epi8(src_reg, zero_reg);
+          exp_src_hi = _mm256_unpackhi_epi8(src_reg, zero_reg);
+
+          exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg);
+          exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg);
+
+          // source - dest
+          exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo);
+          exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi);
+
+          // calculate sum
+          sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo);
+          exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo);
+          sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi);
+          exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi);
+          sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo);
+
+          // calculate sse
+          sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
+
+          src+= src_stride;
+          dst+= dst_stride;
+        }
+    // x_offset = 0 and y_offset = bilin interpolation
+    } else {
+        __m256i filter, pw8, src_next_reg;
+#if (ARCH_X86_64)
+        int64_t y_offset64;
+        y_offset64 = y_offset;
+        y_offset64 <<= 5;
+        filter = _mm256_load_si256(
+            (__m256i const *)(bilinear_filters_avx2 + y_offset64));
+#else
+        y_offset <<= 5;
+        filter = _mm256_load_si256(
+            (__m256i const *)(bilinear_filters_avx2 + y_offset));
+#endif
+        pw8 = _mm256_set1_epi16(8);
+        for (i = 0; i < height ; i++) {
+          // load current and next source + destination
+          src_reg = _mm256_loadu_si256((__m256i const *) (src));
+          src_next_reg = _mm256_loadu_si256((__m256i const *)
+                          (src + src_stride));
+          dst_reg = _mm256_load_si256((__m256i const *) (dst));
+
+          // merge current and next source
+          exp_src_lo = _mm256_unpacklo_epi8(src_reg, src_next_reg);
+          exp_src_hi = _mm256_unpackhi_epi8(src_reg, src_next_reg);
+
+          // filter the source
+          exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter);
+          exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter);
+
+          // add 8 to the source
+          exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8);
+          exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8);
+
+          // divide by 16
+          exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4);
+          exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
+
+          // expand each byte to 2 byte in the destination
+          exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg);
+          exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg);
+
+          // source - dest
+          exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo);
+          exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi);
+
+          // calculate sum
+          sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo);
+          exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo);
+          sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi);
+          exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi);
+
+          // calculate sse
+          sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo);
+          sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
+
+          src+= src_stride;
+          dst+= dst_stride;
+        }
+    }
+  // x_offset = 8  and y_offset = 0
+  } else if (x_offset == 8) {
+      if (y_offset == 0) {
+        __m256i src_next_reg;
+        for (i = 0; i < height ; i++) {
+          // load source and another source starting from the next
+          // following byte + destination
+          src_reg = _mm256_loadu_si256((__m256i const *) (src));
+          src_next_reg = _mm256_loadu_si256((__m256i const *) (src + 1));
+          dst_reg = _mm256_load_si256((__m256i const *) (dst));
+
+          // average between source and the next byte following source
+          src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
+
+          // expand each byte to 2 bytes
+          exp_src_lo = _mm256_unpacklo_epi8(src_reg, zero_reg);
+          exp_src_hi = _mm256_unpackhi_epi8(src_reg, zero_reg);
+
+          exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg);
+          exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg);
+
+          // source - dest
+          exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo);
+          exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi);
+
+          // calculate sum
+          sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo);
+          exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo);
+          sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi);
+          exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi);
+
+          // calculate sse
+          sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo);
+          sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
+
+          src+= src_stride;
+          dst+= dst_stride;
+        }
+      // x_offset = 8  and y_offset = 8
+      } else if (y_offset == 8) {
+          __m256i src_next_reg, src_avg;
+          // load source and another source starting from the next
+          // following byte
+          src_reg = _mm256_loadu_si256((__m256i const *) (src));
+          src_next_reg = _mm256_loadu_si256((__m256i const *) (src + 1));
+
+          // average between source and the next byte following source
+          src_avg = _mm256_avg_epu8(src_reg, src_next_reg);
+          for (i = 0; i < height ; i++) {
+            src+= src_stride;
+            // load source and another source starting from the next
+            // following byte + destination
+            src_reg = _mm256_loadu_si256((__m256i const *) (src));
+            src_next_reg = _mm256_loadu_si256((__m256i const *) (src + 1));
+            dst_reg = _mm256_load_si256((__m256i const *) (dst));
+            // average between source and the next byte following source
+            src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
+
+            // expand each byte to 2 bytes
+            exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg);
+            exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg);
+
+            // average between previous average to current average
+            src_avg = _mm256_avg_epu8(src_avg, src_reg);
+            // expand each byte to 2 bytes
+            exp_src_lo = _mm256_unpacklo_epi8(src_avg, zero_reg);
+            exp_src_hi = _mm256_unpackhi_epi8(src_avg, zero_reg);
+
+            // save current source average
+            src_avg = src_reg;
+            // source - dest
+            exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo);
+            exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi);
+
+            // calculate sum
+            sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo);
+            exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo);
+            sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi);
+            exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi);
+
+            // calculate sse
+            sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo);
+            sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
+
+            dst+= dst_stride;
+          }
+      // x_offset = 8  and y_offset = bilin interpolation
+      } else {
+          __m256i filter, pw8, src_next_reg, src_avg;
+#if (ARCH_X86_64)
+          int64_t y_offset64;
+          y_offset64 = y_offset;
+          y_offset64 <<= 5;
+          filter = _mm256_load_si256(
+              (__m256i const *)(bilinear_filters_avx2 + y_offset64));
+#else
+          y_offset <<= 5;
+          filter = _mm256_load_si256(
+              (__m256i const *)(bilinear_filters_avx2 + y_offset));
+#endif
+          pw8 = _mm256_set1_epi16(8);
+          // load source and another source starting from the next
+          // following byte
+          src_reg = _mm256_loadu_si256((__m256i const *) (src));
+          src_next_reg = _mm256_loadu_si256((__m256i const *) (src + 1));
+          // average between source and the next byte following source
+          src_avg = _mm256_avg_epu8(src_reg, src_next_reg);
+          for (i = 0; i < height ; i++) {
+            src+= src_stride;
+            // load source and another source starting from the next
+            // following byte + destination
+            src_reg = _mm256_loadu_si256((__m256i const *) (src));
+            src_next_reg = _mm256_loadu_si256((__m256i const *) (src + 1));
+            dst_reg = _mm256_load_si256((__m256i const *) (dst));
+            // average between source and the next byte following source
+            src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
+
+            // merge previous average and current average
+            exp_src_lo = _mm256_unpacklo_epi8(src_avg, src_reg);
+            exp_src_hi = _mm256_unpackhi_epi8(src_avg, src_reg);
+
+            // filter the source
+            exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter);
+            exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter);
+
+            // add 8 to the source
+            exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8);
+            exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8);
+
+            // divide the source by 16
+            exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4);
+            exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
+
+            // expand each byte to 2 bytes
+            exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg);
+            exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg);
+
+            // save current source average
+            src_avg = src_reg;
+            // source - dest
+            exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo);
+            exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi);
+
+            // calculate sum
+            sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo);
+            exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo);
+            sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi);
+            exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi);
+
+            // calculate sse
+            sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo);
+            sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
+
+            dst+= dst_stride;
+          }
+      }
+  // x_offset = bilin interpolation and y_offset = 0
+  } else {
+      if (y_offset == 0) {
+        __m256i filter, pw8, src_next_reg;
+#if (ARCH_X86_64)
+        int64_t x_offset64;
+        x_offset64 = x_offset;
+        x_offset64 <<= 5;
+        filter = _mm256_load_si256(
+            (__m256i const *)(bilinear_filters_avx2 + x_offset64));
+#else
+        x_offset <<= 5;
+        filter = _mm256_load_si256(
+            (__m256i const *)(bilinear_filters_avx2 + x_offset));
+#endif
+        pw8 = _mm256_set1_epi16(8);
+        for (i = 0; i < height ; i++) {
+          // load source and another source starting from the next
+          // following byte + destination
+          src_reg = _mm256_loadu_si256((__m256i const *) (src));
+          src_next_reg = _mm256_loadu_si256((__m256i const *) (src + 1));
+          dst_reg = _mm256_load_si256((__m256i const *) (dst));
+
+          // merge current and next source
+          exp_src_lo = _mm256_unpacklo_epi8(src_reg, src_next_reg);
+          exp_src_hi = _mm256_unpackhi_epi8(src_reg, src_next_reg);
+
+          // filter the source
+          exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter);
+          exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter);
+
+          // add 8 to source
+          exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8);
+          exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8);
+
+          // divide the source by 16
+          exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4);
+          exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
+
+          // expand each byte to 2 bytes
+          exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg);
+          exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg);
+
+          // source - dest
+          exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo);
+          exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi);
+
+          // calculate sum
+          sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo);
+          exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo);
+          sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi);
+          exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi);
+
+          // calculate sse
+          sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo);
+          sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
+
+          src+= src_stride;
+          dst+= dst_stride;
+        }
+      // x_offset = bilin interpolation and y_offset = 8
+      } else if (y_offset == 8) {
+          __m256i filter, pw8, src_next_reg, src_pack;
+#if (ARCH_X86_64)
+          int64_t x_offset64;
+          x_offset64 = x_offset;
+          x_offset64 <<= 5;
+          filter = _mm256_load_si256(
+              (__m256i const *)(bilinear_filters_avx2 + x_offset64));
+#else
+          x_offset <<= 5;
+          filter = _mm256_load_si256(
+              (__m256i const *)(bilinear_filters_avx2 + x_offset));
+#endif
+          pw8 = _mm256_set1_epi16(8);
+          // load source and another source starting from the next
+          // following byte
+          src_reg = _mm256_loadu_si256((__m256i const *) (src));
+          src_next_reg = _mm256_loadu_si256((__m256i const *) (src + 1));
+
+          // merge current and next stride source
+          exp_src_lo = _mm256_unpacklo_epi8(src_reg, src_next_reg);
+          exp_src_hi = _mm256_unpackhi_epi8(src_reg, src_next_reg);
+
+          // filter the source
+          exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter);
+          exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter);
+
+          // add 8 to source
+          exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8);
+          exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8);
+
+          // divide source by 16
+          exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4);
+          exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
+
+          // convert each 16 bit to 8 bit to each low and high lane source
+          src_pack =  _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+          for (i = 0; i < height ; i++) {
+            src+= src_stride;
+
+            // load source and another source starting from the next
+            // following byte + destination
+            src_reg = _mm256_loadu_si256((__m256i const *) (src));
+            src_next_reg = _mm256_loadu_si256((__m256i const *) (src + 1));
+            dst_reg = _mm256_load_si256((__m256i const *) (dst));
+
+            // merge current and next stride source
+            exp_src_lo = _mm256_unpacklo_epi8(src_reg, src_next_reg);
+            exp_src_hi = _mm256_unpackhi_epi8(src_reg, src_next_reg);
+
+            // filter the source
+            exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter);
+            exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter);
+
+            // add 8 to source
+            exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8);
+            exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8);
+
+            // divide source by 16
+            exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4);
+            exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
+
+            // convert each 16 bit to 8 bit to each low and high lane source
+            src_reg =  _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+            // average between previous pack to the current
+            src_pack = _mm256_avg_epu8(src_pack, src_reg);
+
+            // expand each byte to 2 bytes
+            exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg);
+            exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg);
+
+            exp_src_lo = _mm256_unpacklo_epi8(src_pack, zero_reg);
+            exp_src_hi = _mm256_unpackhi_epi8(src_pack, zero_reg);
+
+            // source - dest
+            exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo);
+            exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi);
+
+            // calculate sum
+            sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo);
+            exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo);
+            sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi);
+            exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi);
+
+            // calculate sse
+            sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo);
+            sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
+
+            // save previous pack
+            src_pack = src_reg;
+            dst+= dst_stride;
+          }
+      // x_offset = bilin interpolation and y_offset = bilin interpolation
+      } else {
+          __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
+#if (ARCH_X86_64)
+          int64_t x_offset64, y_offset64;
+          x_offset64 = x_offset;
+          x_offset64 <<= 5;
+          y_offset64 = y_offset;
+          y_offset64 <<= 5;
+          xfilter = _mm256_load_si256(
+              (__m256i const *)(bilinear_filters_avx2 + x_offset64));
+          yfilter = _mm256_load_si256(
+              (__m256i const *)(bilinear_filters_avx2 + y_offset64));
+#else
+          x_offset <<= 5;
+          xfilter = _mm256_load_si256(
+              (__m256i const *)(bilinear_filters_avx2 + x_offset));
+          y_offset <<= 5;
+          yfilter = _mm256_load_si256(
+              (__m256i const *)(bilinear_filters_avx2 + y_offset));
+#endif
+          pw8 = _mm256_set1_epi16(8);
+          // load source and another source starting from the next
+          // following byte
+          src_reg = _mm256_loadu_si256((__m256i const *) (src));
+          src_next_reg = _mm256_loadu_si256((__m256i const *) (src + 1));
+          // merge current and next stride source
+          exp_src_lo = _mm256_unpacklo_epi8(src_reg, src_next_reg);
+          exp_src_hi = _mm256_unpackhi_epi8(src_reg, src_next_reg);
+
+          // filter the source
+          exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, xfilter);
+          exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, xfilter);
+
+          // add 8 to the source
+          exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8);
+          exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8);
+
+          // divide the source by 16
+          exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4);
+          exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
+
+          // convert each 16 bit to 8 bit to each low and high lane source
+          src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+          for (i = 0; i < height ; i++) {
+            src+= src_stride;
+            // load source and another source starting from the next
+            // following byte + destination
+            src_reg = _mm256_loadu_si256((__m256i const *) (src));
+            src_next_reg = _mm256_loadu_si256((__m256i const *) (src + 1));
+            dst_reg = _mm256_load_si256((__m256i const *) (dst));
+
+            // merge current and next stride source
+            exp_src_lo = _mm256_unpacklo_epi8(src_reg, src_next_reg);
+            exp_src_hi = _mm256_unpackhi_epi8(src_reg, src_next_reg);
+
+            // filter the source
+            exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, xfilter);
+            exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, xfilter);
+
+            // add 8 to source
+            exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8);
+            exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8);
+
+            // divide source by 16
+            exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4);
+            exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
+
+            // convert each 16 bit to 8 bit to each low and high lane source
+            src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+
+            // merge previous pack to current pack source
+            exp_src_lo = _mm256_unpacklo_epi8(src_pack, src_reg);
+            exp_src_hi = _mm256_unpackhi_epi8(src_pack, src_reg);
+
+            // filter the source
+            exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, yfilter);
+            exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, yfilter);
+
+            // expand each byte to 2 bytes
+            exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg);
+            exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg);
+
+            // add 8 to source
+            exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8);
+            exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8);
+
+            // divide source by 16
+            exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4);
+            exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
+
+            // source - dest
+            exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo);
+            exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi);
+
+            // caculate sum
+            sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo);
+            exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo);
+            sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi);
+            exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi);
+
+            // calculate sse
+            sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo);
+            sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
+
+            src_pack = src_reg;
+            dst+= dst_stride;
+          }
+      }
+  }
+  // sum < 0
+  res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg);
+  // save the next 8 bytes of each lane of sse
+  sse_reg_hi = _mm256_srli_si256(sse_reg, 8);
+  // merge the result of sum < 0  with sum to add sign to the next 16 bits
+  sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp);
+  sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp);
+  // add each 8 bytes from every lane of sse and sum
+  sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi);
+  sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi);
+
+  // save the next 4 bytes of each lane sse
+  sse_reg_hi = _mm256_srli_si256(sse_reg, 4);
+  // save the next 8 bytes of each lane of sum
+  sum_reg_hi = _mm256_srli_si256(sum_reg, 8);
+
+  // add the first 4 bytes to the next 4 bytes sse
+  sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi);
+  // add the first 8 bytes to the next 8 bytes
+  sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi);
+  // extract the low lane and the high lane and add the results
+  *((int*)sse)= _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) +
+                _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1));
+  sum_reg_hi = _mm256_srli_si256(sum_reg, 4);
+  sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi);
+  sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) +
+        _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1));
+  return sum;
+}
diff --git a/vp9/encoder/x86/vp9_variance_avx2.c b/vp9/encoder/x86/vp9_variance_avx2.c
index c9b90d52d..02007a3bd 100644
--- a/vp9/encoder/x86/vp9_variance_avx2.c
+++ b/vp9/encoder/x86/vp9_variance_avx2.c
@@ -42,6 +42,18 @@ void vp9_get32x32var_avx2
   int *Sum
 );
 
+unsigned int vp9_sub_pixel_variance32xh_avx2
+(
+  const uint8_t *src,
+  int src_stride,
+  int x_offset,
+  int y_offset,
+  const uint8_t *dst,
+  int dst_stride,
+  int height,
+  unsigned int *sse
+);
+
 static void variance_avx2(const unsigned char *src_ptr, int  source_stride,
                         const unsigned char *ref_ptr, int  recon_stride,
                         int  w, int  h, unsigned int *sse, int *sum,
@@ -155,3 +167,43 @@ unsigned int vp9_variance64x32_avx2(const uint8_t *src_ptr,
   *sse = var;
   return (var - (((int64_t)avg * avg) >> 11));
 }
+
+unsigned int vp9_sub_pixel_variance64x64_avx2(const uint8_t *src,
+                                              int src_stride,
+                                              int x_offset,
+                                              int y_offset,
+                                              const uint8_t *dst,
+                                              int dst_stride,
+                                              unsigned int *sse_ptr) {
+  // processing 32 elements in parallel
+  unsigned int sse;
+  int se = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,
+                                           y_offset, dst, dst_stride,
+                                           64, &sse);
+  // processing the next 32 elements in parallel
+  unsigned int sse2;
+  int se2 = vp9_sub_pixel_variance32xh_avx2(src + 32, src_stride,
+                                            x_offset, y_offset,
+                                            dst + 32, dst_stride,
+                                            64, &sse2);
+  se += se2;
+  sse += sse2;
+  *sse_ptr = sse;
+  return sse - (((int64_t)se * se) >> 12);
+}
+
+unsigned int vp9_sub_pixel_variance32x32_avx2(const uint8_t *src,
+                                              int src_stride,
+                                              int x_offset,
+                                              int y_offset,
+                                              const uint8_t *dst,
+                                              int dst_stride,
+                                              unsigned int *sse_ptr) {
+  // processing 32 element in parallel
+  unsigned int sse;
+  int se = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,
+                                           y_offset, dst, dst_stride,
+                                           32, &sse);
+  *sse_ptr = sse;
+  return sse - (((int64_t)se * se) >> 10);
+}
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index c691411bf..9fb611504 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -23,7 +23,8 @@ VP9_COMMON_SRCS-yes += common/vp9_entropymode.c
 VP9_COMMON_SRCS-yes += common/vp9_entropymv.c
 VP9_COMMON_SRCS-yes += common/vp9_filter.c
 VP9_COMMON_SRCS-yes += common/vp9_filter.h
-VP9_COMMON_SRCS-yes += common/generic/vp9_systemdependent.c
+VP9_COMMON_SRCS-yes += common/vp9_frame_buffers.c
+VP9_COMMON_SRCS-yes += common/vp9_frame_buffers.h
 VP9_COMMON_SRCS-yes += common/vp9_idct.c
 VP9_COMMON_SRCS-yes += common/vp9_alloccommon.h
 VP9_COMMON_SRCS-yes += common/vp9_blockd.h
@@ -76,12 +77,15 @@ VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_loopfilter_mmx.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_8t_sse2.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_bilinear_sse2.asm
 VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_ssse3.asm
+VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_bilinear_ssse3.asm
+VP9_COMMON_SRCS-$(HAVE_AVX2) += common/x86/vp9_subpixel_8t_intrin_avx2.c
+VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_intrin_ssse3.c
 ifeq ($(CONFIG_VP9_POSTPROC),yes)
 VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_postproc_mmx.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_postproc_sse2.asm
 endif
 
-ifeq ($(USE_X86INC),yes)
+ifeq ($(CONFIG_USE_X86INC),yes)
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_copy_sse2.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_intrapred_sse2.asm
 VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_intrapred_ssse3.asm
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 6b181710e..d7713fd3f 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -175,6 +175,23 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t      *ctx,
 
   RANGE_CHECK(cfg, ss_number_layers,      1,
               VPX_SS_MAX_LAYERS); /*Spatial layers max */
+
+  RANGE_CHECK(cfg, ts_number_layers, 1, VPX_TS_MAX_LAYERS);
+  if (cfg->ts_number_layers > 1) {
+    unsigned int i;
+    for (i = 1; i < cfg->ts_number_layers; ++i) {
+      if (cfg->ts_target_bitrate[i] < cfg->ts_target_bitrate[i-1]) {
+        ERROR("ts_target_bitrate entries are not increasing");
+      }
+    }
+    RANGE_CHECK(cfg, ts_rate_decimator[cfg->ts_number_layers-1], 1, 1);
+    for (i = cfg->ts_number_layers-2; i > 0; --i) {
+      if (cfg->ts_rate_decimator[i-1] != 2*cfg->ts_rate_decimator[i]) {
+        ERROR("ts_rate_decimator factors are not powers of 2");
+      }
+    }
+  }
+
   /* VP8 does not support a lower bound on the keyframe interval in
    * automatic keyframe placement mode.
    */
@@ -205,7 +222,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t      *ctx,
     int              n_packets = (int)(cfg->rc_twopass_stats_in.sz / packet_sz);
     FIRSTPASS_STATS *stats;
 
-    if (!cfg->rc_twopass_stats_in.buf)
+    if (cfg->rc_twopass_stats_in.buf == NULL)
       ERROR("rc_twopass_stats_in.buf not set.");
 
     if (cfg->rc_twopass_stats_in.sz % packet_sz)
@@ -247,7 +264,7 @@ static vpx_codec_err_t validate_img(vpx_codec_alg_priv_t *ctx,
 
 static vpx_codec_err_t set_vp9e_config(VP9_CONFIG *oxcf,
                                        vpx_codec_enc_cfg_t cfg,
-                                       struct vp9_extracfg vp8_cfg) {
+                                       struct vp9_extracfg vp9_cfg) {
   oxcf->version = cfg.g_profile;
   oxcf->width   = cfg.g_w;
   oxcf->height  = cfg.g_h;
@@ -272,30 +289,25 @@ static vpx_codec_err_t set_vp9e_config(VP9_CONFIG *oxcf,
   }
 
   if (cfg.g_pass == VPX_RC_FIRST_PASS) {
-    oxcf->allow_lag     = 0;
     oxcf->lag_in_frames = 0;
   } else {
-    oxcf->allow_lag     = (cfg.g_lag_in_frames) > 0;
     oxcf->lag_in_frames = cfg.g_lag_in_frames;
   }
 
-  // VBR only supported for now.
-  // CBR code has been deprectated for experimental phase.
-  // CQ mode not yet tested
-  oxcf->end_usage        = USAGE_LOCAL_FILE_PLAYBACK;
+  oxcf->end_usage   = USAGE_LOCAL_FILE_PLAYBACK;
   if (cfg.rc_end_usage == VPX_CQ)
-    oxcf->end_usage      = USAGE_CONSTRAINED_QUALITY;
+    oxcf->end_usage = USAGE_CONSTRAINED_QUALITY;
   else if (cfg.rc_end_usage == VPX_Q)
-    oxcf->end_usage      = USAGE_CONSTANT_QUALITY;
+    oxcf->end_usage = USAGE_CONSTANT_QUALITY;
   else if (cfg.rc_end_usage == VPX_CBR)
     oxcf->end_usage = USAGE_STREAM_FROM_SERVER;
 
   oxcf->target_bandwidth         = cfg.rc_target_bitrate;
-  oxcf->rc_max_intra_bitrate_pct = vp8_cfg.rc_max_intra_bitrate_pct;
+  oxcf->rc_max_intra_bitrate_pct = vp9_cfg.rc_max_intra_bitrate_pct;
 
   oxcf->best_allowed_q          = cfg.rc_min_quantizer;
   oxcf->worst_allowed_q         = cfg.rc_max_quantizer;
-  oxcf->cq_level                = vp8_cfg.cq_level;
+  oxcf->cq_level                = vp9_cfg.cq_level;
   oxcf->fixed_q = -1;
 
   oxcf->under_shoot_pct         = cfg.rc_undershoot_pct;
@@ -316,35 +328,52 @@ static vpx_codec_err_t set_vp9e_config(VP9_CONFIG *oxcf,
   // oxcf->kf_min_dist         = cfg.kf_min_dis;
   oxcf->key_freq               = cfg.kf_max_dist;
 
-  // oxcf->delete_first_pass_file = cfg.g_delete_firstpassfile;
-  // strcpy(oxcf->first_pass_file, cfg.g_firstpass_file);
-
-  oxcf->cpu_used               =  vp8_cfg.cpu_used;
-  oxcf->encode_breakout        =  vp8_cfg.static_thresh;
-  oxcf->play_alternate         =  vp8_cfg.enable_auto_alt_ref;
-  oxcf->noise_sensitivity      =  vp8_cfg.noise_sensitivity;
-  oxcf->sharpness              =  vp8_cfg.sharpness;
+  oxcf->cpu_used               =  vp9_cfg.cpu_used;
+  oxcf->encode_breakout        =  vp9_cfg.static_thresh;
+  oxcf->play_alternate         =  vp9_cfg.enable_auto_alt_ref;
+  oxcf->noise_sensitivity      =  vp9_cfg.noise_sensitivity;
+  oxcf->sharpness              =  vp9_cfg.sharpness;
 
   oxcf->two_pass_stats_in      =  cfg.rc_twopass_stats_in;
-  oxcf->output_pkt_list        =  vp8_cfg.pkt_list;
+  oxcf->output_pkt_list        =  vp9_cfg.pkt_list;
 
-  oxcf->arnr_max_frames = vp8_cfg.arnr_max_frames;
-  oxcf->arnr_strength   = vp8_cfg.arnr_strength;
-  oxcf->arnr_type       = vp8_cfg.arnr_type;
+  oxcf->arnr_max_frames = vp9_cfg.arnr_max_frames;
+  oxcf->arnr_strength   = vp9_cfg.arnr_strength;
+  oxcf->arnr_type       = vp9_cfg.arnr_type;
 
-  oxcf->tuning = vp8_cfg.tuning;
+  oxcf->tuning = vp9_cfg.tuning;
 
-  oxcf->tile_columns = vp8_cfg.tile_columns;
-  oxcf->tile_rows    = vp8_cfg.tile_rows;
+  oxcf->tile_columns = vp9_cfg.tile_columns;
+  oxcf->tile_rows    = vp9_cfg.tile_rows;
 
-  oxcf->lossless = vp8_cfg.lossless;
+  oxcf->lossless = vp9_cfg.lossless;
 
   oxcf->error_resilient_mode         = cfg.g_error_resilient;
-  oxcf->frame_parallel_decoding_mode = vp8_cfg.frame_parallel_decoding_mode;
+  oxcf->frame_parallel_decoding_mode = vp9_cfg.frame_parallel_decoding_mode;
 
-  oxcf->aq_mode = vp8_cfg.aq_mode;
+  oxcf->aq_mode = vp9_cfg.aq_mode;
 
   oxcf->ss_number_layers = cfg.ss_number_layers;
+
+  if (oxcf->ss_number_layers > 1) {
+    memcpy(oxcf->ss_target_bitrate, cfg.ss_target_bitrate,
+           sizeof(cfg.ss_target_bitrate));
+  } else if (oxcf->ss_number_layers == 1) {
+    oxcf->ss_target_bitrate[0] = oxcf->target_bandwidth;
+  }
+
+  oxcf->ts_number_layers = cfg.ts_number_layers;
+
+  if (oxcf->ts_number_layers > 1) {
+    memcpy(oxcf->ts_target_bitrate, cfg.ts_target_bitrate,
+           sizeof(cfg.ts_target_bitrate));
+    memcpy(oxcf->ts_rate_decimator, cfg.ts_rate_decimator,
+           sizeof(cfg.ts_rate_decimator));
+  } else if (oxcf->ts_number_layers == 1) {
+    oxcf->ts_target_bitrate[0] = (int)oxcf->target_bandwidth;
+    oxcf->ts_rate_decimator[0] = 1;
+  }
+
   /*
   printf("Current VP9 Settings: \n");
   printf("target_bandwidth: %d\n", oxcf->target_bandwidth);
@@ -352,7 +381,6 @@ static vpx_codec_err_t set_vp9e_config(VP9_CONFIG *oxcf,
   printf("sharpness: %d\n",    oxcf->sharpness);
   printf("cpu_used: %d\n",  oxcf->cpu_used);
   printf("Mode: %d\n",     oxcf->mode);
-  // printf("delete_first_pass_file: %d\n",  oxcf->delete_first_pass_file);
   printf("auto_key: %d\n",  oxcf->auto_key);
   printf("key_freq: %d\n", oxcf->key_freq);
   printf("end_usage: %d\n", oxcf->end_usage);
@@ -367,7 +395,6 @@ static vpx_codec_err_t set_vp9e_config(VP9_CONFIG *oxcf,
   printf("two_pass_vbrbias: %d\n",  oxcf->two_pass_vbrbias);
   printf("two_pass_vbrmin_section: %d\n", oxcf->two_pass_vbrmin_section);
   printf("two_pass_vbrmax_section: %d\n", oxcf->two_pass_vbrmax_section);
-  printf("allow_lag: %d\n", oxcf->allow_lag);
   printf("lag_in_frames: %d\n", oxcf->lag_in_frames);
   printf("play_alternate: %d\n", oxcf->play_alternate);
   printf("Version: %d\n", oxcf->Version);
@@ -396,7 +423,7 @@ static vpx_codec_err_t vp9e_set_config(vpx_codec_alg_priv_t       *ctx,
 
   res = validate_config(ctx, cfg, &ctx->vp8_cfg);
 
-  if (!res) {
+  if (res == VPX_CODEC_OK) {
     ctx->cfg = *cfg;
     set_vp9e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg);
     vp9_change_config(ctx->cpi, &ctx->oxcf);
@@ -416,8 +443,7 @@ static vpx_codec_err_t get_param(vpx_codec_alg_priv_t *ctx,
 
 #define MAP(id, var) case id: *(RECAST(id, arg)) = var; break
 
-  if (!arg)
-    return VPX_CODEC_INVALID_PARAM;
+  if (arg == NULL) return VPX_CODEC_INVALID_PARAM;
 
   switch (ctrl_id) {
       MAP(VP8E_GET_LAST_QUANTIZER, vp9_get_quantizer(ctx->cpi));
@@ -459,7 +485,7 @@ static vpx_codec_err_t set_param(vpx_codec_alg_priv_t *ctx,
 
   res = validate_config(ctx, &ctx->cfg, &xcfg);
 
-  if (!res) {
+  if (res == VPX_CODEC_OK) {
     ctx->vp8_cfg = xcfg;
     set_vp9e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg);
     vp9_change_config(ctx->cpi, &ctx->oxcf);
@@ -478,12 +504,10 @@ static vpx_codec_err_t vp9e_common_init(vpx_codec_ctx_t *ctx) {
 
   VP9_PTR optr;
 
-  if (!ctx->priv) {
+  if (ctx->priv == NULL) {
     priv = calloc(1, sizeof(struct vpx_codec_alg_priv));
 
-    if (!priv) {
-      return VPX_CODEC_MEM_ERROR;
-    }
+    if (priv == NULL) return VPX_CODEC_MEM_ERROR;
 
     ctx->priv = &priv->base;
     ctx->priv->sz = sizeof(*ctx->priv);
@@ -520,21 +544,19 @@ static vpx_codec_err_t vp9e_common_init(vpx_codec_ctx_t *ctx) {
 
     priv->cx_data = malloc(priv->cx_data_sz);
 
-    if (!priv->cx_data) {
-      return VPX_CODEC_MEM_ERROR;
-    }
+    if (priv->cx_data == NULL) return VPX_CODEC_MEM_ERROR;
 
     vp9_initialize_enc();
 
     res = validate_config(priv, &priv->cfg, &priv->vp8_cfg);
 
-    if (!res) {
+    if (res == VPX_CODEC_OK) {
       set_vp9e_config(&ctx->priv->alg_priv->oxcf,
                       ctx->priv->alg_priv->cfg,
                       ctx->priv->alg_priv->vp8_cfg);
       optr = vp9_create_compressor(&ctx->priv->alg_priv->oxcf);
 
-      if (!optr)
+      if (optr == NULL)
         res = VPX_CODEC_MEM_ERROR;
       else
         ctx->priv->alg_priv->cpi = optr;
@@ -621,7 +643,7 @@ static int write_superframe_index(vpx_codec_alg_priv_t *ctx) {
 
     *x++ = marker;
     for (i = 0; i < ctx->pending_frame_count; i++) {
-      int this_sz = ctx->pending_frame_sizes[i];
+      unsigned int this_sz = (unsigned int)ctx->pending_frame_sizes[i];
 
       for (j = 0; j <= mag; j++) {
         *x++ = this_sz & 0xff;
@@ -702,7 +724,7 @@ static vpx_codec_err_t vp9e_encode(vpx_codec_alg_priv_t  *ctx,
   }
 
   /* Initialize the encoder instance on the first frame. */
-  if (!res && ctx->cpi) {
+  if (res == VPX_CODEC_OK && ctx->cpi != NULL) {
     unsigned int lib_flags;
     YV12_BUFFER_CONFIG sd;
     int64_t dst_time_stamp, dst_end_time_stamp;
@@ -762,8 +784,8 @@ static vpx_codec_err_t vp9e_encode(vpx_codec_alg_priv_t  *ctx,
         VP9_COMP *cpi = (VP9_COMP *)ctx->cpi;
 
         /* Pack invisible frames with the next visible frame */
-        if (!cpi->common.show_frame) {
-          if (!ctx->pending_cx_data)
+        if (cpi->common.show_frame == 0) {
+          if (ctx->pending_cx_data == 0)
             ctx->pending_cx_data = cx_data;
           ctx->pending_cx_data_sz += size;
           ctx->pending_frame_sizes[ctx->pending_frame_count++] = size;
@@ -788,7 +810,7 @@ static vpx_codec_err_t vp9e_encode(vpx_codec_alg_priv_t  *ctx,
         if (lib_flags & FRAMEFLAGS_KEY)
           pkt.data.frame.flags |= VPX_FRAME_IS_KEY;
 
-        if (!cpi->common.show_frame) {
+        if (cpi->common.show_frame == 0) {
           pkt.data.frame.flags |= VPX_FRAME_IS_INVISIBLE;
 
           // This timestamp should be as close as possible to the
@@ -862,10 +884,9 @@ static const vpx_codec_cx_pkt_t *vp9e_get_cxdata(vpx_codec_alg_priv_t  *ctx,
 static vpx_codec_err_t vp9e_set_reference(vpx_codec_alg_priv_t *ctx,
                                           int ctr_id,
                                           va_list args) {
-  vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *);
+  vpx_ref_frame_t *frame = va_arg(args, vpx_ref_frame_t *);
 
-  if (data) {
-    vpx_ref_frame_t *frame = (vpx_ref_frame_t *)data;
+  if (frame != NULL) {
     YV12_BUFFER_CONFIG sd;
 
     image2yuvconfig(&frame->img, &sd);
@@ -880,10 +901,9 @@ static vpx_codec_err_t vp9e_set_reference(vpx_codec_alg_priv_t *ctx,
 static vpx_codec_err_t vp9e_copy_reference(vpx_codec_alg_priv_t *ctx,
                                            int ctr_id,
                                            va_list args) {
-  vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *);
+  vpx_ref_frame_t *frame = va_arg(args, vpx_ref_frame_t *);
 
-  if (data) {
-    vpx_ref_frame_t *frame = (vpx_ref_frame_t *)data;
+  if (frame != NULL) {
     YV12_BUFFER_CONFIG sd;
 
     image2yuvconfig(&frame->img, &sd);
@@ -898,13 +918,13 @@ static vpx_codec_err_t vp9e_copy_reference(vpx_codec_alg_priv_t *ctx,
 static vpx_codec_err_t get_reference(vpx_codec_alg_priv_t *ctx,
                                      int ctr_id,
                                      va_list args) {
-  vp9_ref_frame_t *data = va_arg(args, vp9_ref_frame_t *);
+  vp9_ref_frame_t *frame = va_arg(args, vp9_ref_frame_t *);
 
-  if (data) {
+  if (frame != NULL) {
     YV12_BUFFER_CONFIG* fb;
 
-    vp9_get_reference_enc(ctx->cpi, data->idx, &fb);
-    yuvconfig2image(&data->img, fb, NULL);
+    vp9_get_reference_enc(ctx->cpi, frame->idx, &fb);
+    yuvconfig2image(&frame->img, fb, NULL);
     return VPX_CODEC_OK;
   } else {
     return VPX_CODEC_INVALID_PARAM;
@@ -915,11 +935,11 @@ static vpx_codec_err_t vp9e_set_previewpp(vpx_codec_alg_priv_t *ctx,
                                           int ctr_id,
                                           va_list args) {
 #if CONFIG_VP9_POSTPROC
-  vp8_postproc_cfg_t *data = va_arg(args, vp8_postproc_cfg_t *);
+  vp8_postproc_cfg_t *config = va_arg(args, vp8_postproc_cfg_t *);
   (void)ctr_id;
 
-  if (data) {
-    ctx->preview_ppcfg = *((vp8_postproc_cfg_t *)data);
+  if (config != NULL) {
+    ctx->preview_ppcfg = *config;
     return VPX_CODEC_OK;
   } else {
     return VPX_CODEC_INVALID_PARAM;
@@ -993,20 +1013,14 @@ static vpx_codec_err_t vp9e_set_activemap(vpx_codec_alg_priv_t *ctx,
 static vpx_codec_err_t vp9e_set_scalemode(vpx_codec_alg_priv_t *ctx,
                                           int ctr_id,
                                           va_list args) {
-  vpx_scaling_mode_t *data =  va_arg(args, vpx_scaling_mode_t *);
+  vpx_scaling_mode_t *scalemode =  va_arg(args, vpx_scaling_mode_t *);
 
-  if (data) {
+  if (scalemode != NULL) {
     int res;
-    vpx_scaling_mode_t scalemode = *(vpx_scaling_mode_t *)data;
     res = vp9_set_internal_size(ctx->cpi,
-                                (VPX_SCALING)scalemode.h_scaling_mode,
-                                (VPX_SCALING)scalemode.v_scaling_mode);
-
-    if (!res) {
-      return VPX_CODEC_OK;
-    } else {
-      return VPX_CODEC_INVALID_PARAM;
-    }
+                                (VPX_SCALING)scalemode->h_scaling_mode,
+                                (VPX_SCALING)scalemode->v_scaling_mode);
+    return (res == 0) ? VPX_CODEC_OK : VPX_CODEC_INVALID_PARAM;
   } else {
     return VPX_CODEC_INVALID_PARAM;
   }
@@ -1016,32 +1030,54 @@ static vpx_codec_err_t vp9e_set_svc(vpx_codec_alg_priv_t *ctx, int ctr_id,
                                     va_list args) {
   int data = va_arg(args, int);
   vp9_set_svc(ctx->cpi, data);
+  // CBR mode for SVC with both temporal and spatial layers not yet supported.
+  if (data == 1 &&
+      ctx->cfg.rc_end_usage == VPX_CBR &&
+      ctx->cfg.ss_number_layers > 1 &&
+      ctx->cfg.ts_number_layers > 1) {
+    return VPX_CODEC_INVALID_PARAM;
+  }
+  return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t vp9e_set_svc_layer_id(vpx_codec_alg_priv_t *ctx,
+                                             int ctr_id,
+                                             va_list args) {
+  vpx_svc_layer_id_t *data = va_arg(args, vpx_svc_layer_id_t *);
+  VP9_COMP *cpi = (VP9_COMP *)ctx->cpi;
+  cpi->svc.spatial_layer_id = data->spatial_layer_id;
+  cpi->svc.temporal_layer_id = data->temporal_layer_id;
+  // Checks on valid layer_id input.
+  if (cpi->svc.temporal_layer_id < 0 ||
+      cpi->svc.temporal_layer_id >= (int)ctx->cfg.ts_number_layers) {
+    return VPX_CODEC_INVALID_PARAM;
+  }
+  if (cpi->svc.spatial_layer_id < 0 ||
+      cpi->svc.spatial_layer_id >= (int)ctx->cfg.ss_number_layers) {
+    return VPX_CODEC_INVALID_PARAM;
+  }
   return VPX_CODEC_OK;
 }
 
 static vpx_codec_err_t vp9e_set_svc_parameters(vpx_codec_alg_priv_t *ctx,
                                                int ctr_id, va_list args) {
-  vpx_svc_parameters_t *data = va_arg(args, vpx_svc_parameters_t *);
   VP9_COMP *cpi = (VP9_COMP *)ctx->cpi;
-  vpx_svc_parameters_t params;
+  vpx_svc_parameters_t *params = va_arg(args, vpx_svc_parameters_t *);
 
-  if (data == NULL) {
-    return VPX_CODEC_INVALID_PARAM;
-  }
+  if (params == NULL) return VPX_CODEC_INVALID_PARAM;
 
-  params = *(vpx_svc_parameters_t *)data;
+  cpi->svc.spatial_layer_id = params->spatial_layer;
+  cpi->svc.temporal_layer_id = params->temporal_layer;
 
-  cpi->current_layer = params.layer;
-  cpi->lst_fb_idx = params.lst_fb_idx;
-  cpi->gld_fb_idx = params.gld_fb_idx;
-  cpi->alt_fb_idx = params.alt_fb_idx;
+  cpi->lst_fb_idx = params->lst_fb_idx;
+  cpi->gld_fb_idx = params->gld_fb_idx;
+  cpi->alt_fb_idx = params->alt_fb_idx;
 
-  if (vp9_set_size_literal(ctx->cpi, params.width, params.height) != 0) {
+  if (vp9_set_size_literal(ctx->cpi, params->width, params->height) != 0)
     return VPX_CODEC_INVALID_PARAM;
-  }
 
-  ctx->cfg.rc_max_quantizer = params.max_quantizer;
-  ctx->cfg.rc_min_quantizer = params.min_quantizer;
+  ctx->cfg.rc_max_quantizer = params->max_quantizer;
+  ctx->cfg.rc_min_quantizer = params->min_quantizer;
 
   set_vp9e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg);
   vp9_change_config(ctx->cpi, &ctx->oxcf);
@@ -1080,6 +1116,7 @@ static vpx_codec_ctrl_fn_map_t vp9e_ctf_maps[] = {
   {VP9_GET_REFERENCE,                 get_reference},
   {VP9E_SET_SVC,                      vp9e_set_svc},
   {VP9E_SET_SVC_PARAMETERS,           vp9e_set_svc_parameters},
+  {VP9E_SET_SVC_LAYER_ID,             vp9e_set_svc_layer_id},
   { -1, NULL},
 };
 
@@ -1130,9 +1167,13 @@ static vpx_codec_enc_cfg_map_t vp9e_usage_cfg_map[] = {
       9999,               /* kf_max_dist */
 
       VPX_SS_DEFAULT_LAYERS, /* ss_number_layers */
-
+      {0},                /* ss_target_bitrate */
+      1,                  /* ts_number_layers */
+      {0},                /* ts_target_bitrate */
+      {0},                /* ts_rate_decimator */
+      0,                  /* ts_periodicity */
+      {0},                /* ts_layer_id */
 #if VPX_ENCODER_ABI_VERSION == (1 + VPX_CODEC_ABI_VERSION)
-      1,                  /* g_delete_first_pass_file */
       "vp8.fpf"           /* first pass filename */
 #endif
     }
diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c
index 92c6cd20c..b85e17237 100644
--- a/vp9/vp9_dx_iface.c
+++ b/vp9/vp9_dx_iface.c
@@ -15,6 +15,7 @@
 #include "vpx/vp8dx.h"
 #include "vpx/internal/vpx_codec_internal.h"
 #include "./vpx_version.h"
+#include "vp9/common/vp9_frame_buffers.h"
 #include "vp9/decoder/vp9_onyxd.h"
 #include "vp9/decoder/vp9_onyxd_int.h"
 #include "vp9/decoder/vp9_read_bit_buffer.h"
@@ -59,6 +60,11 @@ struct vpx_codec_alg_priv {
   int                     img_setup;
   int                     img_avail;
   int                     invert_tile_order;
+
+  // External frame buffer info to save for VP9 common.
+  void *ext_priv;  // Private data associated with the external frame buffers.
+  vpx_get_frame_buffer_cb_fn_t get_ext_fb_cb;
+  vpx_release_frame_buffer_cb_fn_t release_ext_fb_cb;
 };
 
 static unsigned long priv_sz(const vpx_codec_dec_cfg_t *si,
@@ -148,14 +154,12 @@ static vpx_codec_err_t vp9_peek_si(const uint8_t *data, unsigned int data_sz,
   {
     struct vp9_read_bit_buffer rb = { data, data + data_sz, 0, NULL, NULL };
     const int frame_marker = vp9_rb_read_literal(&rb, 2);
-    const int version = vp9_rb_read_bit(&rb) | (vp9_rb_read_bit(&rb) << 1);
+    const int version = vp9_rb_read_bit(&rb);
+    (void) vp9_rb_read_bit(&rb);  // unused version bit
+
     if (frame_marker != VP9_FRAME_MARKER)
       return VPX_CODEC_UNSUP_BITSTREAM;
-#if CONFIG_NON420
     if (version > 1) return VPX_CODEC_UNSUP_BITSTREAM;
-#else
-    if (version != 0) return VPX_CODEC_UNSUP_BITSTREAM;
-#endif
 
     if (vp9_rb_read_bit(&rb)) {  // show an existing frame
       return VPX_CODEC_OK;
@@ -206,7 +210,7 @@ static vpx_codec_err_t vp9_get_si(vpx_codec_alg_priv_t    *ctx,
                        ? sizeof(vp9_stream_info_t)
                        : sizeof(vpx_codec_stream_info_t);
   memcpy(si, &ctx->si, sz);
-  si->sz = sz;
+  si->sz = (unsigned int)sz;
 
   return VPX_CODEC_OK;
 }
@@ -291,10 +295,31 @@ static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx,
         ctx->postproc_cfg.noise_level = 0;
       }
 
-      if (!optr)
+      if (!optr) {
         res = VPX_CODEC_ERROR;
-      else
+      } else {
+        VP9D_COMP *const pbi = (VP9D_COMP*)optr;
+        VP9_COMMON *const cm = &pbi->common;
+
+        // Set index to not initialized.
+        cm->new_fb_idx = -1;
+
+        if (ctx->get_ext_fb_cb != NULL && ctx->release_ext_fb_cb != NULL) {
+          cm->get_fb_cb = ctx->get_ext_fb_cb;
+          cm->release_fb_cb = ctx->release_ext_fb_cb;
+          cm->cb_priv = ctx->ext_priv;
+        } else {
+          cm->get_fb_cb = vp9_get_frame_buffer;
+          cm->release_fb_cb = vp9_release_frame_buffer;
+
+          if (vp9_alloc_internal_frame_buffers(&cm->int_frame_buffers))
+            vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                               "Failed to initialize internal frame buffers");
+          cm->cb_priv = &cm->int_frame_buffers;
+        }
+
         ctx->pbi = optr;
+      }
     }
 
     ctx->decoder_init = 1;
@@ -332,7 +357,11 @@ static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx,
 
     if (!res && 0 == vp9_get_raw_frame(ctx->pbi, &sd, &time_stamp,
                                        &time_end_stamp, &flags)) {
+      VP9D_COMP *const pbi = (VP9D_COMP*)ctx->pbi;
+      VP9_COMMON *const cm = &pbi->common;
       yuvconfig2image(&ctx->img, &sd, user_priv);
+
+      ctx->img.fb_priv = cm->frame_bufs[cm->new_fb_idx].raw_frame_buffer.priv;
       ctx->img_avail = 1;
     }
   }
@@ -429,7 +458,7 @@ static vpx_codec_err_t vp9_decode(vpx_codec_alg_priv_t  *ctx,
     while (data_start < data_end && *data_start == 0)
       data_start++;
 
-    data_sz = data_end - data_start;
+    data_sz = (unsigned int)(data_end - data_start);
   } while (data_start < data_end);
   return res;
 }
@@ -452,6 +481,24 @@ static vpx_image_t *vp9_get_frame(vpx_codec_alg_priv_t  *ctx,
   return img;
 }
 
+static vpx_codec_err_t vp9_set_fb_fn(
+    vpx_codec_alg_priv_t *ctx,
+    vpx_get_frame_buffer_cb_fn_t cb_get,
+    vpx_release_frame_buffer_cb_fn_t cb_release, void *cb_priv) {
+  if (cb_get == NULL || cb_release == NULL) {
+    return VPX_CODEC_INVALID_PARAM;
+  } else if (ctx->pbi == NULL) {
+    // If the decoder has already been initialized, do not accept changes to
+    // the frame buffer functions.
+    ctx->get_ext_fb_cb = cb_get;
+    ctx->release_ext_fb_cb = cb_release;
+    ctx->ext_priv = cb_priv;
+    return VPX_CODEC_OK;
+  }
+
+  return VPX_CODEC_ERROR;
+}
+
 static vpx_codec_err_t vp9_xma_get_mmap(const vpx_codec_ctx_t *ctx,
                                         vpx_codec_mmap_t *mmap,
                                         vpx_codec_iter_t *iter) {
@@ -685,7 +732,8 @@ static vpx_codec_ctrl_fn_map_t ctf_maps[] = {
 CODEC_INTERFACE(vpx_codec_vp9_dx) = {
   "WebM Project VP9 Decoder" VERSION_STRING,
   VPX_CODEC_INTERNAL_ABI_VERSION,
-  VPX_CODEC_CAP_DECODER | VP9_CAP_POSTPROC,
+  VPX_CODEC_CAP_DECODER | VP9_CAP_POSTPROC |
+      VPX_CODEC_CAP_EXTERNAL_FRAME_BUFFER,
   /* vpx_codec_caps_t          caps; */
   vp9_init,         /* vpx_codec_init_fn_t       init; */
   vp9_destroy,      /* vpx_codec_destroy_fn_t    destroy; */
@@ -697,6 +745,7 @@ CODEC_INTERFACE(vpx_codec_vp9_dx) = {
     vp9_get_si,       /* vpx_codec_get_si_fn_t     get_si; */
     vp9_decode,       /* vpx_codec_decode_fn_t     decode; */
     vp9_get_frame,    /* vpx_codec_frame_get_fn_t  frame_get; */
+    vp9_set_fb_fn,    /* vpx_codec_set_fb_fn_t     set_fb_fn; */
   },
   { // NOLINT
     /* encoder functions */
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index 63003b9c2..6679f89be 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -19,7 +19,6 @@ VP9_CX_SRCS-yes += vp9_cx_iface.c
 
 VP9_CX_SRCS-yes += encoder/vp9_bitstream.c
 VP9_CX_SRCS-yes += encoder/vp9_dct.c
-VP9_CX_SRCS-yes += encoder/vp9_dct.h
 VP9_CX_SRCS-yes += encoder/vp9_encodeframe.c
 VP9_CX_SRCS-yes += encoder/vp9_encodeframe.h
 VP9_CX_SRCS-yes += encoder/vp9_encodemb.c
@@ -39,7 +38,6 @@ VP9_CX_SRCS-yes += encoder/vp9_lookahead.c
 VP9_CX_SRCS-yes += encoder/vp9_lookahead.h
 VP9_CX_SRCS-yes += encoder/vp9_mcomp.h
 VP9_CX_SRCS-yes += encoder/vp9_onyx_int.h
-VP9_CX_SRCS-yes += encoder/vp9_psnr.h
 VP9_CX_SRCS-yes += encoder/vp9_quantize.h
 VP9_CX_SRCS-yes += encoder/vp9_ratectrl.h
 VP9_CX_SRCS-yes += encoder/vp9_rdopt.h
@@ -51,12 +49,11 @@ VP9_CX_SRCS-yes += encoder/vp9_mcomp.c
 VP9_CX_SRCS-yes += encoder/vp9_onyx_if.c
 VP9_CX_SRCS-yes += encoder/vp9_picklpf.c
 VP9_CX_SRCS-yes += encoder/vp9_picklpf.h
-VP9_CX_SRCS-yes += encoder/vp9_psnr.c
 VP9_CX_SRCS-yes += encoder/vp9_quantize.c
 VP9_CX_SRCS-yes += encoder/vp9_ratectrl.c
 VP9_CX_SRCS-yes += encoder/vp9_rdopt.c
 VP9_CX_SRCS-yes += encoder/vp9_pickmode.c
-VP9_CX_SRCS-yes += encoder/vp9_sad_c.c
+VP9_CX_SRCS-yes += encoder/vp9_sad.c
 VP9_CX_SRCS-yes += encoder/vp9_segmentation.c
 VP9_CX_SRCS-yes += encoder/vp9_segmentation.h
 VP9_CX_SRCS-yes += encoder/vp9_subexp.c
@@ -66,7 +63,7 @@ VP9_CX_SRCS-yes += encoder/vp9_resize.h
 VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += encoder/vp9_ssim.c
 VP9_CX_SRCS-yes += encoder/vp9_tokenize.c
 VP9_CX_SRCS-yes += encoder/vp9_treewriter.c
-VP9_CX_SRCS-yes += encoder/vp9_variance_c.c
+VP9_CX_SRCS-yes += encoder/vp9_variance.c
 VP9_CX_SRCS-yes += encoder/vp9_vaq.c
 VP9_CX_SRCS-yes += encoder/vp9_vaq.h
 ifeq ($(CONFIG_VP9_POSTPROC),yes)
@@ -87,10 +84,11 @@ VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_impl_sse2.asm
 VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_variance_impl_intrin_avx2.c
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad4d_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance_impl_sse2.asm
+VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE3) += encoder/x86/vp9_sad_sse3.asm
 
-ifeq ($(USE_X86INC),yes)
+ifeq ($(CONFIG_USE_X86INC),yes)
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_error_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subtract_sse2.asm