66 files changed, 1656 insertions, 1168 deletions
diff --git a/build/make/configure.sh b/build/make/configure.sh
index bb7ab4110..f3610218b 100755
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -1062,7 +1062,7 @@ EOF
                 setup_gnu_toolchain
                 add_cflags -use-msasm -use-asm
                 add_ldflags -i-static
-                enabled x86_64 && add_cflags -ipo -no-prec-div -static -xSSE2 -axSSE2
+                enabled x86_64 && add_cflags -ipo -static -O3
                 enabled x86_64 && AR=xiar
                 case ${tune_cpu} in
                     atom*)
diff --git a/build/make/rtcd.sh b/build/make/rtcd.sh
index 6cc36843b..9a8d97e32 100755
--- a/build/make/rtcd.sh
+++ b/build/make/rtcd.sh
@@ -290,9 +290,11 @@ static void setup_rtcd_internal(void)
 {
 $(set_function_pointers c $ALL_ARCHS)
 #if HAVE_DSPR2
+#if CONFIG_VP8
 void dsputil_static_init();
 dsputil_static_init();
 #endif
+#endif
 }
 #endif
 $(common_bottom)
diff --git a/build/make/thumb.pm b/build/make/thumb.pm
index 545f59f43..e1f34c1ec 100644
--- a/build/make/thumb.pm
+++ b/build/make/thumb.pm
@@ -47,7 +47,7 @@ sub FixThumbInstructions($$)
     # this is used, it's used for two subsequent load instructions,
     # where a hand-written version of it could merge two subsequent
     # add and sub instructions.
-    s/^(\s*)((ldr|str|pld)(ne)?)(\s+)(r\d+,)?\s*\[(\w+), -([^\]]+)\]/$1sub$4$5$7, $7, $8\n$1$2$5$6\[$7\]\n$1add$4$5$7, $7, $8/g;
+    s/^(\s*)((ldr|str|pld)(ne)?)(\s+)(r\d+,\s*)?\[(\w+), -([^\]]+)\]/$1sub$4$5$7, $7, $8\n$1$2$5$6\[$7\]\n$1add$4$5$7, $7, $8/g;
 
     # Convert register post indexing to a separate add instruction.
     # This converts "ldrneb r9, [r0], r2" into "ldrneb r9, [r0]",
diff --git a/test/intrapred_test.cc b/test/intrapred_test.cc
index f5f6d5b3f..5fba70025 100644
--- a/test/intrapred_test.cc
+++ b/test/intrapred_test.cc
@@ -34,13 +34,17 @@ class IntraPredBase {
   }
 
  protected:
-  void SetupMacroblock(uint8_t *data, int block_size, int stride,
+  void SetupMacroblock(MACROBLOCKD *mbptr,
+                       MODE_INFO *miptr,
+                       uint8_t *data,
+                       int block_size,
+                       int stride,
                        int num_planes) {
-    memset(&mb_, 0, sizeof(mb_));
-    memset(&mi_, 0, sizeof(mi_));
-    mb_.up_available = 1;
-    mb_.left_available = 1;
-    mb_.mode_info_context = &mi_;
+    mbptr_ = mbptr;
+    miptr_ = miptr;
+    mbptr_->up_available = 1;
+    mbptr_->left_available = 1;
+    mbptr_->mode_info_context = miptr_;
     stride_ = stride;
     block_size_ = block_size;
     num_planes_ = num_planes;
@@ -63,14 +67,14 @@ class IntraPredBase {
   virtual void Predict(MB_PREDICTION_MODE mode) = 0;
 
   void SetLeftUnavailable() {
-    mb_.left_available = 0;
+    mbptr_->left_available = 0;
     for (int p = 0; p < num_planes_; p++)
       for (int i = -1; i < block_size_; ++i)
         data_ptr_[p][stride_ * i - 1] = 129;
   }
 
   void SetTopUnavailable() {
-    mb_.up_available = 0;
+    mbptr_->up_available = 0;
     for (int p = 0; p < num_planes_; p++)
       memset(&data_ptr_[p][-1 - stride_], 127, block_size_ + 2);
   }
@@ -96,13 +100,13 @@ class IntraPredBase {
     for (int p = 0; p < num_planes_; p++) {
       // calculate expected DC
       int expected;
-      if (mb_.up_available || mb_.left_available) {
-        int sum = 0, shift = BlockSizeLog2Min1() + mb_.up_available +
-                             mb_.left_available;
-        if (mb_.up_available)
+      if (mbptr_->up_available || mbptr_->left_available) {
+        int sum = 0, shift = BlockSizeLog2Min1() + mbptr_->up_available +
+                             mbptr_->left_available;
+        if (mbptr_->up_available)
           for (int x = 0; x < block_size_; x++)
             sum += data_ptr_[p][x - stride_];
-        if (mb_.left_available)
+        if (mbptr_->left_available)
           for (int y = 0; y < block_size_; y++)
             sum += data_ptr_[p][y * stride_ - 1];
         expected = (sum + (1 << (shift - 1))) >> shift;
@@ -209,8 +213,8 @@ class IntraPredBase {
     }
   }
 
-  MACROBLOCKD mb_;
-  MODE_INFO mi_;
+  MACROBLOCKD *mbptr_;
+  MODE_INFO *miptr_;
   uint8_t *data_ptr_[2];  // in the case of Y, only [0] is used
   int stride_;
   int block_size_;
@@ -228,12 +232,18 @@ class IntraPredYTest : public ::testing::TestWithParam<intra_pred_y_fn_t>,
     protected IntraPredBase {
  public:
   static void SetUpTestCase() {
+    mb_ = reinterpret_cast<MACROBLOCKD*>(
+        vpx_memalign(32, sizeof(MACROBLOCKD)));
+    mi_ = reinterpret_cast<MODE_INFO*>(
+        vpx_memalign(32, sizeof(MODE_INFO)));
     data_array_ = reinterpret_cast<uint8_t*>(
         vpx_memalign(kDataAlignment, kDataBufferSize));
   }
 
   static void TearDownTestCase() {
     vpx_free(data_array_);
+    vpx_free(mi_);
+    vpx_free(mb_);
     data_array_ = NULL;
   }
 
@@ -250,12 +260,12 @@ class IntraPredYTest : public ::testing::TestWithParam<intra_pred_y_fn_t>,
 
   virtual void SetUp() {
     pred_fn_ = GetParam();
-    SetupMacroblock(data_array_, kBlockSize, kStride, 1);
+    SetupMacroblock(mb_, mi_, data_array_, kBlockSize, kStride, 1);
   }
 
   virtual void Predict(MB_PREDICTION_MODE mode) {
-    mb_.mode_info_context->mbmi.mode = mode;
-    REGISTER_STATE_CHECK(pred_fn_(&mb_,
+    mbptr_->mode_info_context->mbmi.mode = mode;
+    REGISTER_STATE_CHECK(pred_fn_(mbptr_,
                                   data_ptr_[0] - kStride,
                                   data_ptr_[0] - 1, kStride,
                                   data_ptr_[0], kStride));
@@ -263,8 +273,12 @@ class IntraPredYTest : public ::testing::TestWithParam<intra_pred_y_fn_t>,
 
   intra_pred_y_fn_t pred_fn_;
   static uint8_t* data_array_;
+  static MACROBLOCKD * mb_;
+  static MODE_INFO *mi_;
 };
 
+MACROBLOCKD* IntraPredYTest::mb_ = NULL;
+MODE_INFO* IntraPredYTest::mi_ = NULL;
 uint8_t* IntraPredYTest::data_array_ = NULL;
 
 TEST_P(IntraPredYTest, IntraPredTests) {
@@ -299,12 +313,18 @@ class IntraPredUVTest : public ::testing::TestWithParam<intra_pred_uv_fn_t>,
     protected IntraPredBase {
  public:
   static void SetUpTestCase() {
+    mb_ = reinterpret_cast<MACROBLOCKD*>(
+        vpx_memalign(32, sizeof(MACROBLOCKD)));
+    mi_ = reinterpret_cast<MODE_INFO*>(
+        vpx_memalign(32, sizeof(MODE_INFO)));
     data_array_ = reinterpret_cast<uint8_t*>(
         vpx_memalign(kDataAlignment, kDataBufferSize));
   }
 
   static void TearDownTestCase() {
     vpx_free(data_array_);
+    vpx_free(mi_);
+    vpx_free(mb_);
     data_array_ = NULL;
   }
 
@@ -322,12 +342,12 @@ class IntraPredUVTest : public ::testing::TestWithParam<intra_pred_uv_fn_t>,
 
   virtual void SetUp() {
     pred_fn_ = GetParam();
-    SetupMacroblock(data_array_, kBlockSize, kStride, 2);
+    SetupMacroblock(mb_, mi_, data_array_, kBlockSize, kStride, 2);
   }
 
   virtual void Predict(MB_PREDICTION_MODE mode) {
-    mb_.mode_info_context->mbmi.uv_mode = mode;
-    pred_fn_(&mb_, data_ptr_[0] - kStride, data_ptr_[1] - kStride,
+    mbptr_->mode_info_context->mbmi.uv_mode = mode;
+    pred_fn_(mbptr_, data_ptr_[0] - kStride, data_ptr_[1] - kStride,
              data_ptr_[0] - 1, data_ptr_[1] - 1, kStride,
              data_ptr_[0], data_ptr_[1], kStride);
   }
@@ -340,8 +360,12 @@ class IntraPredUVTest : public ::testing::TestWithParam<intra_pred_uv_fn_t>,
   // We use 9 lines so we have one line above us for top-prediction.
   // [0] = U, [1] = V
   static uint8_t* data_array_;
+  static MACROBLOCKD* mb_;
+  static MODE_INFO* mi_;
 };
 
+MACROBLOCKD* IntraPredUVTest::mb_ = NULL;
+MODE_INFO* IntraPredUVTest::mi_ = NULL;
 uint8_t* IntraPredUVTest::data_array_ = NULL;
 
 TEST_P(IntraPredUVTest, IntraPredTests) {
diff --git a/vp8/common/x86/vp8_asm_stubs.c b/vp8/common/x86/vp8_asm_stubs.c
index c0416b7e6..b40929381 100644
--- a/vp8/common/x86/vp8_asm_stubs.c
+++ b/vp8/common/x86/vp8_asm_stubs.c
@@ -611,16 +611,12 @@ void vp8_sixtap_predict4x4_ssse3
 
           for (r = 0; r < 4; r++)
           {
-  #if !(CONFIG_FAST_UNALIGNED)
             dst_ptr[0]  = src_ptr[0];
             dst_ptr[1]  = src_ptr[1];
             dst_ptr[2]  = src_ptr[2];
             dst_ptr[3]  = src_ptr[3];
-  #else
-              *(uint32_t *)dst_ptr = *(uint32_t *)src_ptr ;
-  #endif
-              dst_ptr     += dst_pitch;
-              src_ptr     += src_pixels_per_line;
+            dst_ptr     += dst_pitch;
+            src_ptr     += src_pixels_per_line;
           }
       }
   }
diff --git a/vp8/encoder/dct.c b/vp8/encoder/dct.c
index b5a11ae34..091554a5d 100644
--- a/vp8/encoder/dct.c
+++ b/vp8/encoder/dct.c
@@ -20,10 +20,10 @@ void vp8_short_fdct4x4_c(short *input, short *output, int pitch)
 
     for (i = 0; i < 4; i++)
     {
-        a1 = ((ip[0] + ip[3])<<3);
-        b1 = ((ip[1] + ip[2])<<3);
-        c1 = ((ip[1] - ip[2])<<3);
-        d1 = ((ip[0] - ip[3])<<3);
+        a1 = ((ip[0] + ip[3]) * 8);
+        b1 = ((ip[1] + ip[2]) * 8);
+        c1 = ((ip[1] - ip[2]) * 8);
+        d1 = ((ip[0] - ip[3]) * 8);
 
         op[0] = a1 + b1;
         op[2] = a1 - b1;
@@ -72,10 +72,10 @@ void vp8_short_walsh4x4_c(short *input, short *output, int pitch)
 
     for (i = 0; i < 4; i++)
     {
-        a1 = ((ip[0] + ip[2])<<2);
-        d1 = ((ip[1] + ip[3])<<2);
-        c1 = ((ip[1] - ip[3])<<2);
-        b1 = ((ip[0] - ip[2])<<2);
+        a1 = ((ip[0] + ip[2]) * 4);
+        d1 = ((ip[1] + ip[3]) * 4);
+        c1 = ((ip[1] - ip[3]) * 4);
+        b1 = ((ip[0] - ip[2]) * 4);
 
         op[0] = a1 + d1 + (a1!=0);
         op[1] = b1 + c1;
diff --git a/vp9/common/arm/neon/vp9_idct16x16_neon.c b/vp9/common/arm/neon/vp9_idct16x16_neon.c
index 3e3e400a4..fb7b5cdc4 100644
--- a/vp9/common/arm/neon/vp9_idct16x16_neon.c
+++ b/vp9/common/arm/neon/vp9_idct16x16_neon.c
@@ -20,26 +20,28 @@ extern void vp9_short_idct16x16_add_neon_pass2(int16_t *src,
                                                int16_t skip_adding,
                                                uint8_t *dest,
                                                int dest_stride);
-extern void vp9_short_idct10_16x16_add_neon_pass1(int16_t *input,
+extern void vp9_short_idct16x16_10_add_neon_pass1(int16_t *input,
                                                int16_t *output,
                                                int output_stride);
-extern void vp9_short_idct10_16x16_add_neon_pass2(int16_t *src,
+extern void vp9_short_idct16x16_10_add_neon_pass2(int16_t *src,
                                                int16_t *output,
                                                int16_t *pass1Output,
                                                int16_t skip_adding,
                                                uint8_t *dest,
                                                int dest_stride);
-extern void save_neon_registers();
-extern void restore_neon_registers();
 
+/* For ARM NEON, d8-d15 are callee-saved registers, and need to be saved. */
+extern void vp9_push_neon(int64_t *store);
+extern void vp9_pop_neon(int64_t *store);
 
 void vp9_short_idct16x16_add_neon(int16_t *input,
                                   uint8_t *dest, int dest_stride) {
+  int64_t store_reg[8];
   int16_t pass1_output[16*16] = {0};
   int16_t row_idct_output[16*16] = {0};
 
   // save d8-d15 register values.
-  save_neon_registers();
+  vp9_push_neon(store_reg);
 
   /* Parallel idct on the upper 8 rows */
   // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
@@ -102,28 +104,29 @@ void vp9_short_idct16x16_add_neon(int16_t *input,
                                      dest_stride);
 
   // restore d8-d15 register values.
-  restore_neon_registers();
+  vp9_pop_neon(store_reg);
 
   return;
 }
 
-void vp9_short_idct10_16x16_add_neon(int16_t *input,
+void vp9_short_idct16x16_10_add_neon(int16_t *input,
                                   uint8_t *dest, int dest_stride) {
+  int64_t store_reg[8];
   int16_t pass1_output[16*16] = {0};
   int16_t row_idct_output[16*16] = {0};
 
   // save d8-d15 register values.
-  save_neon_registers();
+  vp9_push_neon(store_reg);
 
   /* Parallel idct on the upper 8 rows */
   // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
   // stage 6 result in pass1_output.
-  vp9_short_idct10_16x16_add_neon_pass1(input, pass1_output, 8);
+  vp9_short_idct16x16_10_add_neon_pass1(input, pass1_output, 8);
 
   // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
   // with result in pass1(pass1_output) to calculate final result in stage 7
   // which will be saved into row_idct_output.
-  vp9_short_idct10_16x16_add_neon_pass2(input+1,
+  vp9_short_idct16x16_10_add_neon_pass2(input+1,
                                         row_idct_output,
                                         pass1_output,
                                         0,
@@ -163,7 +166,7 @@ void vp9_short_idct10_16x16_add_neon(int16_t *input,
                                      dest_stride);
 
   // restore d8-d15 register values.
-  restore_neon_registers();
+  vp9_pop_neon(store_reg);
 
   return;
 }
diff --git a/vp9/common/arm/neon/vp9_idct32x32_neon.c b/vp9/common/arm/neon/vp9_idct32x32_neon.c
deleted file mode 100644
index ceecd6fbd..000000000
--- a/vp9/common/arm/neon/vp9_idct32x32_neon.c
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vp9/common/vp9_common.h"
-
-// defined in vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm
-extern void idct32_transpose_and_transform(int16_t *transpose_buffer,
-                                           int16_t *output, int16_t *input);
-extern void idct32_combine_add(uint8_t *dest, int16_t *out, int dest_stride);
-
-
-// defined in vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm
-extern void save_neon_registers();
-extern void restore_neon_registers();
-
-void vp9_short_idct32x32_add_neon(int16_t *input, uint8_t *dest,
-                                  int dest_stride) {
-  // TODO(cd): move the creation of these buffers within the ASM file
-  // internal buffer used to transpose 8 lines into before transforming them
-  int16_t transpose_buffer[32 * 8];
-  // results of the first pass (transpose and transform rows)
-  int16_t pass1[32 * 32];
-  // results of the second pass (transpose and transform columns)
-  int16_t pass2[32 * 32];
-
-  // save register we need to preserve
-  save_neon_registers();
-  // process rows
-  idct32_transpose_and_transform(transpose_buffer, pass1, input);
-  // process columns
-  // TODO(cd): do these two steps/passes within the ASM file
-  idct32_transpose_and_transform(transpose_buffer, pass2, pass1);
-  // combine and add to dest
-  // TODO(cd): integrate this within the last storage step of the second pass
-  idct32_combine_add(dest, pass2, dest_stride);
-  // restore register we need to preserve
-  restore_neon_registers();
-}
-
-// TODO(cd): Eliminate this file altogether when everything is in ASM file
diff --git a/vp9/common/arm/neon/vp9_save_reg_neon.asm b/vp9/common/arm/neon/vp9_save_reg_neon.asm
new file mode 100644
index 000000000..71c3e7077
--- /dev/null
+++ b/vp9/common/arm/neon/vp9_save_reg_neon.asm
@@ -0,0 +1,36 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp9_push_neon|
+    EXPORT  |vp9_pop_neon|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+|vp9_push_neon| PROC
+    vst1.i64            {d8, d9, d10, d11}, [r0]!
+    vst1.i64            {d12, d13, d14, d15}, [r0]!
+    bx              lr
+
+    ENDP
+
+|vp9_pop_neon| PROC
+    vld1.i64            {d8, d9, d10, d11}, [r0]!
+    vld1.i64            {d12, d13, d14, d15}, [r0]!
+    bx              lr
+
+    ENDP
+
+    END
+
diff --git a/vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm b/vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm
index 7464e800f..df2a0526c 100644
--- a/vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm
@@ -10,10 +10,8 @@
 
     EXPORT  |vp9_short_idct16x16_add_neon_pass1|
     EXPORT  |vp9_short_idct16x16_add_neon_pass2|
-    EXPORT  |vp9_short_idct10_16x16_add_neon_pass1|
-    EXPORT  |vp9_short_idct10_16x16_add_neon_pass2|
-    EXPORT  |save_neon_registers|
-    EXPORT  |restore_neon_registers|
+    EXPORT  |vp9_short_idct16x16_10_add_neon_pass1|
+    EXPORT  |vp9_short_idct16x16_10_add_neon_pass2|
     ARM
     REQUIRE8
     PRESERVE8
@@ -788,7 +786,7 @@ end_idct16x16_pass2
     bx              lr
     ENDP  ; |vp9_short_idct16x16_add_neon_pass2|
 
-;void |vp9_short_idct10_16x16_add_neon_pass1|(int16_t *input,
+;void |vp9_short_idct16x16_10_add_neon_pass1|(int16_t *input,
 ;                                             int16_t *output, int output_stride)
 ;
 ; r0  int16_t input
@@ -798,7 +796,7 @@ end_idct16x16_pass2
 ; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output
 ; will be stored back into q8-q15 registers. This function will touch q0-q7
 ; registers and use them as buffer during calculation.
-|vp9_short_idct10_16x16_add_neon_pass1| PROC
+|vp9_short_idct16x16_10_add_neon_pass1| PROC
 
     ; TODO(hkuang): Find a better way to load the elements.
     ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15
@@ -907,9 +905,9 @@ end_idct16x16_pass2
     vst1.64         {d31}, [r1], r2
 
     bx              lr
-    ENDP  ; |vp9_short_idct10_16x16_add_neon_pass1|
+    ENDP  ; |vp9_short_idct16x16_10_add_neon_pass1|
 
-;void vp9_short_idct10_16x16_add_neon_pass2(int16_t *src,
+;void vp9_short_idct16x16_10_add_neon_pass2(int16_t *src,
 ;                                           int16_t *output,
 ;                                           int16_t *pass1Output,
 ;                                           int16_t skip_adding,
@@ -926,7 +924,7 @@ end_idct16x16_pass2
 ; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output
 ; will be stored back into q8-q15 registers. This function will touch q0-q7
 ; registers and use them as buffer during calculation.
-|vp9_short_idct10_16x16_add_neon_pass2| PROC
+|vp9_short_idct16x16_10_add_neon_pass2| PROC
     push            {r3-r9}
 
     ; TODO(hkuang): Find a better way to load the elements.
@@ -1177,15 +1175,5 @@ end_idct16x16_pass2
 end_idct10_16x16_pass2
     pop             {r3-r9}
     bx              lr
-    ENDP  ; |vp9_short_idct10_16x16_add_neon_pass2|
-;void |save_neon_registers|()
-|save_neon_registers| PROC
-    vpush           {d8-d15}
-    bx              lr
-    ENDP  ; |save_registers|
-;void |restore_neon_registers|()
-|restore_neon_registers| PROC
-    vpop           {d8-d15}
-    bx             lr
-    ENDP  ; |restore_registers|
+    ENDP  ; |vp9_short_idct16x16_10_add_neon_pass2|
     END
diff --git a/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm b/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm
index 3a0ff608b..b5a284b5a 100644
--- a/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm
@@ -43,8 +43,7 @@ cospi_30_64 EQU  1606
 cospi_31_64 EQU   804
 
 
-    EXPORT  |idct32_transpose_and_transform|
-    EXPORT  |idct32_combine_add|
+    EXPORT  |vp9_short_idct32x32_add_neon|
     ARM
     REQUIRE8
     PRESERVE8
@@ -100,6 +99,142 @@ cospi_31_64 EQU   804
     vst1.16 {$reg2}, [r1]
     MEND
     ; --------------------------------------------------------------------------
+    ; Combine-add results with current destination content
+    ;   q6-q9 contain the results (out[j * 32 + 0-31])
+    MACRO
+    STORE_COMBINE_CENTER_RESULTS
+    ; load dest[j * dest_stride + 0-31]
+    vld1.s16        {d8}, [r10], r2
+    vld1.s16        {d11}, [r9], r11
+    vld1.s16        {d9}, [r10]
+    vld1.s16        {d10}, [r9]
+    ; ROUND_POWER_OF_TWO
+    vrshr.s16       q7, q7, #6
+    vrshr.s16       q8, q8, #6
+    vrshr.s16       q9, q9, #6
+    vrshr.s16       q6, q6, #6
+    ; add to dest[j * dest_stride + 0-31]
+    vaddw.u8        q7, q7, d9
+    vaddw.u8        q8, q8, d10
+    vaddw.u8        q9, q9, d11
+    vaddw.u8        q6, q6, d8
+    ; clip pixel
+    vqmovun.s16     d9,  q7
+    vqmovun.s16     d10, q8
+    vqmovun.s16     d11, q9
+    vqmovun.s16     d8,  q6
+    ; store back into dest[j * dest_stride + 0-31]
+    vst1.16         {d9}, [r10], r11
+    vst1.16         {d10}, [r9], r2
+    vst1.16         {d8}, [r10]
+    vst1.16         {d11}, [r9]
+    ; update pointers (by dest_stride * 2)
+    sub r9,  r9,  r2, lsl #1
+    add r10, r10, r2, lsl #1
+    MEND
+    ; --------------------------------------------------------------------------
+    ; Combine-add results with current destination content
+    ;   q6-q9 contain the results (out[j * 32 + 0-31])
+    MACRO
+    STORE_COMBINE_CENTER_RESULTS_LAST
+    ; load dest[j * dest_stride + 0-31]
+    vld1.s16        {d8}, [r10], r2
+    vld1.s16        {d11}, [r9], r11
+    vld1.s16        {d9}, [r10]
+    vld1.s16        {d10}, [r9]
+    ; ROUND_POWER_OF_TWO
+    vrshr.s16       q7, q7, #6
+    vrshr.s16       q8, q8, #6
+    vrshr.s16       q9, q9, #6
+    vrshr.s16       q6, q6, #6
+    ; add to dest[j * dest_stride + 0-31]
+    vaddw.u8        q7, q7, d9
+    vaddw.u8        q8, q8, d10
+    vaddw.u8        q9, q9, d11
+    vaddw.u8        q6, q6, d8
+    ; clip pixel
+    vqmovun.s16     d9,  q7
+    vqmovun.s16     d10, q8
+    vqmovun.s16     d11, q9
+    vqmovun.s16     d8,  q6
+    ; store back into dest[j * dest_stride + 0-31]
+    vst1.16         {d9}, [r10], r11
+    vst1.16         {d10}, [r9], r2
+    vst1.16         {d8}, [r10]!
+    vst1.16         {d11}, [r9]!
+    ; update pointers (by dest_stride * 2)
+    sub r9,  r9,  r2, lsl #1
+    add r10, r10, r2, lsl #1
+    MEND
+    ; --------------------------------------------------------------------------
+    ; Combine-add results with current destination content
+    ;   q4-q7 contain the results (out[j * 32 + 0-31])
+    MACRO
+    STORE_COMBINE_EXTREME_RESULTS
+    ; load dest[j * dest_stride + 0-31]
+    vld1.s16        {d4}, [r7], r2
+    vld1.s16        {d7}, [r6], r11
+    vld1.s16        {d5}, [r7]
+    vld1.s16        {d6}, [r6]
+    ; ROUND_POWER_OF_TWO
+    vrshr.s16       q5, q5, #6
+    vrshr.s16       q6, q6, #6
+    vrshr.s16       q7, q7, #6
+    vrshr.s16       q4, q4, #6
+    ; add to dest[j * dest_stride + 0-31]
+    vaddw.u8        q5, q5, d5
+    vaddw.u8        q6, q6, d6
+    vaddw.u8        q7, q7, d7
+    vaddw.u8        q4, q4, d4
+    ; clip pixel
+    vqmovun.s16     d5, q5
+    vqmovun.s16     d6, q6
+    vqmovun.s16     d7, q7
+    vqmovun.s16     d4, q4
+    ; store back into dest[j * dest_stride + 0-31]
+    vst1.16         {d5}, [r7], r11
+    vst1.16         {d6}, [r6], r2
+    vst1.16         {d7}, [r6]
+    vst1.16         {d4}, [r7]
+    ; update pointers (by dest_stride * 2)
+    sub r6, r6, r2, lsl #1
+    add r7, r7, r2, lsl #1
+    MEND
+    ; --------------------------------------------------------------------------
+    ; Combine-add results with current destination content
+    ;   q4-q7 contain the results (out[j * 32 + 0-31])
+    MACRO
+    STORE_COMBINE_EXTREME_RESULTS_LAST
+    ; load dest[j * dest_stride + 0-31]
+    vld1.s16        {d4}, [r7], r2
+    vld1.s16        {d7}, [r6], r11
+    vld1.s16        {d5}, [r7]
+    vld1.s16        {d6}, [r6]
+    ; ROUND_POWER_OF_TWO
+    vrshr.s16       q5, q5, #6
+    vrshr.s16       q6, q6, #6
+    vrshr.s16       q7, q7, #6
+    vrshr.s16       q4, q4, #6
+    ; add to dest[j * dest_stride + 0-31]
+    vaddw.u8        q5, q5, d5
+    vaddw.u8        q6, q6, d6
+    vaddw.u8        q7, q7, d7
+    vaddw.u8        q4, q4, d4
+    ; clip pixel
+    vqmovun.s16     d5, q5
+    vqmovun.s16     d6, q6
+    vqmovun.s16     d7, q7
+    vqmovun.s16     d4, q4
+    ; store back into dest[j * dest_stride + 0-31]
+    vst1.16         {d5}, [r7], r11
+    vst1.16         {d6}, [r6], r2
+    vst1.16         {d7}, [r6]!
+    vst1.16         {d4}, [r7]!
+    ; update pointers (by dest_stride * 2)
+    sub r6, r6, r2, lsl #1
+    add r7, r7, r2, lsl #1
+    MEND
+    ; --------------------------------------------------------------------------
     ; Touches q8-q12, q15 (q13-q14 are preserved)
     ; valid output registers are anything but q8-q11
     MACRO
@@ -110,12 +245,12 @@ cospi_31_64 EQU   804
     ;           additions/substractions before the multiplies.
     ; generate the constants
     ;   generate scalar constants
-    mov             r3,  #$first_constant  & 0xFF00
-    add             r3,  #$first_constant  & 0x00FF
+    mov             r8,  #$first_constant  & 0xFF00
     mov             r12, #$second_constant & 0xFF00
+    add             r8,  #$first_constant  & 0x00FF
     add             r12, #$second_constant & 0x00FF
     ;   generate vector constants
-    vdup.16         d30, r3
+    vdup.16         d30, r8
     vdup.16         d31, r12
     ; (used) two for inputs (regA-regD), one for constants (q15)
     ; do some multiplications (ordered for maximum latency hiding)
@@ -153,15 +288,22 @@ cospi_31_64 EQU   804
     MEND
     ; --------------------------------------------------------------------------
 
-;void idct32_transpose_and_transform(int16_t *transpose_buffer, int16_t *output, int16_t *input);
+;void vp9_short_idct32x32_add_neon(int16_t *input, uint8_t *dest, int dest_stride);
 ;
-; r0  int16_t *transpose_buffer
-; r1  int16_t *output
-; r2  int16_t *input)
-; TODO(cd): have more logical parameter ordering but this issue will disappear
-;           when functions are combined.
+;   r0  int16_t *input,
+;   r1  uint8_t *dest,
+;   r2  int dest_stride)
+; loop counters
+;   r4  bands loop counter
+;   r5  pass loop counter
+;   r8  transpose loop counter
+; combine-add pointers
+;   r6  dest + 31 * dest_stride, descending (30, 29, 28, ...)
+;   r7  dest +  0 * dest_stride, ascending  (1, 2, 3, ...)
+;   r9  dest + 15 * dest_stride, descending (14, 13, 12, ...)
+;   r10 dest + 16 * dest_stride, ascending  (17, 18, 19, ...)
 
-|idct32_transpose_and_transform| PROC
+|vp9_short_idct32x32_add_neon| PROC
     ; This function does one pass of idct32x32 transform.
     ;
     ; This is done by transposing the input and then doing a 1d transform on
@@ -171,43 +313,73 @@ cospi_31_64 EQU   804
     ; The 1d transform is done by looping over bands of eight columns (the
     ; idct32_bands loop). For each band, the transform input transposition
     ; is done on demand, one band of four 8x8 matrices at a time. The four
-    ; matrices are trsnposed by pairs (the idct32_transpose_pair loop).
-    push {r4}
-    mov r4, #0          ; initialize bands loop counter
+    ; matrices are transposed by pairs (the idct32_transpose_pair loop).
+    push  {r4-r11}
+    vpush {d8-d15}
+    ; stack operation
+    ; internal buffer used to transpose 8 lines into before transforming them
+    ;   int16_t transpose_buffer[32 * 8];
+    ;   at sp + [4096, 4607]
+    ; results of the first pass (transpose and transform rows)
+    ;   int16_t pass1[32 * 32];
+    ;   at sp + [0, 2047]
+    ; results of the second pass (transpose and transform columns)
+    ;   int16_t pass2[32 * 32];
+    ;   at sp + [2048, 4095]
+    sub sp, sp, #512+2048+2048
+
+    ; r6  = dest + 31 * dest_stride
+    ; r7  = dest +  0 * dest_stride
+    ; r9  = dest + 15 * dest_stride
+    ; r10 = dest + 16 * dest_stride
+    rsb r6,  r2, r2, lsl #5
+    rsb r9,  r2, r2, lsl #4
+    add r10, r1, r2, lsl #4
+    mov r7, r1
+    add r6, r6, r1
+    add r9, r9, r1
+    ; r11 = -dest_stride
+    neg r11, r2
+    ; r3 = input
+    mov r3, r0
+    ; parameters for first pass
+      ; r0 = transpose_buffer[32 * 8]
+    add r0, sp, #4096
+      ; r1 = pass1[32 * 32]
+    mov r1, sp
+
+    mov r5, #0          ; initialize pass loop counter
+idct32_pass_loop
+    mov r4, #4          ; initialize bands loop counter
 idct32_bands_loop
-    ; TODO(cd) get rid of these push/pop by properly adjusting register
-    ;          content at end of loop
-    push {r0}
-    push {r1}
-    push {r2}
-    mov r3, #0          ; initialize transpose loop counter
+    mov r8, #2          ; initialize transpose loop counter
 idct32_transpose_pair_loop
     ; Load two horizontally consecutive 8x8 16bit data matrices. The first one
     ; into q0-q7 and the second one into q8-q15. There is a stride of 64,
     ; adjusted to 32 because of the two post-increments.
-    vld1.s16        {q8},  [r2]!
-    vld1.s16        {q0},  [r2]!
-    add r2, #32
-    vld1.s16        {q9},  [r2]!
-    vld1.s16        {q1},  [r2]!
-    add r2, #32
-    vld1.s16        {q10}, [r2]!
-    vld1.s16        {q2},  [r2]!
-    add r2, #32
-    vld1.s16        {q11}, [r2]!
-    vld1.s16        {q3},  [r2]!
-    add r2, #32
-    vld1.s16        {q12}, [r2]!
-    vld1.s16        {q4},  [r2]!
-    add r2, #32
-    vld1.s16        {q13}, [r2]!
-    vld1.s16        {q5},  [r2]!
-    add r2, #32
-    vld1.s16        {q14}, [r2]!
-    vld1.s16        {q6},  [r2]!
-    add r2, #32
-    vld1.s16        {q15}, [r2]!
-    vld1.s16        {q7},  [r2]!
+    vld1.s16        {q8},  [r3]!
+    vld1.s16        {q0},  [r3]!
+    add r3, #32
+    vld1.s16        {q9},  [r3]!
+    vld1.s16        {q1},  [r3]!
+    add r3, #32
+    vld1.s16        {q10}, [r3]!
+    vld1.s16        {q2},  [r3]!
+    add r3, #32
+    vld1.s16        {q11}, [r3]!
+    vld1.s16        {q3},  [r3]!
+    add r3, #32
+    vld1.s16        {q12}, [r3]!
+    vld1.s16        {q4},  [r3]!
+    add r3, #32
+    vld1.s16        {q13}, [r3]!
+    vld1.s16        {q5},  [r3]!
+    add r3, #32
+    vld1.s16        {q14}, [r3]!
+    vld1.s16        {q6},  [r3]!
+    add r3, #32
+    vld1.s16        {q15}, [r3]!
+    vld1.s16        {q7},  [r3]!
 
     ; Transpose the two 8x8 16bit data matrices.
     vswp            d17, d24
@@ -255,11 +427,13 @@ idct32_transpose_pair_loop
     vst1.16        {q7},  [r0]!
 
     ; increment pointers by adjusted stride (not necessary for r0/out)
-    sub r2,  r2,  #8*32*2-32-16*2
+    ;   go back by 7*32 for the seven lines moved fully by read and add
+    ;   go back by 32 for the eigth line only read
+    ;   advance by 16*2 to go the next pair
+    sub r3,  r3,  #7*32*2 + 32 - 16*2
     ; transpose pair loop processing
-    add r3, r3, #1
-    cmp r3, #1
-    ble idct32_transpose_pair_loop
+    subs r8, r8, #1
+    bne idct32_transpose_pair_loop
 
     ; restore r0/input to its original value
     sub r0, r0, #32*8*2
@@ -815,21 +989,26 @@ idct32_transpose_pair_loop
     vadd.s16  q9, q5, q0
     vsub.s16  q6, q5, q0
     vsub.s16  q7, q4, q1
-    STORE_IN_OUTPUT 17, 17, 16, q7, q6
-    STORE_IN_OUTPUT 16, 15, 14, q9, q8
+
+    cmp r5, #0
+    bgt idct32_bands_end_2nd_pass
+
+idct32_bands_end_1st_pass
+    STORE_IN_OUTPUT 17, 16, 17, q6, q7
+    STORE_IN_OUTPUT 17, 14, 15, q8, q9
     ; --------------------------------------------------------------------------
     ; part of final stage
     ;output[ 0 * 32] = step1b[0][i] + step1b[31][i];
     ;output[ 1 * 32] = step1b[1][i] + step1b[30][i];
     ;output[30 * 32] = step1b[1][i] - step1b[30][i];
     ;output[31 * 32] = step1b[0][i] - step1b[31][i];
-    LOAD_FROM_OUTPUT 14, 30, 31, q0, q1
+    LOAD_FROM_OUTPUT 15, 30, 31, q0, q1
     vadd.s16  q4, q2, q1
     vadd.s16  q5, q3, q0
     vsub.s16  q6, q3, q0
     vsub.s16  q7, q2, q1
-    STORE_IN_OUTPUT 31, 31, 30, q7, q6
-    STORE_IN_OUTPUT 30,  0,  1, q4, q5
+    STORE_IN_OUTPUT 31, 30, 31, q6, q7
+    STORE_IN_OUTPUT 31,  0,  1, q4, q5
     ; --------------------------------------------------------------------------
     ; part of stage 7
     ;step1[2] = step1b[2][i] + step1b[13][i];
@@ -848,25 +1027,25 @@ idct32_transpose_pair_loop
     ;output[18 * 32] = step1b[13][i] - step1b[18][i];
     ;output[19 * 32] = step1b[12][i] - step1b[19][i];
     LOAD_FROM_OUTPUT 13, 18, 19, q0, q1
-    vadd.s16  q6, q4, q1
-    vadd.s16  q7, q5, q0
-    vsub.s16  q8, q5, q0
-    vsub.s16  q9, q4, q1
-    STORE_IN_OUTPUT 19, 19, 18, q9, q8
-    STORE_IN_OUTPUT 18, 13, 12, q7, q6
+    vadd.s16  q8, q4, q1
+    vadd.s16  q9, q5, q0
+    vsub.s16  q6, q5, q0
+    vsub.s16  q7, q4, q1
+    STORE_IN_OUTPUT 19, 18, 19, q6, q7
+    STORE_IN_OUTPUT 19, 12, 13, q8, q9
     ; --------------------------------------------------------------------------
     ; part of final stage
     ;output[ 2 * 32] = step1b[2][i] + step1b[29][i];
     ;output[ 3 * 32] = step1b[3][i] + step1b[28][i];
     ;output[28 * 32] = step1b[3][i] - step1b[28][i];
     ;output[29 * 32] = step1b[2][i] - step1b[29][i];
-    LOAD_FROM_OUTPUT 12, 28, 29, q0, q1
+    LOAD_FROM_OUTPUT 13, 28, 29, q0, q1
     vadd.s16  q4, q2, q1
     vadd.s16  q5, q3, q0
     vsub.s16  q6, q3, q0
     vsub.s16  q7, q2, q1
-    STORE_IN_OUTPUT 29, 29, 28, q7, q6
-    STORE_IN_OUTPUT 28,  2,  3, q4, q5
+    STORE_IN_OUTPUT 29, 28, 29, q6, q7
+    STORE_IN_OUTPUT 29,  2,  3, q4, q5
     ; --------------------------------------------------------------------------
     ; part of stage 7
     ;step1[4] = step1b[4][i] + step1b[11][i];
@@ -885,25 +1064,25 @@ idct32_transpose_pair_loop
     ;output[20 * 32] = step1b[11][i] - step1b[20][i];
     ;output[21 * 32] = step1b[10][i] - step1b[21][i];
     LOAD_FROM_OUTPUT 11, 20, 21, q0, q1
-    vadd.s16  q6, q4, q1
-    vadd.s16  q7, q5, q0
-    vsub.s16  q8, q5, q0
-    vsub.s16  q9, q4, q1
-    STORE_IN_OUTPUT 21, 21, 20, q9, q8
-    STORE_IN_OUTPUT 20, 11, 10, q7, q6
+    vadd.s16  q8, q4, q1
+    vadd.s16  q9, q5, q0
+    vsub.s16  q6, q5, q0
+    vsub.s16  q7, q4, q1
+    STORE_IN_OUTPUT 21, 20, 21, q6, q7
+    STORE_IN_OUTPUT 21, 10, 11, q8, q9
     ; --------------------------------------------------------------------------
     ; part of final stage
     ;output[ 4 * 32] = step1b[4][i] + step1b[27][i];
     ;output[ 5 * 32] = step1b[5][i] + step1b[26][i];
     ;output[26 * 32] = step1b[5][i] - step1b[26][i];
     ;output[27 * 32] = step1b[4][i] - step1b[27][i];
-    LOAD_FROM_OUTPUT 10, 26, 27, q0, q1
+    LOAD_FROM_OUTPUT 11, 26, 27, q0, q1
     vadd.s16  q4, q2, q1
     vadd.s16  q5, q3, q0
     vsub.s16  q6, q3, q0
     vsub.s16  q7, q2, q1
-    STORE_IN_OUTPUT 27, 27, 26, q7, q6
-    STORE_IN_OUTPUT 26,  4,  5, q4, q5
+    STORE_IN_OUTPUT 27, 26, 27, q6, q7
+    STORE_IN_OUTPUT 27,  4,  5, q4, q5
     ; --------------------------------------------------------------------------
     ; part of stage 7
     ;step1[6] = step1b[6][i] + step1b[9][i];
@@ -922,92 +1101,199 @@ idct32_transpose_pair_loop
     ;output[22 * 32] = step1b[9][i] - step1b[22][i];
     ;output[23 * 32] = step1b[8][i] - step1b[23][i];
     LOAD_FROM_OUTPUT 9, 22, 23, q0, q1
-    vadd.s16  q6, q4, q1
-    vadd.s16  q7, q5, q0
-    vsub.s16  q8, q5, q0
-    vsub.s16  q9, q4, q1
-    STORE_IN_OUTPUT 23, 23, 22, q9, q8
-    STORE_IN_OUTPUT 22, 9, 8, q7, q6
+    vadd.s16  q8, q4, q1
+    vadd.s16  q9, q5, q0
+    vsub.s16  q6, q5, q0
+    vsub.s16  q7, q4, q1
+    STORE_IN_OUTPUT 23, 22, 23, q6, q7
+    STORE_IN_OUTPUT 23, 8, 9, q8, q9
     ; --------------------------------------------------------------------------
     ; part of final stage
     ;output[ 6 * 32] = step1b[6][i] + step1b[25][i];
     ;output[ 7 * 32] = step1b[7][i] + step1b[24][i];
     ;output[24 * 32] = step1b[7][i] - step1b[24][i];
     ;output[25 * 32] = step1b[6][i] - step1b[25][i];
-    LOAD_FROM_OUTPUT 8, 24, 25, q0, q1
+    LOAD_FROM_OUTPUT 9, 24, 25, q0, q1
     vadd.s16  q4, q2, q1
     vadd.s16  q5, q3, q0
     vsub.s16  q6, q3, q0
     vsub.s16  q7, q2, q1
-    STORE_IN_OUTPUT 25, 25, 24, q7, q6
-    STORE_IN_OUTPUT 24,  6,  7, q4, q5
-    ; --------------------------------------------------------------------------
+    STORE_IN_OUTPUT 25, 24, 25, q6, q7
+    STORE_IN_OUTPUT 25,  6,  7, q4, q5
 
-    ; TODO(cd) get rid of these push/pop by properly adjusting register
-    ;          content at end of loop
-    pop {r2}
-    pop {r1}
-    pop {r0}
-    add r1, r1, #8*2
-    add r2, r2, #8*32*2
+    ; restore r0 by removing the last offset from the last
+    ;     operation (LOAD_FROM_TRANSPOSED 16, 8, 24) => 24*8*2
+    sub r0, r0, #24*8*2
+    ; restore r1 by removing the last offset from the last
+    ;     operation (STORE_IN_OUTPUT 24,  6,  7) => 7*32*2
+    ; advance by 8 columns => 8*2
+    sub r1, r1, #7*32*2 - 8*2
+    ;   advance by 8 lines (8*32*2)
+    ;   go back by the two pairs from the loop (32*2)
+    add r3, r3, #8*32*2 - 32*2
 
     ; bands loop processing
-    add r4, r4, #1
-    cmp r4, #3
-    ble idct32_bands_loop
+    subs r4, r4, #1
+    bne idct32_bands_loop
 
-    pop {r4}
-    bx              lr
-    ENDP  ; |idct32_transpose_and_transform|
+    ; parameters for second pass
+    ; the input of pass2 is the result of pass1. we have to remove the offset
+    ;   of 32 columns induced by the above idct32_bands_loop
+    sub r3, r1, #32*2
+      ; r1 = pass2[32 * 32]
+    add r1, sp, #2048
 
-;void idct32_combine_add(uint8_t *dest, int16_t *out, int dest_stride);
-;
-; r0  uint8_t *dest
-; r1  int16_t *out
-; r2  int dest_stride)
-
-|idct32_combine_add| PROC
-
-    mov r12, r0         ; dest pointer used for stores
-    sub r2, r2, #32     ; adjust the stride (remove the post-increments)
-    mov r3, #0          ; initialize loop counter
-
-idct32_combine_add_loop
-    ; load out[j * 32 + 0-31]
-    vld1.s16        {q12},  [r1]!
-    vld1.s16        {q13},  [r1]!
-    vld1.s16        {q14},  [r1]!
-    vld1.s16        {q15},  [r1]!
-    ; load dest[j * dest_stride + 0-31]
-    vld1.s16        {q6}, [r0]!
-    vld1.s16        {q7}, [r0]!
-    ; ROUND_POWER_OF_TWO
-    vrshr.s16       q12, q12, #6
-    vrshr.s16       q13, q13, #6
-    vrshr.s16       q14, q14, #6
-    vrshr.s16       q15, q15, #6
-    ; add to dest[j * dest_stride + 0-31]
-    vaddw.u8        q12, q12, d12
-    vaddw.u8        q13, q13, d13
-    vaddw.u8        q14, q14, d14
-    vaddw.u8        q15, q15, d15
-    ; clip pixel
-    vqmovun.s16     d12, q12
-    vqmovun.s16     d13, q13
-    vqmovun.s16     d14, q14
-    vqmovun.s16     d15, q15
-    ; store back into dest[j * dest_stride + 0-31]
-    vst1.16         {q6}, [r12]!
-    vst1.16         {q7}, [r12]!
-    ; increment pointers by adjusted stride (not necessary for r1/out)
-    add r0,  r0,  r2
-    add r12, r12, r2
-    ; loop processing
-    add r3, r3, #1
-    cmp r3, #31
-    ble idct32_combine_add_loop
+    ; pass loop processing
+    add r5, r5, #1
+    B idct32_pass_loop
 
-    bx              lr
-    ENDP  ; |idct32_transpose|
+idct32_bands_end_2nd_pass
+    STORE_COMBINE_CENTER_RESULTS
+    ; --------------------------------------------------------------------------
+    ; part of final stage
+    ;output[ 0 * 32] = step1b[0][i] + step1b[31][i];
+    ;output[ 1 * 32] = step1b[1][i] + step1b[30][i];
+    ;output[30 * 32] = step1b[1][i] - step1b[30][i];
+    ;output[31 * 32] = step1b[0][i] - step1b[31][i];
+    LOAD_FROM_OUTPUT 17, 30, 31, q0, q1
+    vadd.s16  q4, q2, q1
+    vadd.s16  q5, q3, q0
+    vsub.s16  q6, q3, q0
+    vsub.s16  q7, q2, q1
+    STORE_COMBINE_EXTREME_RESULTS
+    ; --------------------------------------------------------------------------
+    ; part of stage 7
+    ;step1[2] = step1b[2][i] + step1b[13][i];
+    ;step1[3] = step1b[3][i] + step1b[12][i];
+    ;step1[12] = step1b[3][i] - step1b[12][i];
+    ;step1[13] = step1b[2][i] - step1b[13][i];
+    LOAD_FROM_OUTPUT 31, 12, 13, q0, q1
+    vadd.s16  q2, q10, q1
+    vadd.s16  q3, q11, q0
+    vsub.s16  q4, q11, q0
+    vsub.s16  q5, q10, q1
+    ; --------------------------------------------------------------------------
+    ; part of final stage
+    ;output[12 * 32] = step1b[12][i] + step1b[19][i];
+    ;output[13 * 32] = step1b[13][i] + step1b[18][i];
+    ;output[18 * 32] = step1b[13][i] - step1b[18][i];
+    ;output[19 * 32] = step1b[12][i] - step1b[19][i];
+    LOAD_FROM_OUTPUT 13, 18, 19, q0, q1
+    vadd.s16  q8, q4, q1
+    vadd.s16  q9, q5, q0
+    vsub.s16  q6, q5, q0
+    vsub.s16  q7, q4, q1
+    STORE_COMBINE_CENTER_RESULTS
+    ; --------------------------------------------------------------------------
+    ; part of final stage
+    ;output[ 2 * 32] = step1b[2][i] + step1b[29][i];
+    ;output[ 3 * 32] = step1b[3][i] + step1b[28][i];
+    ;output[28 * 32] = step1b[3][i] - step1b[28][i];
+    ;output[29 * 32] = step1b[2][i] - step1b[29][i];
+    LOAD_FROM_OUTPUT 19, 28, 29, q0, q1
+    vadd.s16  q4, q2, q1
+    vadd.s16  q5, q3, q0
+    vsub.s16  q6, q3, q0
+    vsub.s16  q7, q2, q1
+    STORE_COMBINE_EXTREME_RESULTS
+    ; --------------------------------------------------------------------------
+    ; part of stage 7
+    ;step1[4] = step1b[4][i] + step1b[11][i];
+    ;step1[5] = step1b[5][i] + step1b[10][i];
+    ;step1[10] = step1b[5][i] - step1b[10][i];
+    ;step1[11] = step1b[4][i] - step1b[11][i];
+    LOAD_FROM_OUTPUT 29, 10, 11, q0, q1
+    vadd.s16  q2, q12, q1
+    vadd.s16  q3, q13, q0
+    vsub.s16  q4, q13, q0
+    vsub.s16  q5, q12, q1
+    ; --------------------------------------------------------------------------
+    ; part of final stage
+    ;output[10 * 32] = step1b[10][i] + step1b[21][i];
+    ;output[11 * 32] = step1b[11][i] + step1b[20][i];
+    ;output[20 * 32] = step1b[11][i] - step1b[20][i];
+    ;output[21 * 32] = step1b[10][i] - step1b[21][i];
+    LOAD_FROM_OUTPUT 11, 20, 21, q0, q1
+    vadd.s16  q8, q4, q1
+    vadd.s16  q9, q5, q0
+    vsub.s16  q6, q5, q0
+    vsub.s16  q7, q4, q1
+    STORE_COMBINE_CENTER_RESULTS
+    ; --------------------------------------------------------------------------
+    ; part of final stage
+    ;output[ 4 * 32] = step1b[4][i] + step1b[27][i];
+    ;output[ 5 * 32] = step1b[5][i] + step1b[26][i];
+    ;output[26 * 32] = step1b[5][i] - step1b[26][i];
+    ;output[27 * 32] = step1b[4][i] - step1b[27][i];
+    LOAD_FROM_OUTPUT 21, 26, 27, q0, q1
+    vadd.s16  q4, q2, q1
+    vadd.s16  q5, q3, q0
+    vsub.s16  q6, q3, q0
+    vsub.s16  q7, q2, q1
+    STORE_COMBINE_EXTREME_RESULTS
+    ; --------------------------------------------------------------------------
+    ; part of stage 7
+    ;step1[6] = step1b[6][i] + step1b[9][i];
+    ;step1[7] = step1b[7][i] + step1b[8][i];
+    ;step1[8] = step1b[7][i] - step1b[8][i];
+    ;step1[9] = step1b[6][i] - step1b[9][i];
+    LOAD_FROM_OUTPUT 27, 8, 9, q0, q1
+    vadd.s16  q2, q14, q1
+    vadd.s16  q3, q15, q0
+    vsub.s16  q4, q15, q0
+    vsub.s16  q5, q14, q1
+    ; --------------------------------------------------------------------------
+    ; part of final stage
+    ;output[ 8 * 32] = step1b[8][i] + step1b[23][i];
+    ;output[ 9 * 32] = step1b[9][i] + step1b[22][i];
+    ;output[22 * 32] = step1b[9][i] - step1b[22][i];
+    ;output[23 * 32] = step1b[8][i] - step1b[23][i];
+    LOAD_FROM_OUTPUT 9, 22, 23, q0, q1
+    vadd.s16  q8, q4, q1
+    vadd.s16  q9, q5, q0
+    vsub.s16  q6, q5, q0
+    vsub.s16  q7, q4, q1
+    STORE_COMBINE_CENTER_RESULTS_LAST
+    ; --------------------------------------------------------------------------
+    ; part of final stage
+    ;output[ 6 * 32] = step1b[6][i] + step1b[25][i];
+    ;output[ 7 * 32] = step1b[7][i] + step1b[24][i];
+    ;output[24 * 32] = step1b[7][i] - step1b[24][i];
+    ;output[25 * 32] = step1b[6][i] - step1b[25][i];
+    LOAD_FROM_OUTPUT 23, 24, 25, q0, q1
+    vadd.s16  q4, q2, q1
+    vadd.s16  q5, q3, q0
+    vsub.s16  q6, q3, q0
+    vsub.s16  q7, q2, q1
+    STORE_COMBINE_EXTREME_RESULTS_LAST
+    ; --------------------------------------------------------------------------
+    ; restore pointers to their initial indices for next band pass by
+    ;     removing/adding dest_stride * 8. The actual increment by eight
+    ;     is taken care of within the _LAST macros.
+    add r6,  r6,  r2, lsl #3
+    add r9,  r9,  r2, lsl #3
+    sub r7,  r7,  r2, lsl #3
+    sub r10, r10, r2, lsl #3
+
+    ; restore r0 by removing the last offset from the last
+    ;     operation (LOAD_FROM_TRANSPOSED 16, 8, 24) => 24*8*2
+    sub r0, r0, #24*8*2
+    ; restore r1 by removing the last offset from the last
+    ;     operation (LOAD_FROM_OUTPUT 23, 24, 25) => 25*32*2
+    ; advance by 8 columns => 8*2
+    sub r1, r1, #25*32*2 - 8*2
+    ;   advance by 8 lines (8*32*2)
+    ;   go back by the two pairs from the loop (32*2)
+    add r3, r3, #8*32*2 - 32*2
 
+    ; bands loop processing
+    subs r4, r4, #1
+    bne idct32_bands_loop
+
+    ; stack operation
+    add sp, sp, #512+2048+2048
+    vpop {d8-d15}
+    pop  {r4-r11}
+    bx              lr
+    ENDP  ; |vp9_short_idct32x32_add_neon|
     END
diff --git a/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm b/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm
index a744f59db..c02251a3d 100644
--- a/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm
@@ -9,7 +9,7 @@
 ;
 
     EXPORT  |vp9_short_idct8x8_add_neon|
-    EXPORT  |vp9_short_idct10_8x8_add_neon|
+    EXPORT  |vp9_short_idct8x8_10_add_neon|
     ARM
     REQUIRE8
     PRESERVE8
@@ -310,13 +310,13 @@
     bx              lr
     ENDP  ; |vp9_short_idct8x8_add_neon|
 
-;void vp9_short_idct10_8x8_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
+;void vp9_short_idct8x8_10_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
 ;
 ; r0  int16_t input
 ; r1  uint8_t *dest
 ; r2  int dest_stride)
 
-|vp9_short_idct10_8x8_add_neon| PROC
+|vp9_short_idct8x8_10_add_neon| PROC
     push            {r4-r9}
     vpush           {d8-d15}
     vld1.s16        {q8,q9}, [r0]!
@@ -514,6 +514,6 @@
     vpop            {d8-d15}
     pop             {r4-r9}
     bx              lr
-    ENDP  ; |vp9_short_idct10_8x8_add_neon|
+    ENDP  ; |vp9_short_idct8x8_10_add_neon|
 
     END
diff --git a/vp9/common/vp9_alloccommon.c b/vp9/common/vp9_alloccommon.c
index 864e27e98..5e526a83c 100644
--- a/vp9/common/vp9_alloccommon.c
+++ b/vp9/common/vp9_alloccommon.c
@@ -170,13 +170,8 @@ int vp9_alloc_frame_buffers(VP9_COMMON *cm, int width, int height) {
 void vp9_create_common(VP9_COMMON *cm) {
   vp9_machine_specific_config(cm);
 
-  vp9_init_mbmode_probs(cm);
-
   cm->tx_mode = ONLY_4X4;
   cm->comp_pred_mode = HYBRID_PREDICTION;
-
-  // Initialize reference frame sign bias structure to defaults
-  vpx_memset(cm->ref_frame_sign_bias, 0, sizeof(cm->ref_frame_sign_bias));
 }
 
 void vp9_remove_common(VP9_COMMON *cm) {
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index c8d677fb9..9ab2cc31b 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -137,7 +137,7 @@ typedef struct {
   TX_SIZE tx_size;
   int_mv mv[2];                // for each reference frame used
   int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES];
-  int_mv best_mv, best_second_mv;
+  int_mv best_mv[2];
 
   uint8_t mode_context[MAX_REF_FRAMES];
 
@@ -247,7 +247,7 @@ typedef struct macroblockd {
 
 } MACROBLOCKD;
 
-static INLINE unsigned char *get_sb_index(MACROBLOCKD *xd, BLOCK_SIZE subsize) {
+static INLINE uint8_t *get_sb_index(MACROBLOCKD *xd, BLOCK_SIZE subsize) {
   switch (subsize) {
     case BLOCK_64X64:
     case BLOCK_64X32:
diff --git a/vp9/common/vp9_convolve.c b/vp9/common/vp9_convolve.c
index abedf6b27..1705402c2 100644
--- a/vp9/common/vp9_convolve.c
+++ b/vp9/common/vp9_convolve.c
@@ -282,7 +282,7 @@ void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride,
   int r;
 
   for (r = h; r > 0; --r) {
-    memcpy(dst, src, w);
+    vpx_memcpy(dst, src, w);
     src += src_stride;
     dst += dst_stride;
   }
diff --git a/vp9/common/vp9_entropy.c b/vp9/common/vp9_entropy.c
index 32d9e0cf7..f171c317f 100644
--- a/vp9/common/vp9_entropy.c
+++ b/vp9/common/vp9_entropy.c
@@ -569,31 +569,6 @@ void vp9_init_neighbors() {
                       vp9_default_scan_32x32_neighbors);
 }
 
-const int16_t *vp9_get_coef_neighbors_handle(const int16_t *scan) {
-  if (scan == vp9_default_scan_4x4) {
-    return vp9_default_scan_4x4_neighbors;
-  } else if (scan == vp9_row_scan_4x4) {
-    return vp9_row_scan_4x4_neighbors;
-  } else if (scan == vp9_col_scan_4x4) {
-    return vp9_col_scan_4x4_neighbors;
-  } else if (scan == vp9_default_scan_8x8) {
-    return vp9_default_scan_8x8_neighbors;
-  } else if (scan == vp9_row_scan_8x8) {
-    return vp9_row_scan_8x8_neighbors;
-  } else if (scan == vp9_col_scan_8x8) {
-    return vp9_col_scan_8x8_neighbors;
-  } else if (scan == vp9_default_scan_16x16) {
-    return vp9_default_scan_16x16_neighbors;
-  } else if (scan == vp9_row_scan_16x16) {
-    return vp9_row_scan_16x16_neighbors;
-  } else if (scan == vp9_col_scan_16x16) {
-    return vp9_col_scan_16x16_neighbors;
-  } else {
-    assert(scan == vp9_default_scan_32x32);
-    return vp9_default_scan_32x32_neighbors;
-  }
-}
-
 void vp9_coef_tree_initialize() {
   vp9_init_neighbors();
   init_bit_trees();
diff --git a/vp9/common/vp9_entropy.h b/vp9/common/vp9_entropy.h
index c1f2d782b..4ed94815b 100644
--- a/vp9/common/vp9_entropy.h
+++ b/vp9/common/vp9_entropy.h
@@ -190,9 +190,6 @@ static INLINE int get_coef_context(const int16_t *neighbors,
           token_cache[neighbors[MAX_NEIGHBORS * c + 1]]) >> 1;
 }
 
-const int16_t *vp9_get_coef_neighbors_handle(const int16_t *scan);
-
-
 // 128 lists of probabilities are stored for the following ONE node probs:
 // 1, 3, 5, 7, ..., 253, 255
 // In between probabilities are interpolated linearly
@@ -367,22 +364,24 @@ static int get_entropy_context(TX_SIZE tx_size,
 static void get_scan_and_band(const MACROBLOCKD *xd, TX_SIZE tx_size,
                               PLANE_TYPE type, int block_idx,
                               const int16_t **scan,
+                              const int16_t **scan_nb,
                               const uint8_t **band_translate) {
   switch (tx_size) {
     case TX_4X4:
-      *scan = get_scan_4x4(get_tx_type_4x4(type, xd, block_idx));
+      get_scan_nb_4x4(get_tx_type_4x4(type, xd, block_idx), scan, scan_nb);
       *band_translate = vp9_coefband_trans_4x4;
       break;
     case TX_8X8:
-      *scan = get_scan_8x8(get_tx_type_8x8(type, xd));
+      get_scan_nb_8x8(get_tx_type_8x8(type, xd), scan, scan_nb);
       *band_translate = vp9_coefband_trans_8x8plus;
       break;
     case TX_16X16:
-      *scan = get_scan_16x16(get_tx_type_16x16(type, xd));
+      get_scan_nb_16x16(get_tx_type_16x16(type, xd), scan, scan_nb);
       *band_translate = vp9_coefband_trans_8x8plus;
       break;
     case TX_32X32:
       *scan = vp9_default_scan_32x32;
+      *scan_nb = vp9_default_scan_32x32_neighbors;
       *band_translate = vp9_coefband_trans_8x8plus;
       break;
     default:
diff --git a/vp9/common/vp9_entropymode.h b/vp9/common/vp9_entropymode.h
index 4cf4c0392..31537c7f7 100644
--- a/vp9/common/vp9_entropymode.h
+++ b/vp9/common/vp9_entropymode.h
@@ -14,7 +14,6 @@
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_treecoder.h"
 
-#define SUBMVREF_COUNT 5
 #define TX_SIZE_CONTEXTS 2
 #define MODE_UPDATE_PROB  252
 #define SWITCHABLE_FILTERS 3   // number of switchable filters
diff --git a/vp9/common/vp9_findnearmv.c b/vp9/common/vp9_findnearmv.c
index 49a731fdb..73f6b4c19 100644
--- a/vp9/common/vp9_findnearmv.c
+++ b/vp9/common/vp9_findnearmv.c
@@ -54,7 +54,7 @@ void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd,
 
   dst_list[1].as_int = 0;
   if (block_idx == 0) {
-    memcpy(dst_list, mv_list, MAX_MV_REF_CANDIDATES * sizeof(int_mv));
+    vpx_memcpy(dst_list, mv_list, MAX_MV_REF_CANDIDATES * sizeof(int_mv));
   } else if (block_idx == 1 || block_idx == 2) {
     int dst = 0, n;
     union b_mode_info *bmi = mi->bmi;
diff --git a/vp9/common/vp9_findnearmv.h b/vp9/common/vp9_findnearmv.h
index ad0d882b9..50dfdc7fb 100644
--- a/vp9/common/vp9_findnearmv.h
+++ b/vp9/common/vp9_findnearmv.h
@@ -55,13 +55,11 @@ static MB_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mb,
     if (!mi)
       return DC_PRED;
 
-    if (mi->mbmi.ref_frame[0] != INTRA_FRAME) {
+    if (is_inter_block(&mi->mbmi))
       return DC_PRED;
-    } else if (mi->mbmi.sb_type < BLOCK_8X8) {
-      return ((mi->bmi + 1 + b)->as_mode);
-    } else {
-      return mi->mbmi.mode;
-    }
+    else
+      return mi->mbmi.sb_type < BLOCK_8X8 ? (mi->bmi + 1 + b)->as_mode
+                                          : mi->mbmi.mode;
   }
   assert(b == 1 || b == 3);
   return (mi->bmi + b - 1)->as_mode;
@@ -77,13 +75,11 @@ static MB_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mb,
     if (!mi)
       return DC_PRED;
 
-    if (mi->mbmi.ref_frame[0] != INTRA_FRAME) {
+    if (is_inter_block(&mi->mbmi))
       return DC_PRED;
-    } else if (mi->mbmi.sb_type < BLOCK_8X8) {
-      return ((mi->bmi + 2 + b)->as_mode);
-    } else {
-      return mi->mbmi.mode;
-    }
+    else
+      return mi->mbmi.sb_type < BLOCK_8X8 ? (mi->bmi + 2 + b)->as_mode
+                                          : mi->mbmi.mode;
   }
 
   return (mi->bmi + b - 2)->as_mode;
diff --git a/vp9/common/vp9_idct.c b/vp9/common/vp9_idct.c
index a2245259e..bc30d2a95 100644
--- a/vp9/common/vp9_idct.c
+++ b/vp9/common/vp9_idct.c
@@ -28,10 +28,10 @@ void vp9_short_iwalsh4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
   int16_t *op = output;
 
   for (i = 0; i < 4; i++) {
-    a1 = ip[0] >> WHT_UPSCALE_FACTOR;
-    c1 = ip[1] >> WHT_UPSCALE_FACTOR;
-    d1 = ip[2] >> WHT_UPSCALE_FACTOR;
-    b1 = ip[3] >> WHT_UPSCALE_FACTOR;
+    a1 = ip[0] >> UNIT_QUANT_SHIFT;
+    c1 = ip[1] >> UNIT_QUANT_SHIFT;
+    d1 = ip[2] >> UNIT_QUANT_SHIFT;
+    b1 = ip[3] >> UNIT_QUANT_SHIFT;
     a1 += c1;
     d1 -= b1;
     e1 = (a1 - d1) >> 1;
@@ -77,7 +77,7 @@ void vp9_short_iwalsh4x4_1_add_c(int16_t *in, uint8_t *dest, int dest_stride) {
   int16_t *ip = in;
   int16_t *op = tmp;
 
-  a1 = ip[0] >> WHT_UPSCALE_FACTOR;
+  a1 = ip[0] >> UNIT_QUANT_SHIFT;
   e1 = a1 >> 1;
   a1 -= e1;
   op[0] = a1;
@@ -420,7 +420,7 @@ void vp9_short_iht8x8_add_c(int16_t *input, uint8_t *dest, int dest_stride,
                                   + dest[j * dest_stride + i]);  }
 }
 
-void vp9_short_idct10_8x8_add_c(int16_t *input, uint8_t *dest,
+void vp9_short_idct8x8_10_add_c(int16_t *input, uint8_t *dest,
                                 int dest_stride) {
   int16_t out[8 * 8] = { 0 };
   int16_t *outptr = out;
@@ -838,7 +838,7 @@ void vp9_short_iht16x16_add_c(int16_t *input, uint8_t *dest, int dest_stride,
                                   + dest[j * dest_stride + i]);  }
 }
 
-void vp9_short_idct10_16x16_add_c(int16_t *input, uint8_t *dest,
+void vp9_short_idct16x16_10_add_c(int16_t *input, uint8_t *dest,
                                   int dest_stride) {
   int16_t out[16 * 16] = { 0 };
   int16_t *outptr = out;
diff --git a/vp9/common/vp9_idct.h b/vp9/common/vp9_idct.h
index 5f2f0a569..59892cd03 100644
--- a/vp9/common/vp9_idct.h
+++ b/vp9/common/vp9_idct.h
@@ -22,7 +22,8 @@
 #define DCT_CONST_BITS 14
 #define DCT_CONST_ROUNDING  (1 << (DCT_CONST_BITS - 1))
 
-#define WHT_UPSCALE_FACTOR 2
+#define UNIT_QUANT_SHIFT 2
+#define UNIT_QUANT_FACTOR (1 << UNIT_QUANT_SHIFT)
 
 #define pair_set_epi16(a, b) \
   _mm_set_epi16(b, a, b, a, b, a, b, a)
diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c
index 0f2e4e999..18407dd73 100644
--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c
@@ -132,7 +132,7 @@ static void build_inter_predictors(int plane, int block, BLOCK_SIZE bsize,
   const int x = 4 * (block & ((1 << bwl) - 1));
   const int y = 4 * (block >> bwl);
   const MODE_INFO *mi = xd->this_mi;
-  const int use_second_ref = mi->mbmi.ref_frame[1] > 0;
+  const int is_compound = has_second_ref(&mi->mbmi);
   int ref;
 
   assert(x < bw);
@@ -140,7 +140,7 @@ static void build_inter_predictors(int plane, int block, BLOCK_SIZE bsize,
   assert(mi->mbmi.sb_type < BLOCK_8X8 || 4 << pred_w == bw);
   assert(mi->mbmi.sb_type < BLOCK_8X8 || 4 << pred_h == bh);
 
-  for (ref = 0; ref < 1 + use_second_ref; ++ref) {
+  for (ref = 0; ref < 1 + is_compound; ++ref) {
     struct scale_factors *const scale = &xd->scale_factor[ref];
     struct buf_2d *const pre_buf = &pd->pre[ref];
     struct buf_2d *const dst_buf = &pd->dst;
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index 042afbbef..42923b3c8 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -53,7 +53,7 @@ prototype void vp9_d45_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const ui
 specialize vp9_d45_predictor_4x4 $ssse3_x86inc
 
 prototype void vp9_d63_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d63_predictor_4x4
+specialize vp9_d63_predictor_4x4 $ssse3_x86inc
 
 prototype void vp9_h_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
 specialize vp9_h_predictor_4x4 $ssse3_x86inc
@@ -92,7 +92,7 @@ prototype void vp9_d45_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const ui
 specialize vp9_d45_predictor_8x8 $ssse3_x86inc
 
 prototype void vp9_d63_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d63_predictor_8x8
+specialize vp9_d63_predictor_8x8 $ssse3_x86inc
 
 prototype void vp9_h_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
 specialize vp9_h_predictor_8x8 $ssse3_x86inc
@@ -131,7 +131,7 @@ prototype void vp9_d45_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const
 specialize vp9_d45_predictor_16x16 $ssse3_x86inc
 
 prototype void vp9_d63_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d63_predictor_16x16
+specialize vp9_d63_predictor_16x16 $ssse3_x86inc
 
 prototype void vp9_h_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
 specialize vp9_h_predictor_16x16 $ssse3_x86inc
@@ -170,7 +170,7 @@ prototype void vp9_d45_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const
 specialize vp9_d45_predictor_32x32 $ssse3_x86inc
 
 prototype void vp9_d63_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d63_predictor_32x32
+specialize vp9_d63_predictor_32x32 $ssse3_x86inc
 
 prototype void vp9_h_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
 specialize vp9_h_predictor_32x32 $ssse3 x86inc
@@ -306,8 +306,8 @@ specialize vp9_short_idct8x8_1_add sse2 neon
 prototype void vp9_short_idct8x8_add "int16_t *input, uint8_t *dest, int dest_stride"
 specialize vp9_short_idct8x8_add sse2 neon
 
-prototype void vp9_short_idct10_8x8_add "int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_short_idct10_8x8_add sse2 neon
+prototype void vp9_short_idct8x8_10_add "int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_short_idct8x8_10_add sse2 neon
 
 prototype void vp9_short_idct16x16_1_add "int16_t *input, uint8_t *dest, int dest_stride"
 specialize vp9_short_idct16x16_1_add sse2 neon
@@ -315,8 +315,8 @@ specialize vp9_short_idct16x16_1_add sse2 neon
 prototype void vp9_short_idct16x16_add "int16_t *input, uint8_t *dest, int dest_stride"
 specialize vp9_short_idct16x16_add sse2 neon
 
-prototype void vp9_short_idct10_16x16_add "int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_short_idct10_16x16_add sse2 neon
+prototype void vp9_short_idct16x16_10_add "int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_short_idct16x16_10_add sse2 neon
 
 prototype void vp9_short_idct32x32_add "int16_t *input, uint8_t *dest, int dest_stride"
 specialize vp9_short_idct32x32_add sse2 neon
diff --git a/vp9/common/vp9_subpelvar.h b/vp9/common/vp9_subpelvar.h
deleted file mode 100644
index fe75481f6..000000000
--- a/vp9/common/vp9_subpelvar.h
+++ /dev/null
@@ -1,145 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VP9_COMMON_VP9_SUBPELVAR_H_
-#define VP9_COMMON_VP9_SUBPELVAR_H_
-
-#include "vp9/common/vp9_common.h"
-#include "vp9/common/vp9_convolve.h"
-
-static void variance(const uint8_t *src_ptr,
-                     int  source_stride,
-                     const uint8_t *ref_ptr,
-                     int  recon_stride,
-                     int  w,
-                     int  h,
-                     unsigned int *sse,
-                     int *sum) {
-  int i, j;
-  int diff;
-
-  *sum = 0;
-  *sse = 0;
-
-  for (i = 0; i < h; i++) {
-    for (j = 0; j < w; j++) {
-      diff = src_ptr[j] - ref_ptr[j];
-      *sum += diff;
-      *sse += diff * diff;
-    }
-
-    src_ptr += source_stride;
-    ref_ptr += recon_stride;
-  }
-}
-
-/****************************************************************************
- *
- *  ROUTINE       : filter_block2d_bil_first_pass
- *
- *  INPUTS        : uint8_t  *src_ptr          : Pointer to source block.
- *                  uint32_t src_pixels_per_line : Stride of input block.
- *                  uint32_t pixel_step        : Offset between filter input samples (see notes).
- *                  uint32_t output_height     : Input block height.
- *                  uint32_t output_width      : Input block width.
- *                  int32_t  *vp9_filter          : Array of 2 bi-linear filter taps.
- *
- *  OUTPUTS       : int32_t *output_ptr        : Pointer to filtered block.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block in
- *                  either horizontal or vertical direction to produce the
- *                  filtered output block. Used to implement first-pass
- *                  of 2-D separable filter.
- *
- *  SPECIAL NOTES : Produces int32_t output to retain precision for next pass.
- *                  Two filter taps should sum to VP9_FILTER_WEIGHT.
- *                  pixel_step defines whether the filter is applied
- *                  horizontally (pixel_step=1) or vertically (pixel_step=stride).
- *                  It defines the offset required to move from one input
- *                  to the next.
- *
- ****************************************************************************/
-static void var_filter_block2d_bil_first_pass(const uint8_t *src_ptr,
-                                              uint16_t *output_ptr,
-                                              unsigned int src_pixels_per_line,
-                                              int pixel_step,
-                                              unsigned int output_height,
-                                              unsigned int output_width,
-                                              const int16_t *vp9_filter) {
-  unsigned int i, j;
-
-  for (i = 0; i < output_height; i++) {
-    for (j = 0; j < output_width; j++) {
-      output_ptr[j] = ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] +
-                          (int)src_ptr[pixel_step] * vp9_filter[1],
-                          FILTER_BITS);
-
-      src_ptr++;
-    }
-
-    // Next row...
-    src_ptr    += src_pixels_per_line - output_width;
-    output_ptr += output_width;
-  }
-}
-
-/****************************************************************************
- *
- *  ROUTINE       : filter_block2d_bil_second_pass
- *
- *  INPUTS        : int32_t  *src_ptr          : Pointer to source block.
- *                  uint32_t src_pixels_per_line : Stride of input block.
- *                  uint32_t pixel_step        : Offset between filter input samples (see notes).
- *                  uint32_t output_height     : Input block height.
- *                  uint32_t output_width      : Input block width.
- *                  int32_t  *vp9_filter          : Array of 2 bi-linear filter taps.
- *
- *  OUTPUTS       : uint16_t *output_ptr       : Pointer to filtered block.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block in
- *                  either horizontal or vertical direction to produce the
- *                  filtered output block. Used to implement second-pass
- *                  of 2-D separable filter.
- *
- *  SPECIAL NOTES : Requires 32-bit input as produced by filter_block2d_bil_first_pass.
- *                  Two filter taps should sum to VP9_FILTER_WEIGHT.
- *                  pixel_step defines whether the filter is applied
- *                  horizontally (pixel_step=1) or vertically (pixel_step=stride).
- *                  It defines the offset required to move from one input
- *                  to the next.
- *
- ****************************************************************************/
-static void var_filter_block2d_bil_second_pass(const uint16_t *src_ptr,
-                                               uint8_t *output_ptr,
-                                               unsigned int src_pixels_per_line,
-                                               unsigned int pixel_step,
-                                               unsigned int output_height,
-                                               unsigned int output_width,
-                                               const int16_t *vp9_filter) {
-  unsigned int  i, j;
-
-  for (i = 0; i < output_height; i++) {
-    for (j = 0; j < output_width; j++) {
-      output_ptr[j] = ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] +
-                          (int)src_ptr[pixel_step] * vp9_filter[1],
-                          FILTER_BITS);
-      src_ptr++;
-    }
-
-    src_ptr += src_pixels_per_line - output_width;
-    output_ptr += output_width;
-  }
-}
-
-#endif  // VP9_COMMON_VP9_SUBPELVAR_H_
diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c
index 8f740f412..d44c7e2a0 100644
--- a/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -985,7 +985,7 @@ void vp9_short_iht8x8_add_sse2(int16_t *input, uint8_t *dest, int stride,
   RECON_AND_STORE(dest, in[7]);
 }
 
-void vp9_short_idct10_8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) {
+void vp9_short_idct8x8_10_add_sse2(int16_t *input, uint8_t *dest, int stride) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
   const __m128i final_rounding = _mm_set1_epi16(1<<4);
@@ -2456,7 +2456,7 @@ void vp9_short_iht16x16_add_sse2(int16_t *input, uint8_t *dest, int stride,
   write_buffer_8x16(dest, in1, stride);
 }
 
-void vp9_short_idct10_16x16_add_sse2(int16_t *input, uint8_t *dest,
+void vp9_short_idct16x16_10_add_sse2(int16_t *input, uint8_t *dest,
                                      int stride) {
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
   const __m128i final_rounding = _mm_set1_epi16(1<<5);
diff --git a/vp9/common/x86/vp9_intrapred_ssse3.asm b/vp9/common/x86/vp9_intrapred_ssse3.asm
index 67c8ab03a..c51d01151 100644
--- a/vp9/common/x86/vp9_intrapred_ssse3.asm
+++ b/vp9/common/x86/vp9_intrapred_ssse3.asm
@@ -17,8 +17,8 @@ pw_2: times 8 dw 2
 pb_7m1: times 8 db 7, -1
 pb_15: times 16 db 15
 
-sh_b01234577: db 0, 1, 2, 3, 4, 5, 7, 7
-sh_b12345677: db 1, 2, 3, 4, 5, 6, 7, 7
+sh_b01234577: db 0, 1, 2, 3, 4, 5, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0
+sh_b12345677: db 1, 2, 3, 4, 5, 6, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0
 sh_b23456777: db 2, 3, 4, 5, 6, 7, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0
 sh_b0123456777777777: db 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7
 sh_b1234567777777777: db 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7
@@ -305,3 +305,153 @@ cglobal d45_predictor_32x32, 3, 6, 7, dst, stride, above, dst16, line, goffset
 
   RESTORE_GOT
   RET
+
+; ------------------------------------------
+; input: x, y, z, result
+;
+; trick from pascal
+; (x+2y+z+2)>>2 can be calculated as:
+; result = avg(x,z)
+; result -= xor(x,z) & 1
+; result = avg(result,y)
+; ------------------------------------------
+%macro X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 4
+  pavgb               %4, %1, %3
+  pxor                %3, %1
+  pand                %3, [GLOBAL(pb_1)]
+  psubb               %4, %3
+  pavgb               %4, %2
+%endmacro
+
+INIT_XMM ssse3
+cglobal d63_predictor_4x4, 3, 4, 5, dst, stride, above, goffset
+  GET_GOT     goffsetq
+
+  movq                m3, [aboveq]
+  pshufb              m1, m3, [GLOBAL(sh_b23456777)]
+  pshufb              m2, m3, [GLOBAL(sh_b12345677)]
+
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m3, m2, m1, m4
+  pavgb               m3, m2
+
+  ; store 4 lines
+  movd    [dstq        ], m3
+  movd    [dstq+strideq], m4
+  lea               dstq, [dstq+strideq*2]
+  psrldq              m3, 1
+  psrldq              m4, 1
+  movd    [dstq        ], m3
+  movd    [dstq+strideq], m4
+  RESTORE_GOT
+  RET
+
+INIT_XMM ssse3
+cglobal d63_predictor_8x8, 3, 4, 5, dst, stride, above, goffset
+  GET_GOT     goffsetq
+
+  movq                m3, [aboveq]
+  DEFINE_ARGS dst, stride, stride3
+  lea           stride3q, [strideq*3]
+  pshufb              m1, m3, [GLOBAL(sh_b2345677777777777)]
+  pshufb              m0, m3, [GLOBAL(sh_b0123456777777777)]
+  pshufb              m2, m3, [GLOBAL(sh_b1234567777777777)]
+  pshufb              m3, [GLOBAL(sh_b0123456777777777)]
+
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m2, m1, m4
+  pavgb               m3, m2
+
+  ; store 4 lines
+  movq    [dstq        ], m3
+  movq    [dstq+strideq], m4
+  psrldq              m3, 1
+  psrldq              m4, 1
+  movq  [dstq+strideq*2], m3
+  movq  [dstq+stride3q ], m4
+  lea               dstq, [dstq+strideq*4]
+  psrldq              m3, 1
+  psrldq              m4, 1
+
+  ; store 4 lines
+  movq    [dstq        ], m3
+  movq    [dstq+strideq], m4
+  psrldq              m3, 1
+  psrldq              m4, 1
+  movq  [dstq+strideq*2], m3
+  movq  [dstq+stride3q ], m4
+  RESTORE_GOT
+  RET
+
+INIT_XMM ssse3
+cglobal d63_predictor_16x16, 3, 5, 5, dst, stride, above, line, goffset
+  GET_GOT     goffsetq
+
+  mova                m0, [aboveq]
+  DEFINE_ARGS dst, stride, stride3, line
+  lea           stride3q, [strideq*3]
+  mova                m1, [GLOBAL(sh_b123456789abcdeff)]
+  pshufb              m2, m0, [GLOBAL(sh_b23456789abcdefff)]
+  pshufb              m3, m0, m1
+
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m3, m2, m4
+  pavgb               m0, m3
+
+  mov              lined, 4
+.loop:
+  mova  [dstq          ], m0
+  mova  [dstq+strideq  ], m4
+  pshufb              m0, m1
+  pshufb              m4, m1
+  mova  [dstq+strideq*2], m0
+  mova  [dstq+stride3q ], m4
+  pshufb              m0, m1
+  pshufb              m4, m1
+  lea               dstq, [dstq+strideq*4]
+  dec              lined
+  jnz .loop
+  RESTORE_GOT
+  REP_RET
+
+INIT_XMM ssse3
+cglobal d63_predictor_32x32, 3, 5, 8, dst, stride, above, line, goffset
+  GET_GOT     goffsetq
+
+  mova                   m0, [aboveq]
+  mova                   m7, [aboveq+16]
+  DEFINE_ARGS dst, stride, stride3, line
+  mova                   m1, [GLOBAL(sh_b123456789abcdeff)]
+  lea              stride3q, [strideq*3]
+  pshufb                 m2, m7, [GLOBAL(sh_b23456789abcdefff)]
+  pshufb                 m3, m7, m1
+
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m2, m4
+  palignr                m6, m7, m0, 1
+  palignr                m5, m7, m0, 2
+  pavgb                  m7, m3
+
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m6, m5, m2
+  pavgb                  m0, m6
+
+  mov                 lined, 8
+.loop:
+  mova  [dstq             ], m0
+  mova  [dstq          +16], m7
+  mova  [dstq+strideq     ], m2
+  mova  [dstq+strideq  +16], m4
+  palignr                m3, m7, m0, 1
+  palignr                m5, m4, m2, 1
+  pshufb                 m7, m1
+  pshufb                 m4, m1
+
+  mova  [dstq+strideq*2   ], m3
+  mova  [dstq+strideq*2+16], m7
+  mova  [dstq+stride3q    ], m5
+  mova  [dstq+stride3q +16], m4
+  palignr                m0, m7, m3, 1
+  palignr                m2, m4, m5, 1
+  pshufb                 m7, m1
+  pshufb                 m4, m1
+  lea                  dstq, [dstq+strideq*4]
+  dec                 lined
+  jnz .loop
+  RESTORE_GOT
+  REP_RET
diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index 84a29b17a..ba9fad25e 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -75,28 +75,9 @@ static TX_SIZE read_tx_size(VP9D_COMP *pbi, TX_MODE tx_mode,
     return TX_4X4;
 }
 
-static void set_segment_id(VP9_COMMON *cm, BLOCK_SIZE bsize,
-                           int mi_row, int mi_col, int segment_id) {
-  const int mi_offset = mi_row * cm->mi_cols + mi_col;
-  const int bw = 1 << mi_width_log2(bsize);
-  const int bh = 1 << mi_height_log2(bsize);
-  const int xmis = MIN(cm->mi_cols - mi_col, bw);
-  const int ymis = MIN(cm->mi_rows - mi_row, bh);
-  int x, y;
-
-  assert(segment_id >= 0 && segment_id < MAX_SEGMENTS);
-
-  for (y = 0; y < ymis; y++)
-    for (x = 0; x < xmis; x++)
-      cm->last_frame_seg_map[mi_offset + y * cm->mi_cols + x] = segment_id;
-}
-
 static int read_intra_segment_id(VP9D_COMP *pbi, int mi_row, int mi_col,
                                  vp9_reader *r) {
-  MACROBLOCKD *const xd = &pbi->mb;
   struct segmentation *const seg = &pbi->common.seg;
-  const BLOCK_SIZE bsize = xd->this_mi->mbmi.sb_type;
-  int segment_id;
 
   if (!seg->enabled)
     return 0;  // Default for disabled segmentation
@@ -104,9 +85,7 @@ static int read_intra_segment_id(VP9D_COMP *pbi, int mi_row, int mi_col,
   if (!seg->update_map)
     return 0;
 
-  segment_id = read_segment_id(r, seg);
-  set_segment_id(&pbi->common, bsize, mi_row, mi_col, segment_id);
-  return segment_id;
+  return read_segment_id(r, seg);
 }
 
 static int read_inter_segment_id(VP9D_COMP *pbi, int mi_row, int mi_col,
@@ -115,7 +94,7 @@ static int read_inter_segment_id(VP9D_COMP *pbi, int mi_row, int mi_col,
   MACROBLOCKD *const xd = &pbi->mb;
   struct segmentation *const seg = &cm->seg;
   const BLOCK_SIZE bsize = xd->this_mi->mbmi.sb_type;
-  int pred_segment_id, segment_id;
+  int pred_segment_id;;
 
   if (!seg->enabled)
     return 0;  // Default for disabled segmentation
@@ -129,13 +108,10 @@ static int read_inter_segment_id(VP9D_COMP *pbi, int mi_row, int mi_col,
     const vp9_prob pred_prob = vp9_get_pred_prob_seg_id(seg, xd);
     const int pred_flag = vp9_read(r, pred_prob);
     vp9_set_pred_flag_seg_id(xd, pred_flag);
-    segment_id = pred_flag ? pred_segment_id
-                           : read_segment_id(r, seg);
+    return pred_flag ? pred_segment_id : read_segment_id(r, seg);
   } else {
-    segment_id = read_segment_id(r, seg);
+    return read_segment_id(r, seg);
   }
-  set_segment_id(cm, bsize, mi_row, mi_col, segment_id);
-  return segment_id;
 }
 
 static uint8_t read_skip_coeff(VP9D_COMP *pbi, int segment_id, vp9_reader *r) {
@@ -200,7 +176,6 @@ static void read_intra_frame_mode_info(VP9D_COMP *pbi, MODE_INFO *m,
 
 static int read_mv_component(vp9_reader *r,
                              const nmv_component *mvcomp, int usehp) {
-
   int mag, d, fr, hp;
   const int sign = vp9_read(r, mvcomp->sign);
   const int mv_class = treed_read(r, vp9_mv_class_tree, mvcomp->classes);
@@ -493,11 +468,12 @@ static void read_inter_block_mode_info(VP9D_COMP *pbi, MODE_INFO *mi,
     const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];  // 1 or 2
     const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];  // 1 or 2
     int idx, idy;
+    int b_mode;
     for (idy = 0; idy < 2; idy += num_4x4_h) {
       for (idx = 0; idx < 2; idx += num_4x4_w) {
         int_mv blockmv, secondmv;
         const int j = idy * 2 + idx;
-        const int b_mode = read_inter_mode(cm, r, inter_mode_ctx);
+        b_mode = read_inter_mode(cm, r, inter_mode_ctx);
 
         if (b_mode == NEARESTMV || b_mode == NEARMV) {
           vp9_append_sub8x8_mvs_for_idx(cm, xd, &nearest, &nearby, j, 0,
@@ -544,10 +520,10 @@ static void read_inter_block_mode_info(VP9D_COMP *pbi, MODE_INFO *mi,
           mi->bmi[j + 2] = mi->bmi[j];
         if (num_4x4_w == 2)
           mi->bmi[j + 1] = mi->bmi[j];
-        mi->mbmi.mode = b_mode;
       }
     }
 
+    mi->mbmi.mode = b_mode;
     mv0->as_int = mi->bmi[3].as_mv[0].as_int;
     mv1->as_int = mi->bmi[3].as_mv[1].as_int;
   } else {
diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c
index 34ed0c759..77fec5061 100644
--- a/vp9/decoder/vp9_decodframe.c
+++ b/vp9/decoder/vp9_decodframe.c
@@ -436,7 +436,6 @@ static void setup_segmentation(struct segmentation *seg,
 
 static void setup_loopfilter(struct loopfilter *lf,
                              struct vp9_read_bit_buffer *rb) {
-
   lf->filter_level = vp9_rb_read_literal(rb, 6);
   lf->sharpness_level = vp9_rb_read_literal(rb, 3);
 
@@ -935,6 +934,15 @@ void vp9_init_dequantizer(VP9_COMMON *cm) {
   }
 }
 
+static void update_segmentation_map(VP9_COMMON *cm) {
+  int i, j;
+
+  for (i = 0; i < cm->mi_rows; ++i)
+    for (j = 0; j < cm->mi_cols; ++j)
+      cm->last_frame_seg_map[i * cm->mi_cols + j] =
+          cm->mi_grid_visible[i * cm->mode_info_stride + j]->mbmi.segment_id;
+}
+
 int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) {
   int i;
   VP9_COMMON *const cm = &pbi->common;
@@ -1014,5 +1022,7 @@ int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) {
   if (cm->refresh_frame_context)
     cm->frame_contexts[cm->frame_context_idx] = cm->fc;
 
+  update_segmentation_map(cm);
+
   return 0;
 }
diff --git a/vp9/decoder/vp9_detokenize.c b/vp9/decoder/vp9_detokenize.c
index 3792b9c78..8fcf83ee3 100644
--- a/vp9/decoder/vp9_detokenize.c
+++ b/vp9/decoder/vp9_detokenize.c
@@ -106,8 +106,7 @@ static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd,
   const uint8_t *band_translate;
   uint8_t token_cache[1024];
   int pt = get_entropy_context(tx_size, A, L);
-  get_scan_and_band(xd, tx_size, type, block_idx, &scan, &band_translate);
-  nb = vp9_get_coef_neighbors_handle(scan);
+  get_scan_and_band(xd, tx_size, type, block_idx, &scan, &nb, &band_translate);
 
   while (1) {
     int val;
@@ -122,7 +121,7 @@ static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd,
     if (!vp9_read(r, prob[EOB_CONTEXT_NODE]))
       break;
 
-SKIP_START:
+  SKIP_START:
     if (c >= seg_eob)
       break;
     if (c)
diff --git a/vp9/decoder/vp9_dsubexp.c b/vp9/decoder/vp9_dsubexp.c
index 8cc64f73e..9a970d42b 100644
--- a/vp9/decoder/vp9_dsubexp.c
+++ b/vp9/decoder/vp9_dsubexp.c
@@ -67,7 +67,6 @@ static int inv_remap_prob(int v, int m) {
     206, 207, 208, 209, 210, 211, 212, 213, 215, 216, 217, 218, 219, 220, 221,
     222, 223, 224, 225, 226, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237,
     238, 239, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252,
-
   };
   // v = merge_index(v, MAX_PROBS - 1, MODULUS_PARAM);
   v = inv_map_table[v];
diff --git a/vp9/decoder/vp9_idct_blk.c b/vp9/decoder/vp9_idct_blk.c
index 395e636b8..cac5f1a76 100644
--- a/vp9/decoder/vp9_idct_blk.c
+++ b/vp9/decoder/vp9_idct_blk.c
@@ -8,7 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "vp9_rtcd.h"
+#include "./vp9_rtcd.h"
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/decoder/vp9_idct_blk.h"
 
@@ -96,7 +96,7 @@ void vp9_idct_add_8x8_c(int16_t *input, uint8_t *dest, int stride, int eob) {
       vp9_short_idct8x8_1_add(input, dest, stride);
       input[0] = 0;
     } else if (eob <= 10) {
-      vp9_short_idct10_8x8_add(input, dest, stride);
+      vp9_short_idct8x8_10_add(input, dest, stride);
       vpx_memset(input, 0, 128);
     } else {
       vp9_short_idct8x8_add(input, dest, stride);
@@ -126,7 +126,7 @@ void vp9_idct_add_16x16_c(int16_t *input, uint8_t *dest, int stride, int eob) {
       vp9_short_idct16x16_1_add(input, dest, stride);
       input[0] = 0;
     } else if (eob <= 10) {
-      vp9_short_idct10_16x16_add(input, dest, stride);
+      vp9_short_idct16x16_10_add(input, dest, stride);
       vpx_memset(input, 0, 512);
     } else {
       vp9_short_idct16x16_add(input, dest, stride);
diff --git a/vp9/decoder/vp9_idct_blk.h b/vp9/decoder/vp9_idct_blk.h
index 1810bd02f..00f1bc6a6 100644
--- a/vp9/decoder/vp9_idct_blk.h
+++ b/vp9/decoder/vp9_idct_blk.h
@@ -14,17 +14,16 @@
 
 #include "vp9/common/vp9_blockd.h"
 
+void vp9_idct_add_lossless_c(int16_t *input, uint8_t *dest,
+                             int stride, int eob);
 
-void vp9_idct_add_lossless_c(int16_t *input, unsigned char *dest, int stride,
-                             int eob);
-
-void vp9_iht_add_c(TX_TYPE tx_type, int16_t *input, unsigned char *dest,
+void vp9_iht_add_c(TX_TYPE tx_type, int16_t *input, uint8_t *dest,
                    int stride, int eob);
 
-void vp9_iht_add_8x8_c(TX_TYPE tx_type, int16_t *input, unsigned char *dest,
+void vp9_iht_add_8x8_c(TX_TYPE tx_type, int16_t *input, uint8_t *dest,
                        int stride, int eob);
 
-void vp9_iht_add_16x16_c(TX_TYPE tx_type, int16_t *input, unsigned char *dest,
+void vp9_iht_add_16x16_c(TX_TYPE tx_type, int16_t *input, uint8_t *dest,
                          int stride, int eob);
 
 #endif  // VP9_DECODER_VP9_IDCT_BLK_H_
diff --git a/vp9/decoder/vp9_onyxd.h b/vp9/decoder/vp9_onyxd.h
index cd5b7508f..4f662e9ac 100644
--- a/vp9/decoder/vp9_onyxd.h
+++ b/vp9/decoder/vp9_onyxd.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_COMMON_VP9_ONYXD_H_
-#define VP9_COMMON_VP9_ONYXD_H_
+#ifndef VP9_DECODER_VP9_ONYXD_H_
+#define VP9_DECODER_VP9_ONYXD_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -66,4 +66,4 @@ void vp9_remove_decompressor(VP9D_PTR comp);
 }
 #endif
 
-#endif  // VP9_COMMON_VP9_ONYXD_H_
+#endif  // VP9_DECODER_VP9_ONYXD_H_
diff --git a/vp9/decoder/vp9_onyxd_if.c b/vp9/decoder/vp9_onyxd_if.c
index 17d5def33..1c804d974 100644
--- a/vp9/decoder/vp9_onyxd_if.c
+++ b/vp9/decoder/vp9_onyxd_if.c
@@ -65,13 +65,12 @@ static void recon_write_yuv_frame(const char *name,
 #endif
 #if WRITE_RECON_BUFFER == 2
 void write_dx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame) {
-
   // write the frame
   FILE *yframe;
   int i;
   char filename[255];
 
-  sprintf(filename, "dx\\y%04d.raw", this_frame);
+  snprintf(filename, sizeof(filename)-1, "dx\\y%04d.raw", this_frame);
   yframe = fopen(filename, "wb");
 
   for (i = 0; i < frame->y_height; i++)
@@ -79,7 +78,7 @@ void write_dx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame) {
            frame->y_width, 1, yframe);
 
   fclose(yframe);
-  sprintf(filename, "dx\\u%04d.raw", this_frame);
+  snprintf(filename, sizeof(filename)-1, "dx\\u%04d.raw", this_frame);
   yframe = fopen(filename, "wb");
 
   for (i = 0; i < frame->uv_height; i++)
@@ -87,7 +86,7 @@ void write_dx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame) {
            frame->uv_width, 1, yframe);
 
   fclose(yframe);
-  sprintf(filename, "dx\\v%04d.raw", this_frame);
+  snprintf(filename, sizeof(filename)-1, "dx\\v%04d.raw", this_frame);
   yframe = fopen(filename, "wb");
 
   for (i = 0; i < frame->uv_height; i++)
@@ -214,13 +213,13 @@ vpx_codec_err_t vp9_set_reference_dec(VP9D_PTR ptr, VP9_REFFRAME ref_frame_flag,
    * vpxenc --test-decode functionality working, and will be replaced in a
    * later commit that adds VP9-specific controls for this functionality.
    */
-  if (ref_frame_flag == VP9_LAST_FLAG)
+  if (ref_frame_flag == VP9_LAST_FLAG) {
     ref_fb_ptr = &pbi->common.active_ref_idx[0];
-  else if (ref_frame_flag == VP9_GOLD_FLAG)
+  } else if (ref_frame_flag == VP9_GOLD_FLAG) {
     ref_fb_ptr = &pbi->common.active_ref_idx[1];
-  else if (ref_frame_flag == VP9_ALT_FLAG)
+  } else if (ref_frame_flag == VP9_ALT_FLAG) {
     ref_fb_ptr = &pbi->common.active_ref_idx[2];
-  else {
+  } else {
     vpx_internal_error(&pbi->common.error, VPX_CODEC_ERROR,
                        "Invalid reference frame");
     return pbi->common.error.error_code;
diff --git a/vp9/decoder/vp9_onyxd_int.h b/vp9/decoder/vp9_onyxd_int.h
index a051971a1..8fee5e975 100644
--- a/vp9/decoder/vp9_onyxd_int.h
+++ b/vp9/decoder/vp9_onyxd_int.h
@@ -41,4 +41,4 @@ typedef struct VP9Decompressor {
   VP9Worker lf_worker;
 } VP9D_COMP;
 
-#endif  // VP9_DECODER_VP9_TREEREADER_H_
+#endif  // VP9_DECODER_VP9_ONYXD_INT_H_
diff --git a/vp9/decoder/vp9_read_bit_buffer.h b/vp9/decoder/vp9_read_bit_buffer.h
index c7fa3aa27..41a686837 100644
--- a/vp9/decoder/vp9_read_bit_buffer.h
+++ b/vp9/decoder/vp9_read_bit_buffer.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_READ_BIT_BUFFER_
-#define VP9_READ_BIT_BUFFER_
+#ifndef VP9_DECODER_VP9_READ_BIT_BUFFER_H_
+#define VP9_DECODER_VP9_READ_BIT_BUFFER_H_
 
 #include <limits.h>
 
@@ -57,4 +57,4 @@ static int vp9_rb_read_signed_literal(struct vp9_read_bit_buffer *rb,
   return vp9_rb_read_bit(rb) ? -value : value;
 }
 
-#endif  // VP9_READ_BIT_BUFFER_
+#endif  // VP9_DECODER_VP9_READ_BIT_BUFFER_H_
diff --git a/vp9/decoder/vp9_thread.h b/vp9/decoder/vp9_thread.h
index a8f7e046a..0b5eca073 100644
--- a/vp9/decoder/vp9_thread.h
+++ b/vp9/decoder/vp9_thread.h
@@ -17,7 +17,7 @@
 #ifndef VP9_DECODER_VP9_THREAD_H_
 #define VP9_DECODER_VP9_THREAD_H_
 
-#include "vpx_config.h"
+#include "./vpx_config.h"
 
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
@@ -90,4 +90,4 @@ void vp9_worker_end(VP9Worker* const worker);
 }    // extern "C"
 #endif
 
-#endif  /* VP9_DECODER_VP9_THREAD_H_ */
+#endif  // VP9_DECODER_VP9_THREAD_H_
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index 622f75fe6..20dd8e175 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -484,17 +484,13 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) {
     }
 
     if (bsize < BLOCK_8X8) {
-      int j;
-      MB_PREDICTION_MODE blockmode;
-      int_mv blockmv;
       const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
       const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
       int idx, idy;
       for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
         for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
-          j = idy * 2 + idx;
-          blockmode = x->partition_info->bmi[j].mode;
-          blockmv = m->bmi[j].as_mv[0];
+          const int j = idy * 2 + idx;
+          const MB_PREDICTION_MODE blockmode = x->partition_info->bmi[j].mode;
           write_sb_mv_ref(bc, blockmode, mv_ref_p);
           ++cm->counts.inter_mode[mi->mode_context[rf]]
                                  [inter_mode_offset(blockmode)];
@@ -503,14 +499,12 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) {
 #ifdef ENTROPY_STATS
             active_section = 11;
 #endif
-            vp9_encode_mv(cpi, bc, &blockmv.as_mv, &mi->best_mv.as_mv,
-                          nmvc, allow_hp);
-
-            if (mi->ref_frame[1] > INTRA_FRAME)
-              vp9_encode_mv(cpi, bc,
-                            &m->bmi[j].as_mv[1].as_mv,
-                            &mi->best_second_mv.as_mv,
-                            nmvc, allow_hp);
+            vp9_encode_mv(cpi, bc, &m->bmi[j].as_mv[0].as_mv,
+                          &mi->best_mv[0].as_mv, nmvc, allow_hp);
+
+            if (has_second_ref(mi))
+              vp9_encode_mv(cpi, bc, &m->bmi[j].as_mv[1].as_mv,
+                            &mi->best_mv[1].as_mv, nmvc, allow_hp);
           }
         }
       }
@@ -518,12 +512,12 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) {
 #ifdef ENTROPY_STATS
       active_section = 5;
 #endif
-      vp9_encode_mv(cpi, bc, &mi->mv[0].as_mv, &mi->best_mv.as_mv,
-                    nmvc, allow_hp);
+      vp9_encode_mv(cpi, bc, &mi->mv[0].as_mv,
+                    &mi->best_mv[0].as_mv, nmvc, allow_hp);
 
-      if (mi->ref_frame[1] > INTRA_FRAME)
-        vp9_encode_mv(cpi, bc, &mi->mv[1].as_mv, &mi->best_second_mv.as_mv,
-                      nmvc, allow_hp);
+      if (has_second_ref(mi))
+        vp9_encode_mv(cpi, bc, &mi->mv[1].as_mv,
+                      &mi->best_mv[1].as_mv, nmvc, allow_hp);
     }
   }
 }
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index 013047e35..5a0d746c8 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -34,6 +34,7 @@ typedef struct {
 typedef struct {
   MODE_INFO mic;
   PARTITION_INFO partition_info;
+  unsigned char zcoeff_blk[256];
   int skip;
   int_mv best_ref_mv;
   int_mv second_best_ref_mv;
@@ -136,6 +137,7 @@ struct macroblock {
   int mv_row_min;
   int mv_row_max;
 
+  unsigned char zcoeff_blk[TX_SIZES][256];
   int skip;
 
   int encode_breakout;
diff --git a/vp9/encoder/vp9_dct.c b/vp9/encoder/vp9_dct.c
index ca863931e..b9c300033 100644
--- a/vp9/encoder/vp9_dct.c
+++ b/vp9/encoder/vp9_dct.c
@@ -637,10 +637,10 @@ void vp9_short_walsh4x4_c(short *input, short *output, int pitch) {
     c1 = e1 - c1;
     a1 -= c1;
     d1 += b1;
-    op[0] = a1 << WHT_UPSCALE_FACTOR;
-    op[1] = c1 << WHT_UPSCALE_FACTOR;
-    op[2] = d1 << WHT_UPSCALE_FACTOR;
-    op[3] = b1 << WHT_UPSCALE_FACTOR;
+    op[0] = a1 * UNIT_QUANT_FACTOR;
+    op[1] = c1 * UNIT_QUANT_FACTOR;
+    op[2] = d1 * UNIT_QUANT_FACTOR;
+    op[3] = b1 * UNIT_QUANT_FACTOR;
 
     ip += 4;
     op += 4;
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index ee938bda9..f6045e80b 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -390,6 +390,9 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
   }
 
   x->skip = ctx->skip;
+  vpx_memcpy(x->zcoeff_blk[mbmi->tx_size], ctx->zcoeff_blk,
+             sizeof(ctx->zcoeff_blk));
+
   if (!output_enabled)
     return;
 
@@ -428,19 +431,19 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
     cpi->mode_chosen_counts[mb_mode_index]++;
     if (is_inter_block(mbmi)
         && (mbmi->sb_type < BLOCK_8X8 || mbmi->mode == NEWMV)) {
-      int_mv best_mv, best_second_mv;
+      int_mv best_mv[2];
       const MV_REFERENCE_FRAME rf1 = mbmi->ref_frame[0];
       const MV_REFERENCE_FRAME rf2 = mbmi->ref_frame[1];
-      best_mv.as_int = ctx->best_ref_mv.as_int;
-      best_second_mv.as_int = ctx->second_best_ref_mv.as_int;
+      best_mv[0].as_int = ctx->best_ref_mv.as_int;
+      best_mv[1].as_int = ctx->second_best_ref_mv.as_int;
       if (mbmi->mode == NEWMV) {
-        best_mv.as_int = mbmi->ref_mvs[rf1][0].as_int;
+        best_mv[0].as_int = mbmi->ref_mvs[rf1][0].as_int;
         if (rf2 > 0)
-          best_second_mv.as_int = mbmi->ref_mvs[rf2][0].as_int;
+          best_mv[1].as_int = mbmi->ref_mvs[rf2][0].as_int;
       }
-      mbmi->best_mv.as_int = best_mv.as_int;
-      mbmi->best_second_mv.as_int = best_second_mv.as_int;
-      vp9_update_nmv_count(cpi, x, &best_mv, &best_second_mv);
+      mbmi->best_mv[0].as_int = best_mv[0].as_int;
+      mbmi->best_mv[1].as_int = best_mv[1].as_int;
+      vp9_update_mv_count(cpi, x, best_mv);
     }
 
     if (cm->mcomp_filter_type == SWITCHABLE && is_inter_mode(mbmi->mode)) {
@@ -2209,7 +2212,7 @@ static void encode_frame_internal(VP9_COMP *cpi) {
   cpi->inter_zz_count = 0;
 
   vp9_zero(cm->counts.switchable_interp);
-  vp9_zero(cpi->txfm_stepdown_count);
+  vp9_zero(cpi->tx_stepdown_count);
 
   xd->mi_8x8 = cm->mi_grid_visible;
   // required for vp9_frame_init_quantizer
@@ -2348,18 +2351,19 @@ static void reset_skip_txfm_size_b(VP9_COMP *cpi, MODE_INFO **mi_8x8,
                                    int mis, TX_SIZE max_tx_size, int bw, int bh,
                                    int mi_row, int mi_col, BLOCK_SIZE bsize) {
   VP9_COMMON * const cm = &cpi->common;
-  MB_MODE_INFO * const mbmi = &mi_8x8[0]->mbmi;
 
-  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) {
     return;
-
-  if (mbmi->tx_size > max_tx_size) {
-    const int ymbs = MIN(bh, cm->mi_rows - mi_row);
-    const int xmbs = MIN(bw, cm->mi_cols - mi_col);
-
-    assert(vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP) ||
-           get_skip_flag(mi_8x8, mis, ymbs, xmbs));
-    set_txfm_flag(mi_8x8, mis, ymbs, xmbs, max_tx_size);
+  } else {
+    MB_MODE_INFO * const mbmi = &mi_8x8[0]->mbmi;
+    if (mbmi->tx_size > max_tx_size) {
+      const int ymbs = MIN(bh, cm->mi_rows - mi_row);
+      const int xmbs = MIN(bw, cm->mi_cols - mi_col);
+
+      assert(vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP) ||
+             get_skip_flag(mi_8x8, mis, ymbs, xmbs));
+      set_txfm_flag(mi_8x8, mis, ymbs, xmbs, max_tx_size);
+    }
   }
 }
 
@@ -2454,9 +2458,9 @@ static void select_tx_mode(VP9_COMP *cpi) {
       unsigned int total = 0;
       int i;
       for (i = 0; i < TX_SIZES; ++i)
-        total += cpi->txfm_stepdown_count[i];
+        total += cpi->tx_stepdown_count[i];
       if (total) {
-        double fraction = (double)cpi->txfm_stepdown_count[0] / total;
+        double fraction = (double)cpi->tx_stepdown_count[0] / total;
         cpi->common.tx_mode = fraction > 0.90 ? ALLOW_32X32 : TX_MODE_SELECT;
         // printf("fraction = %f\n", fraction);
       }  // else keep unchanged
@@ -2732,7 +2736,7 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
     int idx = cm->ref_frame_map[get_ref_frame_idx(cpi, mbmi->ref_frame[0])];
     YV12_BUFFER_CONFIG *ref_fb = &cm->yv12_fb[idx];
     YV12_BUFFER_CONFIG *second_ref_fb = NULL;
-    if (mbmi->ref_frame[1] > 0) {
+    if (has_second_ref(mbmi)) {
       idx = cm->ref_frame_map[get_ref_frame_idx(cpi, mbmi->ref_frame[1])];
       second_ref_fb = &cm->yv12_fb[idx];
     }
@@ -2744,7 +2748,6 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
     setup_pre_planes(xd, 1, second_ref_fb, mi_row, mi_col,
                      &xd->scale_factor[1]);
 
-
     vp9_build_inter_predictors_sb(xd, mi_row, mi_col, MAX(bsize, BLOCK_8X8));
   }
 
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 2c12477a7..76a5d33e7 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -53,7 +53,7 @@ static void inverse_transform_b_8x8_add(int eob,
   if (eob <= 1)
     vp9_short_idct8x8_1_add(dqcoeff, dest, stride);
   else if (eob <= 10)
-    vp9_short_idct10_8x8_add(dqcoeff, dest, stride);
+    vp9_short_idct8x8_10_add(dqcoeff, dest, stride);
   else
     vp9_short_idct8x8_add(dqcoeff, dest, stride);
 }
@@ -64,7 +64,7 @@ static void inverse_transform_b_16x16_add(int eob,
   if (eob <= 1)
     vp9_short_idct16x16_1_add(dqcoeff, dest, stride);
   else if (eob <= 10)
-    vp9_short_idct10_16x16_add(dqcoeff, dest, stride);
+    vp9_short_idct16x16_10_add(dqcoeff, dest, stride);
   else
     vp9_short_idct16x16_add(dqcoeff, dest, stride);
 }
@@ -172,7 +172,7 @@ static void optimize_b(MACROBLOCK *mb,
   assert((!type && !plane) || (type && plane));
   dqcoeff_ptr = BLOCK_OFFSET(pd->dqcoeff, block);
   qcoeff_ptr = BLOCK_OFFSET(pd->qcoeff, block);
-  get_scan_and_band(xd, tx_size, type, ib, &scan, &band_translate);
+  get_scan_and_band(xd, tx_size, type, ib, &scan, &nb, &band_translate);
   assert(eob <= default_eob);
 
   /* Now set up a Viterbi trellis to evaluate alternative roundings. */
@@ -191,7 +191,6 @@ static void optimize_b(MACROBLOCK *mb,
   for (i = 0; i < eob; i++)
     token_cache[scan[i]] = vp9_pt_energy_class[vp9_dct_value_tokens_ptr[
         qcoeff_ptr[scan[i]]].token];
-  nb = vp9_get_coef_neighbors_handle(scan);
 
   for (i = eob; i-- > i0;) {
     int base_bits, d2, dx;
@@ -365,36 +364,10 @@ static void optimize_init_b(int plane, BLOCK_SIZE bsize,
   const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
   const MB_MODE_INFO *mbmi = &xd->this_mi->mbmi;
   const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi) : mbmi->tx_size;
-  int i;
 
-  switch (tx_size) {
-    case TX_4X4:
-      vpx_memcpy(args->ctx->ta[plane], pd->above_context,
-                 sizeof(ENTROPY_CONTEXT) * num_4x4_w);
-      vpx_memcpy(args->ctx->tl[plane], pd->left_context,
-                 sizeof(ENTROPY_CONTEXT) * num_4x4_h);
-      break;
-    case TX_8X8:
-      for (i = 0; i < num_4x4_w; i += 2)
-        args->ctx->ta[plane][i] = !!*(uint16_t *)&pd->above_context[i];
-      for (i = 0; i < num_4x4_h; i += 2)
-        args->ctx->tl[plane][i] = !!*(uint16_t *)&pd->left_context[i];
-      break;
-    case TX_16X16:
-      for (i = 0; i < num_4x4_w; i += 4)
-        args->ctx->ta[plane][i] = !!*(uint32_t *)&pd->above_context[i];
-      for (i = 0; i < num_4x4_h; i += 4)
-        args->ctx->tl[plane][i] = !!*(uint32_t *)&pd->left_context[i];
-      break;
-    case TX_32X32:
-      for (i = 0; i < num_4x4_w; i += 8)
-        args->ctx->ta[plane][i] = !!*(uint64_t *)&pd->above_context[i];
-      for (i = 0; i < num_4x4_h; i += 8)
-        args->ctx->tl[plane][i] = !!*(uint64_t *)&pd->left_context[i];
-      break;
-    default:
-      assert(0);
-  }
+  vp9_get_entropy_contexts(tx_size, args->ctx->ta[plane], args->ctx->tl[plane],
+                           pd->above_context, pd->left_context,
+                           num_4x4_w, num_4x4_h);
 }
 
 void vp9_xform_quant(int plane, int block, BLOCK_SIZE plane_bsize,
@@ -482,6 +455,14 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize,
   int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
   uint8_t *const dst = raster_block_offset_uint8(plane_bsize, raster_block,
                                                  pd->dst.buf, pd->dst.stride);
+
+  // TODO(jingning): per transformed block zero forcing only enabled for
+  // luma component. will integrate chroma components as well.
+  if (x->zcoeff_blk[tx_size][block] && plane == 0) {
+    pd->eobs[block] = 0;
+    return;
+  }
+
   vp9_xform_quant(plane, block, plane_bsize, tx_size, arg);
 
   if (x->optimize)
diff --git a/vp9/encoder/vp9_encodemv.c b/vp9/encoder/vp9_encodemv.c
index ed3a2bb64..db08ee856 100644
--- a/vp9/encoder/vp9_encodemv.c
+++ b/vp9/encoder/vp9_encodemv.c
@@ -314,44 +314,34 @@ void vp9_build_nmv_cost_table(int *mvjoint,
     build_nmv_component_cost_table(mvcost[1], &mvctx->comps[1], usehp);
 }
 
-void vp9_update_nmv_count(VP9_COMP *cpi, MACROBLOCK *x,
-                         int_mv *best_ref_mv, int_mv *second_best_ref_mv) {
+static void inc_mvs(int_mv mv[2], int_mv ref[2], int is_compound,
+                    nmv_context_counts *counts) {
+  int i;
+  for (i = 0; i < 1 + is_compound; ++i) {
+    const MV diff = { mv[i].as_mv.row - ref[i].as_mv.row,
+                      mv[i].as_mv.col - ref[i].as_mv.col };
+    vp9_inc_mv(&diff, counts);
+  }
+}
+
+void vp9_update_mv_count(VP9_COMP *cpi, MACROBLOCK *x, int_mv best_ref_mv[2]) {
   MODE_INFO *mi = x->e_mbd.mi_8x8[0];
   MB_MODE_INFO *const mbmi = &mi->mbmi;
-  MV diff;
-  const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[mbmi->sb_type];
-  const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[mbmi->sb_type];
-  int idx, idy;
+  const int is_compound = has_second_ref(mbmi);
 
   if (mbmi->sb_type < BLOCK_8X8) {
-    PARTITION_INFO *pi = x->partition_info;
-    for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
-      for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
+    const int num_4x4_w = num_4x4_blocks_wide_lookup[mbmi->sb_type];
+    const int num_4x4_h = num_4x4_blocks_high_lookup[mbmi->sb_type];
+    int idx, idy;
+
+    for (idy = 0; idy < 2; idy += num_4x4_h) {
+      for (idx = 0; idx < 2; idx += num_4x4_w) {
         const int i = idy * 2 + idx;
-        if (pi->bmi[i].mode == NEWMV) {
-          diff.row = mi->bmi[i].as_mv[0].as_mv.row - best_ref_mv->as_mv.row;
-          diff.col = mi->bmi[i].as_mv[0].as_mv.col - best_ref_mv->as_mv.col;
-          vp9_inc_mv(&diff, &cpi->NMVcount);
-
-          if (mi->mbmi.ref_frame[1] > INTRA_FRAME) {
-            diff.row = mi->bmi[i].as_mv[1].as_mv.row -
-                         second_best_ref_mv->as_mv.row;
-            diff.col = mi->bmi[i].as_mv[1].as_mv.col -
-                         second_best_ref_mv->as_mv.col;
-            vp9_inc_mv(&diff, &cpi->NMVcount);
-          }
-        }
+        if (x->partition_info->bmi[i].mode == NEWMV)
+          inc_mvs(mi->bmi[i].as_mv, best_ref_mv, is_compound, &cpi->NMVcount);
       }
     }
   } else if (mbmi->mode == NEWMV) {
-    diff.row = mbmi->mv[0].as_mv.row - best_ref_mv->as_mv.row;
-    diff.col = mbmi->mv[0].as_mv.col - best_ref_mv->as_mv.col;
-    vp9_inc_mv(&diff, &cpi->NMVcount);
-
-    if (mbmi->ref_frame[1] > INTRA_FRAME) {
-      diff.row = mbmi->mv[1].as_mv.row - second_best_ref_mv->as_mv.row;
-      diff.col = mbmi->mv[1].as_mv.col - second_best_ref_mv->as_mv.col;
-      vp9_inc_mv(&diff, &cpi->NMVcount);
-    }
+    inc_mvs(mbmi->mv, best_ref_mv, is_compound, &cpi->NMVcount);
   }
 }
diff --git a/vp9/encoder/vp9_encodemv.h b/vp9/encoder/vp9_encodemv.h
index 2789ce114..633177885 100644
--- a/vp9/encoder/vp9_encodemv.h
+++ b/vp9/encoder/vp9_encodemv.h
@@ -25,7 +25,7 @@ void vp9_build_nmv_cost_table(int *mvjoint,
                               int usehp,
                               int mvc_flag_v,
                               int mvc_flag_h);
-void vp9_update_nmv_count(VP9_COMP *cpi, MACROBLOCK *x,
-                          int_mv *best_ref_mv, int_mv *second_best_ref_mv);
+
+void vp9_update_mv_count(VP9_COMP *cpi, MACROBLOCK *x, int_mv best_ref_mv[2]);
 
 #endif  // VP9_ENCODER_VP9_ENCODEMV_H_
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 6e44e604c..eaa3bd183 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -534,10 +534,11 @@ void vp9_first_pass(VP9_COMP *cpi) {
     recon_yoffset = (mb_row * recon_y_stride * 16);
     recon_uvoffset = (mb_row * recon_uv_stride * 8);
 
-    // Set up limit values for motion vectors to prevent them extending outside the UMV borders
-    x->mv_row_min = -((mb_row * 16) + (VP9BORDERINPIXELS - 8));
+    // Set up limit values for motion vectors to prevent them extending
+    // outside the UMV borders
+    x->mv_row_min = -((mb_row * 16) + BORDER_MV_PIXELS_B16);
     x->mv_row_max = ((cm->mb_rows - 1 - mb_row) * 16)
-                    + (VP9BORDERINPIXELS - 8);
+                    + BORDER_MV_PIXELS_B16;
 
     // for each macroblock col in image
     for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
@@ -583,9 +584,9 @@ void vp9_first_pass(VP9_COMP *cpi) {
       intra_error += (int64_t)this_error;
 
       // Set up limit values for motion vectors to prevent them extending outside the UMV borders
-      x->mv_col_min = -((mb_col * 16) + (VP9BORDERINPIXELS - 8));
+      x->mv_col_min = -((mb_col * 16) + BORDER_MV_PIXELS_B16);
       x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16)
-                      + (VP9BORDERINPIXELS - 8);
+                      + BORDER_MV_PIXELS_B16;
 
       // Other than for the first frame do a motion search
       if (cm->current_video_frame > 0) {
diff --git a/vp9/encoder/vp9_mbgraph.c b/vp9/encoder/vp9_mbgraph.c
index 5a671f201..0a6576eb5 100644
--- a/vp9/encoder/vp9_mbgraph.c
+++ b/vp9/encoder/vp9_mbgraph.c
@@ -10,14 +10,17 @@
 
 #include <limits.h>
 
-#include <vpx_mem/vpx_mem.h>
-#include <vp9/encoder/vp9_encodeintra.h>
-#include <vp9/encoder/vp9_rdopt.h>
-#include <vp9/common/vp9_blockd.h>
-#include <vp9/common/vp9_reconinter.h>
-#include <vp9/common/vp9_reconintra.h>
-#include <vp9/common/vp9_systemdependent.h>
-#include <vp9/encoder/vp9_segmentation.h>
+#include "vpx_mem/vpx_mem.h"
+#include "vp9/encoder/vp9_encodeintra.h"
+#include "vp9/encoder/vp9_rdopt.h"
+#include "vp9/encoder/vp9_segmentation.h"
+#include "vp9/encoder/vp9_mcomp.h"
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_reconinter.h"
+#include "vp9/common/vp9_reconintra.h"
+#include "vp9/common/vp9_systemdependent.h"
+
+
 
 static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi,
                                               int_mv *ref_mv,
@@ -46,9 +49,9 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi,
   ref_full.as_mv.row = ref_mv->as_mv.row >> 3;
 
   /*cpi->sf.search_method == HEX*/
-  best_err = vp9_hex_search(x, &ref_full, step_param, x->errorperbit,
+  best_err = vp9_hex_search(x, &ref_full.as_mv, step_param, x->errorperbit,
                             0, &v_fn_ptr,
-                            0, ref_mv, dst_mv);
+                            0, &ref_mv->as_mv, &dst_mv->as_mv);
 
   // Try sub-pixel MC
   // if (bestsme > error_thresh && bestsme < INT_MAX)
@@ -57,7 +60,7 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi,
     unsigned int sse;
     best_err = cpi->find_fractional_mv_step(
         x,
-        dst_mv, ref_mv,
+        &dst_mv->as_mv, &ref_mv->as_mv,
         x->errorperbit, &v_fn_ptr,
         0, cpi->sf.subpel_iters_per_step, NULL, NULL,
         & distortion, &sse);
@@ -246,9 +249,8 @@ static void update_mbgraph_frame_stats(VP9_COMP *cpi,
   // Set up limit values for motion vectors to prevent them extending outside the UMV borders
   arf_top_mv.as_int = 0;
   gld_top_mv.as_int = 0;
-  x->mv_row_min     = -(VP9BORDERINPIXELS - 8 - VP9_INTERP_EXTEND);
-  x->mv_row_max     = (cm->mb_rows - 1) * 8 + VP9BORDERINPIXELS
-                      - 8 - VP9_INTERP_EXTEND;
+  x->mv_row_min     = -BORDER_MV_PIXELS_B16;
+  x->mv_row_max     = (cm->mb_rows - 1) * 8 + BORDER_MV_PIXELS_B16;
   xd->up_available  = 0;
   xd->plane[0].dst.stride  = buf->y_stride;
   xd->plane[0].pre[0].stride  = buf->y_stride;
@@ -267,9 +269,8 @@ static void update_mbgraph_frame_stats(VP9_COMP *cpi,
     // Set up limit values for motion vectors to prevent them extending outside the UMV borders
     arf_left_mv.as_int = arf_top_mv.as_int;
     gld_left_mv.as_int = gld_top_mv.as_int;
-    x->mv_col_min      = -(VP9BORDERINPIXELS - 8 - VP9_INTERP_EXTEND);
-    x->mv_col_max      = (cm->mb_cols - 1) * 8 + VP9BORDERINPIXELS
-                         - 8 - VP9_INTERP_EXTEND;
+    x->mv_col_min      = -BORDER_MV_PIXELS_B16;
+    x->mv_col_max      = (cm->mb_cols - 1) * 8 + BORDER_MV_PIXELS_B16;
     xd->left_available = 0;
 
     for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 7dd786904..44eaa657c 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -59,38 +59,39 @@ int vp9_init_search_range(VP9_COMP *cpi, int size) {
   return sr;
 }
 
-int vp9_mv_bit_cost(int_mv *mv, int_mv *ref, int *mvjcost, int *mvcost[2],
-                    int weight) {
-  MV v;
-  v.row = mv->as_mv.row - ref->as_mv.row;
-  v.col = mv->as_mv.col - ref->as_mv.col;
-  return ROUND_POWER_OF_TWO((mvjcost[vp9_get_mv_joint(&v)] +
-                             mvcost[0][v.row] +
-                             mvcost[1][v.col]) * weight, 7);
+static INLINE int mv_cost(const MV *mv,
+                          const int *joint_cost, int *comp_cost[2]) {
+  return joint_cost[vp9_get_mv_joint(mv)] +
+             comp_cost[0][mv->row] + comp_cost[1][mv->col];
 }
 
-static int mv_err_cost(int_mv *mv, int_mv *ref, int *mvjcost, int *mvcost[2],
+int vp9_mv_bit_cost(const MV *mv, const MV *ref,
+                    const int *mvjcost, int *mvcost[2], int weight) {
+  const MV diff = { mv->row - ref->row,
+                    mv->col - ref->col };
+  return ROUND_POWER_OF_TWO(mv_cost(&diff, mvjcost, mvcost) * weight, 7);
+}
+
+static int mv_err_cost(const MV *mv, const MV *ref,
+                       const int *mvjcost, int *mvcost[2],
                        int error_per_bit) {
   if (mvcost) {
-    MV v;
-    v.row = mv->as_mv.row - ref->as_mv.row;
-    v.col = mv->as_mv.col - ref->as_mv.col;
-    return ROUND_POWER_OF_TWO((mvjcost[vp9_get_mv_joint(&v)] +
-                               mvcost[0][v.row] +
-                               mvcost[1][v.col]) * error_per_bit, 13);
+    const MV diff = { mv->row - ref->row,
+                      mv->col - ref->col };
+    return ROUND_POWER_OF_TWO(mv_cost(&diff, mvjcost, mvcost) *
+                                  error_per_bit, 13);
   }
   return 0;
 }
 
-static int mvsad_err_cost(int_mv *mv, int_mv *ref, int *mvjsadcost,
-                          int *mvsadcost[2], int error_per_bit) {
+static int mvsad_err_cost(const MV *mv, const MV *ref,
+                          const int *mvjsadcost, int *mvsadcost[2],
+                          int error_per_bit) {
   if (mvsadcost) {
-    MV v;
-    v.row = mv->as_mv.row - ref->as_mv.row;
-    v.col = mv->as_mv.col - ref->as_mv.col;
-    return ROUND_POWER_OF_TWO((mvjsadcost[vp9_get_mv_joint(&v)] +
-                               mvsadcost[0][v.row] +
-                               mvsadcost[1][v.col]) * error_per_bit, 8);
+    const MV diff = { mv->row - ref->row,
+                      mv->col - ref->col };
+    return ROUND_POWER_OF_TWO(mv_cost(&diff, mvjsadcost, mvsadcost) *
+                                  error_per_bit, 8);
   }
   return 0;
 }
@@ -273,7 +274,7 @@ void vp9_init3smotion_compensation(MACROBLOCK *x, int stride) {
   }
 
 int vp9_find_best_sub_pixel_iterative(MACROBLOCK *x,
-                                      int_mv *bestmv, int_mv *ref_mv,
+                                      MV *bestmv, const MV *ref_mv,
                                       int error_per_bit,
                                       const vp9_variance_fn_ptr_t *vfp,
                                       int forced_stop,
@@ -294,25 +295,25 @@ int vp9_find_best_sub_pixel_iterative(MACROBLOCK *x,
   int thismse;
 
   const int y_stride = xd->plane[0].pre[0].stride;
-  const int offset = (bestmv->as_mv.row) * y_stride + bestmv->as_mv.col;
+  const int offset = bestmv->row * y_stride + bestmv->col;
   uint8_t *y = xd->plane[0].pre[0].buf + offset;
 
-  int rr = ref_mv->as_mv.row;
-  int rc = ref_mv->as_mv.col;
-  int br = bestmv->as_mv.row * 8;
-  int bc = bestmv->as_mv.col * 8;
+  int rr = ref_mv->row;
+  int rc = ref_mv->col;
+  int br = bestmv->row * 8;
+  int bc = bestmv->col * 8;
   int hstep = 4;
-  const int minc = MAX(x->mv_col_min * 8, ref_mv->as_mv.col - MV_MAX);
-  const int maxc = MIN(x->mv_col_max * 8, ref_mv->as_mv.col + MV_MAX);
-  const int minr = MAX(x->mv_row_min * 8, ref_mv->as_mv.row - MV_MAX);
-  const int maxr = MIN(x->mv_row_max * 8, ref_mv->as_mv.row + MV_MAX);
+  const int minc = MAX(x->mv_col_min * 8, ref_mv->col - MV_MAX);
+  const int maxc = MIN(x->mv_col_max * 8, ref_mv->col + MV_MAX);
+  const int minr = MAX(x->mv_row_min * 8, ref_mv->row - MV_MAX);
+  const int maxr = MIN(x->mv_row_max * 8, ref_mv->row + MV_MAX);
 
   int tr = br;
   int tc = bc;
 
   // central mv
-  bestmv->as_mv.row <<= 3;
-  bestmv->as_mv.col <<= 3;
+  bestmv->row <<= 3;
+  bestmv->col <<= 3;
 
   // calculate central point error
   besterr = vfp->vf(y, y_stride, z, src_stride, sse1);
@@ -347,7 +348,7 @@ int vp9_find_best_sub_pixel_iterative(MACROBLOCK *x,
     }
   }
 
-  if (xd->allow_high_precision_mv && vp9_use_mv_hp(&ref_mv->as_mv) &&
+  if (xd->allow_high_precision_mv && vp9_use_mv_hp(ref_mv) &&
       forced_stop == 0) {
     hstep >>= 1;
     while (eighthiters--) {
@@ -360,18 +361,18 @@ int vp9_find_best_sub_pixel_iterative(MACROBLOCK *x,
     }
   }
 
-  bestmv->as_mv.row = br;
-  bestmv->as_mv.col = bc;
+  bestmv->row = br;
+  bestmv->col = bc;
 
-  if ((abs(bestmv->as_mv.col - ref_mv->as_mv.col) > (MAX_FULL_PEL_VAL << 3)) ||
-      (abs(bestmv->as_mv.row - ref_mv->as_mv.row) > (MAX_FULL_PEL_VAL << 3)))
+  if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) ||
+      (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3)))
     return INT_MAX;
 
   return besterr;
 }
 
 int vp9_find_best_sub_pixel_tree(MACROBLOCK *x,
-                                 int_mv *bestmv, int_mv *ref_mv,
+                                 MV *bestmv, const MV *ref_mv,
                                  int error_per_bit,
                                  const vp9_variance_fn_ptr_t *vfp,
                                  int forced_stop,
@@ -391,25 +392,25 @@ int vp9_find_best_sub_pixel_tree(MACROBLOCK *x,
   unsigned int eighthiters = iters_per_step;
 
   const int y_stride = xd->plane[0].pre[0].stride;
-  const int offset = bestmv->as_mv.row * y_stride + bestmv->as_mv.col;
+  const int offset = bestmv->row * y_stride + bestmv->col;
   uint8_t *y = xd->plane[0].pre[0].buf + offset;
 
-  int rr = ref_mv->as_mv.row;
-  int rc = ref_mv->as_mv.col;
-  int br = bestmv->as_mv.row * 8;
-  int bc = bestmv->as_mv.col * 8;
+  int rr = ref_mv->row;
+  int rc = ref_mv->col;
+  int br = bestmv->row * 8;
+  int bc = bestmv->col * 8;
   int hstep = 4;
-  const int minc = MAX(x->mv_col_min * 8, ref_mv->as_mv.col - MV_MAX);
-  const int maxc = MIN(x->mv_col_max * 8, ref_mv->as_mv.col + MV_MAX);
-  const int minr = MAX(x->mv_row_min * 8, ref_mv->as_mv.row - MV_MAX);
-  const int maxr = MIN(x->mv_row_max * 8, ref_mv->as_mv.row + MV_MAX);
+  const int minc = MAX(x->mv_col_min * 8, ref_mv->col - MV_MAX);
+  const int maxc = MIN(x->mv_col_max * 8, ref_mv->col + MV_MAX);
+  const int minr = MAX(x->mv_row_min * 8, ref_mv->row - MV_MAX);
+  const int maxr = MIN(x->mv_row_max * 8, ref_mv->row + MV_MAX);
 
   int tr = br;
   int tc = bc;
 
   // central mv
-  bestmv->as_mv.row *= 8;
-  bestmv->as_mv.col *= 8;
+  bestmv->row *= 8;
+  bestmv->col *= 8;
 
   // calculate central point error
   besterr = vfp->vf(y, y_stride, z, src_stride, sse1);
@@ -435,7 +436,7 @@ int vp9_find_best_sub_pixel_tree(MACROBLOCK *x,
     tc = bc;
   }
 
-  if (xd->allow_high_precision_mv && vp9_use_mv_hp(&ref_mv->as_mv) &&
+  if (xd->allow_high_precision_mv && vp9_use_mv_hp(ref_mv) &&
       forced_stop == 0) {
     hstep >>= 1;
     FIRST_LEVEL_CHECKS;
@@ -446,11 +447,11 @@ int vp9_find_best_sub_pixel_tree(MACROBLOCK *x,
     tc = bc;
   }
 
-  bestmv->as_mv.row = br;
-  bestmv->as_mv.col = bc;
+  bestmv->row = br;
+  bestmv->col = bc;
 
-  if ((abs(bestmv->as_mv.col - ref_mv->as_mv.col) > (MAX_FULL_PEL_VAL << 3)) ||
-      (abs(bestmv->as_mv.row - ref_mv->as_mv.row) > (MAX_FULL_PEL_VAL << 3)))
+  if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) ||
+      (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3)))
     return INT_MAX;
 
   return besterr;
@@ -463,7 +464,7 @@ int vp9_find_best_sub_pixel_tree(MACROBLOCK *x,
               z, src_stride, &sse, second_pred)
 
 int vp9_find_best_sub_pixel_comp_iterative(MACROBLOCK *x,
-                                           int_mv *bestmv, int_mv *ref_mv,
+                                           MV *bestmv, const MV *ref_mv,
                                            int error_per_bit,
                                            const vp9_variance_fn_ptr_t *vfp,
                                            int forced_stop,
@@ -487,25 +488,25 @@ int vp9_find_best_sub_pixel_comp_iterative(MACROBLOCK *x,
 
   DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64);
   const int y_stride = xd->plane[0].pre[0].stride;
-  const int offset = bestmv->as_mv.row * y_stride + bestmv->as_mv.col;
+  const int offset = bestmv->row * y_stride + bestmv->col;
   uint8_t *const y = xd->plane[0].pre[0].buf + offset;
 
-  int rr = ref_mv->as_mv.row;
-  int rc = ref_mv->as_mv.col;
-  int br = bestmv->as_mv.row * 8;
-  int bc = bestmv->as_mv.col * 8;
+  int rr = ref_mv->row;
+  int rc = ref_mv->col;
+  int br = bestmv->row * 8;
+  int bc = bestmv->col * 8;
   int hstep = 4;
-  const int minc = MAX(x->mv_col_min * 8, ref_mv->as_mv.col - MV_MAX);
-  const int maxc = MIN(x->mv_col_max * 8, ref_mv->as_mv.col + MV_MAX);
-  const int minr = MAX(x->mv_row_min * 8, ref_mv->as_mv.row - MV_MAX);
-  const int maxr = MIN(x->mv_row_max * 8, ref_mv->as_mv.row + MV_MAX);
+  const int minc = MAX(x->mv_col_min * 8, ref_mv->col - MV_MAX);
+  const int maxc = MIN(x->mv_col_max * 8, ref_mv->col + MV_MAX);
+  const int minr = MAX(x->mv_row_min * 8, ref_mv->row - MV_MAX);
+  const int maxr = MIN(x->mv_row_max * 8, ref_mv->row + MV_MAX);
 
   int tr = br;
   int tc = bc;
 
   // central mv
-  bestmv->as_mv.row *= 8;
-  bestmv->as_mv.col *= 8;
+  bestmv->row *= 8;
+  bestmv->col *= 8;
 
   // calculate central point error
   // TODO(yunqingwang): central pointer error was already calculated in full-
@@ -543,7 +544,7 @@ int vp9_find_best_sub_pixel_comp_iterative(MACROBLOCK *x,
     }
   }
 
-  if (xd->allow_high_precision_mv && vp9_use_mv_hp(&ref_mv->as_mv) &&
+  if (xd->allow_high_precision_mv && vp9_use_mv_hp(ref_mv) &&
       forced_stop == 0) {
     hstep >>= 1;
     while (eighthiters--) {
@@ -555,18 +556,18 @@ int vp9_find_best_sub_pixel_comp_iterative(MACROBLOCK *x,
       tc = bc;
     }
   }
-  bestmv->as_mv.row = br;
-  bestmv->as_mv.col = bc;
+  bestmv->row = br;
+  bestmv->col = bc;
 
-  if ((abs(bestmv->as_mv.col - ref_mv->as_mv.col) > (MAX_FULL_PEL_VAL << 3)) ||
-      (abs(bestmv->as_mv.row - ref_mv->as_mv.row) > (MAX_FULL_PEL_VAL << 3)))
+  if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) ||
+      (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3)))
     return INT_MAX;
 
   return besterr;
 }
 
 int vp9_find_best_sub_pixel_comp_tree(MACROBLOCK *x,
-                                      int_mv *bestmv, int_mv *ref_mv,
+                                      MV *bestmv, const MV *ref_mv,
                                       int error_per_bit,
                                       const vp9_variance_fn_ptr_t *vfp,
                                       int forced_stop,
@@ -589,25 +590,25 @@ int vp9_find_best_sub_pixel_comp_tree(MACROBLOCK *x,
 
   DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64);
   const int y_stride = xd->plane[0].pre[0].stride;
-  const int offset = (bestmv->as_mv.row) * y_stride + bestmv->as_mv.col;
+  const int offset = bestmv->row * y_stride + bestmv->col;
   uint8_t *y = xd->plane[0].pre[0].buf + offset;
 
-  int rr = ref_mv->as_mv.row;
-  int rc = ref_mv->as_mv.col;
-  int br = bestmv->as_mv.row * 8;
-  int bc = bestmv->as_mv.col * 8;
+  int rr = ref_mv->row;
+  int rc = ref_mv->col;
+  int br = bestmv->row * 8;
+  int bc = bestmv->col * 8;
   int hstep = 4;
-  const int minc = MAX(x->mv_col_min * 8, ref_mv->as_mv.col - MV_MAX);
-  const int maxc = MIN(x->mv_col_max * 8, ref_mv->as_mv.col + MV_MAX);
-  const int minr = MAX(x->mv_row_min * 8, ref_mv->as_mv.row - MV_MAX);
-  const int maxr = MIN(x->mv_row_max * 8, ref_mv->as_mv.row + MV_MAX);
+  const int minc = MAX(x->mv_col_min * 8, ref_mv->col - MV_MAX);
+  const int maxc = MIN(x->mv_col_max * 8, ref_mv->col + MV_MAX);
+  const int minr = MAX(x->mv_row_min * 8, ref_mv->row - MV_MAX);
+  const int maxr = MIN(x->mv_row_max * 8, ref_mv->row + MV_MAX);
 
   int tr = br;
   int tc = bc;
 
   // central mv
-  bestmv->as_mv.row *= 8;
-  bestmv->as_mv.col *= 8;
+  bestmv->row *= 8;
+  bestmv->col *= 8;
 
   // calculate central point error
   // TODO(yunqingwang): central pointer error was already calculated in full-
@@ -641,7 +642,7 @@ int vp9_find_best_sub_pixel_comp_tree(MACROBLOCK *x,
     tc = bc;
   }
 
-  if (xd->allow_high_precision_mv && vp9_use_mv_hp(&ref_mv->as_mv) &&
+  if (xd->allow_high_precision_mv && vp9_use_mv_hp(ref_mv) &&
       forced_stop == 0) {
     hstep >>= 1;
     FIRST_LEVEL_CHECKS;
@@ -651,11 +652,11 @@ int vp9_find_best_sub_pixel_comp_tree(MACROBLOCK *x,
     tr = br;
     tc = bc;
   }
-  bestmv->as_mv.row = br;
-  bestmv->as_mv.col = bc;
+  bestmv->row = br;
+  bestmv->col = bc;
 
-  if ((abs(bestmv->as_mv.col - ref_mv->as_mv.col) > (MAX_FULL_PEL_VAL << 3)) ||
-      (abs(bestmv->as_mv.row - ref_mv->as_mv.row) > (MAX_FULL_PEL_VAL << 3)))
+  if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) ||
+      (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3)))
     return INT_MAX;
 
   return besterr;
@@ -679,10 +680,10 @@ int vp9_find_best_sub_pixel_comp_tree(MACROBLOCK *x,
 
 #define CHECK_POINT \
   {\
-    if (this_mv.as_mv.col < x->mv_col_min) continue;\
-    if (this_mv.as_mv.col > x->mv_col_max) continue;\
-    if (this_mv.as_mv.row < x->mv_row_min) continue;\
-    if (this_mv.as_mv.row > x->mv_row_max) continue;\
+    if (this_mv.col < x->mv_col_min) continue;\
+    if (this_mv.col > x->mv_col_max) continue;\
+    if (this_mv.row < x->mv_row_min) continue;\
+    if (this_mv.row > x->mv_row_max) continue;\
   }
 
 #define CHECK_BETTER \
@@ -690,7 +691,7 @@ int vp9_find_best_sub_pixel_comp_tree(MACROBLOCK *x,
     if (thissad < bestsad)\
     {\
       if (use_mvcost) \
-        thissad += mvsad_err_cost(&this_mv, &fcenter_mv, \
+        thissad += mvsad_err_cost(&this_mv, &fcenter_mv.as_mv, \
                                   mvjsadcost, mvsadcost, \
                                   sad_per_bit);\
       if (thissad < bestsad)\
@@ -715,14 +716,14 @@ int vp9_find_best_sub_pixel_comp_tree(MACROBLOCK *x,
 // candidates as indicated in the num_candidates and candidates arrays
 // passed into this function
 static int vp9_pattern_search(MACROBLOCK *x,
-                              int_mv *ref_mv,
+                              MV *ref_mv,
                               int search_param,
                               int sad_per_bit,
                               int do_init_search,
                               int do_refine,
                               const vp9_variance_fn_ptr_t *vfp,
                               int use_mvcost,
-                              int_mv *center_mv, int_mv *best_mv,
+                              const MV *center_mv, MV *best_mv,
                               const int num_candidates[MAX_PATTERN_SCALES],
                               const MV candidates[MAX_PATTERN_SCALES]
                                                  [MAX_PATTERN_CANDIDATES]) {
@@ -735,7 +736,7 @@ static int vp9_pattern_search(MACROBLOCK *x,
   int what_stride = x->plane[0].src.stride;
   int in_what_stride = xd->plane[0].pre[0].stride;
   int br, bc;
-  int_mv this_mv;
+  MV this_mv;
   int bestsad = INT_MAX;
   int thissad;
   uint8_t *base_offset;
@@ -748,24 +749,22 @@ static int vp9_pattern_search(MACROBLOCK *x,
   int *mvjsadcost = x->nmvjointsadcost;
   int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
 
-  fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
-  fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
+  fcenter_mv.as_mv.row = center_mv->row >> 3;
+  fcenter_mv.as_mv.col = center_mv->col >> 3;
 
   // adjust ref_mv to make sure it is within MV range
-  clamp_mv(&ref_mv->as_mv,
-           x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
-  br = ref_mv->as_mv.row;
-  bc = ref_mv->as_mv.col;
+  clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
+  br = ref_mv->row;
+  bc = ref_mv->col;
 
   // Work out the start point for the search
   base_offset = (uint8_t *)(xd->plane[0].pre[0].buf);
   this_offset = base_offset + (br * in_what_stride) + bc;
-  this_mv.as_mv.row = br;
-  this_mv.as_mv.col = bc;
-  bestsad = vfp->sdf(what, what_stride, this_offset,
-                     in_what_stride, 0x7fffffff)
-            + mvsad_err_cost(&this_mv, &fcenter_mv, mvjsadcost, mvsadcost,
-                             sad_per_bit);
+  this_mv.row = br;
+  this_mv.col = bc;
+  bestsad = vfp->sdf(what, what_stride, this_offset, in_what_stride, 0x7fffffff)
+                + mvsad_err_cost(&this_mv, &fcenter_mv.as_mv,
+                                 mvjsadcost, mvsadcost, sad_per_bit);
 
   // Search all possible scales upto the search param around the center point
   // pick the scale of the point that is best as the starting scale of
@@ -778,21 +777,21 @@ static int vp9_pattern_search(MACROBLOCK *x,
       CHECK_BOUNDS((1 << t))
       if (all_in) {
         for (i = 0; i < num_candidates[t]; i++) {
-          this_mv.as_mv.row = br + candidates[t][i].row;
-          this_mv.as_mv.col = bc + candidates[t][i].col;
-          this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) +
-              this_mv.as_mv.col;
+          this_mv.row = br + candidates[t][i].row;
+          this_mv.col = bc + candidates[t][i].col;
+          this_offset = base_offset + (this_mv.row * in_what_stride) +
+                                       this_mv.col;
           thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
                              bestsad);
           CHECK_BETTER
         }
       } else {
         for (i = 0; i < num_candidates[t]; i++) {
-          this_mv.as_mv.row = br + candidates[t][i].row;
-          this_mv.as_mv.col = bc + candidates[t][i].col;
+          this_mv.row = br + candidates[t][i].row;
+          this_mv.col = bc + candidates[t][i].col;
           CHECK_POINT
-          this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) +
-                        this_mv.as_mv.col;
+          this_offset = base_offset + (this_mv.row * in_what_stride) +
+                                       this_mv.col;
           thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
                              bestsad);
           CHECK_BETTER
@@ -822,21 +821,21 @@ static int vp9_pattern_search(MACROBLOCK *x,
         CHECK_BOUNDS((1 << s))
         if (all_in) {
           for (i = 0; i < num_candidates[s]; i++) {
-            this_mv.as_mv.row = br + candidates[s][i].row;
-            this_mv.as_mv.col = bc + candidates[s][i].col;
-            this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) +
-                this_mv.as_mv.col;
+            this_mv.row = br + candidates[s][i].row;
+            this_mv.col = bc + candidates[s][i].col;
+            this_offset = base_offset + (this_mv.row * in_what_stride) +
+                                         this_mv.col;
             thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
                                bestsad);
             CHECK_BETTER
           }
         } else {
           for (i = 0; i < num_candidates[s]; i++) {
-            this_mv.as_mv.row = br + candidates[s][i].row;
-            this_mv.as_mv.col = bc + candidates[s][i].col;
+            this_mv.row = br + candidates[s][i].row;
+            this_mv.col = bc + candidates[s][i].col;
             CHECK_POINT
-            this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) +
-                          this_mv.as_mv.col;
+            this_offset = base_offset + (this_mv.row * in_what_stride) +
+                                         this_mv.col;
             thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
                                bestsad);
             CHECK_BETTER
@@ -860,25 +859,21 @@ static int vp9_pattern_search(MACROBLOCK *x,
         get_next_chkpts(next_chkpts_indices, k, num_candidates[s]);
         if (all_in) {
           for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
-            this_mv.as_mv.row = br +
-                candidates[s][next_chkpts_indices[i]].row;
-            this_mv.as_mv.col = bc +
-                candidates[s][next_chkpts_indices[i]].col;
-            this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) +
-                this_mv.as_mv.col;
+            this_mv.row = br + candidates[s][next_chkpts_indices[i]].row;
+            this_mv.col = bc + candidates[s][next_chkpts_indices[i]].col;
+            this_offset = base_offset + (this_mv.row * (in_what_stride)) +
+                                         this_mv.col;
             thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
                                bestsad);
             CHECK_BETTER
           }
         } else {
           for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
-            this_mv.as_mv.row = br +
-                candidates[s][next_chkpts_indices[i]].row;
-            this_mv.as_mv.col = bc +
-                candidates[s][next_chkpts_indices[i]].col;
+            this_mv.row = br + candidates[s][next_chkpts_indices[i]].row;
+            this_mv.col = bc + candidates[s][next_chkpts_indices[i]].col;
             CHECK_POINT
-            this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) +
-                          this_mv.as_mv.col;
+            this_offset = base_offset + (this_mv.row * (in_what_stride)) +
+                                         this_mv.col;
             thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
                                bestsad);
             CHECK_BETTER
@@ -905,21 +900,21 @@ static int vp9_pattern_search(MACROBLOCK *x,
       CHECK_BOUNDS(1)
       if (all_in) {
         for (i = 0; i < 4; i++) {
-          this_mv.as_mv.row = br + neighbors[i].row;
-          this_mv.as_mv.col = bc + neighbors[i].col;
-          this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) +
-              this_mv.as_mv.col;
+          this_mv.row = br + neighbors[i].row;
+          this_mv.col = bc + neighbors[i].col;
+          this_offset = base_offset + (this_mv.row * (in_what_stride)) +
+                                       this_mv.col;
           thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
                              bestsad);
           CHECK_BETTER
         }
       } else {
         for (i = 0; i < 4; i++) {
-          this_mv.as_mv.row = br + neighbors[i].row;
-          this_mv.as_mv.col = bc + neighbors[i].col;
+          this_mv.row = br + neighbors[i].row;
+          this_mv.col = bc + neighbors[i].col;
           CHECK_POINT
-          this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) +
-                        this_mv.as_mv.col;
+          this_offset = base_offset + (this_mv.row * (in_what_stride)) +
+                                       this_mv.col;
           thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
                              bestsad);
           CHECK_BETTER
@@ -935,31 +930,32 @@ static int vp9_pattern_search(MACROBLOCK *x,
     }
   }
 
-  best_mv->as_mv.row = br;
-  best_mv->as_mv.col = bc;
+  best_mv->row = br;
+  best_mv->col = bc;
 
-  this_offset = base_offset + (best_mv->as_mv.row * (in_what_stride)) +
-      best_mv->as_mv.col;
-  this_mv.as_mv.row = best_mv->as_mv.row * 8;
-  this_mv.as_mv.col = best_mv->as_mv.col * 8;
+  this_offset = base_offset + (best_mv->row * in_what_stride) +
+                               best_mv->col;
+  this_mv.row = best_mv->row * 8;
+  this_mv.col = best_mv->col * 8;
   if (bestsad == INT_MAX)
     return INT_MAX;
-  return
-      vfp->vf(what, what_stride, this_offset, in_what_stride,
-              (unsigned int *)(&bestsad)) +
-      use_mvcost ? mv_err_cost(&this_mv, center_mv, x->nmvjointcost, x->mvcost,
-                               x->errorperbit) : 0;
+
+  return vfp->vf(what, what_stride, this_offset, in_what_stride,
+                 (unsigned int *)&bestsad) +
+         use_mvcost ? mv_err_cost(&this_mv, center_mv,
+                                  x->nmvjointcost, x->mvcost, x->errorperbit)
+                    : 0;
 }
 
 
 int vp9_hex_search(MACROBLOCK *x,
-                   int_mv *ref_mv,
+                   MV *ref_mv,
                    int search_param,
                    int sad_per_bit,
                    int do_init_search,
                    const vp9_variance_fn_ptr_t *vfp,
                    int use_mvcost,
-                   int_mv *center_mv, int_mv *best_mv) {
+                   const MV *center_mv, MV *best_mv) {
   // First scale has 8-closest points, the rest have 6 points in hex shape
   // at increasing scales
   static const int hex_num_candidates[MAX_PATTERN_SCALES] = {
@@ -988,14 +984,14 @@ int vp9_hex_search(MACROBLOCK *x,
 }
 
 int vp9_bigdia_search(MACROBLOCK *x,
-                      int_mv *ref_mv,
+                      MV *ref_mv,
                       int search_param,
                       int sad_per_bit,
                       int do_init_search,
                       const vp9_variance_fn_ptr_t *vfp,
                       int use_mvcost,
-                      int_mv *center_mv,
-                      int_mv *best_mv) {
+                      const MV *center_mv,
+                      MV *best_mv) {
   // First scale has 4-closest points, the rest have 8 points in diamond
   // shape at increasing scales
   static const int bigdia_num_candidates[MAX_PATTERN_SCALES] = {
@@ -1022,22 +1018,21 @@ int vp9_bigdia_search(MACROBLOCK *x,
     {{-512, -512}, {0, -1024}, {512, -512}, {1024, 0}, {512, 512}, {0, 1024},
       {-512, 512}, {-1024, 0}},
   };
-  return
-      vp9_pattern_search(x, ref_mv, search_param, sad_per_bit,
-                         do_init_search, 0, vfp, use_mvcost,
-                         center_mv, best_mv,
-                         bigdia_num_candidates, bigdia_candidates);
+  return vp9_pattern_search(x, ref_mv, search_param, sad_per_bit,
+                            do_init_search, 0, vfp, use_mvcost,
+                            center_mv, best_mv,
+                            bigdia_num_candidates, bigdia_candidates);
 }
 
 int vp9_square_search(MACROBLOCK *x,
-                      int_mv *ref_mv,
+                      MV *ref_mv,
                       int search_param,
                       int sad_per_bit,
                       int do_init_search,
                       const vp9_variance_fn_ptr_t *vfp,
                       int use_mvcost,
-                      int_mv *center_mv,
-                      int_mv *best_mv) {
+                      const MV *center_mv,
+                      MV *best_mv) {
   // All scales have 8 closest points in square shape
   static const int square_num_candidates[MAX_PATTERN_SCALES] = {
     8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
@@ -1064,11 +1059,10 @@ int vp9_square_search(MACROBLOCK *x,
     {{-1024, -1024}, {0, -1024}, {1024, -1024}, {1024, 0}, {1024, 1024},
       {0, 1024}, {-1024, 1024}, {-1024, 0}},
   };
-  return
-      vp9_pattern_search(x, ref_mv, search_param, sad_per_bit,
-                         do_init_search, 0, vfp, use_mvcost,
-                         center_mv, best_mv,
-                         square_num_candidates, square_candidates);
+  return vp9_pattern_search(x, ref_mv, search_param, sad_per_bit,
+                            do_init_search, 0, vfp, use_mvcost,
+                            center_mv, best_mv,
+                            square_num_candidates, square_candidates);
 };
 
 #undef CHECK_BOUNDS
@@ -1124,10 +1118,9 @@ int vp9_diamond_search_sad_c(MACROBLOCK *x,
   best_address = in_what;
 
   // Check the starting position
-  bestsad = fn_ptr->sdf(what, what_stride, in_what,
-                        in_what_stride, 0x7fffffff)
-            + mvsad_err_cost(best_mv, &fcenter_mv, mvjsadcost, mvsadcost,
-                             sad_per_bit);
+  bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, 0x7fffffff)
+                + mvsad_err_cost(&best_mv->as_mv, &fcenter_mv.as_mv,
+                                 mvjsadcost, mvsadcost, sad_per_bit);
 
   // search_param determines the length of the initial step and hence the number of iterations
   // 0 = initial step (MAX_FIRST_STEP) pel : 1 = (MAX_FIRST_STEP/2) pel, 2 = (MAX_FIRST_STEP/4) pel... etc.
@@ -1153,7 +1146,7 @@ int vp9_diamond_search_sad_c(MACROBLOCK *x,
         if (thissad < bestsad) {
           this_mv.as_mv.row = this_row_offset;
           this_mv.as_mv.col = this_col_offset;
-          thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
+          thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv,
                                     mvjsadcost, mvsadcost, sad_per_bit);
 
           if (thissad < bestsad) {
@@ -1185,7 +1178,7 @@ int vp9_diamond_search_sad_c(MACROBLOCK *x,
           if (thissad < bestsad) {
             this_mv.as_mv.row = this_row_offset;
             this_mv.as_mv.col = this_col_offset;
-            thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
+            thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv,
                                       mvjsadcost, mvsadcost, sad_per_bit);
             if (thissad < bestsad) {
               bestsad = thissad;
@@ -1210,8 +1203,9 @@ int vp9_diamond_search_sad_c(MACROBLOCK *x,
     return INT_MAX;
 
   return fn_ptr->vf(what, what_stride, best_address, in_what_stride,
-         (unsigned int *)(&thissad)) + mv_err_cost(&this_mv, center_mv, mvjcost,
-                                                   mvcost, x->errorperbit);
+                    (unsigned int *)(&thissad)) +
+                       mv_err_cost(&this_mv.as_mv, &center_mv->as_mv,
+                                   mvjcost, mvcost, x->errorperbit);
 }
 
 int vp9_diamond_search_sadx4(MACROBLOCK *x,
@@ -1265,10 +1259,9 @@ int vp9_diamond_search_sadx4(MACROBLOCK *x,
   best_address = in_what;
 
   // Check the starting position
-  bestsad = fn_ptr->sdf(what, what_stride,
-                        in_what, in_what_stride, 0x7fffffff)
-            + mvsad_err_cost(best_mv, &fcenter_mv, mvjsadcost, mvsadcost,
-                             sad_per_bit);
+  bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, 0x7fffffff)
+                + mvsad_err_cost(&best_mv->as_mv, &fcenter_mv.as_mv,
+                                 mvjsadcost, mvsadcost, sad_per_bit);
 
   // search_param determines the length of the initial step and hence the number of iterations
   // 0 = initial step (MAX_FIRST_STEP) pel : 1 = (MAX_FIRST_STEP/2) pel, 2 = (MAX_FIRST_STEP/4) pel... etc.
@@ -1303,7 +1296,7 @@ int vp9_diamond_search_sadx4(MACROBLOCK *x,
           if (sad_array[t] < bestsad) {
             this_mv.as_mv.row = best_mv->as_mv.row + ss[i].mv.row;
             this_mv.as_mv.col = best_mv->as_mv.col + ss[i].mv.col;
-            sad_array[t] += mvsad_err_cost(&this_mv, &fcenter_mv,
+            sad_array[t] += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv,
                                            mvjsadcost, mvsadcost, sad_per_bit);
 
             if (sad_array[t] < bestsad) {
@@ -1327,7 +1320,7 @@ int vp9_diamond_search_sadx4(MACROBLOCK *x,
           if (thissad < bestsad) {
             this_mv.as_mv.row = this_row_offset;
             this_mv.as_mv.col = this_col_offset;
-            thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
+            thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv,
                                       mvjsadcost, mvsadcost, sad_per_bit);
 
             if (thissad < bestsad) {
@@ -1358,7 +1351,7 @@ int vp9_diamond_search_sadx4(MACROBLOCK *x,
           if (thissad < bestsad) {
             this_mv.as_mv.row = this_row_offset;
             this_mv.as_mv.col = this_col_offset;
-            thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
+            thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv,
                                       mvjsadcost, mvsadcost, sad_per_bit);
             if (thissad < bestsad) {
               bestsad = thissad;
@@ -1383,8 +1376,9 @@ int vp9_diamond_search_sadx4(MACROBLOCK *x,
     return INT_MAX;
 
   return fn_ptr->vf(what, what_stride, best_address, in_what_stride,
-                    (unsigned int *)(&thissad)) + mv_err_cost(&this_mv,
-                            center_mv, mvjcost, mvcost, x->errorperbit);
+                    (unsigned int *)(&thissad)) +
+                    mv_err_cost(&this_mv.as_mv, &center_mv->as_mv,
+                                mvjcost, mvcost, x->errorperbit);
 }
 
 /* do_refine: If last step (1-away) of n-step search doesn't pick the center
@@ -1495,8 +1489,8 @@ int vp9_full_search_sad_c(MACROBLOCK *x, int_mv *ref_mv,
   // Baseline value at the centre
   bestsad = fn_ptr->sdf(what, what_stride, bestaddress,
                         in_what_stride, 0x7fffffff)
-            + mvsad_err_cost(best_mv, &fcenter_mv, mvjsadcost, mvsadcost,
-                             sad_per_bit);
+                           + mvsad_err_cost(&best_mv->as_mv, &fcenter_mv.as_mv,
+                                            mvjsadcost, mvsadcost, sad_per_bit);
 
   // Apply further limits to prevent us looking using vectors that stretch
   // beyond the UMV border
@@ -1513,8 +1507,8 @@ int vp9_full_search_sad_c(MACROBLOCK *x, int_mv *ref_mv,
       thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
 
       this_mv.as_mv.col = c;
-      thissad  += mvsad_err_cost(&this_mv, &fcenter_mv,
-                                 mvjsadcost, mvsadcost, sad_per_bit);
+      thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv,
+                                mvjsadcost, mvsadcost, sad_per_bit);
 
       if (thissad < bestsad) {
         bestsad = thissad;
@@ -1531,10 +1525,10 @@ int vp9_full_search_sad_c(MACROBLOCK *x, int_mv *ref_mv,
   this_mv.as_mv.col = best_mv->as_mv.col * 8;
 
   if (bestsad < INT_MAX)
-    return
-        fn_ptr->vf(what, what_stride, bestaddress, in_what_stride,
-                   (unsigned int *)(&thissad)) +
-        mv_err_cost(&this_mv, center_mv, mvjcost, mvcost, x->errorperbit);
+    return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride,
+                      (unsigned int *)(&thissad)) +
+                      mv_err_cost(&this_mv.as_mv, &center_mv->as_mv,
+                                  mvjcost, mvcost, x->errorperbit);
   else
     return INT_MAX;
 }
@@ -1585,8 +1579,8 @@ int vp9_full_search_sadx3(MACROBLOCK *x, int_mv *ref_mv,
   // Baseline value at the centre
   bestsad = fn_ptr->sdf(what, what_stride,
                         bestaddress, in_what_stride, 0x7fffffff)
-            + mvsad_err_cost(best_mv, &fcenter_mv, mvjsadcost, mvsadcost,
-                             sad_per_bit);
+            + mvsad_err_cost(&best_mv->as_mv, &fcenter_mv.as_mv,
+                             mvjsadcost, mvsadcost, sad_per_bit);
 
   // Apply further limits to prevent us looking using vectors that stretch
   // beyond the UMV border
@@ -1610,8 +1604,8 @@ int vp9_full_search_sadx3(MACROBLOCK *x, int_mv *ref_mv,
 
         if (thissad < bestsad) {
           this_mv.as_mv.col = c;
-          thissad  += mvsad_err_cost(&this_mv, &fcenter_mv,
-                                     mvjsadcost, mvsadcost, sad_per_bit);
+          thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv,
+                                    mvjsadcost, mvsadcost, sad_per_bit);
 
           if (thissad < bestsad) {
             bestsad = thissad;
@@ -1631,7 +1625,7 @@ int vp9_full_search_sadx3(MACROBLOCK *x, int_mv *ref_mv,
 
       if (thissad < bestsad) {
         this_mv.as_mv.col = c;
-        thissad  += mvsad_err_cost(&this_mv, &fcenter_mv,
+        thissad  += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv,
                                    mvjsadcost, mvsadcost, sad_per_bit);
 
         if (thissad < bestsad) {
@@ -1652,10 +1646,10 @@ int vp9_full_search_sadx3(MACROBLOCK *x, int_mv *ref_mv,
   this_mv.as_mv.col = best_mv->as_mv.col * 8;
 
   if (bestsad < INT_MAX)
-    return
-        fn_ptr->vf(what, what_stride, bestaddress, in_what_stride,
-                   (unsigned int *)(&thissad)) +
-        mv_err_cost(&this_mv, center_mv, mvjcost, mvcost, x->errorperbit);
+    return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride,
+                      (unsigned int *)(&thissad)) +
+                      mv_err_cost(&this_mv.as_mv, &center_mv->as_mv,
+                                  mvjcost, mvcost, x->errorperbit);
   else
     return INT_MAX;
 }
@@ -1708,8 +1702,8 @@ int vp9_full_search_sadx8(MACROBLOCK *x, int_mv *ref_mv,
   // Baseline value at the centre
   bestsad = fn_ptr->sdf(what, what_stride,
                         bestaddress, in_what_stride, 0x7fffffff)
-            + mvsad_err_cost(best_mv, &fcenter_mv, mvjsadcost, mvsadcost,
-                             sad_per_bit);
+            + mvsad_err_cost(&best_mv->as_mv, &fcenter_mv.as_mv,
+                             mvjsadcost, mvsadcost, sad_per_bit);
 
   // Apply further limits to prevent us looking using vectors that stretch
   // beyond the UMV border
@@ -1733,8 +1727,8 @@ int vp9_full_search_sadx8(MACROBLOCK *x, int_mv *ref_mv,
 
         if (thissad < bestsad) {
           this_mv.as_mv.col = c;
-          thissad  += mvsad_err_cost(&this_mv, &fcenter_mv,
-                                     mvjsadcost, mvsadcost, sad_per_bit);
+          thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv,
+                                    mvjsadcost, mvsadcost, sad_per_bit);
 
           if (thissad < bestsad) {
             bestsad = thissad;
@@ -1759,7 +1753,7 @@ int vp9_full_search_sadx8(MACROBLOCK *x, int_mv *ref_mv,
 
         if (thissad < bestsad) {
           this_mv.as_mv.col = c;
-          thissad  += mvsad_err_cost(&this_mv, &fcenter_mv,
+          thissad  += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv,
                                      mvjsadcost, mvsadcost, sad_per_bit);
 
           if (thissad < bestsad) {
@@ -1780,8 +1774,8 @@ int vp9_full_search_sadx8(MACROBLOCK *x, int_mv *ref_mv,
 
       if (thissad < bestsad) {
         this_mv.as_mv.col = c;
-        thissad  += mvsad_err_cost(&this_mv, &fcenter_mv,
-                                   mvjsadcost, mvsadcost, sad_per_bit);
+        thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv,
+                                  mvjsadcost, mvsadcost, sad_per_bit);
 
         if (thissad < bestsad) {
           bestsad = thissad;
@@ -1800,10 +1794,10 @@ int vp9_full_search_sadx8(MACROBLOCK *x, int_mv *ref_mv,
   this_mv.as_mv.col = best_mv->as_mv.col * 8;
 
   if (bestsad < INT_MAX)
-    return
-        fn_ptr->vf(what, what_stride, bestaddress, in_what_stride,
-                   (unsigned int *)(&thissad)) +
-        mv_err_cost(&this_mv, center_mv, mvjcost, mvcost, x->errorperbit);
+    return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride,
+                      (unsigned int *)(&thissad)) +
+                      mv_err_cost(&this_mv.as_mv, &center_mv->as_mv,
+                                  mvjcost, mvcost, x->errorperbit);
   else
     return INT_MAX;
 }
@@ -1834,8 +1828,10 @@ int vp9_refining_search_sad_c(MACROBLOCK *x,
   fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
   fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
 
-  bestsad = fn_ptr->sdf(what, what_stride, best_address, in_what_stride, 0x7fffffff) +
-      mvsad_err_cost(ref_mv, &fcenter_mv, mvjsadcost, mvsadcost, error_per_bit);
+  bestsad = fn_ptr->sdf(what, what_stride, best_address,
+                        in_what_stride, 0x7fffffff) +
+                        mvsad_err_cost(&ref_mv->as_mv, &fcenter_mv.as_mv,
+                                       mvjsadcost, mvsadcost, error_per_bit);
 
   for (i = 0; i < search_range; i++) {
     int best_site = -1;
@@ -1852,8 +1848,8 @@ int vp9_refining_search_sad_c(MACROBLOCK *x,
         if (thissad < bestsad) {
           this_mv.as_mv.row = this_row_offset;
           this_mv.as_mv.col = this_col_offset;
-          thissad += mvsad_err_cost(&this_mv, &fcenter_mv, mvjsadcost,
-                                    mvsadcost, error_per_bit);
+          thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv,
+                                    mvjsadcost, mvsadcost, error_per_bit);
 
           if (thissad < bestsad) {
             bestsad = thissad;
@@ -1876,10 +1872,10 @@ int vp9_refining_search_sad_c(MACROBLOCK *x,
   this_mv.as_mv.col = ref_mv->as_mv.col * 8;
 
   if (bestsad < INT_MAX)
-    return
-        fn_ptr->vf(what, what_stride, best_address, in_what_stride,
-                   (unsigned int *)(&thissad)) +
-        mv_err_cost(&this_mv, center_mv, mvjcost, mvcost, x->errorperbit);
+    return fn_ptr->vf(what, what_stride, best_address, in_what_stride,
+                      (unsigned int *)(&thissad)) +
+                      mv_err_cost(&this_mv.as_mv, &center_mv->as_mv,
+                                  mvjcost, mvcost, x->errorperbit);
   else
     return INT_MAX;
 }
@@ -1911,8 +1907,10 @@ int vp9_refining_search_sadx4(MACROBLOCK *x,
   fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
   fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
 
-  bestsad = fn_ptr->sdf(what, what_stride, best_address, in_what_stride, 0x7fffffff) +
-      mvsad_err_cost(ref_mv, &fcenter_mv, mvjsadcost, mvsadcost, error_per_bit);
+  bestsad = fn_ptr->sdf(what, what_stride, best_address,
+                        in_what_stride, 0x7fffffff) +
+      mvsad_err_cost(&ref_mv->as_mv, &fcenter_mv.as_mv,
+                     mvjsadcost, mvsadcost, error_per_bit);
 
   for (i = 0; i < search_range; i++) {
     int best_site = -1;
@@ -1935,8 +1933,8 @@ int vp9_refining_search_sadx4(MACROBLOCK *x,
         if (sad_array[j] < bestsad) {
           this_mv.as_mv.row = ref_mv->as_mv.row + neighbors[j].row;
           this_mv.as_mv.col = ref_mv->as_mv.col + neighbors[j].col;
-          sad_array[j] += mvsad_err_cost(&this_mv, &fcenter_mv, mvjsadcost,
-                                         mvsadcost, error_per_bit);
+          sad_array[j] += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv,
+                                         mvjsadcost, mvsadcost, error_per_bit);
 
           if (sad_array[j] < bestsad) {
             bestsad = sad_array[j];
@@ -1957,8 +1955,8 @@ int vp9_refining_search_sadx4(MACROBLOCK *x,
           if (thissad < bestsad) {
             this_mv.as_mv.row = this_row_offset;
             this_mv.as_mv.col = this_col_offset;
-            thissad += mvsad_err_cost(&this_mv, &fcenter_mv, mvjsadcost,
-                                      mvsadcost, error_per_bit);
+            thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv,
+                                      mvjsadcost, mvsadcost, error_per_bit);
 
             if (thissad < bestsad) {
               bestsad = thissad;
@@ -1982,10 +1980,10 @@ int vp9_refining_search_sadx4(MACROBLOCK *x,
   this_mv.as_mv.col = ref_mv->as_mv.col * 8;
 
   if (bestsad < INT_MAX)
-    return
-        fn_ptr->vf(what, what_stride, best_address, in_what_stride,
-                   (unsigned int *)(&thissad)) +
-        mv_err_cost(&this_mv, center_mv, mvjcost, mvcost, x->errorperbit);
+    return fn_ptr->vf(what, what_stride, best_address, in_what_stride,
+                      (unsigned int *)(&thissad)) +
+                      mv_err_cost(&this_mv.as_mv, &center_mv->as_mv,
+                                  mvjcost, mvcost, x->errorperbit);
   else
     return INT_MAX;
 }
@@ -2025,7 +2023,8 @@ int vp9_refining_search_8p_c(MACROBLOCK *x,
   /* Get compound pred by averaging two pred blocks. */
   bestsad = fn_ptr->sdaf(what, what_stride, best_address, in_what_stride,
                          second_pred, 0x7fffffff) +
-      mvsad_err_cost(ref_mv, &fcenter_mv, mvjsadcost, mvsadcost, error_per_bit);
+      mvsad_err_cost(&ref_mv->as_mv, &fcenter_mv.as_mv,
+                     mvjsadcost, mvsadcost, error_per_bit);
 
   for (i = 0; i < search_range; i++) {
     int best_site = -1;
@@ -2048,9 +2047,8 @@ int vp9_refining_search_8p_c(MACROBLOCK *x,
         if (thissad < bestsad) {
           this_mv.as_mv.row = this_row_offset;
           this_mv.as_mv.col = this_col_offset;
-          thissad += mvsad_err_cost(&this_mv, &fcenter_mv, mvjsadcost,
-                                    mvsadcost, error_per_bit);
-
+          thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv,
+                                    mvjsadcost, mvsadcost, error_per_bit);
           if (thissad < bestsad) {
             bestsad = thissad;
             best_site = j;
@@ -2075,10 +2073,10 @@ int vp9_refining_search_8p_c(MACROBLOCK *x,
   if (bestsad < INT_MAX) {
     // FIXME(rbultje, yunqing): add full-pixel averaging variance functions
     // so we don't have to use the subpixel with xoff=0,yoff=0 here.
-    return fn_ptr->svaf(best_address, in_what_stride, 0, 0,
-                               what, what_stride, (unsigned int *)(&thissad),
-                               second_pred) +
-        mv_err_cost(&this_mv, center_mv, mvjcost, mvcost, x->errorperbit);
+    return fn_ptr->svaf(best_address, in_what_stride, 0, 0, what, what_stride,
+                        (unsigned int *)(&thissad), second_pred) +
+                        mv_err_cost(&this_mv.as_mv, &center_mv->as_mv,
+                                    mvjcost, mvcost, x->errorperbit);
   } else {
     return INT_MAX;
   }
diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h
index 3598fa09a..77c157c5b 100644
--- a/vp9/encoder/vp9_mcomp.h
+++ b/vp9/encoder/vp9_mcomp.h
@@ -22,10 +22,14 @@
 #define MAX_FULL_PEL_VAL ((1 << (MAX_MVSEARCH_STEPS)) - 1)
 // Maximum size of the first step in full pel units
 #define MAX_FIRST_STEP (1 << (MAX_MVSEARCH_STEPS-1))
+// Allowed motion vector pixel distance outside image border
+// for Block_16x16
+#define BORDER_MV_PIXELS_B16 (16 + VP9_INTERP_EXTEND)
+
 
 void vp9_clamp_mv_min_max(MACROBLOCK *x, MV *mv);
-int vp9_mv_bit_cost(int_mv *mv, int_mv *ref, int *mvjcost,
-                           int *mvcost[2], int weight);
+int vp9_mv_bit_cost(const MV *mv, const MV *ref,
+                    const int *mvjcost, int *mvcost[2], int weight);
 void vp9_init_dsmotion_compensation(MACROBLOCK *x, int stride);
 void vp9_init3smotion_compensation(MACROBLOCK *x,  int stride);
 
@@ -40,37 +44,36 @@ int vp9_full_pixel_diamond(struct VP9_COMP *cpi, MACROBLOCK *x,
                            int_mv *ref_mv, int_mv *dst_mv);
 
 int vp9_hex_search(MACROBLOCK *x,
-                   int_mv *ref_mv,
+                   MV *ref_mv,
                    int search_param,
                    int error_per_bit,
                    int do_init_search,
                    const vp9_variance_fn_ptr_t *vf,
                    int use_mvcost,
-                   int_mv *center_mv,
-                   int_mv *best_mv);
+                   const MV *center_mv,
+                   MV *best_mv);
 int vp9_bigdia_search(MACROBLOCK *x,
-                      int_mv *ref_mv,
+                      MV *ref_mv,
                       int search_param,
                       int error_per_bit,
                       int do_init_search,
                       const vp9_variance_fn_ptr_t *vf,
                       int use_mvcost,
-                      int_mv *center_mv,
-                      int_mv *best_mv);
+                      const MV *center_mv,
+                      MV *best_mv);
 int vp9_square_search(MACROBLOCK *x,
-                      int_mv *ref_mv,
+                      MV *ref_mv,
                       int search_param,
                       int error_per_bit,
                       int do_init_search,
                       const vp9_variance_fn_ptr_t *vf,
                       int use_mvcost,
-                      int_mv *center_mv,
-                      int_mv *best_mv);
+                      const MV *center_mv,
+                      MV *best_mv);
 
 typedef int (fractional_mv_step_fp) (
     MACROBLOCK *x,
-    int_mv *bestmv,
-    int_mv *ref_mv,
+    MV *bestmv, const MV *ref_mv,
     int error_per_bit,
     const vp9_variance_fn_ptr_t *vfp,
     int forced_stop,  // 0 - full, 1 - qtr only, 2 - half only
@@ -84,7 +87,7 @@ extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree;
 
 typedef int (fractional_mv_step_comp_fp) (
     MACROBLOCK *x,
-    int_mv *bestmv, int_mv *ref_mv,
+    MV *bestmv, const MV *ref_mv,
     int error_per_bit,
     const vp9_variance_fn_ptr_t *vfp,
     int forced_stop,  // 0 - full, 1 - qtr only, 2 - half only
diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h
index 9b20dafde..a106014f8 100644
--- a/vp9/encoder/vp9_onyx_int.h
+++ b/vp9/encoder/vp9_onyx_int.h
@@ -61,16 +61,11 @@
 #define INTRA_ZBIN_BOOST     0
 
 typedef struct {
-  nmv_context nmvc;
   int nmvjointcost[MV_JOINTS];
   int nmvcosts[2][MV_VALS];
   int nmvcosts_hp[2][MV_VALS];
 
   vp9_prob segment_pred_probs[PREDICTION_PROBS];
-  vp9_prob intra_inter_prob[INTRA_INTER_CONTEXTS];
-  vp9_prob comp_inter_prob[COMP_INTER_CONTEXTS];
-  vp9_prob single_ref_prob[REF_CONTEXTS][2];
-  vp9_prob comp_ref_prob[REF_CONTEXTS];
 
   unsigned char *last_frame_seg_map_copy;
 
@@ -79,20 +74,8 @@ typedef struct {
   // 0 = ZERO_MV, MV
   signed char last_mode_lf_deltas[MAX_MODE_LF_DELTAS];
 
-  vp9_coeff_probs_model coef_probs[TX_SIZES][BLOCK_TYPES];
-
-  vp9_prob y_mode_prob[4][INTRA_MODES - 1];
-  vp9_prob uv_mode_prob[INTRA_MODES][INTRA_MODES - 1];
-  vp9_prob partition_prob[2][NUM_PARTITION_CONTEXTS][PARTITION_TYPES - 1];
-
-  vp9_prob switchable_interp_prob[SWITCHABLE_FILTERS + 1]
-                                 [SWITCHABLE_FILTERS - 1];
-
   int inter_mode_counts[INTER_MODE_CONTEXTS][INTER_MODES - 1][2];
-  vp9_prob inter_mode_probs[INTER_MODE_CONTEXTS][INTER_MODES - 1];
-
-  struct tx_probs tx_probs;
-  vp9_prob mbskip_probs[MBSKIP_CONTEXTS];
+  FRAME_CONTEXT fc;
 } CODING_CONTEXT;
 
 typedef struct {
@@ -649,7 +632,7 @@ typedef struct VP9_COMP {
   unsigned int switchable_interp_count[SWITCHABLE_FILTERS + 1]
                                       [SWITCHABLE_FILTERS];
 
-  unsigned int txfm_stepdown_count[TX_SIZES];
+  unsigned int tx_stepdown_count[TX_SIZES];
 
   int initial_width;
   int initial_height;
@@ -712,9 +695,8 @@ void vp9_activity_masking(VP9_COMP *cpi, MACROBLOCK *x);
 
 void vp9_set_speed_features(VP9_COMP *cpi);
 
-extern int vp9_calc_ss_err(YV12_BUFFER_CONFIG *source,
-                           YV12_BUFFER_CONFIG *dest);
+int vp9_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest);
 
-extern void vp9_alloc_compressor_data(VP9_COMP *cpi);
+void vp9_alloc_compressor_data(VP9_COMP *cpi);
 
 #endif  // VP9_ENCODER_VP9_ONYX_INT_H_
diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index 6c8b2a04b..05e893ee9 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@@ -337,10 +337,10 @@ void vp9_frame_init_quantizer(VP9_COMP *cpi) {
   vp9_mb_init_quantizer(cpi, &cpi->mb);
 }
 
-void vp9_set_quantizer(struct VP9_COMP *cpi, int Q) {
+void vp9_set_quantizer(struct VP9_COMP *cpi, int q) {
   VP9_COMMON *cm = &cpi->common;
 
-  cm->base_qindex = Q;
+  cm->base_qindex = q;
 
   // if any of the delta_q values are changing update flag will
   // have to be set.
diff --git a/vp9/encoder/vp9_quantize.h b/vp9/encoder/vp9_quantize.h
index 3229eaad2..3191c49ae 100644
--- a/vp9/encoder/vp9_quantize.h
+++ b/vp9/encoder/vp9_quantize.h
@@ -30,14 +30,14 @@ void vp9_regular_quantize_b_8x8(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type,
                                 int y_blocks);
 struct VP9_COMP;
 
-extern void vp9_set_quantizer(struct VP9_COMP *cpi, int Q);
+void vp9_set_quantizer(struct VP9_COMP *cpi, int q);
 
-extern void vp9_frame_init_quantizer(struct VP9_COMP *cpi);
+void vp9_frame_init_quantizer(struct VP9_COMP *cpi);
 
-extern void vp9_update_zbin_extra(struct VP9_COMP *cpi, MACROBLOCK *x);
+void vp9_update_zbin_extra(struct VP9_COMP *cpi, MACROBLOCK *x);
 
-extern void vp9_mb_init_quantizer(struct VP9_COMP *cpi, MACROBLOCK *x);
+void vp9_mb_init_quantizer(struct VP9_COMP *cpi, MACROBLOCK *x);
 
-extern void vp9_init_quantizer(struct VP9_COMP *cpi);
+void vp9_init_quantizer(struct VP9_COMP *cpi);
 
 #endif  // VP9_ENCODER_VP9_QUANTIZE_H_
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index 2d12ba94f..bbcad172d 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -76,35 +76,19 @@ void vp9_save_coding_context(VP9_COMP *cpi) {
   // restored with a call to vp9_restore_coding_context. These functions are
   // intended for use in a re-code loop in vp9_compress_frame where the
   // quantizer value is adjusted between loop iterations.
-
-  cc->nmvc = cm->fc.nmvc;
   vp9_copy(cc->nmvjointcost,  cpi->mb.nmvjointcost);
   vp9_copy(cc->nmvcosts,  cpi->mb.nmvcosts);
   vp9_copy(cc->nmvcosts_hp,  cpi->mb.nmvcosts_hp);
 
-  vp9_copy(cc->inter_mode_probs, cm->fc.inter_mode_probs);
-
-  vp9_copy(cc->y_mode_prob, cm->fc.y_mode_prob);
-  vp9_copy(cc->uv_mode_prob, cm->fc.uv_mode_prob);
-  vp9_copy(cc->partition_prob, cm->fc.partition_prob);
-
   vp9_copy(cc->segment_pred_probs, cm->seg.pred_probs);
 
-  vp9_copy(cc->intra_inter_prob, cm->fc.intra_inter_prob);
-  vp9_copy(cc->comp_inter_prob, cm->fc.comp_inter_prob);
-  vp9_copy(cc->single_ref_prob, cm->fc.single_ref_prob);
-  vp9_copy(cc->comp_ref_prob, cm->fc.comp_ref_prob);
-
   vpx_memcpy(cpi->coding_context.last_frame_seg_map_copy,
              cm->last_frame_seg_map, (cm->mi_rows * cm->mi_cols));
 
   vp9_copy(cc->last_ref_lf_deltas, cm->lf.last_ref_deltas);
   vp9_copy(cc->last_mode_lf_deltas, cm->lf.last_mode_deltas);
 
-  vp9_copy(cc->coef_probs, cm->fc.coef_probs);
-  vp9_copy(cc->switchable_interp_prob, cm->fc.switchable_interp_prob);
-  cc->tx_probs = cm->fc.tx_probs;
-  vp9_copy(cc->mbskip_probs, cm->fc.mbskip_probs);
+  cc->fc = cm->fc;
 }
 
 void vp9_restore_coding_context(VP9_COMP *cpi) {
@@ -113,25 +97,12 @@ void vp9_restore_coding_context(VP9_COMP *cpi) {
 
   // Restore key state variables to the snapshot state stored in the
   // previous call to vp9_save_coding_context.
-
-  cm->fc.nmvc = cc->nmvc;
   vp9_copy(cpi->mb.nmvjointcost, cc->nmvjointcost);
   vp9_copy(cpi->mb.nmvcosts, cc->nmvcosts);
   vp9_copy(cpi->mb.nmvcosts_hp, cc->nmvcosts_hp);
 
-  vp9_copy(cm->fc.inter_mode_probs, cc->inter_mode_probs);
-
-  vp9_copy(cm->fc.y_mode_prob, cc->y_mode_prob);
-  vp9_copy(cm->fc.uv_mode_prob, cc->uv_mode_prob);
-  vp9_copy(cm->fc.partition_prob, cc->partition_prob);
-
   vp9_copy(cm->seg.pred_probs, cc->segment_pred_probs);
 
-  vp9_copy(cm->fc.intra_inter_prob, cc->intra_inter_prob);
-  vp9_copy(cm->fc.comp_inter_prob, cc->comp_inter_prob);
-  vp9_copy(cm->fc.single_ref_prob, cc->single_ref_prob);
-  vp9_copy(cm->fc.comp_ref_prob, cc->comp_ref_prob);
-
   vpx_memcpy(cm->last_frame_seg_map,
              cpi->coding_context.last_frame_seg_map_copy,
              (cm->mi_rows * cm->mi_cols));
@@ -139,10 +110,7 @@ void vp9_restore_coding_context(VP9_COMP *cpi) {
   vp9_copy(cm->lf.last_ref_deltas, cc->last_ref_lf_deltas);
   vp9_copy(cm->lf.last_mode_deltas, cc->last_mode_lf_deltas);
 
-  vp9_copy(cm->fc.coef_probs, cc->coef_probs);
-  vp9_copy(cm->fc.switchable_interp_prob, cc->switchable_interp_prob);
-  cm->fc.tx_probs = cc->tx_probs;
-  vp9_copy(cm->fc.mbskip_probs, cc->mbskip_probs);
+  cm->fc = cc->fc;
 }
 
 void vp9_setup_key_frame(VP9_COMP *cpi) {
diff --git a/vp9/encoder/vp9_ratectrl.h b/vp9/encoder/vp9_ratectrl.h
index 473317605..ddda7130c 100644
--- a/vp9/encoder/vp9_ratectrl.h
+++ b/vp9/encoder/vp9_ratectrl.h
@@ -32,8 +32,8 @@ int vp9_pick_frame_size(VP9_COMP *cpi);
 
 double vp9_convert_qindex_to_q(int qindex);
 int vp9_gfboost_qadjust(int qindex);
-extern int vp9_bits_per_mb(FRAME_TYPE frame_type, int qindex,
-                           double correction_factor);
+int vp9_bits_per_mb(FRAME_TYPE frame_type, int qindex,
+                    double correction_factor);
 void vp9_setup_inter_frame(VP9_COMP *cpi);
 
 #endif  // VP9_ENCODER_VP9_RATECTRL_H_
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 3ef3eeeeb..83cd61226 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -186,6 +186,7 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex) {
   //     cpi->common.refresh_alt_ref_frame)
   qindex = clamp(qindex, 0, MAXQ);
 
+  cpi->RDDIV = 100;
   cpi->RDMULT = compute_rd_mult(qindex);
   if (cpi->pass == 2 && (cpi->common.frame_type != KEY_FRAME)) {
     if (cpi->twopass.next_iiratio > 31)
@@ -204,42 +205,18 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex) {
   if (q < 8)
     q = 8;
 
-  if (cpi->RDMULT > 1000) {
-    cpi->RDDIV = 1;
-    cpi->RDMULT /= 100;
+  for (bsize = 0; bsize < BLOCK_SIZES; ++bsize) {
+    for (i = 0; i < MAX_MODES; i++) {
+      // Threshold here seem unecessarily harsh but fine given actual
+      // range of values used for cpi->sf.thresh_mult[]
+      int thresh_max = INT_MAX / (q * rd_thresh_block_size_factor[bsize]);
 
-    for (bsize = 0; bsize < BLOCK_SIZES; ++bsize) {
-      for (i = 0; i < MAX_MODES; ++i) {
-        // Threshold here seem unecessarily harsh but fine given actual
-        // range of values used for cpi->sf.thresh_mult[]
-        int thresh_max = INT_MAX / (q * rd_thresh_block_size_factor[bsize]);
-
-        // *4 relates to the scaling of rd_thresh_block_size_factor[]
-        if ((int64_t)cpi->sf.thresh_mult[i] < thresh_max) {
-          cpi->rd_threshes[bsize][i] =
-            cpi->sf.thresh_mult[i] * q *
-            rd_thresh_block_size_factor[bsize] / (4 * 100);
-        } else {
-          cpi->rd_threshes[bsize][i] = INT_MAX;
-        }
-      }
-    }
-  } else {
-    cpi->RDDIV = 100;
-
-    for (bsize = 0; bsize < BLOCK_SIZES; ++bsize) {
-      for (i = 0; i < MAX_MODES; i++) {
-        // Threshold here seem unecessarily harsh but fine given actual
-        // range of values used for cpi->sf.thresh_mult[]
-        int thresh_max = INT_MAX / (q * rd_thresh_block_size_factor[bsize]);
-
-        if (cpi->sf.thresh_mult[i] < thresh_max) {
-          cpi->rd_threshes[bsize][i] =
+      if (cpi->sf.thresh_mult[i] < thresh_max) {
+        cpi->rd_threshes[bsize][i] =
             cpi->sf.thresh_mult[i] * q *
             rd_thresh_block_size_factor[bsize] / 4;
-        } else {
-          cpi->rd_threshes[bsize][i] = INT_MAX;
-        }
+      } else {
+        cpi->rd_threshes[bsize][i] = INT_MAX;
       }
     }
   }
@@ -554,9 +531,13 @@ struct rdcost_block_args {
   TX_SIZE tx_size;
   int bw;
   int bh;
-  int rate;
-  int64_t dist;
-  int64_t sse;
+  int rate[256];
+  int64_t dist[256];
+  int64_t sse[256];
+  int this_rate;
+  int64_t this_dist;
+  int64_t this_sse;
+  int64_t this_rd;
   int64_t best_rd;
   int skip;
   const int16_t *scan, *nb;
@@ -573,17 +554,17 @@ static void dist_block(int plane, int block, TX_SIZE tx_size, void *arg) {
   int shift = args->tx_size == TX_32X32 ? 0 : 2;
   int16_t *const coeff = BLOCK_OFFSET(p->coeff, block);
   int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
-  args->dist += vp9_block_error(coeff, dqcoeff, 16 << ss_txfrm_size,
+  args->dist[block] = vp9_block_error(coeff, dqcoeff, 16 << ss_txfrm_size,
                                 &this_sse) >> shift;
-  args->sse += this_sse >> shift;
+  args->sse[block]  = this_sse >> shift;
 
   if (x->skip_encode &&
       xd->this_mi->mbmi.ref_frame[0] == INTRA_FRAME) {
     // TODO(jingning): tune the model to better capture the distortion.
     int64_t p = (pd->dequant[1] * pd->dequant[1] *
                     (1 << ss_txfrm_size)) >> shift;
-    args->dist += p;
-    args->sse  += p;
+    args->dist[block] = p;
+    args->sse[block]  = p;
   }
 }
 
@@ -594,10 +575,10 @@ static void rate_block(int plane, int block, BLOCK_SIZE plane_bsize,
   int x_idx, y_idx;
   txfrm_block_to_raster_xy(plane_bsize, args->tx_size, block, &x_idx, &y_idx);
 
-  args->rate += cost_coeffs(args->x, plane, block,
-                            args->t_above + x_idx,
-                            args->t_left + y_idx, args->tx_size,
-                            args->scan, args->nb);
+  args->rate[block] = cost_coeffs(args->x, plane, block,
+                                  args->t_above + x_idx,
+                                  args->t_left + y_idx, args->tx_size,
+                                  args->scan, args->nb);
 }
 
 static void block_yrd_txfm(int plane, int block, BLOCK_SIZE plane_bsize,
@@ -610,16 +591,6 @@ static void block_yrd_txfm(int plane, int block, BLOCK_SIZE plane_bsize,
 
   if (args->skip)
     return;
-  rd1 = RDCOST(x->rdmult, x->rddiv, args->rate, args->dist);
-  rd2 = RDCOST(x->rdmult, x->rddiv, 0, args->sse);
-  rd = MIN(rd1, rd2);
-  if (rd > args->best_rd) {
-    args->skip = 1;
-    args->rate = INT_MAX;
-    args->dist = INT64_MAX;
-    args->sse  = INT64_MAX;
-    return;
-  }
 
   if (!is_inter_block(&xd->this_mi->mbmi))
     vp9_encode_block_intra(plane, block, plane_bsize, tx_size, &encode_args);
@@ -628,6 +599,56 @@ static void block_yrd_txfm(int plane, int block, BLOCK_SIZE plane_bsize,
 
   dist_block(plane, block, tx_size, args);
   rate_block(plane, block, plane_bsize, tx_size, args);
+  rd1 = RDCOST(x->rdmult, x->rddiv, args->rate[block], args->dist[block]);
+  rd2 = RDCOST(x->rdmult, x->rddiv, 0, args->sse[block]);
+
+  // TODO(jingning): temporarily enabled only for luma component
+  rd = MIN(rd1, rd2);
+  if (plane == 0)
+    x->zcoeff_blk[tx_size][block] = rd1 > rd2;
+
+  args->this_rate += args->rate[block];
+  args->this_dist += args->dist[block];
+  args->this_sse  += args->sse[block];
+  args->this_rd += rd;
+
+  if (args->this_rd > args->best_rd) {
+    args->skip = 1;
+    return;
+  }
+}
+
+void vp9_get_entropy_contexts(TX_SIZE tx_size,
+    ENTROPY_CONTEXT t_above[16], ENTROPY_CONTEXT t_left[16],
+    const ENTROPY_CONTEXT *above, const ENTROPY_CONTEXT *left,
+    int num_4x4_w, int num_4x4_h) {
+  int i;
+  switch (tx_size) {
+    case TX_4X4:
+      vpx_memcpy(t_above, above, sizeof(ENTROPY_CONTEXT) * num_4x4_w);
+      vpx_memcpy(t_left, left, sizeof(ENTROPY_CONTEXT) * num_4x4_h);
+      break;
+    case TX_8X8:
+      for (i = 0; i < num_4x4_w; i += 2)
+        t_above[i] = !!*(const uint16_t *)&above[i];
+      for (i = 0; i < num_4x4_h; i += 2)
+        t_left[i] = !!*(const uint16_t *)&left[i];
+      break;
+    case TX_16X16:
+      for (i = 0; i < num_4x4_w; i += 4)
+        t_above[i] = !!*(const uint32_t *)&above[i];
+      for (i = 0; i < num_4x4_h; i += 4)
+        t_left[i] = !!*(const uint32_t *)&left[i];
+      break;
+    case TX_32X32:
+      for (i = 0; i < num_4x4_w; i += 8)
+        t_above[i] = !!*(const uint64_t *)&above[i];
+      for (i = 0; i < num_4x4_h; i += 8)
+        t_left[i] = !!*(const uint64_t *)&left[i];
+      break;
+    default:
+      assert(!"Invalid transform size.");
+  }
 }
 
 static void txfm_rd_in_plane(MACROBLOCK *x,
@@ -638,45 +659,33 @@ static void txfm_rd_in_plane(MACROBLOCK *x,
   MACROBLOCKD *const xd = &x->e_mbd;
   struct macroblockd_plane *const pd = &xd->plane[plane];
   const BLOCK_SIZE bs = get_plane_block_size(bsize, pd);
-  const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bs];
-  const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bs];
-  int i;
+  const int num_4x4_w = num_4x4_blocks_wide_lookup[bs];
+  const int num_4x4_h = num_4x4_blocks_high_lookup[bs];
+
   struct rdcost_block_args args = { x, { 0 }, { 0 }, tx_size,
-                                    num_4x4_blocks_wide, num_4x4_blocks_high,
-                                    0, 0, 0, ref_best_rd, 0 };
+                                    num_4x4_w, num_4x4_h,
+                                    { 0 }, { 0 }, { 0 },
+                                    0, 0, 0, 0, ref_best_rd, 0 };
   if (plane == 0)
     xd->this_mi->mbmi.tx_size = tx_size;
 
+  vp9_get_entropy_contexts(tx_size, args.t_above, args.t_left,
+                           pd->above_context, pd->left_context,
+                           num_4x4_w, num_4x4_h);
   switch (tx_size) {
     case TX_4X4:
-      vpx_memcpy(&args.t_above, pd->above_context,
-                 sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide);
-      vpx_memcpy(&args.t_left, pd->left_context,
-                 sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high);
       get_scan_nb_4x4(get_tx_type_4x4(pd->plane_type, xd, 0),
                       &args.scan, &args.nb);
       break;
     case TX_8X8:
-      for (i = 0; i < num_4x4_blocks_wide; i += 2)
-        args.t_above[i] = !!*(uint16_t *)&pd->above_context[i];
-      for (i = 0; i < num_4x4_blocks_high; i += 2)
-        args.t_left[i] = !!*(uint16_t *)&pd->left_context[i];
       get_scan_nb_8x8(get_tx_type_8x8(pd->plane_type, xd),
                       &args.scan, &args.nb);
       break;
     case TX_16X16:
-      for (i = 0; i < num_4x4_blocks_wide; i += 4)
-        args.t_above[i] = !!*(uint32_t *)&pd->above_context[i];
-      for (i = 0; i < num_4x4_blocks_high; i += 4)
-        args.t_left[i] = !!*(uint32_t *)&pd->left_context[i];
       get_scan_nb_16x16(get_tx_type_16x16(pd->plane_type, xd),
                         &args.scan, &args.nb);
       break;
     case TX_32X32:
-      for (i = 0; i < num_4x4_blocks_wide; i += 8)
-        args.t_above[i] = !!*(uint64_t *)&pd->above_context[i];
-      for (i = 0; i < num_4x4_blocks_high; i += 8)
-        args.t_left[i] = !!*(uint64_t *)&pd->left_context[i];
       args.scan = vp9_default_scan_32x32;
       args.nb = vp9_default_scan_32x32_neighbors;
       break;
@@ -685,10 +694,17 @@ static void txfm_rd_in_plane(MACROBLOCK *x,
   }
 
   foreach_transformed_block_in_plane(xd, bsize, plane, block_yrd_txfm, &args);
-  *distortion = args.dist;
-  *rate       = args.rate;
-  *sse        = args.sse;
-  *skippable  = vp9_is_skippable_in_plane(xd, bsize, plane) && (!args.skip);
+  if (args.skip) {
+    *rate       = INT_MAX;
+    *distortion = INT64_MAX;
+    *sse        = INT64_MAX;
+    *skippable  = 0;
+  } else {
+    *distortion = args.this_dist;
+    *rate       = args.this_rate;
+    *sse        = args.this_sse;
+    *skippable  = vp9_is_skippable_in_plane(xd, bsize, plane);
+  }
 }
 
 static void choose_largest_txfm_size(VP9_COMP *cpi, MACROBLOCK *x,
@@ -696,15 +712,15 @@ static void choose_largest_txfm_size(VP9_COMP *cpi, MACROBLOCK *x,
                                      int *skip, int64_t *sse,
                                      int64_t ref_best_rd,
                                      BLOCK_SIZE bs) {
-  const TX_SIZE max_txfm_size = max_txsize_lookup[bs];
+  const TX_SIZE max_tx_size = max_txsize_lookup[bs];
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->this_mi->mbmi;
-  if (max_txfm_size == TX_32X32 &&
+  if (max_tx_size == TX_32X32 &&
       (cm->tx_mode == ALLOW_32X32 ||
        cm->tx_mode == TX_MODE_SELECT)) {
     mbmi->tx_size = TX_32X32;
-  } else if (max_txfm_size >= TX_16X16 &&
+  } else if (max_tx_size >= TX_16X16 &&
              (cm->tx_mode == ALLOW_16X16 ||
               cm->tx_mode == ALLOW_32X32 ||
               cm->tx_mode == TX_MODE_SELECT)) {
@@ -717,7 +733,7 @@ static void choose_largest_txfm_size(VP9_COMP *cpi, MACROBLOCK *x,
   txfm_rd_in_plane(x, rate, distortion, skip,
                    &sse[mbmi->tx_size], ref_best_rd, 0, bs,
                    mbmi->tx_size);
-  cpi->txfm_stepdown_count[0]++;
+  cpi->tx_stepdown_count[0]++;
 }
 
 static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
@@ -811,15 +827,15 @@ static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
       rd[TX_32X32][1] < rd[TX_16X16][1] &&
       rd[TX_32X32][1] < rd[TX_8X8][1] &&
       rd[TX_32X32][1] < rd[TX_4X4][1]) {
-    cpi->txfm_stepdown_count[0]++;
+    cpi->tx_stepdown_count[0]++;
   } else if (max_tx_size >= TX_16X16 &&
              rd[TX_16X16][1] < rd[TX_8X8][1] &&
              rd[TX_16X16][1] < rd[TX_4X4][1]) {
-    cpi->txfm_stepdown_count[max_tx_size - TX_16X16]++;
+    cpi->tx_stepdown_count[max_tx_size - TX_16X16]++;
   } else if (rd[TX_8X8][1] < rd[TX_4X4][1]) {
-    cpi->txfm_stepdown_count[max_tx_size - TX_8X8]++;
+    cpi->tx_stepdown_count[max_tx_size - TX_8X8]++;
   } else {
-    cpi->txfm_stepdown_count[max_tx_size - TX_4X4]++;
+    cpi->tx_stepdown_count[max_tx_size - TX_4X4]++;
   }
 }
 
@@ -829,7 +845,7 @@ static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x,
                                           int *s, int *skip, int64_t *sse,
                                           int64_t ref_best_rd,
                                           BLOCK_SIZE bs) {
-  const TX_SIZE max_txfm_size = max_txsize_lookup[bs];
+  const TX_SIZE max_tx_size = max_txsize_lookup[bs];
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->this_mi->mbmi;
@@ -845,9 +861,9 @@ static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x,
   // for (n = TX_4X4; n <= max_txfm_size; n++)
   //   r[n][0] = (r[n][0] * scale_r[n]);
 
-  for (n = TX_4X4; n <= max_txfm_size; n++) {
+  for (n = TX_4X4; n <= max_tx_size; n++) {
     r[n][1] = r[n][0];
-    for (m = 0; m <= n - (n == max_txfm_size); m++) {
+    for (m = 0; m <= n - (n == max_tx_size); m++) {
       if (m == n)
         r[n][1] += vp9_cost_zero(tx_probs[m]);
       else
@@ -859,7 +875,7 @@ static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x,
   s0 = vp9_cost_bit(skip_prob, 0);
   s1 = vp9_cost_bit(skip_prob, 1);
 
-  for (n = TX_4X4; n <= max_txfm_size; n++) {
+  for (n = TX_4X4; n <= max_tx_size; n++) {
     if (s[n]) {
       rd[n][0] = rd[n][1] = RDCOST(x->rdmult, x->rddiv, s1, d[n]);
     } else {
@@ -867,19 +883,19 @@ static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x,
       rd[n][1] = RDCOST(x->rdmult, x->rddiv, r[n][1] + s0, d[n]);
     }
   }
-  for (n = TX_4X4; n <= max_txfm_size; n++) {
+  for (n = TX_4X4; n <= max_tx_size; n++) {
     rd[n][0] = (int64_t)(scale_rd[n] * rd[n][0]);
     rd[n][1] = (int64_t)(scale_rd[n] * rd[n][1]);
   }
 
-  if (max_txfm_size == TX_32X32 &&
+  if (max_tx_size == TX_32X32 &&
       (cm->tx_mode == ALLOW_32X32 ||
        (cm->tx_mode == TX_MODE_SELECT &&
         rd[TX_32X32][1] <= rd[TX_16X16][1] &&
         rd[TX_32X32][1] <= rd[TX_8X8][1] &&
         rd[TX_32X32][1] <= rd[TX_4X4][1]))) {
     mbmi->tx_size = TX_32X32;
-  } else if (max_txfm_size >= TX_16X16 &&
+  } else if (max_tx_size >= TX_16X16 &&
              (cm->tx_mode == ALLOW_16X16 ||
               cm->tx_mode == ALLOW_32X32 ||
               (cm->tx_mode == TX_MODE_SELECT &&
@@ -901,19 +917,19 @@ static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x,
   txfm_rd_in_plane(x, rate, distortion, skip, &sse[mbmi->tx_size],
                    ref_best_rd, 0, bs, mbmi->tx_size);
 
-  if (max_txfm_size == TX_32X32 &&
+  if (max_tx_size == TX_32X32 &&
       rd[TX_32X32][1] <= rd[TX_16X16][1] &&
       rd[TX_32X32][1] <= rd[TX_8X8][1] &&
       rd[TX_32X32][1] <= rd[TX_4X4][1]) {
-    cpi->txfm_stepdown_count[0]++;
-  } else if (max_txfm_size >= TX_16X16 &&
+    cpi->tx_stepdown_count[0]++;
+  } else if (max_tx_size >= TX_16X16 &&
              rd[TX_16X16][1] <= rd[TX_8X8][1] &&
              rd[TX_16X16][1] <= rd[TX_4X4][1]) {
-    cpi->txfm_stepdown_count[max_txfm_size - TX_16X16]++;
+    cpi->tx_stepdown_count[max_tx_size - TX_16X16]++;
   } else if (rd[TX_8X8][1] <= rd[TX_4X4][1]) {
-    cpi->txfm_stepdown_count[max_txfm_size - TX_8X8]++;
+    cpi->tx_stepdown_count[max_tx_size - TX_8X8]++;
   } else {
-    cpi->txfm_stepdown_count[max_txfm_size - TX_4X4]++;
+    cpi->tx_stepdown_count[max_tx_size - TX_4X4]++;
   }
 }
 
@@ -1058,6 +1074,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
       for (idx = 0; idx < num_4x4_blocks_wide; ++idx) {
         int64_t ssz;
         const int16_t *scan;
+        const int16_t *nb;
         uint8_t *src = src_init + idx * 4 + idy * 4 * src_stride;
         uint8_t *dst = dst_init + idx * 4 + idy * 4 * dst_stride;
 
@@ -1083,10 +1100,10 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
           x->quantize_b_4x4(x, block, tx_type, 16);
         }
 
-        scan = get_scan_4x4(get_tx_type_4x4(PLANE_TYPE_Y_WITH_DC, xd, block));
+        get_scan_nb_4x4(get_tx_type_4x4(PLANE_TYPE_Y_WITH_DC, xd, block),
+                        &scan, &nb);
         ratey += cost_coeffs(x, 0, block,
-                             tempa + idx, templ + idy, TX_4X4, scan,
-                             vp9_get_coef_neighbors_handle(scan));
+                             tempa + idx, templ + idy, TX_4X4, scan, nb);
         distortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, block),
                                       16, &ssz) >> 2;
         if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
@@ -1458,11 +1475,12 @@ static int labels2mode(MACROBLOCK *x, int i,
   switch (m = this_mode) {
     case NEWMV:
       this_mv->as_int = seg_mvs[mbmi->ref_frame[0]].as_int;
-      thismvcost  = vp9_mv_bit_cost(this_mv, best_ref_mv, mvjcost, mvcost,
-                                    102);
+      thismvcost  = vp9_mv_bit_cost(&this_mv->as_mv, &best_ref_mv->as_mv,
+                                    mvjcost, mvcost, 102);
       if (has_second_rf) {
         this_second_mv->as_int = seg_mvs[mbmi->ref_frame[1]].as_int;
-        thismvcost += vp9_mv_bit_cost(this_second_mv, second_best_ref_mv,
+        thismvcost += vp9_mv_bit_cost(&this_second_mv->as_mv,
+                                      &second_best_ref_mv->as_mv,
                                       mvjcost, mvcost, 102);
       }
       break;
@@ -1796,20 +1814,23 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
           // adjust src pointer for this block
           mi_buf_shift(x, i);
           if (cpi->sf.search_method == HEX) {
-            bestsme = vp9_hex_search(x, &mvp_full,
+            bestsme = vp9_hex_search(x, &mvp_full.as_mv,
                                      step_param,
                                      sadpb, 1, v_fn_ptr, 1,
-                                     bsi->ref_mv, &mode_mv[NEWMV]);
+                                     &bsi->ref_mv->as_mv,
+                                     &mode_mv[NEWMV].as_mv);
           } else if (cpi->sf.search_method == SQUARE) {
-            bestsme = vp9_square_search(x, &mvp_full,
+            bestsme = vp9_square_search(x, &mvp_full.as_mv,
                                         step_param,
                                         sadpb, 1, v_fn_ptr, 1,
-                                        bsi->ref_mv, &mode_mv[NEWMV]);
+                                        &bsi->ref_mv->as_mv,
+                                        &mode_mv[NEWMV].as_mv);
           } else if (cpi->sf.search_method == BIGDIA) {
-            bestsme = vp9_bigdia_search(x, &mvp_full,
+            bestsme = vp9_bigdia_search(x, &mvp_full.as_mv,
                                         step_param,
                                         sadpb, 1, v_fn_ptr, 1,
-                                        bsi->ref_mv, &mode_mv[NEWMV]);
+                                        &bsi->ref_mv->as_mv,
+                                        &mode_mv[NEWMV].as_mv);
           } else {
             bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param,
                                              sadpb, further_steps, 0, v_fn_ptr,
@@ -1840,8 +1861,10 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
           if (bestsme < INT_MAX) {
             int distortion;
             unsigned int sse;
-            cpi->find_fractional_mv_step(x, &mode_mv[NEWMV],
-                                         bsi->ref_mv, x->errorperbit, v_fn_ptr,
+            cpi->find_fractional_mv_step(x,
+                                         &mode_mv[NEWMV].as_mv,
+                                         &bsi->ref_mv->as_mv,
+                                         x->errorperbit, v_fn_ptr,
                                          0, cpi->sf.subpel_iters_per_step,
                                          x->nmvjointcost, x->mvcost,
                                          &distortion, &sse);
@@ -2220,11 +2243,12 @@ static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
   ctx->comp_pred_diff   = (int)comp_pred_diff[COMP_PREDICTION_ONLY];
   ctx->hybrid_pred_diff = (int)comp_pred_diff[HYBRID_PREDICTION];
 
-  // FIXME(rbultje) does this memcpy the whole array? I believe sizeof()
-  // doesn't actually work this way
-  memcpy(ctx->tx_rd_diff, tx_size_diff, sizeof(ctx->tx_rd_diff));
-  memcpy(ctx->best_filter_diff, best_filter_diff,
-         sizeof(*best_filter_diff) * (SWITCHABLE_FILTERS + 1));
+  vpx_memcpy(ctx->zcoeff_blk, x->zcoeff_blk[xd->this_mi->mbmi.tx_size],
+             sizeof(ctx->zcoeff_blk));
+
+  vpx_memcpy(ctx->tx_rd_diff, tx_size_diff, sizeof(ctx->tx_rd_diff));
+  vpx_memcpy(ctx->best_filter_diff, best_filter_diff,
+             sizeof(*best_filter_diff) * (SWITCHABLE_FILTERS + 1));
 }
 
 static void setup_pred_block(const MACROBLOCKD *xd,
@@ -2403,23 +2427,23 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;
 
   if (cpi->sf.search_method == HEX) {
-    bestsme = vp9_hex_search(x, &mvp_full,
+    bestsme = vp9_hex_search(x, &mvp_full.as_mv,
                              step_param,
                              sadpb, 1,
                              &cpi->fn_ptr[block_size], 1,
-                             &ref_mv, tmp_mv);
+                             &ref_mv.as_mv, &tmp_mv->as_mv);
   } else if (cpi->sf.search_method == SQUARE) {
-    bestsme = vp9_square_search(x, &mvp_full,
+    bestsme = vp9_square_search(x, &mvp_full.as_mv,
                                 step_param,
                                 sadpb, 1,
                                 &cpi->fn_ptr[block_size], 1,
-                                &ref_mv, tmp_mv);
+                                &ref_mv.as_mv, &tmp_mv->as_mv);
   } else if (cpi->sf.search_method == BIGDIA) {
-    bestsme = vp9_bigdia_search(x, &mvp_full,
+    bestsme = vp9_bigdia_search(x, &mvp_full.as_mv,
                                 step_param,
                                 sadpb, 1,
                                 &cpi->fn_ptr[block_size], 1,
-                                &ref_mv, tmp_mv);
+                                &ref_mv.as_mv, &tmp_mv->as_mv);
   } else {
     bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param,
                                      sadpb, further_steps, 1,
@@ -2435,16 +2459,15 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   if (bestsme < INT_MAX) {
     int dis;  /* TODO: use dis in distortion calculation later. */
     unsigned int sse;
-    cpi->find_fractional_mv_step(x, tmp_mv, &ref_mv,
+    cpi->find_fractional_mv_step(x, &tmp_mv->as_mv, &ref_mv.as_mv,
                                  x->errorperbit,
                                  &cpi->fn_ptr[block_size],
                                  0, cpi->sf.subpel_iters_per_step,
                                  x->nmvjointcost, x->mvcost,
                                  &dis, &sse);
   }
-  *rate_mv = vp9_mv_bit_cost(tmp_mv, &ref_mv,
-                             x->nmvjointcost, x->mvcost,
-                             96);
+  *rate_mv = vp9_mv_bit_cost(&tmp_mv->as_mv, &ref_mv.as_mv,
+                             x->nmvjointcost, x->mvcost, 96);
 
   if (cpi->sf.adaptive_motion_search && cpi->common.show_frame)
     x->pred_mv[ref].as_int = tmp_mv->as_int;
@@ -2570,8 +2593,8 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
       unsigned int sse;
 
       bestsme = cpi->find_fractional_mv_step_comp(
-          x, &tmp_mv,
-          &ref_mv[id],
+          x, &tmp_mv.as_mv,
+          &ref_mv[id].as_mv,
           x->errorperbit,
           &cpi->fn_ptr[block_size],
           0, cpi->sf.subpel_iters_per_step,
@@ -2603,11 +2626,11 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
     for (i = 0; i < MAX_MB_PLANE; i++)
       xd->plane[i].pre[1] = backup_second_yv12[i];
   }
-  *rate_mv  = vp9_mv_bit_cost(&frame_mv[refs[0]],
-                              &mbmi->ref_mvs[refs[0]][0],
+  *rate_mv  = vp9_mv_bit_cost(&frame_mv[refs[0]].as_mv,
+                              &mbmi->ref_mvs[refs[0]][0].as_mv,
                               x->nmvjointcost, x->mvcost, 96);
-  *rate_mv += vp9_mv_bit_cost(&frame_mv[refs[1]],
-                              &mbmi->ref_mvs[refs[1]][0],
+  *rate_mv += vp9_mv_bit_cost(&frame_mv[refs[1]].as_mv,
+                              &mbmi->ref_mvs[refs[1]][0].as_mv,
                               x->nmvjointcost, x->mvcost, 96);
 
   vpx_free(second_pred);
@@ -2630,7 +2653,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   VP9_COMMON *cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = &xd->this_mi->mbmi;
-  const int is_comp_pred = (mbmi->ref_frame[1] > 0);
+  const int is_comp_pred = has_second_ref(mbmi);
   const int num_refs = is_comp_pred ? 2 : 1;
   const int this_mode = mbmi->mode;
   int_mv *frame_mv = mode_mv[this_mode];
@@ -2659,11 +2682,11 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
         joint_motion_search(cpi, x, bsize, frame_mv,
                             mi_row, mi_col, single_newmv, &rate_mv);
       } else {
-        rate_mv  = vp9_mv_bit_cost(&frame_mv[refs[0]],
-                                   &mbmi->ref_mvs[refs[0]][0],
+        rate_mv  = vp9_mv_bit_cost(&frame_mv[refs[0]].as_mv,
+                                   &mbmi->ref_mvs[refs[0]][0].as_mv,
                                    x->nmvjointcost, x->mvcost, 96);
-        rate_mv += vp9_mv_bit_cost(&frame_mv[refs[1]],
-                                   &mbmi->ref_mvs[refs[1]][0],
+        rate_mv += vp9_mv_bit_cost(&frame_mv[refs[1]].as_mv,
+                                   &mbmi->ref_mvs[refs[1]][0].as_mv,
                                    x->nmvjointcost, x->mvcost, 96);
       }
       if (frame_mv[refs[0]].as_int == INVALID_MV ||
@@ -3071,8 +3094,12 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
         vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd), 0);
     *returndist = dist_y + dist_uv;
     if (cpi->sf.tx_size_search_method == USE_FULL_RD)
-      for (i = 0; i < TX_MODES; i++)
-        ctx->tx_rd_diff[i] = tx_cache[i] - tx_cache[cm->tx_mode];
+      for (i = 0; i < TX_MODES; i++) {
+        if (tx_cache[i] < INT64_MAX && tx_cache[cm->tx_mode] < INT64_MAX)
+          ctx->tx_rd_diff[i] = tx_cache[i] - tx_cache[cm->tx_mode];
+        else
+          ctx->tx_rd_diff[i] = 0;
+      }
   }
 
   ctx->mic = *xd->this_mi;
@@ -3139,8 +3166,11 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   const int bws = num_8x8_blocks_wide_lookup[bsize] / 2;
   const int bhs = num_8x8_blocks_high_lookup[bsize] / 2;
   int best_skip2 = 0;
+  unsigned char best_zcoeff_blk[256] = { 0 };
 
   x->skip_encode = cpi->sf.skip_encode_frame && xd->q_index < QIDX_SKIP_THRESH;
+  vpx_memset(x->zcoeff_blk, 0, sizeof(x->zcoeff_blk));
+  vpx_memset(ctx->zcoeff_blk, 0, sizeof(ctx->zcoeff_blk));
 
   for (i = 0; i < 4; i++) {
     int j;
@@ -3812,6 +3842,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
         best_mbmode = *mbmi;
         best_skip2 = this_skip2;
         best_partition = *x->partition_info;
+        vpx_memcpy(best_zcoeff_blk, x->zcoeff_blk[mbmi->tx_size],
+                   sizeof(best_zcoeff_blk));
 
         if (this_mode == RD_I4X4_PRED || this_mode == RD_SPLITMV)
           for (i = 0; i < 4; i++)
@@ -3993,13 +4025,11 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   if (best_mbmode.ref_frame[0] != INTRA_FRAME &&
       best_mbmode.sb_type < BLOCK_8X8) {
     for (i = 0; i < 4; i++)
-      xd->this_mi->bmi[i].as_mv[0].as_int =
-          best_bmodes[i].as_mv[0].as_int;
+      xd->this_mi->bmi[i].as_mv[0].as_int = best_bmodes[i].as_mv[0].as_int;
 
-    if (mbmi->ref_frame[1] > 0)
+    if (has_second_ref(mbmi))
       for (i = 0; i < 4; i++)
-        xd->this_mi->bmi[i].as_mv[1].as_int =
-            best_bmodes[i].as_mv[1].as_int;
+        xd->this_mi->bmi[i].as_mv[1].as_int = best_bmodes[i].as_mv[1].as_int;
 
     *x->partition_info = best_partition;
 
@@ -4007,6 +4037,9 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     mbmi->mv[1].as_int = xd->this_mi->bmi[3].as_mv[1].as_int;
   }
 
+  vpx_memcpy(x->zcoeff_blk[mbmi->tx_size], best_zcoeff_blk,
+             sizeof(best_zcoeff_blk));
+
   for (i = 0; i < NB_PREDICTION_TYPES; ++i) {
     if (best_pred_rd[i] == INT64_MAX)
       best_pred_diff[i] = INT_MIN;
diff --git a/vp9/encoder/vp9_rdopt.h b/vp9/encoder/vp9_rdopt.h
index eba7df907..9796c0d7c 100644
--- a/vp9/encoder/vp9_rdopt.h
+++ b/vp9/encoder/vp9_rdopt.h
@@ -33,4 +33,9 @@ void vp9_init_me_luts();
 void vp9_set_mbmode_and_mvs(MACROBLOCK *x,
                             MB_PREDICTION_MODE mb, int_mv *mv);
 
+void vp9_get_entropy_contexts(TX_SIZE tx_size,
+    ENTROPY_CONTEXT t_above[16], ENTROPY_CONTEXT t_left[16],
+    const ENTROPY_CONTEXT *above, const ENTROPY_CONTEXT *left,
+    int num_4x4_w, int num_4x4_h);
+
 #endif  // VP9_ENCODER_VP9_RDOPT_H_
diff --git a/vp9/encoder/vp9_segmentation.c b/vp9/encoder/vp9_segmentation.c
index 10655e8a7..874b71ab1 100644
--- a/vp9/encoder/vp9_segmentation.c
+++ b/vp9/encoder/vp9_segmentation.c
@@ -130,6 +130,8 @@ static void count_segs(VP9_COMP *cpi, MODE_INFO **mi_8x8,
     return;
 
   segment_id = mi_8x8[0]->mbmi.segment_id;
+  xd->mi_8x8 = mi_8x8;
+  xd->this_mi = mi_8x8[0];
 
   set_mi_row_col(cm, xd, mi_row, bh, mi_col, bw);
 
diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c
index 63826eea5..1768b5bed 100644
--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c
@@ -154,10 +154,10 @@ static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi,
   // TODO Check that the 16x16 vf & sdf are selected here
   // Ignore mv costing by sending NULL pointer instead of cost arrays
   ref_mv = &x->e_mbd.mi_8x8[0]->bmi[0].as_mv[0];
-  bestsme = vp9_hex_search(x, &best_ref_mv1_full,
+  bestsme = vp9_hex_search(x, &best_ref_mv1_full.as_mv,
                            step_param, sadpb, 1,
                            &cpi->fn_ptr[BLOCK_16X16],
-                           0, &best_ref_mv1, ref_mv);
+                           0, &best_ref_mv1.as_mv, &ref_mv->as_mv);
 
 #if ALT_REF_SUBPEL_ENABLED
   // Try sub-pixel MC?
@@ -166,8 +166,8 @@ static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi,
     int distortion;
     unsigned int sse;
     // Ignore mv costing by sending NULL pointer instead of cost array
-    bestsme = cpi->find_fractional_mv_step(x, ref_mv,
-                                           &best_ref_mv1,
+    bestsme = cpi->find_fractional_mv_step(x, &ref_mv->as_mv,
+                                           &best_ref_mv1.as_mv,
                                            x->errorperbit,
                                            &cpi->fn_ptr[BLOCK_16X16],
                                            0, cpi->sf.subpel_iters_per_step,
diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c
index a59f6db88..7c14c18aa 100644
--- a/vp9/encoder/vp9_tokenize.c
+++ b/vp9/encoder/vp9_tokenize.c
@@ -137,8 +137,7 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize,
 
   pt = get_entropy_context(tx_size, pd->above_context + aoff,
                                     pd->left_context + loff);
-  get_scan_and_band(xd, tx_size, type, block, &scan, &band_translate);
-  nb = vp9_get_coef_neighbors_handle(scan);
+  get_scan_and_band(xd, tx_size, type, block, &scan, &nb, &band_translate);
   c = 0;
   do {
     const int band = get_coef_band(band_translate, c);
diff --git a/vp9/encoder/vp9_variance_c.c b/vp9/encoder/vp9_variance_c.c
index 155ba8a3e..991ef4d29 100644
--- a/vp9/encoder/vp9_variance_c.c
+++ b/vp9/encoder/vp9_variance_c.c
@@ -8,13 +8,150 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include "./vp9_rtcd.h"
 
-#include "vp9/encoder/vp9_variance.h"
-#include "vp9/common/vp9_filter.h"
-#include "vp9/common/vp9_subpelvar.h"
-#include "vpx/vpx_integer.h"
 #include "vpx_ports/mem.h"
-#include "./vp9_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_convolve.h"
+#include "vp9/common/vp9_filter.h"
+#include "vp9/encoder/vp9_variance.h"
+
+static void variance(const uint8_t *src_ptr,
+                     int  source_stride,
+                     const uint8_t *ref_ptr,
+                     int  recon_stride,
+                     int  w,
+                     int  h,
+                     unsigned int *sse,
+                     int *sum) {
+  int i, j;
+  int diff;
+
+  *sum = 0;
+  *sse = 0;
+
+  for (i = 0; i < h; i++) {
+    for (j = 0; j < w; j++) {
+      diff = src_ptr[j] - ref_ptr[j];
+      *sum += diff;
+      *sse += diff * diff;
+    }
+
+    src_ptr += source_stride;
+    ref_ptr += recon_stride;
+  }
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : filter_block2d_bil_first_pass
+ *
+ *  INPUTS        : uint8_t  *src_ptr          : Pointer to source block.
+ *                  uint32_t src_pixels_per_line : Stride of input block.
+ *                  uint32_t pixel_step        : Offset between filter input
+ *                                               samples (see notes).
+ *                  uint32_t output_height     : Input block height.
+ *                  uint32_t output_width      : Input block width.
+ *                  int32_t  *vp9_filter       : Array of 2 bi-linear filter
+ *                                               taps.
+ *
+ *  OUTPUTS       : int32_t *output_ptr        : Pointer to filtered block.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block in
+ *                  either horizontal or vertical direction to produce the
+ *                  filtered output block. Used to implement first-pass
+ *                  of 2-D separable filter.
+ *
+ *  SPECIAL NOTES : Produces int32_t output to retain precision for next pass.
+ *                  Two filter taps should sum to VP9_FILTER_WEIGHT.
+ *                  pixel_step defines whether the filter is applied
+ *                  horizontally (pixel_step=1) or vertically (pixel_step=
+ *                  stride).
+ *                  It defines the offset required to move from one input
+ *                  to the next.
+ *
+ ****************************************************************************/
+static void var_filter_block2d_bil_first_pass(const uint8_t *src_ptr,
+                                              uint16_t *output_ptr,
+                                              unsigned int src_pixels_per_line,
+                                              int pixel_step,
+                                              unsigned int output_height,
+                                              unsigned int output_width,
+                                              const int16_t *vp9_filter) {
+  unsigned int i, j;
+
+  for (i = 0; i < output_height; i++) {
+    for (j = 0; j < output_width; j++) {
+      output_ptr[j] = ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] +
+                          (int)src_ptr[pixel_step] * vp9_filter[1],
+                          FILTER_BITS);
+
+      src_ptr++;
+    }
+
+    // Next row...
+    src_ptr    += src_pixels_per_line - output_width;
+    output_ptr += output_width;
+  }
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : filter_block2d_bil_second_pass
+ *
+ *  INPUTS        : int32_t  *src_ptr          : Pointer to source block.
+ *                  uint32_t src_pixels_per_line : Stride of input block.
+ *                  uint32_t pixel_step        : Offset between filter input
+ *                                               samples (see notes).
+ *                  uint32_t output_height     : Input block height.
+ *                  uint32_t output_width      : Input block width.
+ *                  int32_t  *vp9_filter       : Array of 2 bi-linear filter
+ *                                               taps.
+ *
+ *  OUTPUTS       : uint16_t *output_ptr       : Pointer to filtered block.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block in
+ *                  either horizontal or vertical direction to produce the
+ *                  filtered output block. Used to implement second-pass
+ *                  of 2-D separable filter.
+ *
+ *  SPECIAL NOTES : Requires 32-bit input as produced by
+ *                  filter_block2d_bil_first_pass.
+ *                  Two filter taps should sum to VP9_FILTER_WEIGHT.
+ *                  pixel_step defines whether the filter is applied
+ *                  horizontally (pixel_step=1) or vertically (pixel_step=
+ *                  stride).
+ *                  It defines the offset required to move from one input
+ *                  to the next.
+ *
+ ****************************************************************************/
+static void var_filter_block2d_bil_second_pass(const uint16_t *src_ptr,
+                                               uint8_t *output_ptr,
+                                               unsigned int src_pixels_per_line,
+                                               unsigned int pixel_step,
+                                               unsigned int output_height,
+                                               unsigned int output_width,
+                                               const int16_t *vp9_filter) {
+  unsigned int  i, j;
+
+  for (i = 0; i < output_height; i++) {
+    for (j = 0; j < output_width; j++) {
+      output_ptr[j] = ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] +
+                          (int)src_ptr[pixel_step] * vp9_filter[1],
+                          FILTER_BITS);
+      src_ptr++;
+    }
+
+    src_ptr += src_pixels_per_line - output_width;
+    output_ptr += output_width;
+  }
+}
 
 unsigned int vp9_get_mb_ss_c(const int16_t *src_ptr) {
   unsigned int i, sum = 0;
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index 687fb487c..7d040f7db 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -48,7 +48,6 @@ VP9_COMMON_SRCS-yes += common/vp9_reconintra.h
 VP9_COMMON_SRCS-yes += common/vp9_rtcd.c
 VP9_COMMON_SRCS-yes += common/vp9_rtcd_defs.sh
 VP9_COMMON_SRCS-yes += common/vp9_sadmxn.h
-VP9_COMMON_SRCS-yes += common/vp9_subpelvar.h
 VP9_COMMON_SRCS-yes += common/vp9_scale.h
 VP9_COMMON_SRCS-yes += common/vp9_scale.c
 VP9_COMMON_SRCS-yes += common/vp9_seg_common.h
@@ -92,7 +91,6 @@ VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.c
 
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_convolve_neon.c
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_idct16x16_neon.c
-VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_idct32x32_neon.c
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_convolve8_neon$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_convolve8_avg_neon$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_loopfilter_neon$(ASM)
@@ -109,5 +107,6 @@ VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_iht8x8_add_neon$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_mb_lpf_neon$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_copy_neon$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_avg_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_save_reg_neon$(ASM)
 
 $(eval $(call rtcd_h_template,vp9_rtcd,vp9/common/vp9_rtcd_defs.sh))
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 08a1a8458..157752a86 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -89,7 +89,7 @@ struct vpx_codec_alg_priv {
   unsigned int                fixed_kf_cntr;
 };
 
-static const VP9_REFFRAME ref_frame_to_vp9_reframe(vpx_ref_frame_type_t frame) {
+static VP9_REFFRAME ref_frame_to_vp9_reframe(vpx_ref_frame_type_t frame) {
   switch (frame) {
     case VP8_LAST_FRAME:
       return VP9_LAST_FLAG;
diff --git a/vpx_scale/mips/dspr2/yv12extend_dspr2.c b/vpx_scale/mips/dspr2/yv12extend_dspr2.c
new file mode 100644
index 000000000..2c5cd1a87
--- /dev/null
+++ b/vpx_scale/mips/dspr2/yv12extend_dspr2.c
@@ -0,0 +1,149 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "vpx_scale/yv12config.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_scale/vpx_scale.h"
+
+#if HAVE_DSPR2
+static void extend_plane(uint8_t *const src, int src_stride,
+                         int width, int height,
+                         int extend_top, int extend_left,
+                         int extend_bottom, int extend_right) {
+  int       i, j;
+  uint8_t   *left_src, *right_src;
+  uint8_t   *left_dst_start, *right_dst_start;
+  uint8_t   *left_dst, *right_dst;
+  uint8_t   *top_src, *bot_src;
+  uint8_t   *top_dst, *bot_dst;
+  uint32_t  left_pix;
+  uint32_t  right_pix;
+  uint32_t  linesize;
+
+  /* copy the left and right most columns out */
+  left_src  = src;
+  right_src = src + width - 1;
+  left_dst_start = src - extend_left;
+  right_dst_start = src + width;
+
+  for (i = height; i--; ) {
+    left_dst  = left_dst_start;
+    right_dst = right_dst_start;
+
+    __asm__ __volatile__ (
+        "lb        %[left_pix],     0(%[left_src])      \n\t"
+        "lb        %[right_pix],    0(%[right_src])     \n\t"
+        "replv.qb  %[left_pix],     %[left_pix]         \n\t"
+        "replv.qb  %[right_pix],    %[right_pix]        \n\t"
+
+        : [left_pix] "=&r" (left_pix), [right_pix] "=&r" (right_pix)
+        : [left_src] "r" (left_src), [right_src] "r" (right_src)
+    );
+
+    for (j = extend_left/4; j--; ) {
+      __asm__ __volatile__ (
+        "sw     %[left_pix],    0(%[left_dst])     \n\t"
+        "sw     %[right_pix],   0(%[right_dst])    \n\t"
+
+        :
+        : [left_dst] "r" (left_dst), [left_pix] "r" (left_pix),
+          [right_dst] "r" (right_dst), [right_pix] "r" (right_pix)
+      );
+
+      left_dst += 4;
+      right_dst += 4;
+    }
+
+    for (j = extend_left%4; j--; ) {
+      __asm__ __volatile__ (
+        "sb     %[left_pix],    0(%[left_dst])     \n\t"
+        "sb     %[right_pix],   0(%[right_dst])     \n\t"
+
+        :
+        : [left_dst] "r" (left_dst), [left_pix] "r" (left_pix),
+          [right_dst] "r" (right_dst), [right_pix] "r" (right_pix)
+      );
+
+      left_dst += 1;
+      right_dst += 1;
+    }
+
+    left_src  += src_stride;
+    right_src += src_stride;
+    left_dst_start += src_stride;
+    right_dst_start += src_stride;
+  }
+
+  /* Now copy the top and bottom lines into each line of the respective
+   * borders
+   */
+  top_src = src - extend_left;
+  bot_src = src + src_stride * (height - 1) - extend_left;
+  top_dst = src + src_stride * (-extend_top) - extend_left;
+  bot_dst = src + src_stride * (height) - extend_left;
+  linesize = extend_left + extend_right + width;
+
+  for (i = 0; i < extend_top; i++) {
+    vpx_memcpy(top_dst, top_src, linesize);
+    top_dst += src_stride;
+  }
+
+  for (i = 0; i < extend_bottom; i++) {
+    vpx_memcpy(bot_dst, bot_src, linesize);
+    bot_dst += src_stride;
+  }
+}
+
+static void extend_frame(YV12_BUFFER_CONFIG *const ybf,
+                         int subsampling_x, int subsampling_y,
+                         int ext_size) {
+  const int c_w = (ybf->y_crop_width + subsampling_x) >> subsampling_x;
+  const int c_h = (ybf->y_crop_height + subsampling_y) >> subsampling_y;
+  const int c_et = ext_size >> subsampling_y;
+  const int c_el = ext_size >> subsampling_x;
+  const int c_eb = (ext_size + ybf->y_height - ybf->y_crop_height +
+                    subsampling_y) >> subsampling_y;
+  const int c_er = (ext_size + ybf->y_width - ybf->y_crop_width +
+                    subsampling_x) >> subsampling_x;
+
+  assert(ybf->y_height - ybf->y_crop_height < 16);
+  assert(ybf->y_width - ybf->y_crop_width < 16);
+  assert(ybf->y_height - ybf->y_crop_height >= 0);
+  assert(ybf->y_width - ybf->y_crop_width >= 0);
+
+  extend_plane(ybf->y_buffer, ybf->y_stride,
+               ybf->y_crop_width, ybf->y_crop_height,
+               ext_size, ext_size,
+               ext_size + ybf->y_height - ybf->y_crop_height,
+               ext_size + ybf->y_width - ybf->y_crop_width);
+
+  extend_plane(ybf->u_buffer, ybf->uv_stride,
+               c_w, c_h, c_et, c_el, c_eb, c_er);
+
+  extend_plane(ybf->v_buffer, ybf->uv_stride,
+               c_w, c_h, c_et, c_el, c_eb, c_er);
+}
+
+void vp9_extend_frame_borders_dspr2(YV12_BUFFER_CONFIG *ybf,
+                                int subsampling_x, int subsampling_y) {
+  extend_frame(ybf, subsampling_x, subsampling_y, ybf->border);
+}
+
+void vp9_extend_frame_inner_borders_dspr2(YV12_BUFFER_CONFIG *ybf,
+                                          int subsampling_x,
+                                          int subsampling_y) {
+  const int inner_bw = (ybf->border > VP9INNERBORDERINPIXELS) ?
+                       VP9INNERBORDERINPIXELS : ybf->border;
+  extend_frame(ybf, subsampling_x, subsampling_y, inner_bw);
+}
+#endif
diff --git a/vpx_scale/vpx_scale.mk b/vpx_scale/vpx_scale.mk
index 76c11e792..50d3e9d8e 100644
--- a/vpx_scale/vpx_scale.mk
+++ b/vpx_scale/vpx_scale.mk
@@ -16,6 +16,9 @@ SCALE_SRCS-$(HAVE_NEON)  += arm/neon/vp8_vpxyv12_copysrcframe_func_neon$(ASM)
 SCALE_SRCS-$(HAVE_NEON)  += arm/neon/vp8_vpxyv12_extendframeborders_neon$(ASM)
 SCALE_SRCS-$(HAVE_NEON)  += arm/neon/yv12extend_arm.c
 
+#mips(dspr2)
+SCALE_SRCS-$(HAVE_DSPR2)  += mips/dspr2/yv12extend_dspr2.c
+
 SCALE_SRCS-no += $(SCALE_SRCS_REMOVE-yes)
 
 $(eval $(call asm_offsets_template,\
diff --git a/vpx_scale/vpx_scale_rtcd.sh b/vpx_scale/vpx_scale_rtcd.sh
index ea7b0e2e8..a5faf1148 100644
--- a/vpx_scale/vpx_scale_rtcd.sh
+++ b/vpx_scale/vpx_scale_rtcd.sh
@@ -27,8 +27,8 @@ specialize vpx_yv12_copy_y neon
 
 if [ "$CONFIG_VP9" = "yes" ]; then
     prototype void vp9_extend_frame_borders "struct yv12_buffer_config *ybf, int subsampling_x, int subsampling_y"
-    specialize vp9_extend_frame_borders
+    specialize vp9_extend_frame_borders dspr2
 
     prototype void vp9_extend_frame_inner_borders "struct yv12_buffer_config *ybf, int subsampling_x, int subsampling_y"
-    specialize vp9_extend_frame_inner_borders_c
+    specialize vp9_extend_frame_inner_borders dspr2
 fi
diff --git a/vpxenc.c b/vpxenc.c
index c618ef891..71cf01fef 100644
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -725,14 +725,12 @@ write_webm_file_header(EbmlGlobal                *glob,
         {
           unsigned int pixelWidth = cfg->g_w;
           unsigned int pixelHeight = cfg->g_h;
-          float        frameRate   = (float)fps->num / (float)fps->den;
 
           EbmlLoc videoStart;
           Ebml_StartSubElement(glob, &videoStart, Video);
           Ebml_SerializeUnsigned(glob, PixelWidth, pixelWidth);
           Ebml_SerializeUnsigned(glob, PixelHeight, pixelHeight);
           Ebml_SerializeUnsigned(glob, StereoMode, stereo_fmt);
-          Ebml_SerializeFloat(glob, FrameRate, frameRate);
           Ebml_EndSubElement(glob, &videoStart);
         }
         Ebml_EndSubElement(glob, &start); /* Track Entry */