78 files changed, 7996 insertions, 1918 deletions
diff --git a/vp9/common/mips/dspr2/vp9_common_dspr2.h b/vp9/common/mips/dspr2/vp9_common_dspr2.h
index dc88f1603..644264f65 100644
--- a/vp9/common/mips/dspr2/vp9_common_dspr2.h
+++ b/vp9/common/mips/dspr2/vp9_common_dspr2.h
@@ -81,6 +81,9 @@ static INLINE void vp9_prefetch_store_streamed(unsigned char *dst) {
   );
 }
 
+void vp9_idct32_1d_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
+                                      int dest_stride);
+
 void vp9_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                uint8_t *dst, ptrdiff_t dst_stride,
                                const int16_t *filter_x, int x_step_q4,
diff --git a/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c b/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c
new file mode 100644
index 000000000..1b2f5506a
--- /dev/null
+++ b/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c
@@ -0,0 +1,1315 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_idct.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+
+#if HAVE_DSPR2
+static void idct16_1d_rows_dspr2(const int16_t *input, int16_t *output,
+                                 uint32_t no_rows) {
+  int i;
+  int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
+  int step1_10, step1_11, step1_12, step1_13;
+  int step2_0, step2_1, step2_2, step2_3;
+  int step2_8, step2_9, step2_10, step2_11;
+  int step2_12, step2_13, step2_14, step2_15;
+  int load1, load2, load3, load4, load5, load6, load7, load8;
+  int result1, result2, result3, result4;
+  const int const_2_power_13 = 8192;
+
+  for (i = no_rows; i--; ) {
+    /* prefetch row */
+    vp9_prefetch_load((const uint8_t *)(input + 16));
+
+    __asm__ __volatile__ (
+        "lh       %[load1],              0(%[input])                    \n\t"
+        "lh       %[load2],             16(%[input])                    \n\t"
+        "lh       %[load3],              8(%[input])                    \n\t"
+        "lh       %[load4],             24(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+        "add      %[result1],           %[load1],       %[load2]        \n\t"
+        "sub      %[result2],           %[load1],       %[load2]        \n\t"
+        "madd     $ac1,                 %[result1],     %[cospi_16_64]  \n\t"
+        "madd     $ac2,                 %[result2],     %[cospi_16_64]  \n\t"
+        "extp     %[step2_0],           $ac1,           31              \n\t"
+        "extp     %[step2_1],           $ac2,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+        "madd     $ac3,                 %[load3],       %[cospi_24_64]  \n\t"
+        "msub     $ac3,                 %[load4],       %[cospi_8_64]   \n\t"
+        "extp     %[step2_2],           $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "madd     $ac1,                 %[load3],       %[cospi_8_64]   \n\t"
+        "madd     $ac1,                 %[load4],       %[cospi_24_64]  \n\t"
+        "extp     %[step2_3],           $ac1,           31              \n\t"
+
+        "add      %[step1_0],           %[step2_0],     %[step2_3]      \n\t"
+        "add      %[step1_1],           %[step2_1],     %[step2_2]      \n\t"
+        "sub      %[step1_2],           %[step2_1],     %[step2_2]      \n\t"
+        "sub      %[step1_3],           %[step2_0],     %[step2_3]      \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2),
+          [load3] "=&r" (load3), [load4] "=&r" (load4),
+          [result1] "=&r" (result1), [result2] "=&r" (result2),
+          [step2_0] "=&r" (step2_0), [step2_1] "=&r" (step2_1),
+          [step2_2] "=&r" (step2_2), [step2_3] "=&r" (step2_3),
+          [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1),
+          [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64),
+          [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load5],             2(%[input])                     \n\t"
+        "lh       %[load6],             30(%[input])                    \n\t"
+        "lh       %[load7],             18(%[input])                    \n\t"
+        "lh       %[load8],             14(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load5],       %[cospi_30_64]  \n\t"
+        "msub     $ac1,                 %[load6],       %[cospi_2_64]   \n\t"
+        "extp     %[result1],           $ac1,           31              \n\t"
+
+        "madd     $ac3,                 %[load7],       %[cospi_14_64]  \n\t"
+        "msub     $ac3,                 %[load8],       %[cospi_18_64]  \n\t"
+        "extp     %[result2],           $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac1,                 %[load7],       %[cospi_18_64]  \n\t"
+        "madd     $ac1,                 %[load8],       %[cospi_14_64]  \n\t"
+        "extp     %[result3],           $ac1,           31              \n\t"
+
+        "madd     $ac2,                 %[load5],       %[cospi_2_64]   \n\t"
+        "madd     $ac2,                 %[load6],       %[cospi_30_64]  \n\t"
+        "extp     %[result4],           $ac2,           31              \n\t"
+
+        "sub      %[load5],             %[result1],     %[result2]      \n\t"
+        "sub      %[load6],             %[result4],     %[result3]      \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load6],       %[cospi_24_64]  \n\t"
+        "msub     $ac1,                 %[load5],       %[cospi_8_64]   \n\t"
+        "madd     $ac3,                 %[load5],       %[cospi_24_64]  \n\t"
+        "madd     $ac3,                 %[load6],       %[cospi_8_64]   \n\t"
+
+        "extp     %[step2_9],           $ac1,           31              \n\t"
+        "extp     %[step2_14],          $ac3,           31              \n\t"
+        "add      %[step2_8],           %[result1],     %[result2]      \n\t"
+        "add      %[step2_15],          %[result4],     %[result3]      \n\t"
+
+        : [load5] "=&r" (load5), [load6] "=&r" (load6),
+          [load7] "=&r" (load7), [load8] "=&r" (load8),
+          [result1] "=&r" (result1), [result2] "=&r" (result2),
+          [result3] "=&r" (result3), [result4] "=&r" (result4),
+          [step2_8] "=r" (step2_8), [step2_15] "=r" (step2_15),
+          [step2_9] "=r" (step2_9), [step2_14] "=r" (step2_14)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64),
+          [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64),
+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load1],             10(%[input])                    \n\t"
+        "lh       %[load2],             22(%[input])                    \n\t"
+        "lh       %[load3],             26(%[input])                    \n\t"
+        "lh       %[load4],             6(%[input])                     \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load1],       %[cospi_22_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_10_64]  \n\t"
+        "extp     %[result1],           $ac1,           31              \n\t"
+
+        "madd     $ac3,                 %[load3],       %[cospi_6_64]   \n\t"
+        "msub     $ac3,                 %[load4],       %[cospi_26_64]  \n\t"
+        "extp     %[result2],           $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac1,                 %[load1],       %[cospi_10_64]  \n\t"
+        "madd     $ac1,                 %[load2],       %[cospi_22_64]  \n\t"
+        "extp     %[result3],           $ac1,           31              \n\t"
+
+        "madd     $ac2,                 %[load3],       %[cospi_26_64]  \n\t"
+        "madd     $ac2,                 %[load4],       %[cospi_6_64]   \n\t"
+        "extp     %[result4],           $ac2,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load1],             %[result2],     %[result1]      \n\t"
+        "sub      %[load2],             %[result4],     %[result3]      \n\t"
+
+        "msub     $ac1,                 %[load1],       %[cospi_24_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_8_64]   \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_24_64]  \n\t"
+        "msub     $ac3,                 %[load1],       %[cospi_8_64]   \n\t"
+
+        "extp     %[step2_10],          $ac1,           31              \n\t"
+        "extp     %[step2_13],          $ac3,           31              \n\t"
+        "add      %[step2_11],          %[result1],     %[result2]      \n\t"
+        "add      %[step2_12],          %[result4],     %[result3]      \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2),
+          [load3] "=&r" (load3), [load4] "=&r" (load4),
+          [result1] "=&r" (result1), [result2] "=&r" (result2),
+          [result3] "=&r" (result3), [result4] "=&r" (result4),
+          [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11),
+          [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64),
+          [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64),
+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load5],             4(%[input])                     \n\t"
+        "lh       %[load6],             28(%[input])                    \n\t"
+        "lh       %[load7],             20(%[input])                    \n\t"
+        "lh       %[load8],             12(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load5],       %[cospi_28_64]  \n\t"
+        "msub     $ac1,                 %[load6],       %[cospi_4_64]   \n\t"
+        "extp     %[result1],           $ac1,           31              \n\t"
+
+        "madd     $ac3,                 %[load7],       %[cospi_12_64]  \n\t"
+        "msub     $ac3,                 %[load8],       %[cospi_20_64]  \n\t"
+        "extp     %[result2],           $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac1,                 %[load7],       %[cospi_20_64]  \n\t"
+        "madd     $ac1,                 %[load8],       %[cospi_12_64]  \n\t"
+        "extp     %[result3],           $ac1,           31              \n\t"
+
+        "madd     $ac2,                 %[load5],       %[cospi_4_64]   \n\t"
+        "madd     $ac2,                 %[load6],       %[cospi_28_64]  \n\t"
+        "extp     %[result4],           $ac2,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load5],             %[result4],     %[result3]      \n\t"
+        "sub      %[load5],             %[load5],       %[result1]      \n\t"
+        "add      %[load5],             %[load5],       %[result2]      \n\t"
+
+        "sub      %[load6],             %[result1],     %[result2]      \n\t"
+        "sub      %[load6],             %[load6],       %[result3]      \n\t"
+        "add      %[load6],             %[load6],       %[result4]      \n\t"
+
+        "madd     $ac1,                 %[load5],       %[cospi_16_64]  \n\t"
+        "madd     $ac3,                 %[load6],       %[cospi_16_64]  \n\t"
+
+        "extp     %[step1_5],           $ac1,           31              \n\t"
+        "extp     %[step1_6],           $ac3,           31              \n\t"
+        "add      %[step1_4],           %[result1],     %[result2]      \n\t"
+        "add      %[step1_7],           %[result4],     %[result3]      \n\t"
+
+        : [load5] "=&r" (load5), [load6] "=&r" (load6),
+          [load7] "=&r" (load7), [load8] "=&r" (load8),
+          [result1] "=&r" (result1), [result2] "=&r" (result2),
+          [result3] "=&r" (result3), [result4] "=&r" (result4),
+          [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5),
+          [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64),
+          [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64),
+          [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    __asm__ __volatile__ (
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+
+        "sub      %[load5],             %[step2_14],    %[step2_13]     \n\t"
+        "sub      %[load5],             %[load5],       %[step2_9]      \n\t"
+        "add      %[load5],             %[load5],       %[step2_10]     \n\t"
+
+        "madd     $ac0,                 %[load5],       %[cospi_16_64]  \n\t"
+
+        "sub      %[load6],             %[step2_14],    %[step2_13]     \n\t"
+        "sub      %[load6],             %[load6],       %[step2_10]     \n\t"
+        "add      %[load6],             %[load6],       %[step2_9]      \n\t"
+
+        "madd     $ac1,                 %[load6],       %[cospi_16_64]  \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load5],             %[step2_15],    %[step2_12]     \n\t"
+        "sub      %[load5],             %[load5],       %[step2_8]      \n\t"
+        "add      %[load5],             %[load5],       %[step2_11]     \n\t"
+
+        "madd     $ac2,                 %[load5],       %[cospi_16_64]  \n\t"
+
+        "sub      %[load6],             %[step2_15],    %[step2_12]     \n\t"
+        "sub      %[load6],             %[load6],       %[step2_11]     \n\t"
+        "add      %[load6],             %[load6],       %[step2_8]      \n\t"
+
+        "madd     $ac3,                 %[load6],       %[cospi_16_64]  \n\t"
+
+        "extp     %[step1_10],          $ac0,           31              \n\t"
+        "extp     %[step1_13],          $ac1,           31              \n\t"
+        "extp     %[step1_11],          $ac2,           31              \n\t"
+        "extp     %[step1_12],          $ac3,           31              \n\t"
+
+        : [load5] "=&r" (load5), [load6] "=&r" (load6),
+          [step1_10] "=r" (step1_10), [step1_11] "=r" (step1_11),
+          [step1_12] "=r" (step1_12), [step1_13] "=r" (step1_13)
+        : [const_2_power_13] "r" (const_2_power_13),
+          [step2_14] "r" (step2_14), [step2_13] "r" (step2_13),
+          [step2_9] "r" (step2_9), [step2_10] "r" (step2_10),
+          [step2_15] "r" (step2_15), [step2_12] "r" (step2_12),
+          [step2_8] "r" (step2_8), [step2_11] "r" (step2_11),
+          [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    __asm__ __volatile__ (
+        "add      %[load5],             %[step1_0],     %[step1_7]      \n\t"
+        "add      %[load5],             %[load5],       %[step2_12]     \n\t"
+        "add      %[load5],             %[load5],       %[step2_15]     \n\t"
+        "add      %[load6],             %[step1_1],     %[step1_6]      \n\t"
+        "add      %[load6],             %[load6],       %[step2_13]     \n\t"
+        "add      %[load6],             %[load6],       %[step2_14]     \n\t"
+        "sh       %[load5],             0(%[output])                    \n\t"
+        "sh       %[load6],             32(%[output])                   \n\t"
+        "sub      %[load5],             %[step1_1],     %[step1_6]      \n\t"
+        "add      %[load5],             %[load5],       %[step2_9]      \n\t"
+        "add      %[load5],             %[load5],       %[step2_10]     \n\t"
+        "sub      %[load6],             %[step1_0],     %[step1_7]      \n\t"
+        "add      %[load6],             %[load6],       %[step2_8]      \n\t"
+        "add      %[load6],             %[load6],       %[step2_11]     \n\t"
+        "sh       %[load5],             192(%[output])                  \n\t"
+        "sh       %[load6],             224(%[output])                  \n\t"
+        "sub      %[load5],             %[step1_0],     %[step1_7]      \n\t"
+        "sub      %[load5],             %[load5],       %[step2_8]      \n\t"
+        "sub      %[load5],             %[load5],       %[step2_11]     \n\t"
+        "sub      %[load6],             %[step1_1],     %[step1_6]      \n\t"
+        "sub      %[load6],             %[load6],       %[step2_9]      \n\t"
+        "sub      %[load6],             %[load6],       %[step2_10]     \n\t"
+        "sh       %[load5],             256(%[output])                  \n\t"
+        "sh       %[load6],             288(%[output])                  \n\t"
+        "add      %[load5],             %[step1_1],     %[step1_6]      \n\t"
+        "sub      %[load5],             %[load5],       %[step2_13]     \n\t"
+        "sub      %[load5],             %[load5],       %[step2_14]     \n\t"
+        "add      %[load6],             %[step1_0],     %[step1_7]      \n\t"
+        "sub      %[load6],             %[load6],       %[step2_12]     \n\t"
+        "sub      %[load6],             %[load6],       %[step2_15]     \n\t"
+        "sh       %[load5],             448(%[output])                  \n\t"
+        "sh       %[load6],             480(%[output])                  \n\t"
+
+        : [load5] "=&r" (load5), [load6] "=&r" (load6)
+        : [output] "r" (output),
+          [step1_0] "r" (step1_0), [step1_1] "r" (step1_1),
+          [step1_6] "r" (step1_6), [step1_7] "r" (step1_7),
+          [step2_8] "r" (step2_8), [step2_9] "r" (step2_9),
+          [step2_10] "r" (step2_10), [step2_11] "r" (step2_11),
+          [step2_12] "r" (step2_12), [step2_13] "r" (step2_13),
+          [step2_14] "r" (step2_14), [step2_15] "r" (step2_15)
+    );
+
+    __asm__ __volatile__ (
+        "add      %[load5],             %[step1_2],     %[step1_5]      \n\t"
+        "add      %[load5],             %[load5],       %[step1_13]     \n\t"
+        "add      %[load6],             %[step1_3],     %[step1_4]      \n\t"
+        "add      %[load6],             %[load6],       %[step1_12]     \n\t"
+        "sh       %[load5],             64(%[output])                   \n\t"
+        "sh       %[load6],             96(%[output])                   \n\t"
+        "sub      %[load5],             %[step1_3],     %[step1_4]      \n\t"
+        "add      %[load5],             %[load5],       %[step1_11]     \n\t"
+        "sub      %[load6],             %[step1_2],     %[step1_5]      \n\t"
+        "add      %[load6],             %[load6],       %[step1_10]     \n\t"
+        "sh       %[load5],             128(%[output])                  \n\t"
+        "sh       %[load6],             160(%[output])                  \n\t"
+        "sub      %[load5],             %[step1_2],     %[step1_5]      \n\t"
+        "sub      %[load5],             %[load5],       %[step1_10]     \n\t"
+        "sub      %[load6],             %[step1_3],     %[step1_4]      \n\t"
+        "sub      %[load6],             %[load6],       %[step1_11]     \n\t"
+        "sh       %[load5],             320(%[output])                  \n\t"
+        "sh       %[load6],             352(%[output])                  \n\t"
+        "add      %[load5],             %[step1_3],     %[step1_4]      \n\t"
+        "sub      %[load5],             %[load5],       %[step1_12]     \n\t"
+        "add      %[load6],             %[step1_2],     %[step1_5]      \n\t"
+        "sub      %[load6],             %[load6],       %[step1_13]     \n\t"
+        "sh       %[load5],             384(%[output])                  \n\t"
+        "sh       %[load6],             416(%[output])                  \n\t"
+
+        : [load5] "=&r" (load5), [load6] "=&r" (load6)
+        : [output] "r" (output),
+          [step1_2] "r" (step1_2), [step1_3] "r" (step1_3),
+          [step1_4] "r" (step1_4), [step1_5] "r" (step1_5),
+          [step1_10] "r" (step1_10), [step1_11] "r" (step1_11),
+          [step1_12] "r" (step1_12), [step1_13] "r" (step1_13)
+    );
+
+    input += 16;
+    output += 1;
+  }
+}
+
+static void idct16_1d_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
+                                         int dest_stride) {
+  int i;
+  int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
+  int step1_8, step1_9, step1_10, step1_11;
+  int step1_12, step1_13, step1_14, step1_15;
+  int step2_0, step2_1, step2_2, step2_3;
+  int step2_8, step2_9, step2_10, step2_11;
+  int step2_12, step2_13, step2_14, step2_15;
+  int load1, load2, load3, load4, load5, load6, load7, load8;
+  int result1, result2, result3, result4;
+  const int const_2_power_13 = 8192;
+  uint8_t *dest_pix;
+  uint8_t *cm = vp9_ff_cropTbl;
+
+  /* prefetch vp9_ff_cropTbl */
+  vp9_prefetch_load(vp9_ff_cropTbl);
+  vp9_prefetch_load(vp9_ff_cropTbl +  32);
+  vp9_prefetch_load(vp9_ff_cropTbl +  64);
+  vp9_prefetch_load(vp9_ff_cropTbl +  96);
+  vp9_prefetch_load(vp9_ff_cropTbl + 128);
+  vp9_prefetch_load(vp9_ff_cropTbl + 160);
+  vp9_prefetch_load(vp9_ff_cropTbl + 192);
+  vp9_prefetch_load(vp9_ff_cropTbl + 224);
+
+  for (i = 0; i < 16; ++i) {
+    dest_pix = (dest + i);
+    __asm__ __volatile__ (
+        "lh       %[load1],              0(%[input])                    \n\t"
+        "lh       %[load2],             16(%[input])                    \n\t"
+        "lh       %[load3],              8(%[input])                    \n\t"
+        "lh       %[load4],             24(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+        "add      %[result1],           %[load1],       %[load2]        \n\t"
+        "sub      %[result2],           %[load1],       %[load2]        \n\t"
+        "madd     $ac1,                 %[result1],     %[cospi_16_64]  \n\t"
+        "madd     $ac2,                 %[result2],     %[cospi_16_64]  \n\t"
+        "extp     %[step2_0],           $ac1,           31              \n\t"
+        "extp     %[step2_1],           $ac2,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+        "madd     $ac3,                 %[load3],       %[cospi_24_64]  \n\t"
+        "msub     $ac3,                 %[load4],       %[cospi_8_64]   \n\t"
+        "extp     %[step2_2],           $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "madd     $ac1,                 %[load3],       %[cospi_8_64]   \n\t"
+        "madd     $ac1,                 %[load4],       %[cospi_24_64]  \n\t"
+        "extp     %[step2_3],           $ac1,           31              \n\t"
+
+        "add      %[step1_0],           %[step2_0],     %[step2_3]      \n\t"
+        "add      %[step1_1],           %[step2_1],     %[step2_2]      \n\t"
+        "sub      %[step1_2],           %[step2_1],     %[step2_2]      \n\t"
+        "sub      %[step1_3],           %[step2_0],     %[step2_3]      \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2),
+          [load3] "=&r" (load3), [load4] "=&r" (load4),
+          [result1] "=&r" (result1), [result2] "=&r" (result2),
+          [step2_0] "=&r" (step2_0), [step2_1] "=&r" (step2_1),
+          [step2_2] "=&r" (step2_2), [step2_3] "=&r" (step2_3),
+          [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1),
+          [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64),
+          [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load5],             2(%[input])                     \n\t"
+        "lh       %[load6],             30(%[input])                    \n\t"
+        "lh       %[load7],             18(%[input])                    \n\t"
+        "lh       %[load8],             14(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load5],       %[cospi_30_64]  \n\t"
+        "msub     $ac1,                 %[load6],       %[cospi_2_64]   \n\t"
+        "extp     %[result1],           $ac1,           31              \n\t"
+
+        "madd     $ac3,                 %[load7],       %[cospi_14_64]  \n\t"
+        "msub     $ac3,                 %[load8],       %[cospi_18_64]  \n\t"
+        "extp     %[result2],           $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac1,                 %[load7],       %[cospi_18_64]  \n\t"
+        "madd     $ac1,                 %[load8],       %[cospi_14_64]  \n\t"
+        "extp     %[result3],           $ac1,           31              \n\t"
+
+        "madd     $ac2,                 %[load5],        %[cospi_2_64]  \n\t"
+        "madd     $ac2,                 %[load6],        %[cospi_30_64] \n\t"
+        "extp     %[result4],           $ac2,            31             \n\t"
+
+        "sub      %[load5],             %[result1],     %[result2]      \n\t"
+        "sub      %[load6],             %[result4],     %[result3]      \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load6],       %[cospi_24_64]  \n\t"
+        "msub     $ac1,                 %[load5],       %[cospi_8_64]   \n\t"
+        "madd     $ac3,                 %[load5],       %[cospi_24_64]  \n\t"
+        "madd     $ac3,                 %[load6],       %[cospi_8_64]   \n\t"
+
+        "extp     %[step2_9],           $ac1,           31              \n\t"
+        "extp     %[step2_14],          $ac3,           31              \n\t"
+        "add      %[step2_8],           %[result1],     %[result2]      \n\t"
+        "add      %[step2_15],          %[result4],     %[result3]      \n\t"
+
+        : [load5] "=&r" (load5), [load6] "=&r" (load6),
+          [load7] "=&r" (load7), [load8] "=&r" (load8),
+          [result1] "=&r" (result1), [result2] "=&r" (result2),
+          [result3] "=&r" (result3), [result4] "=&r" (result4),
+          [step2_8] "=r" (step2_8), [step2_15] "=r" (step2_15),
+          [step2_9] "=r" (step2_9), [step2_14] "=r" (step2_14)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64),
+          [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64),
+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load1],             10(%[input])                    \n\t"
+        "lh       %[load2],             22(%[input])                    \n\t"
+        "lh       %[load3],             26(%[input])                    \n\t"
+        "lh       %[load4],             6(%[input])                     \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load1],    %[cospi_22_64]     \n\t"
+        "msub     $ac1,                 %[load2],    %[cospi_10_64]     \n\t"
+        "extp     %[result1],           $ac1,        31                 \n\t"
+
+        "madd     $ac3,                 %[load3],    %[cospi_6_64]      \n\t"
+        "msub     $ac3,                 %[load4],    %[cospi_26_64]     \n\t"
+        "extp     %[result2],           $ac3,        31                 \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac1,                 %[load1],    %[cospi_10_64]     \n\t"
+        "madd     $ac1,                 %[load2],    %[cospi_22_64]     \n\t"
+        "extp     %[result3],           $ac1,        31                 \n\t"
+
+        "madd     $ac2,                 %[load3],    %[cospi_26_64]     \n\t"
+        "madd     $ac2,                 %[load4],    %[cospi_6_64]      \n\t"
+        "extp     %[result4],           $ac2,        31                 \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load1],             %[result2],     %[result1]      \n\t"
+        "sub      %[load2],             %[result4],     %[result3]      \n\t"
+
+        "msub     $ac1,                 %[load1],       %[cospi_24_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_8_64]   \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_24_64]  \n\t"
+        "msub     $ac3,                 %[load1],       %[cospi_8_64]   \n\t"
+
+        "extp     %[step2_10],          $ac1,           31              \n\t"
+        "extp     %[step2_13],          $ac3,           31              \n\t"
+        "add      %[step2_11],          %[result1],     %[result2]      \n\t"
+        "add      %[step2_12],          %[result4],     %[result3]      \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2),
+          [load3] "=&r" (load3), [load4] "=&r" (load4),
+          [result1] "=&r" (result1), [result2] "=&r" (result2),
+          [result3] "=&r" (result3), [result4] "=&r" (result4),
+          [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11),
+          [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64),
+          [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64),
+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load5],             4(%[input])                   \n\t"
+        "lh       %[load6],             28(%[input])                  \n\t"
+        "lh       %[load7],             20(%[input])                  \n\t"
+        "lh       %[load8],             12(%[input])                  \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                          \n\t"
+        "mthi     $zero,                $ac1                          \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                          \n\t"
+        "mthi     $zero,                $ac3                          \n\t"
+
+        "madd     $ac1,                 %[load5],    %[cospi_28_64]   \n\t"
+        "msub     $ac1,                 %[load6],    %[cospi_4_64]    \n\t"
+        "extp     %[result1],           $ac1,        31               \n\t"
+
+        "madd     $ac3,                 %[load7],    %[cospi_12_64]   \n\t"
+        "msub     $ac3,                 %[load8],    %[cospi_20_64]   \n\t"
+        "extp     %[result2],           $ac3,        31               \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                          \n\t"
+        "mthi     $zero,                $ac1                          \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                          \n\t"
+        "mthi     $zero,                $ac2                          \n\t"
+
+        "madd     $ac1,                 %[load7],    %[cospi_20_64]   \n\t"
+        "madd     $ac1,                 %[load8],    %[cospi_12_64]   \n\t"
+        "extp     %[result3],           $ac1,        31               \n\t"
+
+        "madd     $ac2,                 %[load5],    %[cospi_4_64]    \n\t"
+        "madd     $ac2,                 %[load6],    %[cospi_28_64]   \n\t"
+        "extp     %[result4],           $ac2,        31               \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load5],             %[result4],     %[result3]      \n\t"
+        "sub      %[load5],             %[load5],       %[result1]      \n\t"
+        "add      %[load5],             %[load5],       %[result2]      \n\t"
+
+        "sub      %[load6],             %[result1],     %[result2]      \n\t"
+        "sub      %[load6],             %[load6],       %[result3]      \n\t"
+        "add      %[load6],             %[load6],       %[result4]      \n\t"
+
+        "madd     $ac1,                 %[load5],       %[cospi_16_64]  \n\t"
+        "madd     $ac3,                 %[load6],       %[cospi_16_64]  \n\t"
+
+        "extp     %[step1_5],           $ac1,           31              \n\t"
+        "extp     %[step1_6],           $ac3,           31              \n\t"
+
+        "add      %[step1_4],           %[result1],     %[result2]      \n\t"
+        "add      %[step1_7],           %[result4],     %[result3]      \n\t"
+
+        : [load5] "=&r" (load5), [load6] "=&r" (load6),
+          [load7] "=&r" (load7), [load8] "=&r" (load8),
+          [result1] "=&r" (result1), [result2] "=&r" (result2),
+          [result3] "=&r" (result3), [result4] "=&r" (result4),
+          [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5),
+          [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64),
+          [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64),
+          [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    __asm__ __volatile__ (
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+
+        "sub      %[load5],             %[step2_14],    %[step2_13]     \n\t"
+        "sub      %[load5],             %[load5],       %[step2_9]      \n\t"
+        "add      %[load5],             %[load5],       %[step2_10]     \n\t"
+
+        "madd     $ac0,                 %[load5],       %[cospi_16_64]  \n\t"
+
+        "sub      %[load6],             %[step2_14],    %[step2_13]     \n\t"
+        "sub      %[load6],             %[load6],       %[step2_10]     \n\t"
+        "add      %[load6],             %[load6],       %[step2_9]      \n\t"
+
+        "madd     $ac1,                 %[load6],       %[cospi_16_64]  \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load5],             %[step2_15],    %[step2_12]     \n\t"
+        "sub      %[load5],             %[load5],       %[step2_8]      \n\t"
+        "add      %[load5],             %[load5],       %[step2_11]     \n\t"
+
+        "madd     $ac2,                 %[load5],       %[cospi_16_64]  \n\t"
+
+        "sub      %[load6],             %[step2_15],    %[step2_12]     \n\t"
+        "sub      %[load6],             %[load6],       %[step2_11]     \n\t"
+        "add      %[load6],             %[load6],       %[step2_8]      \n\t"
+
+        "madd     $ac3,                 %[load6],       %[cospi_16_64]  \n\t"
+
+        "extp     %[step1_10],          $ac0,           31              \n\t"
+        "extp     %[step1_13],          $ac1,           31              \n\t"
+        "extp     %[step1_11],          $ac2,           31              \n\t"
+        "extp     %[step1_12],          $ac3,           31              \n\t"
+
+        : [load5] "=&r" (load5), [load6] "=&r" (load6),
+          [step1_10] "=r" (step1_10), [step1_11] "=r" (step1_11),
+          [step1_12] "=r" (step1_12), [step1_13] "=r" (step1_13)
+        : [const_2_power_13] "r" (const_2_power_13),
+          [step2_14] "r" (step2_14), [step2_13] "r" (step2_13),
+          [step2_9] "r" (step2_9), [step2_10] "r" (step2_10),
+          [step2_15] "r" (step2_15), [step2_12] "r" (step2_12),
+          [step2_8] "r" (step2_8), [step2_11] "r" (step2_11),
+          [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    step1_8 = step2_8 + step2_11;
+    step1_9 = step2_9 + step2_10;
+    step1_14 = step2_13 + step2_14;
+    step1_15 = step2_12 + step2_15;
+
+    __asm__ __volatile__ (
+        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
+        "add      %[load5],         %[step1_0],         %[step1_7]      \n\t"
+        "add      %[load5],         %[load5],           %[step1_15]     \n\t"
+        "addi     %[load5],         %[load5],           32              \n\t"
+        "sra      %[load5],         %[load5],           6               \n\t"
+        "add      %[load7],         %[load7],           %[load5]        \n\t"
+        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
+        "add      %[load6],         %[step1_1],         %[step1_6]      \n\t"
+        "add      %[load6],         %[load6],           %[step1_14]     \n\t"
+        "sb       %[load5],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
+        "addi     %[load6],         %[load6],           32              \n\t"
+        "sra      %[load6],         %[load6],           6               \n\t"
+        "add      %[load8],         %[load8],           %[load6]        \n\t"
+        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
+        "sb       %[load6],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+
+        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
+        "add      %[load5],         %[step1_2],         %[step1_5]      \n\t"
+        "add      %[load5],         %[load5],           %[step1_13]     \n\t"
+        "addi     %[load5],         %[load5],           32              \n\t"
+        "sra      %[load5],         %[load5],           6               \n\t"
+        "add      %[load7],         %[load7],           %[load5]        \n\t"
+        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
+        "add      %[load6],         %[step1_3],         %[step1_4]      \n\t"
+        "add      %[load6],         %[load6],           %[step1_12]     \n\t"
+        "sb       %[load5],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
+        "addi     %[load6],         %[load6],           32              \n\t"
+        "sra      %[load6],         %[load6],           6               \n\t"
+        "add      %[load8],         %[load8],           %[load6]        \n\t"
+        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
+        "sb       %[load6],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+
+        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
+        "sub      %[load5],         %[step1_3],         %[step1_4]      \n\t"
+        "add      %[load5],         %[load5],           %[step1_11]     \n\t"
+        "addi     %[load5],         %[load5],           32              \n\t"
+        "sra      %[load5],         %[load5],           6               \n\t"
+        "add      %[load7],         %[load7],           %[load5]        \n\t"
+        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
+        "sub      %[load6],         %[step1_2],         %[step1_5]      \n\t"
+        "add      %[load6],         %[load6],           %[step1_10]     \n\t"
+        "sb       %[load5],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
+        "addi     %[load6],         %[load6],           32              \n\t"
+        "sra      %[load6],         %[load6],           6               \n\t"
+        "add      %[load8],         %[load8],           %[load6]        \n\t"
+        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
+        "sb       %[load6],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+
+        "sub      %[load5],         %[step1_1],         %[step1_6]      \n\t"
+        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
+        "add      %[load5],         %[load5],           %[step1_9]      \n\t"
+        "addi     %[load5],         %[load5],           32              \n\t"
+        "sra      %[load5],         %[load5],           6               \n\t"
+        "add      %[load7],         %[load7],           %[load5]        \n\t"
+        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
+        "sub      %[load6],         %[step1_0],         %[step1_7]      \n\t"
+        "add      %[load6],         %[load6],           %[step1_8]      \n\t"
+        "sb       %[load5],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
+        "addi     %[load6],         %[load6],           32              \n\t"
+        "sra      %[load6],         %[load6],           6               \n\t"
+        "add      %[load8],         %[load8],           %[load6]        \n\t"
+        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
+        "sb       %[load6],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+
+        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
+        "sub      %[load5],         %[step1_0],         %[step1_7]      \n\t"
+        "sub      %[load5],         %[load5],           %[step1_8]      \n\t"
+        "addi     %[load5],         %[load5],           32              \n\t"
+        "sra      %[load5],         %[load5],           6               \n\t"
+        "add      %[load7],         %[load7],           %[load5]        \n\t"
+        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
+        "sub      %[load6],         %[step1_1],         %[step1_6]      \n\t"
+        "sub      %[load6],         %[load6],           %[step1_9]      \n\t"
+        "sb       %[load5],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
+        "addi     %[load6],         %[load6],           32              \n\t"
+        "sra      %[load6],         %[load6],           6               \n\t"
+        "add      %[load8],         %[load8],           %[load6]        \n\t"
+        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
+        "sb       %[load6],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+
+        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
+        "sub      %[load5],         %[step1_2],         %[step1_5]      \n\t"
+        "sub      %[load5],         %[load5],           %[step1_10]     \n\t"
+        "addi     %[load5],         %[load5],           32              \n\t"
+        "sra      %[load5],         %[load5],           6               \n\t"
+        "add      %[load7],         %[load7],           %[load5]        \n\t"
+        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
+        "sub      %[load6],         %[step1_3],         %[step1_4]      \n\t"
+        "sub      %[load6],         %[load6],           %[step1_11]     \n\t"
+        "sb       %[load5],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
+        "addi     %[load6],         %[load6],           32              \n\t"
+        "sra      %[load6],         %[load6],           6               \n\t"
+        "add      %[load8],         %[load8],           %[load6]        \n\t"
+        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
+        "sb       %[load6],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+
+        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
+        "add      %[load5],         %[step1_3],         %[step1_4]      \n\t"
+        "sub      %[load5],         %[load5],           %[step1_12]     \n\t"
+        "addi     %[load5],         %[load5],           32              \n\t"
+        "sra      %[load5],         %[load5],           6               \n\t"
+        "add      %[load7],         %[load7],           %[load5]        \n\t"
+        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
+        "add      %[load6],         %[step1_2],         %[step1_5]      \n\t"
+        "sub      %[load6],         %[load6],           %[step1_13]     \n\t"
+        "sb       %[load5],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
+        "addi     %[load6],         %[load6],           32              \n\t"
+        "sra      %[load6],         %[load6],           6               \n\t"
+        "add      %[load8],         %[load8],           %[load6]        \n\t"
+        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
+        "sb       %[load6],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+
+        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
+        "add      %[load5],         %[step1_1],         %[step1_6]      \n\t"
+        "sub      %[load5],         %[load5],           %[step1_14]     \n\t"
+        "addi     %[load5],         %[load5],           32              \n\t"
+        "sra      %[load5],         %[load5],           6               \n\t"
+        "add      %[load7],         %[load7],           %[load5]        \n\t"
+        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
+        "add      %[load6],         %[step1_0],         %[step1_7]      \n\t"
+        "sub      %[load6],         %[load6],           %[step1_15]     \n\t"
+        "sb       %[load5],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
+        "addi     %[load6],         %[load6],           32              \n\t"
+        "sra      %[load6],         %[load6],           6               \n\t"
+        "add      %[load8],         %[load8],           %[load6]        \n\t"
+        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
+        "sb       %[load6],         0(%[dest_pix])                      \n\t"
+
+        : [load5] "=&r" (load5), [load6] "=&r" (load6), [load7] "=&r" (load7),
+          [load8] "=&r" (load8), [dest_pix] "+r" (dest_pix)
+        : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
+          [step1_0] "r" (step1_0), [step1_1] "r" (step1_1),
+          [step1_2] "r" (step1_2), [step1_3] "r" (step1_3),
+          [step1_4] "r" (step1_4), [step1_5] "r" (step1_5),
+          [step1_6] "r" (step1_6), [step1_7] "r" (step1_7),
+          [step1_8] "r" (step1_8), [step1_9] "r" (step1_9),
+          [step1_10] "r" (step1_10), [step1_11] "r" (step1_11),
+          [step1_12] "r" (step1_12), [step1_13] "r" (step1_13),
+          [step1_14] "r" (step1_14), [step1_15] "r" (step1_15)
+    );
+
+    input += 16;
+  }
+}
+
+void vp9_idct16x16_256_add_dspr2(const int16_t *input, uint8_t *dest,
+                                 int dest_stride) {
+  DECLARE_ALIGNED(32, int16_t,  out[16 * 16]);
+  uint32_t pos = 45;
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp    %[pos],    1    \n\t"
+    :
+    : [pos] "r" (pos)
+  );
+
+  // First transform rows
+  idct16_1d_rows_dspr2(input, out, 16);
+
+  // Then transform columns and add to dest
+  idct16_1d_cols_add_blk_dspr2(out, dest, dest_stride);
+}
+
+static void iadst16_1d(const int16_t *input, int16_t *output) {
+  int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
+
+  int x0 = input[15];
+  int x1 = input[0];
+  int x2 = input[13];
+  int x3 = input[2];
+  int x4 = input[11];
+  int x5 = input[4];
+  int x6 = input[9];
+  int x7 = input[6];
+  int x8 = input[7];
+  int x9 = input[8];
+  int x10 = input[5];
+  int x11 = input[10];
+  int x12 = input[3];
+  int x13 = input[12];
+  int x14 = input[1];
+  int x15 = input[14];
+
+  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
+           | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {
+    output[0] = output[1] = output[2] = output[3] = output[4]
+              = output[5] = output[6] = output[7] = output[8]
+              = output[9] = output[10] = output[11] = output[12]
+              = output[13] = output[14] = output[15] = 0;
+    return;
+  }
+
+  // stage 1
+  s0 = x0 * cospi_1_64  + x1 * cospi_31_64;
+  s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
+  s2 = x2 * cospi_5_64  + x3 * cospi_27_64;
+  s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
+  s4 = x4 * cospi_9_64  + x5 * cospi_23_64;
+  s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
+  s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
+  s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
+  s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
+  s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
+  s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
+  s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
+  s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
+  s13 = x12 * cospi_7_64  - x13 * cospi_25_64;
+  s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
+  s15 = x14 * cospi_3_64  - x15 * cospi_29_64;
+
+  x0 = dct_const_round_shift(s0 + s8);
+  x1 = dct_const_round_shift(s1 + s9);
+  x2 = dct_const_round_shift(s2 + s10);
+  x3 = dct_const_round_shift(s3 + s11);
+  x4 = dct_const_round_shift(s4 + s12);
+  x5 = dct_const_round_shift(s5 + s13);
+  x6 = dct_const_round_shift(s6 + s14);
+  x7 = dct_const_round_shift(s7 + s15);
+  x8  = dct_const_round_shift(s0 - s8);
+  x9  = dct_const_round_shift(s1 - s9);
+  x10 = dct_const_round_shift(s2 - s10);
+  x11 = dct_const_round_shift(s3 - s11);
+  x12 = dct_const_round_shift(s4 - s12);
+  x13 = dct_const_round_shift(s5 - s13);
+  x14 = dct_const_round_shift(s6 - s14);
+  x15 = dct_const_round_shift(s7 - s15);
+
+  // stage 2
+  s0 = x0;
+  s1 = x1;
+  s2 = x2;
+  s3 = x3;
+  s4 = x4;
+  s5 = x5;
+  s6 = x6;
+  s7 = x7;
+  s8 =    x8 * cospi_4_64   + x9 * cospi_28_64;
+  s9 =    x8 * cospi_28_64  - x9 * cospi_4_64;
+  s10 =   x10 * cospi_20_64 + x11 * cospi_12_64;
+  s11 =   x10 * cospi_12_64 - x11 * cospi_20_64;
+  s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;
+  s13 =   x12 * cospi_4_64  + x13 * cospi_28_64;
+  s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;
+  s15 =   x14 * cospi_20_64 + x15 * cospi_12_64;
+
+  x0 = s0 + s4;
+  x1 = s1 + s5;
+  x2 = s2 + s6;
+  x3 = s3 + s7;
+  x4 = s0 - s4;
+  x5 = s1 - s5;
+  x6 = s2 - s6;
+  x7 = s3 - s7;
+  x8 = dct_const_round_shift(s8 + s12);
+  x9 = dct_const_round_shift(s9 + s13);
+  x10 = dct_const_round_shift(s10 + s14);
+  x11 = dct_const_round_shift(s11 + s15);
+  x12 = dct_const_round_shift(s8 - s12);
+  x13 = dct_const_round_shift(s9 - s13);
+  x14 = dct_const_round_shift(s10 - s14);
+  x15 = dct_const_round_shift(s11 - s15);
+
+  // stage 3
+  s0 = x0;
+  s1 = x1;
+  s2 = x2;
+  s3 = x3;
+  s4 = x4 * cospi_8_64  + x5 * cospi_24_64;
+  s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
+  s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;
+  s7 =   x6 * cospi_8_64  + x7 * cospi_24_64;
+  s8 = x8;
+  s9 = x9;
+  s10 = x10;
+  s11 = x11;
+  s12 = x12 * cospi_8_64  + x13 * cospi_24_64;
+  s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
+  s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;
+  s15 =   x14 * cospi_8_64  + x15 * cospi_24_64;
+
+  x0 = s0 + s2;
+  x1 = s1 + s3;
+  x2 = s0 - s2;
+  x3 = s1 - s3;
+  x4 = dct_const_round_shift(s4 + s6);
+  x5 = dct_const_round_shift(s5 + s7);
+  x6 = dct_const_round_shift(s4 - s6);
+  x7 = dct_const_round_shift(s5 - s7);
+  x8 = s8 + s10;
+  x9 = s9 + s11;
+  x10 = s8 - s10;
+  x11 = s9 - s11;
+  x12 = dct_const_round_shift(s12 + s14);
+  x13 = dct_const_round_shift(s13 + s15);
+  x14 = dct_const_round_shift(s12 - s14);
+  x15 = dct_const_round_shift(s13 - s15);
+
+  // stage 4
+  s2 = (- cospi_16_64) * (x2 + x3);
+  s3 = cospi_16_64 * (x2 - x3);
+  s6 = cospi_16_64 * (x6 + x7);
+  s7 = cospi_16_64 * (- x6 + x7);
+  s10 = cospi_16_64 * (x10 + x11);
+  s11 = cospi_16_64 * (- x10 + x11);
+  s14 = (- cospi_16_64) * (x14 + x15);
+  s15 = cospi_16_64 * (x14 - x15);
+
+  x2 = dct_const_round_shift(s2);
+  x3 = dct_const_round_shift(s3);
+  x6 = dct_const_round_shift(s6);
+  x7 = dct_const_round_shift(s7);
+  x10 = dct_const_round_shift(s10);
+  x11 = dct_const_round_shift(s11);
+  x14 = dct_const_round_shift(s14);
+  x15 = dct_const_round_shift(s15);
+
+  output[0] =  x0;
+  output[1] = -x8;
+  output[2] =  x12;
+  output[3] = -x4;
+  output[4] =  x6;
+  output[5] =  x14;
+  output[6] =  x10;
+  output[7] =  x2;
+  output[8] =  x3;
+  output[9] =  x11;
+  output[10] =  x15;
+  output[11] =  x7;
+  output[12] =  x5;
+  output[13] = -x13;
+  output[14] =  x9;
+  output[15] = -x1;
+}
+
+void vp9_iht16x16_256_add_dspr2(const int16_t *input, uint8_t *dest,
+                                int pitch, int tx_type) {
+  int i, j;
+  DECLARE_ALIGNED(32, int16_t,  out[16 * 16]);
+  int16_t *outptr = out;
+  int16_t temp_out[16];
+  uint32_t pos = 45;
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp    %[pos],    1    \n\t"
+    :
+    : [pos] "r" (pos)
+  );
+
+  switch (tx_type) {
+    case DCT_DCT:     // DCT in both horizontal and vertical
+      idct16_1d_rows_dspr2(input, outptr, 16);
+      idct16_1d_cols_add_blk_dspr2(out, dest, pitch);
+      break;
+    case ADST_DCT:    // ADST in vertical, DCT in horizontal
+      idct16_1d_rows_dspr2(input, outptr, 16);
+
+      outptr = out;
+
+      for (i = 0; i < 16; ++i) {
+        iadst16_1d(outptr, temp_out);
+
+        for (j = 0; j < 16; ++j)
+          dest[j * pitch + i] =
+                    clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+                                      + dest[j * pitch + i]);
+        outptr += 16;
+      }
+      break;
+    case DCT_ADST:    // DCT in vertical, ADST in horizontal
+    {
+      int16_t temp_in[16 * 16];
+
+      for (i = 0; i < 16; ++i) {
+        /* prefetch row */
+        vp9_prefetch_load((const uint8_t *)(input + 16));
+
+        iadst16_1d(input, outptr);
+        input += 16;
+        outptr += 16;
+      }
+
+      for (i = 0; i < 16; ++i)
+        for (j = 0; j < 16; ++j)
+            temp_in[j * 16 + i] = out[i * 16 + j];
+
+      idct16_1d_cols_add_blk_dspr2(temp_in, dest, pitch);
+    }
+    break;
+    case ADST_ADST:   // ADST in both directions
+    {
+      int16_t temp_in[16];
+
+      for (i = 0; i < 16; ++i) {
+        /* prefetch row */
+        vp9_prefetch_load((const uint8_t *)(input + 16));
+
+        iadst16_1d(input, outptr);
+        input += 16;
+        outptr += 16;
+      }
+
+      for (i = 0; i < 16; ++i) {
+        for (j = 0; j < 16; ++j)
+          temp_in[j] = out[j * 16 + i];
+        iadst16_1d(temp_in, temp_out);
+        for (j = 0; j < 16; ++j)
+          dest[j * pitch + i] =
+                    clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+                                      + dest[j * pitch + i]);
+      }
+    }
+    break;
+    default:
+      printf("vp9_short_iht16x16_add_dspr2 : Invalid tx_type\n");
+      break;
+  }
+}
+
+void vp9_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest,
+                                int dest_stride) {
+  DECLARE_ALIGNED(32, int16_t,  out[16 * 16]);
+  int16_t *outptr = out;
+  uint32_t i;
+  uint32_t pos = 45;
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp    %[pos],    1    \n\t"
+    :
+    : [pos] "r" (pos)
+  );
+
+  // First transform rows. Since all non-zero dct coefficients are in
+  // upper-left 4x4 area, we only need to calculate first 4 rows here.
+  idct16_1d_rows_dspr2(input, outptr, 4);
+
+  outptr += 4;
+  for (i = 0; i < 6; ++i) {
+    __asm__ __volatile__ (
+        "sw     $zero,    0(%[outptr])     \n\t"
+        "sw     $zero,   32(%[outptr])     \n\t"
+        "sw     $zero,   64(%[outptr])     \n\t"
+        "sw     $zero,   96(%[outptr])     \n\t"
+        "sw     $zero,  128(%[outptr])     \n\t"
+        "sw     $zero,  160(%[outptr])     \n\t"
+        "sw     $zero,  192(%[outptr])     \n\t"
+        "sw     $zero,  224(%[outptr])     \n\t"
+        "sw     $zero,  256(%[outptr])     \n\t"
+        "sw     $zero,  288(%[outptr])     \n\t"
+        "sw     $zero,  320(%[outptr])     \n\t"
+        "sw     $zero,  352(%[outptr])     \n\t"
+        "sw     $zero,  384(%[outptr])     \n\t"
+        "sw     $zero,  416(%[outptr])     \n\t"
+        "sw     $zero,  448(%[outptr])     \n\t"
+        "sw     $zero,  480(%[outptr])     \n\t"
+
+        :
+        : [outptr] "r" (outptr)
+    );
+
+    outptr += 2;
+  }
+
+  // Then transform columns
+  idct16_1d_cols_add_blk_dspr2(out, dest, dest_stride);
+}
+
+void vp9_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest,
+                               int dest_stride) {
+  uint32_t pos = 45;
+  int32_t out;
+  int32_t r;
+  int32_t a1, absa1;
+  int32_t vector_a1;
+  int32_t t1, t2, t3, t4;
+  int32_t vector_1, vector_2, vector_3, vector_4;
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp      %[pos],     1           \n\t"
+
+    :
+    : [pos] "r" (pos)
+  );
+
+  out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);
+  __asm__ __volatile__ (
+      "addi     %[out],     %[out],     32      \n\t"
+      "sra      %[a1],      %[out],     6       \n\t"
+
+      : [out] "+r" (out), [a1] "=r" (a1)
+      :
+  );
+
+  if (a1 < 0) {
+    /* use quad-byte
+     * input and output memory are four byte aligned */
+    __asm__ __volatile__ (
+        "abs        %[absa1],       %[a1]       \n\t"
+        "replv.qb   %[vector_a1],   %[absa1]    \n\t"
+
+        : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)
+        : [a1] "r" (a1)
+    );
+
+    for (r = 16; r--;) {
+      __asm__ __volatile__ (
+          "lw             %[t1],          0(%[dest])                      \n\t"
+          "lw             %[t2],          4(%[dest])                      \n\t"
+          "lw             %[t3],          8(%[dest])                      \n\t"
+          "lw             %[t4],          12(%[dest])                     \n\t"
+          "subu_s.qb      %[vector_1],    %[t1],          %[vector_a1]    \n\t"
+          "subu_s.qb      %[vector_2],    %[t2],          %[vector_a1]    \n\t"
+          "subu_s.qb      %[vector_3],    %[t3],          %[vector_a1]    \n\t"
+          "subu_s.qb      %[vector_4],    %[t4],          %[vector_a1]    \n\t"
+          "sw             %[vector_1],    0(%[dest])                      \n\t"
+          "sw             %[vector_2],    4(%[dest])                      \n\t"
+          "sw             %[vector_3],    8(%[dest])                      \n\t"
+          "sw             %[vector_4],    12(%[dest])                     \n\t"
+          "add            %[dest],        %[dest],        %[dest_stride]  \n\t"
+
+          : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4),
+            [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
+            [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4),
+            [dest] "+&r" (dest)
+          : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
+      );
+    }
+  } else {
+    /* use quad-byte
+     * input and output memory are four byte aligned */
+    __asm__ __volatile__ (
+        "replv.qb   %[vector_a1],   %[a1]   \n\t"
+
+        : [vector_a1] "=r" (vector_a1)
+        : [a1] "r" (a1)
+    );
+
+    for (r = 16; r--;) {
+      __asm__ __volatile__ (
+          "lw             %[t1],          0(%[dest])                      \n\t"
+          "lw             %[t2],          4(%[dest])                      \n\t"
+          "lw             %[t3],          8(%[dest])                      \n\t"
+          "lw             %[t4],          12(%[dest])                     \n\t"
+          "addu_s.qb      %[vector_1],    %[t1],          %[vector_a1]    \n\t"
+          "addu_s.qb      %[vector_2],    %[t2],          %[vector_a1]    \n\t"
+          "addu_s.qb      %[vector_3],    %[t3],          %[vector_a1]    \n\t"
+          "addu_s.qb      %[vector_4],    %[t4],          %[vector_a1]    \n\t"
+          "sw             %[vector_1],    0(%[dest])                      \n\t"
+          "sw             %[vector_2],    4(%[dest])                      \n\t"
+          "sw             %[vector_3],    8(%[dest])                      \n\t"
+          "sw             %[vector_4],    12(%[dest])                     \n\t"
+          "add            %[dest],        %[dest],        %[dest_stride]  \n\t"
+
+          : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4),
+            [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
+            [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4),
+            [dest] "+&r" (dest)
+          : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
+      );
+    }
+  }
+}
+#endif  // #if HAVE_DSPR2
diff --git a/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c b/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c
new file mode 100644
index 000000000..5e92db3d2
--- /dev/null
+++ b/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c
@@ -0,0 +1,1073 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_idct.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+
+#if HAVE_DSPR2
+void vp9_idct32_1d_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
+                                      int dest_stride) {
+  int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6;
+  int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13;
+  int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19;
+  int16_t step1_20, step1_21, step1_22, step1_23, step1_24, step1_25, step1_26;
+  int16_t step1_27, step1_28, step1_29, step1_30, step1_31;
+  int16_t step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
+  int16_t step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13;
+  int16_t step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20;
+  int16_t step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27;
+  int16_t step2_28, step2_29, step2_30, step2_31;
+  int16_t step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14;
+  int16_t step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21;
+  int16_t step3_22, step3_23, step3_24, step3_25, step3_26, step3_27;
+  int16_t step3_28, step3_29, step3_30, step3_31;
+  int temp0, temp1, temp2, temp3;
+  int load1, load2, load3, load4;
+  int result1, result2;
+  int i, temp21;
+  uint8_t *dest_pix, *dest_pix1;
+  const int const_2_power_13 = 8192;
+  uint8_t *cm = vp9_ff_cropTbl;
+
+  /* prefetch vp9_ff_cropTbl */
+  vp9_prefetch_load(vp9_ff_cropTbl);
+  vp9_prefetch_load(vp9_ff_cropTbl +  32);
+  vp9_prefetch_load(vp9_ff_cropTbl +  64);
+  vp9_prefetch_load(vp9_ff_cropTbl +  96);
+  vp9_prefetch_load(vp9_ff_cropTbl + 128);
+  vp9_prefetch_load(vp9_ff_cropTbl + 160);
+  vp9_prefetch_load(vp9_ff_cropTbl + 192);
+  vp9_prefetch_load(vp9_ff_cropTbl + 224);
+
+  for (i = 0; i < 32; ++i) {
+    dest_pix = dest + i;
+    dest_pix1 = dest + i + 31 * dest_stride;
+
+    __asm__ __volatile__ (
+        "lh       %[load1],             2(%[input])                     \n\t"
+        "lh       %[load2],             62(%[input])                    \n\t"
+        "lh       %[load3],             34(%[input])                    \n\t"
+        "lh       %[load4],             30(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load1],       %[cospi_31_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_1_64]   \n\t"
+        "extp     %[temp0],             $ac1,           31              \n\t"
+
+        "madd     $ac3,                 %[load1],       %[cospi_1_64]   \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_31_64]  \n\t"
+        "extp     %[temp3],             $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac2,                 %[load3],       %[cospi_15_64]  \n\t"
+        "msub     $ac2,                 %[load4],       %[cospi_17_64]  \n\t"
+        "extp     %[temp1],             $ac2,           31              \n\t"
+
+        "madd     $ac1,                 %[load3],       %[cospi_17_64]  \n\t"
+        "madd     $ac1,                 %[load4],       %[cospi_15_64]  \n\t"
+        "extp     %[temp2],             $ac1,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load1],             %[temp3],       %[temp2]        \n\t"
+        "sub      %[load2],             %[temp0],       %[temp1]        \n\t"
+
+        "madd     $ac1,                 %[load1],       %[cospi_28_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_4_64]   \n\t"
+        "madd     $ac3,                 %[load1],       %[cospi_4_64]   \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_28_64]  \n\t"
+
+        "extp     %[step1_17],          $ac1,           31              \n\t"
+        "extp     %[step1_30],          $ac3,           31              \n\t"
+        "add      %[step1_16],          %[temp0],       %[temp1]        \n\t"
+        "add      %[step1_31],          %[temp2],       %[temp3]        \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3),
+          [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+          [step1_16] "=r" (step1_16), [step1_17] "=r" (step1_17),
+          [step1_30] "=r" (step1_30), [step1_31] "=r" (step1_31)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_31_64] "r" (cospi_31_64), [cospi_1_64] "r" (cospi_1_64),
+          [cospi_4_64] "r" (cospi_4_64), [cospi_17_64] "r" (cospi_17_64),
+          [cospi_15_64] "r" (cospi_15_64), [cospi_28_64] "r" (cospi_28_64)
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load1],             18(%[input])                    \n\t"
+        "lh       %[load2],             46(%[input])                    \n\t"
+        "lh       %[load3],             50(%[input])                    \n\t"
+        "lh       %[load4],             14(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load1],       %[cospi_23_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_9_64]   \n\t"
+        "extp     %[temp0],             $ac1,           31              \n\t"
+
+        "madd     $ac3,                 %[load1],       %[cospi_9_64]   \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_23_64]  \n\t"
+        "extp     %[temp3],             $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac2,                 %[load3],       %[cospi_7_64]   \n\t"
+        "msub     $ac2,                 %[load4],       %[cospi_25_64]  \n\t"
+        "extp     %[temp1],             $ac2,           31              \n\t"
+
+        "madd     $ac1,                 %[load3],       %[cospi_25_64]  \n\t"
+        "madd     $ac1,                 %[load4],       %[cospi_7_64]   \n\t"
+        "extp     %[temp2],             $ac1,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load1],             %[temp1],       %[temp0]        \n\t"
+        "sub      %[load2],             %[temp2],       %[temp3]        \n\t"
+
+        "msub     $ac1,                 %[load1],       %[cospi_28_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_4_64]   \n\t"
+        "msub     $ac3,                 %[load1],       %[cospi_4_64]   \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_28_64]  \n\t"
+
+        "extp     %[step1_18],          $ac1,           31              \n\t"
+        "extp     %[step1_29],          $ac3,           31              \n\t"
+        "add      %[step1_19],          %[temp0],       %[temp1]        \n\t"
+        "add      %[step1_28],          %[temp2],       %[temp3]        \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3),
+          [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+          [step1_18] "=r" (step1_18), [step1_19] "=r" (step1_19),
+          [step1_28] "=r" (step1_28), [step1_29] "=r" (step1_29)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_23_64] "r" (cospi_23_64), [cospi_9_64] "r" (cospi_9_64),
+          [cospi_4_64] "r" (cospi_4_64), [cospi_7_64] "r" (cospi_7_64),
+          [cospi_25_64] "r" (cospi_25_64), [cospi_28_64] "r" (cospi_28_64)
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load1],             10(%[input])                    \n\t"
+        "lh       %[load2],             54(%[input])                    \n\t"
+        "lh       %[load3],             42(%[input])                    \n\t"
+        "lh       %[load4],             22(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load1],       %[cospi_27_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_5_64]   \n\t"
+        "extp     %[temp0],             $ac1,           31              \n\t"
+
+        "madd     $ac3,                 %[load1],       %[cospi_5_64]   \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_27_64]  \n\t"
+        "extp     %[temp3],             $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac2,                 %[load3],       %[cospi_11_64]  \n\t"
+        "msub     $ac2,                 %[load4],       %[cospi_21_64]  \n\t"
+        "extp     %[temp1],             $ac2,           31              \n\t"
+
+        "madd     $ac1,                 %[load3],       %[cospi_21_64]  \n\t"
+        "madd     $ac1,                 %[load4],       %[cospi_11_64]  \n\t"
+        "extp     %[temp2],             $ac1,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load1],             %[temp0],       %[temp1]        \n\t"
+        "sub      %[load2],             %[temp3],       %[temp2]        \n\t"
+
+        "madd     $ac1,                 %[load2],       %[cospi_12_64]  \n\t"
+        "msub     $ac1,                 %[load1],       %[cospi_20_64]  \n\t"
+        "madd     $ac3,                 %[load1],       %[cospi_12_64]  \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_20_64]  \n\t"
+
+        "extp     %[step1_21],          $ac1,           31              \n\t"
+        "extp     %[step1_26],          $ac3,           31              \n\t"
+        "add      %[step1_20],          %[temp0],       %[temp1]        \n\t"
+        "add      %[step1_27],          %[temp2],       %[temp3]        \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3),
+          [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+          [step1_20] "=r" (step1_20), [step1_21] "=r" (step1_21),
+          [step1_26] "=r" (step1_26), [step1_27] "=r" (step1_27)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_27_64] "r" (cospi_27_64), [cospi_5_64] "r" (cospi_5_64),
+          [cospi_11_64] "r" (cospi_11_64), [cospi_21_64] "r" (cospi_21_64),
+          [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64)
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load1],             26(%[input])                    \n\t"
+        "lh       %[load2],             38(%[input])                    \n\t"
+        "lh       %[load3],             58(%[input])                    \n\t"
+        "lh       %[load4],              6(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load1],       %[cospi_19_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_13_64]  \n\t"
+        "extp     %[temp0],             $ac1,           31              \n\t"
+        "madd     $ac3,                 %[load1],       %[cospi_13_64]  \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_19_64]  \n\t"
+        "extp     %[temp3],             $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac2,                 %[load3],       %[cospi_3_64]   \n\t"
+        "msub     $ac2,                 %[load4],       %[cospi_29_64]  \n\t"
+        "extp     %[temp1],             $ac2,           31              \n\t"
+        "madd     $ac1,                 %[load3],       %[cospi_29_64]  \n\t"
+        "madd     $ac1,                 %[load4],       %[cospi_3_64]   \n\t"
+        "extp     %[temp2],             $ac1,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load1],             %[temp1],       %[temp0]        \n\t"
+        "sub      %[load2],             %[temp2],       %[temp3]        \n\t"
+        "msub     $ac1,                 %[load1],       %[cospi_12_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_20_64]  \n\t"
+        "msub     $ac3,                 %[load1],       %[cospi_20_64]  \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_12_64]  \n\t"
+        "extp     %[step1_22],          $ac1,           31              \n\t"
+        "extp     %[step1_25],          $ac3,           31              \n\t"
+        "add      %[step1_23],          %[temp0],       %[temp1]        \n\t"
+        "add      %[step1_24],          %[temp2],       %[temp3]        \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3),
+          [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+          [step1_22] "=r" (step1_22), [step1_23] "=r" (step1_23),
+          [step1_24] "=r" (step1_24), [step1_25] "=r" (step1_25)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_19_64] "r" (cospi_19_64), [cospi_13_64] "r" (cospi_13_64),
+          [cospi_3_64] "r" (cospi_3_64), [cospi_29_64] "r" (cospi_29_64),
+          [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64)
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load1],              4(%[input])                    \n\t"
+        "lh       %[load2],             60(%[input])                    \n\t"
+        "lh       %[load3],             36(%[input])                    \n\t"
+        "lh       %[load4],             28(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load1],       %[cospi_30_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_2_64]   \n\t"
+        "extp     %[temp0],             $ac1,           31              \n\t"
+        "madd     $ac3,                 %[load1],       %[cospi_2_64]   \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_30_64]  \n\t"
+        "extp     %[temp3],             $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac2,                 %[load3],       %[cospi_14_64]  \n\t"
+        "msub     $ac2,                 %[load4],       %[cospi_18_64]  \n\t"
+        "extp     %[temp1],             $ac2,           31              \n\t"
+        "madd     $ac1,                 %[load3],       %[cospi_18_64]  \n\t"
+        "madd     $ac1,                 %[load4],       %[cospi_14_64]  \n\t"
+        "extp     %[temp2],             $ac1,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load1],             %[temp0],       %[temp1]        \n\t"
+        "sub      %[load2],             %[temp3],       %[temp2]        \n\t"
+        "msub     $ac1,                 %[load1],       %[cospi_8_64]   \n\t"
+        "madd     $ac1,                 %[load2],       %[cospi_24_64]  \n\t"
+        "madd     $ac3,                 %[load1],       %[cospi_24_64]  \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_8_64]   \n\t"
+        "extp     %[step2_9],           $ac1,           31              \n\t"
+        "extp     %[step2_14],          $ac3,           31              \n\t"
+        "add      %[step2_8],           %[temp0],       %[temp1]        \n\t"
+        "add      %[step2_15],          %[temp2],       %[temp3]        \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3),
+          [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+          [step2_8] "=r" (step2_8), [step2_9] "=r" (step2_9),
+          [step2_14] "=r" (step2_14), [step2_15] "=r" (step2_15)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64),
+          [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64),
+          [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64)
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load1],             20(%[input])                    \n\t"
+        "lh       %[load2],             44(%[input])                    \n\t"
+        "lh       %[load3],             52(%[input])                    \n\t"
+        "lh       %[load4],             12(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load1],       %[cospi_22_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_10_64]  \n\t"
+        "extp     %[temp0],             $ac1,           31              \n\t"
+        "madd     $ac3,                 %[load1],       %[cospi_10_64]  \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_22_64]  \n\t"
+        "extp     %[temp3],             $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac2,                 %[load3],       %[cospi_6_64]   \n\t"
+        "msub     $ac2,                 %[load4],       %[cospi_26_64]  \n\t"
+        "extp     %[temp1],             $ac2,           31              \n\t"
+        "madd     $ac1,                 %[load3],       %[cospi_26_64]  \n\t"
+        "madd     $ac1,                 %[load4],       %[cospi_6_64]   \n\t"
+        "extp     %[temp2],             $ac1,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load1],             %[temp1],       %[temp0]        \n\t"
+        "sub      %[load2],             %[temp2],       %[temp3]        \n\t"
+        "msub     $ac1,                 %[load1],       %[cospi_24_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_8_64]   \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_24_64]  \n\t"
+        "msub     $ac3,                 %[load1],       %[cospi_8_64]   \n\t"
+        "extp     %[step2_10],          $ac1,           31              \n\t"
+        "extp     %[step2_13],          $ac3,           31              \n\t"
+        "add      %[step2_11],          %[temp0],       %[temp1]        \n\t"
+        "add      %[step2_12],          %[temp2],       %[temp3]        \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3),
+          [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+          [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11),
+          [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64),
+          [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64),
+          [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64)
+    );
+
+    __asm__ __volatile__ (
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "sub      %[temp0],             %[step2_14],    %[step2_13]     \n\t"
+        "sub      %[temp0],             %[temp0],       %[step2_9]      \n\t"
+        "add      %[temp0],             %[temp0],       %[step2_10]     \n\t"
+        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "sub      %[temp1],             %[step2_14],    %[step2_13]     \n\t"
+        "add      %[temp1],             %[temp1],       %[step2_9]      \n\t"
+        "sub      %[temp1],             %[temp1],       %[step2_10]     \n\t"
+        "madd     $ac1,                 %[temp1],       %[cospi_16_64]  \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+        "sub      %[temp0],             %[step2_15],    %[step2_12]     \n\t"
+        "sub      %[temp0],             %[temp0],       %[step2_8]      \n\t"
+        "add      %[temp0],             %[temp0],       %[step2_11]     \n\t"
+        "madd     $ac2,                 %[temp0],       %[cospi_16_64]  \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+        "sub      %[temp1],             %[step2_15],    %[step2_12]     \n\t"
+        "add      %[temp1],             %[temp1],       %[step2_8]      \n\t"
+        "sub      %[temp1],             %[temp1],       %[step2_11]     \n\t"
+        "madd     $ac3,                 %[temp1],       %[cospi_16_64]  \n\t"
+
+        "add      %[step3_8],           %[step2_8],     %[step2_11]     \n\t"
+        "add      %[step3_9],           %[step2_9],     %[step2_10]     \n\t"
+        "add      %[step3_14],          %[step2_13],    %[step2_14]     \n\t"
+        "add      %[step3_15],          %[step2_12],    %[step2_15]     \n\t"
+        "extp     %[step3_10],          $ac0,           31              \n\t"
+        "extp     %[step3_13],          $ac1,           31              \n\t"
+        "extp     %[step3_11],          $ac2,           31              \n\t"
+        "extp     %[step3_12],          $ac3,           31              \n\t"
+
+        : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+          [step3_8] "=r" (step3_8), [step3_9] "=r" (step3_9),
+          [step3_10] "=r" (step3_10), [step3_11] "=r" (step3_11),
+          [step3_12] "=r" (step3_12), [step3_13] "=r" (step3_13),
+          [step3_14] "=r" (step3_14), [step3_15] "=r" (step3_15)
+        : [const_2_power_13] "r" (const_2_power_13), [step2_8] "r" (step2_8),
+          [step2_9] "r" (step2_9), [step2_10] "r" (step2_10),
+          [step2_11] "r" (step2_11), [step2_12] "r" (step2_12),
+          [step2_13] "r" (step2_13), [step2_14] "r" (step2_14),
+          [step2_15] "r" (step2_15), [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    step2_18 = step1_17 - step1_18;
+    step2_29 = step1_30 - step1_29;
+
+    __asm__ __volatile__ (
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "msub     $ac0,                 %[step2_18],    %[cospi_8_64]   \n\t"
+        "madd     $ac0,                 %[step2_29],    %[cospi_24_64]  \n\t"
+        "extp     %[step3_18],          $ac0,           31              \n\t"
+
+        : [step3_18] "=r" (step3_18)
+        : [const_2_power_13] "r" (const_2_power_13),
+          [step2_18] "r" (step2_18), [step2_29] "r" (step2_29),
+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+    );
+
+    temp21 = step2_18 * cospi_24_64 + step2_29 * cospi_8_64;
+    step3_29 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+    step2_19 = step1_16 - step1_19;
+    step2_28 = step1_31 - step1_28;
+
+    __asm__ __volatile__ (
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "msub     $ac0,                 %[step2_19],    %[cospi_8_64]   \n\t"
+        "madd     $ac0,                 %[step2_28],    %[cospi_24_64]  \n\t"
+        "extp     %[step3_19],          $ac0,           31              \n\t"
+
+        : [step3_19] "=r" (step3_19)
+        : [const_2_power_13] "r" (const_2_power_13),
+          [step2_19] "r" (step2_19), [step2_28] "r" (step2_28),
+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+    );
+
+    temp21 = step2_19 * cospi_24_64 + step2_28 * cospi_8_64;
+    step3_28 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+    step3_16 = step1_16 + step1_19;
+    step3_17 = step1_17 + step1_18;
+    step3_30 = step1_29 + step1_30;
+    step3_31 = step1_28 + step1_31;
+
+    step2_20 = step1_23 - step1_20;
+    step2_27 = step1_24 - step1_27;
+
+    __asm__ __volatile__ (
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "msub     $ac0,                 %[step2_20],    %[cospi_24_64]  \n\t"
+        "msub     $ac0,                 %[step2_27],    %[cospi_8_64]   \n\t"
+        "extp     %[step3_20],          $ac0,           31              \n\t"
+
+        : [step3_20] "=r" (step3_20)
+        : [const_2_power_13] "r" (const_2_power_13),
+          [step2_20] "r" (step2_20), [step2_27] "r" (step2_27),
+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+    );
+
+    temp21 = -step2_20 * cospi_8_64 + step2_27 * cospi_24_64;
+    step3_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+    step2_21 = step1_22 - step1_21;
+    step2_26 = step1_25 - step1_26;
+
+    __asm__ __volatile__ (
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "msub     $ac1,                 %[step2_21],    %[cospi_24_64]  \n\t"
+        "msub     $ac1,                 %[step2_26],    %[cospi_8_64]   \n\t"
+        "extp     %[step3_21],          $ac1,           31              \n\t"
+
+        : [step3_21] "=r" (step3_21)
+        : [const_2_power_13] "r" (const_2_power_13),
+          [step2_21] "r" (step2_21), [step2_26] "r" (step2_26),
+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+    );
+
+    temp21 = -step2_21 * cospi_8_64 + step2_26 * cospi_24_64;
+    step3_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+    step3_22 = step1_21 + step1_22;
+    step3_23 = step1_20 + step1_23;
+    step3_24 = step1_24 + step1_27;
+    step3_25 = step1_25 + step1_26;
+
+    step2_16 = step3_16 + step3_23;
+    step2_17 = step3_17 + step3_22;
+    step2_18 = step3_18 + step3_21;
+    step2_19 = step3_19 + step3_20;
+    step2_20 = step3_19 - step3_20;
+    step2_21 = step3_18 - step3_21;
+    step2_22 = step3_17 - step3_22;
+    step2_23 = step3_16 - step3_23;
+
+    step2_24 = step3_31 - step3_24;
+    step2_25 = step3_30 - step3_25;
+    step2_26 = step3_29 - step3_26;
+    step2_27 = step3_28 - step3_27;
+    step2_28 = step3_28 + step3_27;
+    step2_29 = step3_29 + step3_26;
+    step2_30 = step3_30 + step3_25;
+    step2_31 = step3_31 + step3_24;
+
+    __asm__ __volatile__ (
+        "lh       %[load1],             0(%[input])                     \n\t"
+        "lh       %[load2],             32(%[input])                    \n\t"
+        "lh       %[load3],             16(%[input])                    \n\t"
+        "lh       %[load4],             48(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+        "add      %[result1],           %[load1],       %[load2]        \n\t"
+        "sub      %[result2],           %[load1],       %[load2]        \n\t"
+        "madd     $ac1,                 %[result1],     %[cospi_16_64]  \n\t"
+        "madd     $ac2,                 %[result2],     %[cospi_16_64]  \n\t"
+        "extp     %[temp0],             $ac1,           31              \n\t"
+        "extp     %[temp1],             $ac2,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+        "madd     $ac3,                 %[load3],       %[cospi_24_64]  \n\t"
+        "msub     $ac3,                 %[load4],       %[cospi_8_64]   \n\t"
+        "extp     %[temp2],             $ac3,           31              \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "madd     $ac1,                 %[load3],       %[cospi_8_64]   \n\t"
+        "madd     $ac1,                 %[load4],       %[cospi_24_64]  \n\t"
+        "extp     %[temp3],             $ac1,           31              \n\t"
+        "add      %[step1_0],           %[temp0],       %[temp3]        \n\t"
+        "add      %[step1_1],           %[temp1],       %[temp2]        \n\t"
+        "sub      %[step1_2],           %[temp1],       %[temp2]        \n\t"
+        "sub      %[step1_3],           %[temp0],       %[temp3]        \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2),
+          [load3] "=&r" (load3), [load4] "=&r" (load4),
+          [result1] "=&r" (result1), [result2] "=&r" (result2),
+          [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+          [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1),
+          [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64),
+          [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load1],             8(%[input])                     \n\t"
+        "lh       %[load2],             56(%[input])                    \n\t"
+        "lh       %[load3],             40(%[input])                    \n\t"
+        "lh       %[load4],             24(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load1],       %[cospi_28_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_4_64]   \n\t"
+        "extp     %[temp0],             $ac1,           31              \n\t"
+        "madd     $ac3,                 %[load1],       %[cospi_4_64]   \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_28_64]  \n\t"
+        "extp     %[temp3],             $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac2,                 %[load3],       %[cospi_12_64]  \n\t"
+        "msub     $ac2,                 %[load4],       %[cospi_20_64]  \n\t"
+        "extp     %[temp1],             $ac2,           31              \n\t"
+        "madd     $ac1,                 %[load3],       %[cospi_20_64]  \n\t"
+        "madd     $ac1,                 %[load4],       %[cospi_12_64]  \n\t"
+        "extp     %[temp2],             $ac1,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load1],             %[temp3],       %[temp2]        \n\t"
+        "sub      %[load1],             %[load1],       %[temp0]        \n\t"
+        "add      %[load1],             %[load1],       %[temp1]        \n\t"
+        "sub      %[load2],             %[temp0],       %[temp1]        \n\t"
+        "sub      %[load2],             %[load2],       %[temp2]        \n\t"
+        "add      %[load2],             %[load2],       %[temp3]        \n\t"
+        "madd     $ac1,                 %[load1],       %[cospi_16_64]  \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_16_64]  \n\t"
+
+        "extp     %[step1_5],           $ac1,           31              \n\t"
+        "extp     %[step1_6],           $ac3,           31              \n\t"
+        "add      %[step1_4],           %[temp0],       %[temp1]        \n\t"
+        "add      %[step1_7],           %[temp3],       %[temp2]        \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2),
+          [load3] "=&r" (load3), [load4] "=&r" (load4),
+          [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+          [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5),
+          [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64),
+          [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64),
+          [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    step2_0 = step1_0 + step1_7;
+    step2_1 = step1_1 + step1_6;
+    step2_2 = step1_2 + step1_5;
+    step2_3 = step1_3 + step1_4;
+    step2_4 = step1_3 - step1_4;
+    step2_5 = step1_2 - step1_5;
+    step2_6 = step1_1 - step1_6;
+    step2_7 = step1_0 - step1_7;
+
+    // stage 7
+    step1_0 = step2_0 + step3_15;
+    step1_1 = step2_1 + step3_14;
+    step1_2 = step2_2 + step3_13;
+    step1_3 = step2_3 + step3_12;
+    step1_4 = step2_4 + step3_11;
+    step1_5 = step2_5 + step3_10;
+    step1_6 = step2_6 + step3_9;
+    step1_7 = step2_7 + step3_8;
+    step1_8 = step2_7 - step3_8;
+    step1_9 = step2_6 - step3_9;
+    step1_10 = step2_5 - step3_10;
+    step1_11 = step2_4 - step3_11;
+    step1_12 = step2_3 - step3_12;
+    step1_13 = step2_2 - step3_13;
+    step1_14 = step2_1 - step3_14;
+    step1_15 = step2_0 - step3_15;
+
+    __asm__ __volatile__ (
+        "sub      %[temp0],             %[step2_27],    %[step2_20]     \n\t"
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
+        "extp     %[step1_20],          $ac0,           31              \n\t"
+
+        : [temp0] "=&r" (temp0), [step1_20] "=r" (step1_20)
+        : [const_2_power_13] "r" (const_2_power_13), [step2_20] "r" (step2_20),
+          [step2_27] "r" (step2_27), [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    temp21 = (step2_20 + step2_27) * cospi_16_64;
+    step1_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+    __asm__ __volatile__ (
+        "sub      %[temp0],             %[step2_26],    %[step2_21]     \n\t"
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
+        "extp     %[step1_21],          $ac0,           31              \n\t"
+
+        : [temp0] "=&r" (temp0), [step1_21] "=r" (step1_21)
+        : [const_2_power_13] "r" (const_2_power_13), [step2_26] "r" (step2_26),
+          [step2_21] "r" (step2_21), [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    temp21 = (step2_21 + step2_26) * cospi_16_64;
+    step1_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+    __asm__ __volatile__ (
+        "sub      %[temp0],             %[step2_25],    %[step2_22]     \n\t"
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
+        "extp     %[step1_22],          $ac0,           31              \n\t"
+
+        : [temp0] "=&r" (temp0), [step1_22] "=r" (step1_22)
+        : [const_2_power_13] "r" (const_2_power_13), [step2_25] "r" (step2_25),
+          [step2_22] "r" (step2_22), [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    temp21 = (step2_22 + step2_25) * cospi_16_64;
+    step1_25 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+    __asm__ __volatile__ (
+        "sub      %[temp0],             %[step2_24],    %[step2_23]     \n\t"
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
+        "extp     %[step1_23],          $ac0,           31              \n\t"
+
+        : [temp0] "=&r" (temp0), [step1_23] "=r" (step1_23)
+        : [const_2_power_13] "r" (const_2_power_13), [step2_24] "r" (step2_24),
+          [step2_23] "r" (step2_23), [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    temp21 = (step2_23 + step2_24) * cospi_16_64;
+    step1_24 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+    __asm__ __volatile__ (
+        "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
+        "add      %[temp0],         %[step1_0],         %[step2_31]     \n\t"
+        "addi     %[temp0],         %[temp0],           32              \n\t"
+        "sra      %[temp0],         %[temp0],           6               \n\t"
+        "add      %[temp2],         %[temp2],           %[temp0]        \n\t"
+        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
+        "add      %[temp1],         %[step1_1],         %[step2_30]     \n\t"
+        "sb       %[temp0],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
+        "addi     %[temp1],         %[temp1],           32              \n\t"
+        "sra      %[temp1],         %[temp1],           6               \n\t"
+        "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
+        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
+        "sb       %[temp1],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+
+        "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
+        "add      %[temp0],         %[step1_2],         %[step2_29]     \n\t"
+        "addi     %[temp0],         %[temp0],           32              \n\t"
+        "sra      %[temp0],         %[temp0],           6               \n\t"
+        "add      %[temp2],         %[temp2],           %[temp0]        \n\t"
+        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
+        "add      %[temp1],         %[step1_3],         %[step2_28]     \n\t"
+        "sb       %[temp0],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
+        "addi     %[temp1],         %[temp1],           32              \n\t"
+        "sra      %[temp1],         %[temp1],           6               \n\t"
+        "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
+        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
+        "sb       %[temp1],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+
+        : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
+          [temp3] "=&r" (temp3), [dest_pix] "+r" (dest_pix)
+        : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
+          [step1_0] "r" (step1_0), [step1_1] "r" (step1_1),
+          [step1_2] "r" (step1_2), [step1_3] "r" (step1_3),
+          [step2_28] "r" (step2_28), [step2_29] "r" (step2_29),
+          [step2_30] "r" (step2_30), [step2_31] "r" (step2_31)
+    );
+
+    step3_12 = ROUND_POWER_OF_TWO((step1_3 - step2_28), 6);
+    step3_13 = ROUND_POWER_OF_TWO((step1_2 - step2_29), 6);
+    step3_14 = ROUND_POWER_OF_TWO((step1_1 - step2_30), 6);
+    step3_15 = ROUND_POWER_OF_TWO((step1_0 - step2_31), 6);
+
+    __asm__ __volatile__ (
+        "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
+        "add      %[temp2],         %[temp2],           %[step3_15]     \n\t"
+        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
+        "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+        "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
+        "add      %[temp3],         %[temp3],           %[step3_14]     \n\t"
+        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
+        "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+
+        "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
+        "add      %[temp2],         %[temp2],           %[step3_13]     \n\t"
+        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
+        "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+        "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
+        "add      %[temp3],         %[temp3],           %[step3_12]     \n\t"
+        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
+        "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+
+        : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
+          [temp3] "=&r" (temp3), [dest_pix1] "+r" (dest_pix1)
+        : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
+          [step3_12] "r" (step3_12), [step3_13] "r" (step3_13),
+          [step3_14] "r" (step3_14), [step3_15] "r" (step3_15)
+    );
+
+    __asm__ __volatile__ (
+        "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
+        "add      %[temp0],         %[step1_4],         %[step1_27]     \n\t"
+        "addi     %[temp0],         %[temp0],           32              \n\t"
+        "sra      %[temp0],         %[temp0],           6               \n\t"
+        "add      %[temp2],         %[temp2],           %[temp0]        \n\t"
+        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
+        "add      %[temp1],         %[step1_5],         %[step1_26]     \n\t"
+        "sb       %[temp0],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
+        "addi     %[temp1],         %[temp1],           32              \n\t"
+        "sra      %[temp1],         %[temp1],           6               \n\t"
+        "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
+        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
+        "sb       %[temp1],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+
+        "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
+        "add      %[temp0],         %[step1_6],         %[step1_25]     \n\t"
+        "addi     %[temp0],         %[temp0],           32              \n\t"
+        "sra      %[temp0],         %[temp0],           6               \n\t"
+        "add      %[temp2],         %[temp2],           %[temp0]        \n\t"
+        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
+        "add      %[temp1],         %[step1_7],         %[step1_24]     \n\t"
+        "sb       %[temp0],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
+        "addi     %[temp1],         %[temp1],           32              \n\t"
+        "sra      %[temp1],         %[temp1],           6               \n\t"
+        "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
+        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
+        "sb       %[temp1],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+
+        : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
+          [temp3] "=&r" (temp3), [dest_pix] "+r" (dest_pix)
+        : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
+          [step1_4] "r" (step1_4), [step1_5] "r" (step1_5),
+          [step1_6] "r" (step1_6), [step1_7] "r" (step1_7),
+          [step1_24] "r" (step1_24), [step1_25] "r" (step1_25),
+          [step1_26] "r" (step1_26), [step1_27] "r" (step1_27)
+    );
+
+    step3_12 = ROUND_POWER_OF_TWO((step1_7 - step1_24), 6);
+    step3_13 = ROUND_POWER_OF_TWO((step1_6 - step1_25), 6);
+    step3_14 = ROUND_POWER_OF_TWO((step1_5 - step1_26), 6);
+    step3_15 = ROUND_POWER_OF_TWO((step1_4 - step1_27), 6);
+
+    __asm__ __volatile__ (
+        "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
+        "add      %[temp2],         %[temp2],           %[step3_15]     \n\t"
+        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
+        "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+        "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
+        "add      %[temp3],         %[temp3],           %[step3_14]     \n\t"
+        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
+        "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+
+        "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
+        "add      %[temp2],         %[temp2],           %[step3_13]     \n\t"
+        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
+        "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+        "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
+        "add      %[temp3],         %[temp3],           %[step3_12]     \n\t"
+        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
+        "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+
+        : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
+          [temp3] "=&r" (temp3), [dest_pix1] "+r" (dest_pix1)
+        : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
+          [step3_12] "r" (step3_12), [step3_13] "r" (step3_13),
+          [step3_14] "r" (step3_14), [step3_15] "r" (step3_15)
+    );
+
+    __asm__ __volatile__ (
+        "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
+        "add      %[temp0],         %[step1_8],         %[step1_23]     \n\t"
+        "addi     %[temp0],         %[temp0],           32              \n\t"
+        "sra      %[temp0],         %[temp0],           6               \n\t"
+        "add      %[temp2],         %[temp2],           %[temp0]        \n\t"
+        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
+        "add      %[temp1],         %[step1_9],         %[step1_22]     \n\t"
+        "sb       %[temp0],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
+        "addi     %[temp1],         %[temp1],           32              \n\t"
+        "sra      %[temp1],         %[temp1],           6               \n\t"
+        "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
+        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
+        "sb       %[temp1],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+
+        "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
+        "add      %[temp0],         %[step1_10],        %[step1_21]     \n\t"
+        "addi     %[temp0],         %[temp0],           32              \n\t"
+        "sra      %[temp0],         %[temp0],           6               \n\t"
+        "add      %[temp2],         %[temp2],           %[temp0]        \n\t"
+        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
+        "add      %[temp1],         %[step1_11],        %[step1_20]     \n\t"
+        "sb       %[temp0],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
+        "addi     %[temp1],         %[temp1],           32              \n\t"
+        "sra      %[temp1],         %[temp1],           6               \n\t"
+        "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
+        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
+        "sb       %[temp1],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+
+        : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
+          [temp3] "=&r" (temp3), [dest_pix] "+r" (dest_pix)
+        : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
+          [step1_8] "r" (step1_8), [step1_9] "r" (step1_9),
+          [step1_10] "r" (step1_10), [step1_11] "r" (step1_11),
+          [step1_20] "r" (step1_20), [step1_21] "r" (step1_21),
+          [step1_22] "r" (step1_22), [step1_23] "r" (step1_23)
+    );
+
+    step3_12 = ROUND_POWER_OF_TWO((step1_11 - step1_20), 6);
+    step3_13 = ROUND_POWER_OF_TWO((step1_10 - step1_21), 6);
+    step3_14 = ROUND_POWER_OF_TWO((step1_9 - step1_22), 6);
+    step3_15 = ROUND_POWER_OF_TWO((step1_8 - step1_23), 6);
+
+    __asm__ __volatile__ (
+        "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
+        "add      %[temp2],         %[temp2],           %[step3_15]     \n\t"
+        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
+        "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+        "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
+        "add      %[temp3],         %[temp3],           %[step3_14]     \n\t"
+        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
+        "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+
+        "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
+        "add      %[temp2],         %[temp2],           %[step3_13]     \n\t"
+        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
+        "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+        "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
+        "add      %[temp3],         %[temp3],           %[step3_12]     \n\t"
+        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
+        "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+
+        : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
+          [temp3] "=&r" (temp3), [dest_pix1] "+r" (dest_pix1)
+        : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
+          [step3_12] "r" (step3_12), [step3_13] "r" (step3_13),
+          [step3_14] "r" (step3_14), [step3_15] "r" (step3_15)
+    );
+
+    __asm__ __volatile__ (
+        "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
+        "add      %[temp0],         %[step1_12],        %[step2_19]     \n\t"
+        "addi     %[temp0],         %[temp0],           32              \n\t"
+        "sra      %[temp0],         %[temp0],           6               \n\t"
+        "add      %[temp2],         %[temp2],           %[temp0]        \n\t"
+        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
+        "add      %[temp1],         %[step1_13],        %[step2_18]     \n\t"
+        "sb       %[temp0],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
+        "addi     %[temp1],         %[temp1],           32              \n\t"
+        "sra      %[temp1],         %[temp1],           6               \n\t"
+        "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
+        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
+        "sb       %[temp1],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+
+        "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
+        "add      %[temp0],         %[step1_14],        %[step2_17]     \n\t"
+        "addi     %[temp0],         %[temp0],           32              \n\t"
+        "sra      %[temp0],         %[temp0],           6               \n\t"
+        "add      %[temp2],         %[temp2],           %[temp0]        \n\t"
+        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
+        "add      %[temp1],         %[step1_15],        %[step2_16]     \n\t"
+        "sb       %[temp0],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
+        "addi     %[temp1],         %[temp1],           32              \n\t"
+        "sra      %[temp1],         %[temp1],           6               \n\t"
+        "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
+        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
+        "sb       %[temp1],         0(%[dest_pix])                      \n\t"
+
+        : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
+          [temp3] "=&r" (temp3), [dest_pix] "+r" (dest_pix)
+        : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
+          [step1_12] "r" (step1_12), [step1_13] "r" (step1_13),
+          [step1_14] "r" (step1_14), [step1_15] "r" (step1_15),
+          [step2_16] "r" (step2_16), [step2_17] "r" (step2_17),
+          [step2_18] "r" (step2_18), [step2_19] "r" (step2_19)
+    );
+
+    step3_12 = ROUND_POWER_OF_TWO((step1_15 - step2_16), 6);
+    step3_13 = ROUND_POWER_OF_TWO((step1_14 - step2_17), 6);
+    step3_14 = ROUND_POWER_OF_TWO((step1_13 - step2_18), 6);
+    step3_15 = ROUND_POWER_OF_TWO((step1_12 - step2_19), 6);
+
+    __asm__ __volatile__ (
+        "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
+        "add      %[temp2],         %[temp2],           %[step3_15]     \n\t"
+        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
+        "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+        "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
+        "add      %[temp3],         %[temp3],           %[step3_14]     \n\t"
+        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
+        "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+
+        "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
+        "add      %[temp2],         %[temp2],           %[step3_13]     \n\t"
+        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
+        "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+        "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
+        "add      %[temp3],         %[temp3],           %[step3_12]     \n\t"
+        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
+        "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
+
+        : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
+          [temp3] "=&r" (temp3), [dest_pix1] "+r" (dest_pix1)
+        : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
+          [step3_12] "r" (step3_12), [step3_13] "r" (step3_13),
+          [step3_14] "r" (step3_14), [step3_15] "r" (step3_15)
+    );
+
+    input += 32;
+  }
+}
+#endif  // #if HAVE_DSPR2
diff --git a/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c b/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c
new file mode 100644
index 000000000..d3aee73cb
--- /dev/null
+++ b/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c
@@ -0,0 +1,1013 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_idct.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+
+#if HAVE_DSPR2
+static void idct32_1d_rows_dspr2(const int16_t *input, int16_t *output) {
+  int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6;
+  int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13;
+  int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19, step1_20;
+  int16_t step1_21, step1_22, step1_23, step1_24, step1_25, step1_26, step1_27;
+  int16_t step1_28, step1_29, step1_30, step1_31;
+  int16_t step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
+  int16_t step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13;
+  int16_t step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20;
+  int16_t step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27;
+  int16_t step2_28, step2_29, step2_30, step2_31;
+  int16_t step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14;
+  int16_t step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21;
+  int16_t step3_22, step3_23, step3_24, step3_25, step3_26, step3_27, step3_28;
+  int16_t step3_29, step3_30, step3_31;
+  int temp0, temp1, temp2, temp3;
+  int load1, load2, load3, load4;
+  int result1, result2;
+  int temp21;
+  int i;
+  const int const_2_power_13 = 8192;
+  const int32_t *input_int;
+
+  for (i = 32; i--; ) {
+    input_int = (const int32_t *)input;
+
+    if (!(input_int[0]  | input_int[1]  | input_int[2]  | input_int[3]  |
+          input_int[4]  | input_int[5]  | input_int[6]  | input_int[7]  |
+          input_int[8]  | input_int[9]  | input_int[10] | input_int[11] |
+          input_int[12] | input_int[13] | input_int[14] | input_int[15])) {
+      input += 32;
+
+      __asm__ __volatile__ (
+          "sh     $zero,     0(%[output])     \n\t"
+          "sh     $zero,    64(%[output])     \n\t"
+          "sh     $zero,   128(%[output])     \n\t"
+          "sh     $zero,   192(%[output])     \n\t"
+          "sh     $zero,   256(%[output])     \n\t"
+          "sh     $zero,   320(%[output])     \n\t"
+          "sh     $zero,   384(%[output])     \n\t"
+          "sh     $zero,   448(%[output])     \n\t"
+          "sh     $zero,   512(%[output])     \n\t"
+          "sh     $zero,   576(%[output])     \n\t"
+          "sh     $zero,   640(%[output])     \n\t"
+          "sh     $zero,   704(%[output])     \n\t"
+          "sh     $zero,   768(%[output])     \n\t"
+          "sh     $zero,   832(%[output])     \n\t"
+          "sh     $zero,   896(%[output])     \n\t"
+          "sh     $zero,   960(%[output])     \n\t"
+          "sh     $zero,  1024(%[output])     \n\t"
+          "sh     $zero,  1088(%[output])     \n\t"
+          "sh     $zero,  1152(%[output])     \n\t"
+          "sh     $zero,  1216(%[output])     \n\t"
+          "sh     $zero,  1280(%[output])     \n\t"
+          "sh     $zero,  1344(%[output])     \n\t"
+          "sh     $zero,  1408(%[output])     \n\t"
+          "sh     $zero,  1472(%[output])     \n\t"
+          "sh     $zero,  1536(%[output])     \n\t"
+          "sh     $zero,  1600(%[output])     \n\t"
+          "sh     $zero,  1664(%[output])     \n\t"
+          "sh     $zero,  1728(%[output])     \n\t"
+          "sh     $zero,  1792(%[output])     \n\t"
+          "sh     $zero,  1856(%[output])     \n\t"
+          "sh     $zero,  1920(%[output])     \n\t"
+          "sh     $zero,  1984(%[output])     \n\t"
+
+          :
+          : [output] "r" (output)
+      );
+
+      output += 1;
+
+      continue;
+    }
+
+    /* prefetch row */
+    vp9_prefetch_load((const uint8_t *)(input + 32));
+    vp9_prefetch_load((const uint8_t *)(input + 48));
+
+    __asm__ __volatile__ (
+        "lh       %[load1],             2(%[input])                     \n\t"
+        "lh       %[load2],             62(%[input])                    \n\t"
+        "lh       %[load3],             34(%[input])                    \n\t"
+        "lh       %[load4],             30(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load1],       %[cospi_31_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_1_64]   \n\t"
+        "extp     %[temp0],             $ac1,           31              \n\t"
+
+        "madd     $ac3,                 %[load1],       %[cospi_1_64]   \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_31_64]  \n\t"
+        "extp     %[temp3],             $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac2,                 %[load3],       %[cospi_15_64]  \n\t"
+        "msub     $ac2,                 %[load4],       %[cospi_17_64]  \n\t"
+        "extp     %[temp1],             $ac2,           31              \n\t"
+
+        "madd     $ac1,                 %[load3],       %[cospi_17_64]  \n\t"
+        "madd     $ac1,                 %[load4],       %[cospi_15_64]  \n\t"
+        "extp     %[temp2],             $ac1,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load1],             %[temp3],       %[temp2]        \n\t"
+        "sub      %[load2],             %[temp0],       %[temp1]        \n\t"
+
+        "madd     $ac1,                 %[load1],       %[cospi_28_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_4_64]   \n\t"
+        "madd     $ac3,                 %[load1],       %[cospi_4_64]   \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_28_64]  \n\t"
+
+        "extp     %[step1_17],          $ac1,           31              \n\t"
+        "extp     %[step1_30],          $ac3,           31              \n\t"
+        "add      %[step1_16],          %[temp0],       %[temp1]        \n\t"
+        "add      %[step1_31],          %[temp2],       %[temp3]        \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2),
+          [load3] "=&r" (load3), [load4] "=&r" (load4),
+          [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+          [step1_16] "=r" (step1_16), [step1_17] "=r" (step1_17),
+          [step1_30] "=r" (step1_30), [step1_31] "=r" (step1_31)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_31_64] "r" (cospi_31_64), [cospi_1_64] "r" (cospi_1_64),
+          [cospi_4_64] "r" (cospi_4_64), [cospi_17_64] "r" (cospi_17_64),
+          [cospi_15_64] "r" (cospi_15_64), [cospi_28_64] "r" (cospi_28_64)
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load1],             18(%[input])                    \n\t"
+        "lh       %[load2],             46(%[input])                    \n\t"
+        "lh       %[load3],             50(%[input])                    \n\t"
+        "lh       %[load4],             14(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load1],       %[cospi_23_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_9_64]   \n\t"
+        "extp     %[temp0],             $ac1,           31              \n\t"
+
+        "madd     $ac3,                 %[load1],       %[cospi_9_64]   \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_23_64]  \n\t"
+        "extp     %[temp3],             $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac2,                 %[load3],       %[cospi_7_64]   \n\t"
+        "msub     $ac2,                 %[load4],       %[cospi_25_64]  \n\t"
+        "extp     %[temp1],             $ac2,           31              \n\t"
+
+        "madd     $ac1,                 %[load3],       %[cospi_25_64]  \n\t"
+        "madd     $ac1,                 %[load4],       %[cospi_7_64]   \n\t"
+        "extp     %[temp2],             $ac1,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load1],             %[temp1],       %[temp0]        \n\t"
+        "sub      %[load2],             %[temp2],       %[temp3]        \n\t"
+
+        "msub     $ac1,                 %[load1],       %[cospi_28_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_4_64]   \n\t"
+        "msub     $ac3,                 %[load1],       %[cospi_4_64]   \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_28_64]  \n\t"
+
+        "extp     %[step1_18],          $ac1,           31              \n\t"
+        "extp     %[step1_29],          $ac3,           31              \n\t"
+        "add      %[step1_19],          %[temp0],       %[temp1]        \n\t"
+        "add      %[step1_28],          %[temp2],       %[temp3]        \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2),
+          [load3] "=&r" (load3), [load4] "=&r" (load4),
+          [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+          [step1_18] "=r" (step1_18), [step1_19] "=r" (step1_19),
+          [step1_28] "=r" (step1_28), [step1_29] "=r" (step1_29)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_23_64] "r" (cospi_23_64), [cospi_9_64] "r" (cospi_9_64),
+          [cospi_4_64] "r" (cospi_4_64), [cospi_7_64] "r" (cospi_7_64),
+          [cospi_25_64] "r" (cospi_25_64), [cospi_28_64] "r" (cospi_28_64)
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load1],             10(%[input])                    \n\t"
+        "lh       %[load2],             54(%[input])                    \n\t"
+        "lh       %[load3],             42(%[input])                    \n\t"
+        "lh       %[load4],             22(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load1],       %[cospi_27_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_5_64]   \n\t"
+        "extp     %[temp0],             $ac1,           31              \n\t"
+
+        "madd     $ac3,                 %[load1],       %[cospi_5_64]   \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_27_64]  \n\t"
+        "extp     %[temp3],             $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac2,                 %[load3],       %[cospi_11_64]  \n\t"
+        "msub     $ac2,                 %[load4],       %[cospi_21_64]  \n\t"
+        "extp     %[temp1],             $ac2,           31              \n\t"
+
+        "madd     $ac1,                 %[load3],       %[cospi_21_64]  \n\t"
+        "madd     $ac1,                 %[load4],       %[cospi_11_64]  \n\t"
+        "extp     %[temp2],             $ac1,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load1],             %[temp0],       %[temp1]        \n\t"
+        "sub      %[load2],             %[temp3],       %[temp2]        \n\t"
+
+        "madd     $ac1,                 %[load2],       %[cospi_12_64]  \n\t"
+        "msub     $ac1,                 %[load1],       %[cospi_20_64]  \n\t"
+        "madd     $ac3,                 %[load1],       %[cospi_12_64]  \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_20_64]  \n\t"
+
+        "extp     %[step1_21],          $ac1,           31              \n\t"
+        "extp     %[step1_26],          $ac3,           31              \n\t"
+        "add      %[step1_20],          %[temp0],       %[temp1]        \n\t"
+        "add      %[step1_27],          %[temp2],       %[temp3]        \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2),
+          [load3] "=&r" (load3), [load4] "=&r" (load4),
+          [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+          [step1_20] "=r" (step1_20), [step1_21] "=r" (step1_21),
+          [step1_26] "=r" (step1_26), [step1_27] "=r" (step1_27)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_27_64] "r" (cospi_27_64), [cospi_5_64] "r" (cospi_5_64),
+          [cospi_11_64] "r" (cospi_11_64), [cospi_21_64] "r" (cospi_21_64),
+          [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64)
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load1],             26(%[input])                    \n\t"
+        "lh       %[load2],             38(%[input])                    \n\t"
+        "lh       %[load3],             58(%[input])                    \n\t"
+        "lh       %[load4],              6(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load1],       %[cospi_19_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_13_64]  \n\t"
+        "extp     %[temp0],             $ac1,           31              \n\t"
+
+        "madd     $ac3,                 %[load1],       %[cospi_13_64]  \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_19_64]  \n\t"
+        "extp     %[temp3],             $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac2,                 %[load3],       %[cospi_3_64]   \n\t"
+        "msub     $ac2,                 %[load4],       %[cospi_29_64]  \n\t"
+        "extp     %[temp1],             $ac2,           31              \n\t"
+
+        "madd     $ac1,                 %[load3],       %[cospi_29_64]  \n\t"
+        "madd     $ac1,                 %[load4],       %[cospi_3_64]   \n\t"
+        "extp     %[temp2],             $ac1,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load1],             %[temp1],       %[temp0]        \n\t"
+        "sub      %[load2],             %[temp2],       %[temp3]        \n\t"
+
+        "msub     $ac1,                 %[load1],       %[cospi_12_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_20_64]  \n\t"
+        "msub     $ac3,                 %[load1],       %[cospi_20_64]  \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_12_64]  \n\t"
+
+        "extp     %[step1_22],          $ac1,           31              \n\t"
+        "extp     %[step1_25],          $ac3,           31              \n\t"
+        "add      %[step1_23],          %[temp0],       %[temp1]        \n\t"
+        "add      %[step1_24],          %[temp2],       %[temp3]        \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2),
+          [load3] "=&r" (load3), [load4] "=&r" (load4),
+          [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+          [step1_22] "=r" (step1_22), [step1_23] "=r" (step1_23),
+          [step1_24] "=r" (step1_24), [step1_25] "=r" (step1_25)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_19_64] "r" (cospi_19_64), [cospi_13_64] "r" (cospi_13_64),
+          [cospi_3_64] "r" (cospi_3_64), [cospi_29_64] "r" (cospi_29_64),
+          [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64)
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load1],              4(%[input])                    \n\t"
+        "lh       %[load2],             60(%[input])                    \n\t"
+        "lh       %[load3],             36(%[input])                    \n\t"
+        "lh       %[load4],             28(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load1],       %[cospi_30_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_2_64]   \n\t"
+        "extp     %[temp0],             $ac1,           31              \n\t"
+
+        "madd     $ac3,                 %[load1],       %[cospi_2_64]   \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_30_64]  \n\t"
+        "extp     %[temp3],             $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac2,                 %[load3],       %[cospi_14_64]  \n\t"
+        "msub     $ac2,                 %[load4],       %[cospi_18_64]  \n\t"
+        "extp     %[temp1],             $ac2,           31              \n\t"
+
+        "madd     $ac1,                 %[load3],       %[cospi_18_64]  \n\t"
+        "madd     $ac1,                 %[load4],       %[cospi_14_64]  \n\t"
+        "extp     %[temp2],             $ac1,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load1],             %[temp0],       %[temp1]        \n\t"
+        "sub      %[load2],             %[temp3],       %[temp2]        \n\t"
+
+        "msub     $ac1,                 %[load1],       %[cospi_8_64]   \n\t"
+        "madd     $ac1,                 %[load2],       %[cospi_24_64]  \n\t"
+        "madd     $ac3,                 %[load1],       %[cospi_24_64]  \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_8_64]   \n\t"
+
+        "extp     %[step2_9],           $ac1,           31              \n\t"
+        "extp     %[step2_14],          $ac3,           31              \n\t"
+        "add      %[step2_8],           %[temp0],       %[temp1]        \n\t"
+        "add      %[step2_15],          %[temp2],       %[temp3]        \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2),
+          [load3] "=&r" (load3), [load4] "=&r" (load4),
+          [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+          [step2_8] "=r" (step2_8), [step2_9] "=r" (step2_9),
+          [step2_14] "=r" (step2_14), [step2_15] "=r" (step2_15)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64),
+          [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64),
+          [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64)
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load1],             20(%[input])                    \n\t"
+        "lh       %[load2],             44(%[input])                    \n\t"
+        "lh       %[load3],             52(%[input])                    \n\t"
+        "lh       %[load4],             12(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load1],       %[cospi_22_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_10_64]  \n\t"
+        "extp     %[temp0],             $ac1,           31              \n\t"
+
+        "madd     $ac3,                 %[load1],       %[cospi_10_64]  \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_22_64]  \n\t"
+        "extp     %[temp3],             $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac2,                 %[load3],       %[cospi_6_64]   \n\t"
+        "msub     $ac2,                 %[load4],       %[cospi_26_64]  \n\t"
+        "extp     %[temp1],             $ac2,           31              \n\t"
+
+        "madd     $ac1,                 %[load3],       %[cospi_26_64]  \n\t"
+        "madd     $ac1,                 %[load4],       %[cospi_6_64]   \n\t"
+        "extp     %[temp2],             $ac1,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load1],             %[temp1],       %[temp0]        \n\t"
+        "sub      %[load2],             %[temp2],       %[temp3]        \n\t"
+
+        "msub     $ac1,                 %[load1],       %[cospi_24_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_8_64]   \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_24_64]  \n\t"
+        "msub     $ac3,                 %[load1],       %[cospi_8_64]   \n\t"
+
+        "extp     %[step2_10],          $ac1,           31              \n\t"
+        "extp     %[step2_13],          $ac3,           31              \n\t"
+        "add      %[step2_11],          %[temp0],       %[temp1]        \n\t"
+        "add      %[step2_12],          %[temp2],       %[temp3]        \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2),
+          [load3] "=&r" (load3), [load4] "=&r" (load4),
+          [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+          [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11),
+          [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64),
+          [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64),
+          [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64)
+    );
+
+    __asm__ __volatile__ (
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "sub      %[temp0],             %[step2_14],    %[step2_13]     \n\t"
+        "sub      %[temp0],             %[temp0],       %[step2_9]      \n\t"
+        "add      %[temp0],             %[temp0],       %[step2_10]     \n\t"
+        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "sub      %[temp1],             %[step2_14],    %[step2_13]     \n\t"
+        "add      %[temp1],             %[temp1],       %[step2_9]      \n\t"
+        "sub      %[temp1],             %[temp1],       %[step2_10]     \n\t"
+        "madd     $ac1,                 %[temp1],       %[cospi_16_64]  \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+        "sub      %[temp0],             %[step2_15],    %[step2_12]     \n\t"
+        "sub      %[temp0],             %[temp0],       %[step2_8]      \n\t"
+        "add      %[temp0],             %[temp0],       %[step2_11]     \n\t"
+        "madd     $ac2,                 %[temp0],       %[cospi_16_64]  \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+        "sub      %[temp1],             %[step2_15],    %[step2_12]     \n\t"
+        "add      %[temp1],             %[temp1],       %[step2_8]      \n\t"
+        "sub      %[temp1],             %[temp1],       %[step2_11]     \n\t"
+        "madd     $ac3,                 %[temp1],       %[cospi_16_64]  \n\t"
+
+        "add      %[step3_8],           %[step2_8],     %[step2_11]     \n\t"
+        "add      %[step3_9],           %[step2_9],     %[step2_10]     \n\t"
+        "add      %[step3_14],          %[step2_13],    %[step2_14]     \n\t"
+        "add      %[step3_15],          %[step2_12],    %[step2_15]     \n\t"
+
+        "extp     %[step3_10],          $ac0,           31              \n\t"
+        "extp     %[step3_13],          $ac1,           31              \n\t"
+        "extp     %[step3_11],          $ac2,           31              \n\t"
+        "extp     %[step3_12],          $ac3,           31              \n\t"
+
+        : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+          [step3_8] "=r" (step3_8), [step3_9] "=r" (step3_9),
+          [step3_10] "=r" (step3_10), [step3_11] "=r" (step3_11),
+          [step3_12] "=r" (step3_12), [step3_13] "=r" (step3_13),
+          [step3_14] "=r" (step3_14), [step3_15] "=r" (step3_15)
+        : [const_2_power_13] "r" (const_2_power_13),
+          [step2_8] "r" (step2_8), [step2_9] "r" (step2_9),
+          [step2_10] "r" (step2_10), [step2_11] "r" (step2_11),
+          [step2_12] "r" (step2_12), [step2_13] "r" (step2_13),
+          [step2_14] "r" (step2_14), [step2_15] "r" (step2_15),
+          [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    step2_18 = step1_17 - step1_18;
+    step2_29 = step1_30 - step1_29;
+
+    __asm__ __volatile__ (
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "msub     $ac0,                 %[step2_18],    %[cospi_8_64]   \n\t"
+        "madd     $ac0,                 %[step2_29],    %[cospi_24_64]  \n\t"
+        "extp     %[step3_18],          $ac0,           31              \n\t"
+
+        : [step3_18] "=r" (step3_18)
+        : [const_2_power_13] "r" (const_2_power_13),
+          [step2_18] "r" (step2_18), [step2_29] "r" (step2_29),
+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+    );
+
+    temp21 = step2_18 * cospi_24_64 + step2_29 * cospi_8_64;
+    step3_29 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+    step2_19 = step1_16 - step1_19;
+    step2_28 = step1_31 - step1_28;
+
+    __asm__ __volatile__ (
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "msub     $ac0,                 %[step2_19],    %[cospi_8_64]   \n\t"
+        "madd     $ac0,                 %[step2_28],    %[cospi_24_64]  \n\t"
+        "extp     %[step3_19],          $ac0,           31              \n\t"
+
+        : [step3_19] "=r" (step3_19)
+        : [const_2_power_13] "r" (const_2_power_13),
+          [step2_19] "r" (step2_19), [step2_28] "r" (step2_28),
+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+    );
+
+    temp21 = step2_19 * cospi_24_64 + step2_28 * cospi_8_64;
+    step3_28 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+    step3_16 = step1_16 + step1_19;
+    step3_17 = step1_17 + step1_18;
+    step3_30 = step1_29 + step1_30;
+    step3_31 = step1_28 + step1_31;
+
+    step2_20 = step1_23 - step1_20;
+    step2_27 = step1_24 - step1_27;
+
+    __asm__ __volatile__ (
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "msub     $ac0,                 %[step2_20],    %[cospi_24_64]  \n\t"
+        "msub     $ac0,                 %[step2_27],    %[cospi_8_64]   \n\t"
+        "extp     %[step3_20],          $ac0,           31              \n\t"
+
+        : [step3_20] "=r" (step3_20)
+        : [const_2_power_13] "r" (const_2_power_13),
+          [step2_20] "r" (step2_20), [step2_27] "r" (step2_27),
+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+    );
+
+    temp21 = -step2_20 * cospi_8_64 + step2_27 * cospi_24_64;
+    step3_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+    step2_21 = step1_22 - step1_21;
+    step2_26 = step1_25 - step1_26;
+
+    __asm__ __volatile__ (
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "msub     $ac1,                 %[step2_21],    %[cospi_24_64]  \n\t"
+        "msub     $ac1,                 %[step2_26],    %[cospi_8_64]   \n\t"
+        "extp     %[step3_21],          $ac1,           31              \n\t"
+
+        : [step3_21] "=r" (step3_21)
+        : [const_2_power_13] "r" (const_2_power_13),
+          [step2_21] "r" (step2_21), [step2_26] "r" (step2_26),
+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+    );
+
+    temp21 = -step2_21 * cospi_8_64 + step2_26 * cospi_24_64;
+    step3_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+    step3_22 = step1_21 + step1_22;
+    step3_23 = step1_20 + step1_23;
+    step3_24 = step1_24 + step1_27;
+    step3_25 = step1_25 + step1_26;
+
+    step2_16 = step3_16 + step3_23;
+    step2_17 = step3_17 + step3_22;
+    step2_18 = step3_18 + step3_21;
+    step2_19 = step3_19 + step3_20;
+    step2_20 = step3_19 - step3_20;
+    step2_21 = step3_18 - step3_21;
+    step2_22 = step3_17 - step3_22;
+    step2_23 = step3_16 - step3_23;
+
+    step2_24 = step3_31 - step3_24;
+    step2_25 = step3_30 - step3_25;
+    step2_26 = step3_29 - step3_26;
+    step2_27 = step3_28 - step3_27;
+    step2_28 = step3_28 + step3_27;
+    step2_29 = step3_29 + step3_26;
+    step2_30 = step3_30 + step3_25;
+    step2_31 = step3_31 + step3_24;
+
+    __asm__ __volatile__ (
+        "lh       %[load1],             0(%[input])                     \n\t"
+        "lh       %[load2],             32(%[input])                    \n\t"
+        "lh       %[load3],             16(%[input])                    \n\t"
+        "lh       %[load4],             48(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+        "add      %[result1],           %[load1],       %[load2]        \n\t"
+        "sub      %[result2],           %[load1],       %[load2]        \n\t"
+        "madd     $ac1,                 %[result1],     %[cospi_16_64]  \n\t"
+        "madd     $ac2,                 %[result2],     %[cospi_16_64]  \n\t"
+        "extp     %[temp0],             $ac1,           31              \n\t"
+        "extp     %[temp1],             $ac2,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+        "madd     $ac3,                 %[load3],       %[cospi_24_64]  \n\t"
+        "msub     $ac3,                 %[load4],       %[cospi_8_64]   \n\t"
+        "extp     %[temp2],             $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "madd     $ac1,                 %[load3],       %[cospi_8_64]   \n\t"
+        "madd     $ac1,                 %[load4],       %[cospi_24_64]  \n\t"
+        "extp     %[temp3],             $ac1,           31              \n\t"
+
+        "add      %[step1_0],          %[temp0],        %[temp3]        \n\t"
+        "add      %[step1_1],          %[temp1],        %[temp2]        \n\t"
+        "sub      %[step1_2],          %[temp1],        %[temp2]        \n\t"
+        "sub      %[step1_3],          %[temp0],        %[temp3]        \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2),
+          [load3] "=&r" (load3), [load4] "=&r" (load4),
+          [result1] "=&r" (result1), [result2] "=&r" (result2),
+          [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+          [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1),
+          [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_16_64] "r" (cospi_16_64),
+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load1],             8(%[input])                     \n\t"
+        "lh       %[load2],             56(%[input])                    \n\t"
+        "lh       %[load3],             40(%[input])                    \n\t"
+        "lh       %[load4],             24(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load1],       %[cospi_28_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_4_64]   \n\t"
+        "extp     %[temp0],             $ac1,           31              \n\t"
+
+        "madd     $ac3,                 %[load1],       %[cospi_4_64]   \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_28_64]  \n\t"
+        "extp     %[temp3],             $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac2,                 %[load3],       %[cospi_12_64]  \n\t"
+        "msub     $ac2,                 %[load4],       %[cospi_20_64]  \n\t"
+        "extp     %[temp1],             $ac2,           31              \n\t"
+
+        "madd     $ac1,                 %[load3],       %[cospi_20_64]  \n\t"
+        "madd     $ac1,                 %[load4],       %[cospi_12_64]  \n\t"
+        "extp     %[temp2],             $ac1,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load1],             %[temp3],       %[temp2]        \n\t"
+        "sub      %[load1],             %[load1],       %[temp0]        \n\t"
+        "add      %[load1],             %[load1],       %[temp1]        \n\t"
+
+        "sub      %[load2],             %[temp0],       %[temp1]        \n\t"
+        "sub      %[load2],             %[load2],       %[temp2]        \n\t"
+        "add      %[load2],             %[load2],       %[temp3]        \n\t"
+
+        "madd     $ac1,                 %[load1],       %[cospi_16_64]  \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_16_64]  \n\t"
+
+        "extp     %[step1_5],           $ac1,           31              \n\t"
+        "extp     %[step1_6],           $ac3,           31              \n\t"
+        "add      %[step1_4],           %[temp0],       %[temp1]        \n\t"
+        "add      %[step1_7],           %[temp3],       %[temp2]        \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2),
+          [load3] "=&r" (load3), [load4] "=&r" (load4),
+          [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+          [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5),
+          [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64),
+          [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64),
+          [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    step2_0 = step1_0 + step1_7;
+    step2_1 = step1_1 + step1_6;
+    step2_2 = step1_2 + step1_5;
+    step2_3 = step1_3 + step1_4;
+    step2_4 = step1_3 - step1_4;
+    step2_5 = step1_2 - step1_5;
+    step2_6 = step1_1 - step1_6;
+    step2_7 = step1_0 - step1_7;
+
+    step1_0 = step2_0 + step3_15;
+    step1_1 = step2_1 + step3_14;
+    step1_2 = step2_2 + step3_13;
+    step1_3 = step2_3 + step3_12;
+    step1_4 = step2_4 + step3_11;
+    step1_5 = step2_5 + step3_10;
+    step1_6 = step2_6 + step3_9;
+    step1_7 = step2_7 + step3_8;
+    step1_8 = step2_7 - step3_8;
+    step1_9 = step2_6 - step3_9;
+    step1_10 = step2_5 - step3_10;
+    step1_11 = step2_4 - step3_11;
+    step1_12 = step2_3 - step3_12;
+    step1_13 = step2_2 - step3_13;
+    step1_14 = step2_1 - step3_14;
+    step1_15 = step2_0 - step3_15;
+
+    __asm__ __volatile__ (
+        "sub      %[temp0],             %[step2_27],    %[step2_20]     \n\t"
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
+        "extp     %[step1_20],          $ac0,           31              \n\t"
+
+        : [temp0] "=&r" (temp0), [step1_20] "=r" (step1_20)
+        : [const_2_power_13] "r" (const_2_power_13),
+          [step2_20] "r" (step2_20), [step2_27] "r" (step2_27),
+          [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    temp21 = (step2_20 + step2_27) * cospi_16_64;
+    step1_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+    __asm__ __volatile__ (
+        "sub      %[temp0],             %[step2_26],    %[step2_21]     \n\t"
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
+        "extp     %[step1_21],          $ac0,           31              \n\t"
+
+        : [temp0] "=&r" (temp0), [step1_21] "=r" (step1_21)
+        : [const_2_power_13] "r" (const_2_power_13),
+          [step2_26] "r" (step2_26), [step2_21] "r" (step2_21),
+          [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    temp21 = (step2_21 + step2_26) * cospi_16_64;
+    step1_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+    __asm__ __volatile__ (
+        "sub      %[temp0],             %[step2_25],    %[step2_22]     \n\t"
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
+        "extp     %[step1_22],          $ac0,           31              \n\t"
+
+        : [temp0] "=&r" (temp0), [step1_22] "=r" (step1_22)
+        : [const_2_power_13] "r" (const_2_power_13),
+          [step2_25] "r" (step2_25), [step2_22] "r" (step2_22),
+          [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    temp21 = (step2_22 + step2_25) * cospi_16_64;
+    step1_25 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+    __asm__ __volatile__ (
+        "sub      %[temp0],             %[step2_24],    %[step2_23]     \n\t"
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
+        "extp     %[step1_23],          $ac0,           31              \n\t"
+
+        : [temp0] "=&r" (temp0), [step1_23] "=r" (step1_23)
+        : [const_2_power_13] "r" (const_2_power_13),
+          [step2_24] "r" (step2_24), [step2_23] "r" (step2_23),
+          [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    temp21 = (step2_23 + step2_24) * cospi_16_64;
+    step1_24 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+    // final stage
+    output[0 * 32] = step1_0 + step2_31;
+    output[1 * 32] = step1_1 + step2_30;
+    output[2 * 32] = step1_2 + step2_29;
+    output[3 * 32] = step1_3 + step2_28;
+    output[4 * 32] = step1_4 + step1_27;
+    output[5 * 32] = step1_5 + step1_26;
+    output[6 * 32] = step1_6 + step1_25;
+    output[7 * 32] = step1_7 + step1_24;
+    output[8 * 32] = step1_8 + step1_23;
+    output[9 * 32] = step1_9 + step1_22;
+    output[10 * 32] = step1_10 + step1_21;
+    output[11 * 32] = step1_11 + step1_20;
+    output[12 * 32] = step1_12 + step2_19;
+    output[13 * 32] = step1_13 + step2_18;
+    output[14 * 32] = step1_14 + step2_17;
+    output[15 * 32] = step1_15 + step2_16;
+    output[16 * 32] = step1_15 - step2_16;
+    output[17 * 32] = step1_14 - step2_17;
+    output[18 * 32] = step1_13 - step2_18;
+    output[19 * 32] = step1_12 - step2_19;
+    output[20 * 32] = step1_11 - step1_20;
+    output[21 * 32] = step1_10 - step1_21;
+    output[22 * 32] = step1_9 - step1_22;
+    output[23 * 32] = step1_8 - step1_23;
+    output[24 * 32] = step1_7 - step1_24;
+    output[25 * 32] = step1_6 - step1_25;
+    output[26 * 32] = step1_5 - step1_26;
+    output[27 * 32] = step1_4 - step1_27;
+    output[28 * 32] = step1_3 - step2_28;
+    output[29 * 32] = step1_2 - step2_29;
+    output[30 * 32] = step1_1 - step2_30;
+    output[31 * 32] = step1_0 - step2_31;
+
+    input += 32;
+    output += 1;
+  }
+}
+
+void vp9_idct32x32_1024_add_dspr2(const int16_t *input, uint8_t *dest,
+                                  int dest_stride) {
+  DECLARE_ALIGNED(32, int16_t,  out[32 * 32]);
+  int16_t *outptr = out;
+  uint32_t pos = 45;
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp      %[pos],     1           \n\t"
+    :
+    : [pos] "r" (pos)
+  );
+
+  // Rows
+  idct32_1d_rows_dspr2(input, outptr);
+
+  // Columns
+  vp9_idct32_1d_cols_add_blk_dspr2(out, dest, dest_stride);
+}
+
+void vp9_idct32x32_1_add_dspr2(const int16_t *input, uint8_t *dest,
+                               int stride) {
+  int       r, out;
+  int32_t   a1, absa1;
+  int32_t   vector_a1;
+  int32_t   t1, t2, t3, t4;
+  int32_t   vector_1, vector_2, vector_3, vector_4;
+  uint32_t  pos = 45;
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp      %[pos],     1           \n\t"
+
+    :
+    : [pos] "r" (pos)
+  );
+
+  out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);
+  __asm__ __volatile__ (
+      "addi     %[out],    %[out],    32      \n\t"
+      "sra      %[a1],     %[out],    6       \n\t"
+
+      : [out] "+r" (out), [a1] "=r" (a1)
+      :
+  );
+
+  if (a1 < 0) {
+    /* use quad-byte
+     * input and output memory are four byte aligned */
+    __asm__ __volatile__ (
+        "abs        %[absa1],     %[a1]         \n\t"
+        "replv.qb   %[vector_a1], %[absa1]      \n\t"
+
+        : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)
+        : [a1] "r" (a1)
+    );
+
+    for (r = 32; r--;) {
+      __asm__ __volatile__ (
+          "lw             %[t1],          0(%[dest])                      \n\t"
+          "lw             %[t2],          4(%[dest])                      \n\t"
+          "lw             %[t3],          8(%[dest])                      \n\t"
+          "lw             %[t4],          12(%[dest])                     \n\t"
+          "subu_s.qb      %[vector_1],    %[t1],          %[vector_a1]    \n\t"
+          "subu_s.qb      %[vector_2],    %[t2],          %[vector_a1]    \n\t"
+          "subu_s.qb      %[vector_3],    %[t3],          %[vector_a1]    \n\t"
+          "subu_s.qb      %[vector_4],    %[t4],          %[vector_a1]    \n\t"
+          "sw             %[vector_1],    0(%[dest])                      \n\t"
+          "sw             %[vector_2],    4(%[dest])                      \n\t"
+          "sw             %[vector_3],    8(%[dest])                      \n\t"
+          "sw             %[vector_4],    12(%[dest])                     \n\t"
+
+          "lw             %[t1],          16(%[dest])                     \n\t"
+          "lw             %[t2],          20(%[dest])                     \n\t"
+          "lw             %[t3],          24(%[dest])                     \n\t"
+          "lw             %[t4],          28(%[dest])                     \n\t"
+          "subu_s.qb      %[vector_1],    %[t1],          %[vector_a1]    \n\t"
+          "subu_s.qb      %[vector_2],    %[t2],          %[vector_a1]    \n\t"
+          "subu_s.qb      %[vector_3],    %[t3],          %[vector_a1]    \n\t"
+          "subu_s.qb      %[vector_4],    %[t4],          %[vector_a1]    \n\t"
+          "sw             %[vector_1],    16(%[dest])                     \n\t"
+          "sw             %[vector_2],    20(%[dest])                     \n\t"
+          "sw             %[vector_3],    24(%[dest])                     \n\t"
+          "sw             %[vector_4],    28(%[dest])                     \n\t"
+
+          "add            %[dest],        %[dest],        %[stride]       \n\t"
+
+          : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4),
+            [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
+            [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4),
+            [dest] "+&r" (dest)
+          : [stride] "r" (stride), [vector_a1] "r" (vector_a1)
+      );
+    }
+  } else {
+    /* use quad-byte
+     * input and output memory are four byte aligned */
+    __asm__ __volatile__ (
+        "replv.qb       %[vector_a1],   %[a1]     \n\t"
+
+        : [vector_a1] "=r" (vector_a1)
+        : [a1] "r" (a1)
+    );
+
+    for (r = 32; r--;) {
+      __asm__ __volatile__ (
+          "lw             %[t1],          0(%[dest])                      \n\t"
+          "lw             %[t2],          4(%[dest])                      \n\t"
+          "lw             %[t3],          8(%[dest])                      \n\t"
+          "lw             %[t4],          12(%[dest])                     \n\t"
+          "addu_s.qb      %[vector_1],    %[t1],          %[vector_a1]    \n\t"
+          "addu_s.qb      %[vector_2],    %[t2],          %[vector_a1]    \n\t"
+          "addu_s.qb      %[vector_3],    %[t3],          %[vector_a1]    \n\t"
+          "addu_s.qb      %[vector_4],    %[t4],          %[vector_a1]    \n\t"
+          "sw             %[vector_1],    0(%[dest])                      \n\t"
+          "sw             %[vector_2],    4(%[dest])                      \n\t"
+          "sw             %[vector_3],    8(%[dest])                      \n\t"
+          "sw             %[vector_4],    12(%[dest])                     \n\t"
+
+          "lw             %[t1],          16(%[dest])                     \n\t"
+          "lw             %[t2],          20(%[dest])                     \n\t"
+          "lw             %[t3],          24(%[dest])                     \n\t"
+          "lw             %[t4],          28(%[dest])                     \n\t"
+          "addu_s.qb      %[vector_1],    %[t1],          %[vector_a1]    \n\t"
+          "addu_s.qb      %[vector_2],    %[t2],          %[vector_a1]    \n\t"
+          "addu_s.qb      %[vector_3],    %[t3],          %[vector_a1]    \n\t"
+          "addu_s.qb      %[vector_4],    %[t4],          %[vector_a1]    \n\t"
+          "sw             %[vector_1],    16(%[dest])                     \n\t"
+          "sw             %[vector_2],    20(%[dest])                     \n\t"
+          "sw             %[vector_3],    24(%[dest])                     \n\t"
+          "sw             %[vector_4],    28(%[dest])                     \n\t"
+
+          "add            %[dest],        %[dest],        %[stride]       \n\t"
+
+          : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4),
+            [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
+            [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4),
+            [dest] "+&r" (dest)
+          : [stride] "r" (stride), [vector_a1] "r" (vector_a1)
+      );
+    }
+  }
+}
+#endif  // #if HAVE_DSPR2
diff --git a/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c b/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c
new file mode 100644
index 000000000..5b7aa5e71
--- /dev/null
+++ b/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c
@@ -0,0 +1,438 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_idct.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+
+#if HAVE_DSPR2
+static void vp9_idct4_1d_rows_dspr2(const int16_t *input, int16_t *output) {
+  int16_t   step_0, step_1, step_2, step_3;
+  int       Temp0, Temp1, Temp2, Temp3;
+  const int const_2_power_13 = 8192;
+  int       i;
+
+  for (i = 4; i--; ) {
+    __asm__ __volatile__ (
+        /*
+          temp_1 = (input[0] + input[2]) * cospi_16_64;
+          step_0 = dct_const_round_shift(temp_1);
+
+          temp_2 = (input[0] - input[2]) * cospi_16_64;
+          step_1 = dct_const_round_shift(temp_2);
+        */
+        "lh       %[Temp0],             0(%[input])                     \n\t"
+        "lh       %[Temp1],             4(%[input])                     \n\t"
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "add      %[Temp2],             %[Temp0],       %[Temp1]        \n\t"
+        "sub      %[Temp3],             %[Temp0],       %[Temp1]        \n\t"
+        "madd     $ac0,                 %[Temp2],       %[cospi_16_64]  \n\t"
+        "lh       %[Temp0],             2(%[input])                     \n\t"
+        "lh       %[Temp1],             6(%[input])                     \n\t"
+        "extp     %[step_0],            $ac0,           31              \n\t"
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+
+        "madd     $ac1,                 %[Temp3],       %[cospi_16_64]  \n\t"
+        "extp     %[step_1],            $ac1,           31              \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+
+        /*
+          temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
+          step_2 = dct_const_round_shift(temp1);
+        */
+        "madd     $ac0,                 %[Temp0],       %[cospi_24_64]  \n\t"
+        "msub     $ac0,                 %[Temp1],       %[cospi_8_64]   \n\t"
+        "extp     %[step_2],            $ac0,           31              \n\t"
+
+        /*
+          temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
+          step_3 = dct_const_round_shift(temp2);
+        */
+        "madd     $ac1,                 %[Temp0],       %[cospi_8_64]   \n\t"
+        "madd     $ac1,                 %[Temp1],       %[cospi_24_64]  \n\t"
+        "extp     %[step_3],            $ac1,           31              \n\t"
+
+        /*
+          output[0]  = step_0 + step_3;
+          output[4]  = step_1 + step_2;
+          output[8]  = step_1 - step_2;
+          output[12] = step_0 - step_3;
+        */
+        "add      %[Temp0],             %[step_0],      %[step_3]       \n\t"
+        "sh       %[Temp0],             0(%[output])                    \n\t"
+
+        "add      %[Temp1],             %[step_1],      %[step_2]       \n\t"
+        "sh       %[Temp1],             8(%[output])                    \n\t"
+
+        "sub      %[Temp2],             %[step_1],      %[step_2]       \n\t"
+        "sh       %[Temp2],             16(%[output])                   \n\t"
+
+        "sub      %[Temp3],             %[step_0],      %[step_3]       \n\t"
+        "sh       %[Temp3],             24(%[output])                   \n\t"
+
+      : [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),
+        [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
+        [step_0] "=&r" (step_0), [step_1] "=&r" (step_1),
+        [step_2] "=&r" (step_2), [step_3] "=&r" (step_3),
+        [output] "+r" (output)
+      : [const_2_power_13] "r" (const_2_power_13),
+        [cospi_8_64] "r" (cospi_8_64), [cospi_16_64] "r" (cospi_16_64),
+        [cospi_24_64] "r" (cospi_24_64),
+        [input] "r" (input)
+    );
+
+    input += 4;
+    output += 1;
+  }
+}
+
+static void vp9_idct4_1d_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
+                                               int dest_stride) {
+  int16_t   step_0, step_1, step_2, step_3;
+  int       Temp0, Temp1, Temp2, Temp3;
+  const int const_2_power_13 = 8192;
+  int       i;
+  uint8_t   *dest_pix;
+  uint8_t   *cm = vp9_ff_cropTbl;
+
+  /* prefetch vp9_ff_cropTbl */
+  vp9_prefetch_load(vp9_ff_cropTbl);
+  vp9_prefetch_load(vp9_ff_cropTbl +  32);
+  vp9_prefetch_load(vp9_ff_cropTbl +  64);
+  vp9_prefetch_load(vp9_ff_cropTbl +  96);
+  vp9_prefetch_load(vp9_ff_cropTbl + 128);
+  vp9_prefetch_load(vp9_ff_cropTbl + 160);
+  vp9_prefetch_load(vp9_ff_cropTbl + 192);
+  vp9_prefetch_load(vp9_ff_cropTbl + 224);
+
+  for (i = 0; i < 4; ++i) {
+      dest_pix = (dest + i);
+
+    __asm__ __volatile__ (
+        /*
+          temp_1 = (input[0] + input[2]) * cospi_16_64;
+          step_0 = dct_const_round_shift(temp_1);
+
+          temp_2 = (input[0] - input[2]) * cospi_16_64;
+          step_1 = dct_const_round_shift(temp_2);
+        */
+        "lh       %[Temp0],             0(%[input])                     \n\t"
+        "lh       %[Temp1],             4(%[input])                     \n\t"
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "add      %[Temp2],             %[Temp0],       %[Temp1]        \n\t"
+        "sub      %[Temp3],             %[Temp0],       %[Temp1]        \n\t"
+        "madd     $ac0,                 %[Temp2],       %[cospi_16_64]  \n\t"
+        "lh       %[Temp0],             2(%[input])                     \n\t"
+        "lh       %[Temp1],             6(%[input])                     \n\t"
+        "extp     %[step_0],            $ac0,           31              \n\t"
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+
+        "madd     $ac1,                 %[Temp3],       %[cospi_16_64]  \n\t"
+        "extp     %[step_1],            $ac1,           31              \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+
+        /*
+          temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
+          step_2 = dct_const_round_shift(temp1);
+        */
+        "madd     $ac0,                 %[Temp0],       %[cospi_24_64]  \n\t"
+        "msub     $ac0,                 %[Temp1],       %[cospi_8_64]   \n\t"
+        "extp     %[step_2],            $ac0,           31              \n\t"
+
+        /*
+          temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
+          step_3 = dct_const_round_shift(temp2);
+        */
+        "madd     $ac1,                 %[Temp0],       %[cospi_8_64]   \n\t"
+        "madd     $ac1,                 %[Temp1],       %[cospi_24_64]  \n\t"
+        "extp     %[step_3],            $ac1,           31              \n\t"
+
+        /*
+          output[0]  = step_0 + step_3;
+          output[4]  = step_1 + step_2;
+          output[8]  = step_1 - step_2;
+          output[12] = step_0 - step_3;
+        */
+        "add      %[Temp0],             %[step_0],      %[step_3]       \n\t"
+        "addi     %[Temp0],             %[Temp0],       8               \n\t"
+        "sra      %[Temp0],             %[Temp0],       4               \n\t"
+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
+        "add      %[Temp0],             %[step_1],      %[step_2]       \n\t"
+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
+        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
+
+        "addi     %[Temp0],             %[Temp0],       8               \n\t"
+        "sra      %[Temp0],             %[Temp0],       4               \n\t"
+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
+        "sub      %[Temp0],             %[step_1],      %[step_2]       \n\t"
+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
+        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
+
+        "addi     %[Temp0],             %[Temp0],       8               \n\t"
+        "sra      %[Temp0],             %[Temp0],       4               \n\t"
+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
+        "sub      %[Temp0],             %[step_0],      %[step_3]       \n\t"
+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
+        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
+
+        "addi     %[Temp0],             %[Temp0],       8               \n\t"
+        "sra      %[Temp0],             %[Temp0],       4               \n\t"
+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
+
+      : [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),
+        [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
+        [step_0] "=&r" (step_0), [step_1] "=&r" (step_1),
+        [step_2] "=&r" (step_2), [step_3] "=&r" (step_3),
+        [dest_pix] "+r" (dest_pix)
+      : [const_2_power_13] "r" (const_2_power_13),
+        [cospi_8_64] "r" (cospi_8_64), [cospi_16_64] "r" (cospi_16_64),
+        [cospi_24_64] "r" (cospi_24_64),
+        [input] "r" (input), [cm] "r" (cm), [dest_stride] "r" (dest_stride)
+    );
+
+    input += 4;
+  }
+}
+
+void vp9_idct4x4_16_add_dspr2(const int16_t *input, uint8_t *dest,
+                              int dest_stride) {
+  DECLARE_ALIGNED(32, int16_t, out[4 * 4]);
+  int16_t *outptr = out;
+  uint32_t pos = 45;
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp      %[pos],     1           \n\t"
+    :
+    : [pos] "r" (pos)
+  );
+
+  // Rows
+  vp9_idct4_1d_rows_dspr2(input, outptr);
+
+  // Columns
+  vp9_idct4_1d_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+}
+
+void vp9_idct4x4_1_add_dspr2(const int16_t *input, uint8_t *dest,
+                             int dest_stride) {
+  int       a1, absa1;
+  int       r;
+  int32_t   out;
+  int       t2, vector_a1, vector_a;
+  uint32_t  pos = 45;
+  int16_t   input_dc = input[0];
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp      %[pos],     1           \n\t"
+
+    :
+    : [pos] "r" (pos)
+  );
+
+  out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input_dc);
+  __asm__ __volatile__ (
+      "addi     %[out],     %[out],    8       \n\t"
+      "sra      %[a1],      %[out],    4       \n\t"
+
+      : [out] "+r" (out), [a1] "=r" (a1)
+      :
+  );
+
+  if (a1 < 0) {
+    /* use quad-byte
+     * input and output memory are four byte aligned */
+    __asm__ __volatile__ (
+        "abs        %[absa1],     %[a1]         \n\t"
+        "replv.qb   %[vector_a1], %[absa1]      \n\t"
+
+        : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)
+        : [a1] "r" (a1)
+    );
+
+    for (r = 4; r--;) {
+      __asm__ __volatile__ (
+          "lw             %[t2],          0(%[dest])                      \n\t"
+          "subu_s.qb      %[vector_a],    %[t2],          %[vector_a1]    \n\t"
+          "sw             %[vector_a],    0(%[dest])                      \n\t"
+          "add            %[dest],        %[dest],        %[dest_stride]  \n\t"
+
+          : [t2] "=&r" (t2), [vector_a] "=&r" (vector_a),
+            [dest] "+&r" (dest)
+          : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
+      );
+    }
+  } else {
+    /* use quad-byte
+     * input and output memory are four byte aligned */
+    __asm__ __volatile__ (
+        "replv.qb       %[vector_a1],   %[a1]     \n\t"
+        : [vector_a1] "=r" (vector_a1)
+        : [a1] "r" (a1)
+    );
+
+    for (r = 4; r--;) {
+      __asm__ __volatile__ (
+          "lw           %[t2],          0(%[dest])                        \n\t"
+          "addu_s.qb    %[vector_a],    %[t2],            %[vector_a1]    \n\t"
+          "sw           %[vector_a],    0(%[dest])                        \n\t"
+          "add          %[dest],        %[dest],          %[dest_stride]  \n\t"
+
+          : [t2] "=&r" (t2), [vector_a] "=&r" (vector_a),
+            [dest] "+&r" (dest)
+          : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
+      );
+    }
+  }
+}
+
+static void iadst4_1d_dspr2(const int16_t *input, int16_t *output) {
+  int s0, s1, s2, s3, s4, s5, s6, s7;
+  int x0, x1, x2, x3;
+
+  x0 = input[0];
+  x1 = input[1];
+  x2 = input[2];
+  x3 = input[3];
+
+  if (!(x0 | x1 | x2 | x3)) {
+    output[0] = output[1] = output[2] = output[3] = 0;
+    return;
+  }
+
+  s0 = sinpi_1_9 * x0;
+  s1 = sinpi_2_9 * x0;
+  s2 = sinpi_3_9 * x1;
+  s3 = sinpi_4_9 * x2;
+  s4 = sinpi_1_9 * x2;
+  s5 = sinpi_2_9 * x3;
+  s6 = sinpi_4_9 * x3;
+  s7 = x0 - x2 + x3;
+
+  x0 = s0 + s3 + s5;
+  x1 = s1 - s4 - s6;
+  x2 = sinpi_3_9 * s7;
+  x3 = s2;
+
+  s0 = x0 + x3;
+  s1 = x1 + x3;
+  s2 = x2;
+  s3 = x0 + x1 - x3;
+
+  // 1-D transform scaling factor is sqrt(2).
+  // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
+  // + 1b (addition) = 29b.
+  // Hence the output bit depth is 15b.
+  output[0] = dct_const_round_shift(s0);
+  output[1] = dct_const_round_shift(s1);
+  output[2] = dct_const_round_shift(s2);
+  output[3] = dct_const_round_shift(s3);
+}
+
+void vp9_iht4x4_16_add_dspr2(const int16_t *input, uint8_t *dest,
+                             int dest_stride, int tx_type) {
+  int i, j;
+  DECLARE_ALIGNED(32, int16_t, out[4 * 4]);
+  int16_t *outptr = out;
+  int16_t temp_in[4 * 4], temp_out[4];
+  uint32_t pos = 45;
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp      %[pos],     1           \n\t"
+    :
+    : [pos] "r" (pos)
+  );
+
+  switch (tx_type) {
+    case DCT_DCT:   // DCT in both horizontal and vertical
+      vp9_idct4_1d_rows_dspr2(input, outptr);
+      vp9_idct4_1d_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+      break;
+    case ADST_DCT:  // ADST in vertical, DCT in horizontal
+      vp9_idct4_1d_rows_dspr2(input, outptr);
+
+      outptr = out;
+
+      for (i = 0; i < 4; ++i) {
+        iadst4_1d_dspr2(outptr, temp_out);
+
+        for (j = 0; j < 4; ++j)
+          dest[j * dest_stride + i] =
+                    clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
+                                      + dest[j * dest_stride + i]);
+
+        outptr += 4;
+      }
+      break;
+    case DCT_ADST:  // DCT in vertical, ADST in horizontal
+      for (i = 0; i < 4; ++i) {
+        iadst4_1d_dspr2(input, outptr);
+        input  += 4;
+        outptr += 4;
+      }
+
+      for (i = 0; i < 4; ++i) {
+        for (j = 0; j < 4; ++j) {
+          temp_in[i * 4 + j] = out[j * 4 + i];
+        }
+      }
+      vp9_idct4_1d_columns_add_blk_dspr2(&temp_in[0], dest, dest_stride);
+      break;
+    case ADST_ADST:  // ADST in both directions
+      for (i = 0; i < 4; ++i) {
+        iadst4_1d_dspr2(input, outptr);
+        input  += 4;
+        outptr += 4;
+      }
+
+      for (i = 0; i < 4; ++i) {
+        for (j = 0; j < 4; ++j)
+          temp_in[j] = out[j * 4 + i];
+        iadst4_1d_dspr2(temp_in, temp_out);
+
+        for (j = 0; j < 4; ++j)
+          dest[j * dest_stride + i] =
+                  clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
+                                      + dest[j * dest_stride + i]);
+      }
+      break;
+    default:
+      printf("vp9_short_iht4x4_add_dspr2 : Invalid tx_type\n");
+      break;
+  }
+}
+#endif  // #if HAVE_DSPR2
diff --git a/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c b/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c
new file mode 100644
index 000000000..93a08401d
--- /dev/null
+++ b/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c
@@ -0,0 +1,745 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_idct.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+
+#if HAVE_DSPR2
+static void idct8_1d_rows_dspr2(const int16_t *input, int16_t *output,
+                                uint32_t no_rows) {
+  int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
+  const int const_2_power_13 = 8192;
+  int Temp0, Temp1, Temp2, Temp3, Temp4;
+  int i;
+
+  for (i = no_rows; i--; ) {
+    __asm__ __volatile__ (
+        /*
+          temp_1 = (input[0] + input[4]) * cospi_16_64;
+          step2_0 = dct_const_round_shift(temp_1);
+
+          temp_2 = (input[0] - input[4]) * cospi_16_64;
+          step2_1 = dct_const_round_shift(temp_2);
+        */
+        "lh       %[Temp0],             0(%[input])                     \n\t"
+        "lh       %[Temp1],             8(%[input])                     \n\t"
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "add      %[Temp2],             %[Temp0],       %[Temp1]        \n\t"
+        "madd     $ac0,                 %[Temp2],       %[cospi_16_64]  \n\t"
+        "extp     %[Temp4],             $ac0,           31              \n\t"
+
+        "sub      %[Temp3],             %[Temp0],       %[Temp1]        \n\t"
+        "madd     $ac1,                 %[Temp3],       %[cospi_16_64]  \n\t"
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "extp     %[Temp2],             $ac1,           31              \n\t"
+
+        /*
+          temp_1 = input[2] * cospi_24_64 - input[6] * cospi_8_64;
+          step2_2 = dct_const_round_shift(temp_1);
+        */
+        "lh       %[Temp0],             4(%[input])                     \n\t"
+        "lh       %[Temp1],             12(%[input])                    \n\t"
+        "madd     $ac0,                 %[Temp0],       %[cospi_24_64]  \n\t"
+        "msub     $ac0,                 %[Temp1],       %[cospi_8_64]   \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "extp     %[Temp3],             $ac0,           31              \n\t"
+
+        /*
+          step1_1 = step2_1 + step2_2;
+          step1_2 = step2_1 - step2_2;
+        */
+        "add      %[step1_1],           %[Temp2],       %[Temp3]        \n\t"
+        "sub      %[step1_2],           %[Temp2],       %[Temp3]        \n\t"
+
+        /*
+          temp_2 = input[2] * cospi_8_64 + input[6] * cospi_24_64;
+          step2_3 = dct_const_round_shift(temp_2);
+        */
+        "madd     $ac1,                 %[Temp0],       %[cospi_8_64]   \n\t"
+        "madd     $ac1,                 %[Temp1],       %[cospi_24_64]  \n\t"
+        "extp     %[Temp1],             $ac1,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+
+        /*
+          step1_0 = step2_0 + step2_3;
+          step1_3 = step2_0 - step2_3;
+        */
+        "add      %[step1_0],           %[Temp4],       %[Temp1]        \n\t"
+        "sub      %[step1_3],           %[Temp4],       %[Temp1]        \n\t"
+
+        /*
+          temp_1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
+          step1_4 = dct_const_round_shift(temp_1);
+        */
+        "lh       %[Temp0],             2(%[input])                     \n\t"
+        "madd     $ac0,                 %[Temp0],       %[cospi_28_64]  \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "lh       %[Temp1],             14(%[input])                    \n\t"
+        "lh       %[Temp0],             2(%[input])                     \n\t"
+        "msub     $ac0,                 %[Temp1],       %[cospi_4_64]   \n\t"
+        "extp     %[step1_4],           $ac0,           31              \n\t"
+
+        /*
+          temp_2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
+          step1_7 = dct_const_round_shift(temp_2);
+        */
+        "madd     $ac1,                 %[Temp0],       %[cospi_4_64]   \n\t"
+        "madd     $ac1,                 %[Temp1],       %[cospi_28_64]  \n\t"
+        "extp     %[step1_7],           $ac1,           31              \n\t"
+
+        /*
+          temp_1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
+          step1_5 = dct_const_round_shift(temp_1);
+        */
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "lh       %[Temp0],             10(%[input])                    \n\t"
+        "madd     $ac0,                 %[Temp0],       %[cospi_12_64]  \n\t"
+        "lh       %[Temp1],             6(%[input])                     \n\t"
+        "msub     $ac0,                 %[Temp1],       %[cospi_20_64]  \n\t"
+        "extp     %[step1_5],           $ac0,           31              \n\t"
+
+        /*
+          temp_2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
+          step1_6 = dct_const_round_shift(temp_2);
+        */
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "lh       %[Temp0],             10(%[input])                    \n\t"
+        "madd     $ac1,                 %[Temp0],       %[cospi_20_64]  \n\t"
+        "lh       %[Temp1],             6(%[input])                     \n\t"
+        "madd     $ac1,                 %[Temp1],       %[cospi_12_64]  \n\t"
+        "extp     %[step1_6],           $ac1,           31              \n\t"
+
+        /*
+          temp_1 = (step1_7 - step1_6 - step1_4 + step1_5) * cospi_16_64;
+          temp_2 = (step1_4 - step1_5 - step1_6 + step1_7) * cospi_16_64;
+        */
+        "sub      %[Temp0],             %[step1_7],     %[step1_6]      \n\t"
+        "sub      %[Temp0],             %[Temp0],       %[step1_4]      \n\t"
+        "add      %[Temp0],             %[Temp0],       %[step1_5]      \n\t"
+        "sub      %[Temp1],             %[step1_4],     %[step1_5]      \n\t"
+        "sub      %[Temp1],             %[Temp1],       %[step1_6]      \n\t"
+        "add      %[Temp1],             %[Temp1],       %[step1_7]      \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+
+        "madd     $ac0,                 %[Temp0],       %[cospi_16_64]  \n\t"
+        "madd     $ac1,                 %[Temp1],       %[cospi_16_64]  \n\t"
+
+        /*
+          step1_4 = step1_4 + step1_5;
+          step1_7 = step1_6 + step1_7;
+        */
+        "add      %[step1_4],           %[step1_4],     %[step1_5]      \n\t"
+        "add      %[step1_7],           %[step1_7],     %[step1_6]      \n\t"
+
+        "extp     %[step1_5],           $ac0,           31              \n\t"
+        "extp     %[step1_6],           $ac1,           31              \n\t"
+
+        "add      %[Temp0],             %[step1_0],     %[step1_7]      \n\t"
+        "sh       %[Temp0],             0(%[output])                    \n\t"
+        "add      %[Temp1],             %[step1_1],     %[step1_6]      \n\t"
+        "sh       %[Temp1],             16(%[output])                   \n\t"
+        "add      %[Temp0],             %[step1_2],     %[step1_5]      \n\t"
+        "sh       %[Temp0],             32(%[output])                   \n\t"
+        "add      %[Temp1],             %[step1_3],     %[step1_4]      \n\t"
+        "sh       %[Temp1],             48(%[output])                   \n\t"
+
+        "sub      %[Temp0],             %[step1_3],     %[step1_4]      \n\t"
+        "sh       %[Temp0],             64(%[output])                   \n\t"
+        "sub      %[Temp1],             %[step1_2],     %[step1_5]      \n\t"
+        "sh       %[Temp1],             80(%[output])                   \n\t"
+        "sub      %[Temp0],             %[step1_1],     %[step1_6]      \n\t"
+        "sh       %[Temp0],             96(%[output])                   \n\t"
+        "sub      %[Temp1],             %[step1_0],     %[step1_7]      \n\t"
+        "sh       %[Temp1],             112(%[output])                  \n\t"
+
+        : [step1_0] "=&r" (step1_0), [step1_1] "=&r" (step1_1),
+          [step1_2] "=&r" (step1_2), [step1_3] "=&r" (step1_3),
+          [step1_4] "=&r" (step1_4), [step1_5] "=&r" (step1_5),
+          [step1_6] "=&r" (step1_6), [step1_7] "=&r" (step1_7),
+          [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),
+          [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
+          [Temp4] "=&r" (Temp4)
+        : [const_2_power_13] "r" (const_2_power_13),
+          [cospi_16_64] "r" (cospi_16_64), [cospi_28_64] "r" (cospi_28_64),
+          [cospi_4_64] "r" (cospi_4_64), [cospi_12_64] "r" (cospi_12_64),
+          [cospi_20_64] "r" (cospi_20_64), [cospi_8_64] "r" (cospi_8_64),
+          [cospi_24_64] "r" (cospi_24_64),
+          [output] "r" (output), [input] "r" (input)
+    );
+
+    input += 8;
+    output += 1;
+  }
+}
+
+static void idct8_1d_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
+                                           int dest_stride) {
+  int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
+  int Temp0, Temp1, Temp2, Temp3;
+  int i;
+  const int const_2_power_13 = 8192;
+  uint8_t *dest_pix;
+  uint8_t *cm = vp9_ff_cropTbl;
+
+  /* prefetch vp9_ff_cropTbl */
+  vp9_prefetch_load(vp9_ff_cropTbl);
+  vp9_prefetch_load(vp9_ff_cropTbl +  32);
+  vp9_prefetch_load(vp9_ff_cropTbl +  64);
+  vp9_prefetch_load(vp9_ff_cropTbl +  96);
+  vp9_prefetch_load(vp9_ff_cropTbl + 128);
+  vp9_prefetch_load(vp9_ff_cropTbl + 160);
+  vp9_prefetch_load(vp9_ff_cropTbl + 192);
+  vp9_prefetch_load(vp9_ff_cropTbl + 224);
+
+  for (i = 0; i < 8; ++i) {
+      dest_pix = (dest + i);
+
+    __asm__ __volatile__ (
+        /*
+          temp_1 = (input[0] + input[4]) * cospi_16_64;
+          step2_0 = dct_const_round_shift(temp_1);
+
+          temp_2 = (input[0] - input[4]) * cospi_16_64;
+          step2_1 = dct_const_round_shift(temp_2);
+        */
+        "lh       %[Temp0],             0(%[input])                     \n\t"
+        "lh       %[Temp1],             8(%[input])                     \n\t"
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "add      %[Temp2],             %[Temp0],       %[Temp1]        \n\t"
+        "madd     $ac0,                 %[Temp2],       %[cospi_16_64]  \n\t"
+        "extp     %[step1_6],           $ac0,           31              \n\t"
+
+        "sub      %[Temp3],             %[Temp0],       %[Temp1]        \n\t"
+        "madd     $ac1,                 %[Temp3],       %[cospi_16_64]  \n\t"
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "extp     %[Temp2],             $ac1,           31              \n\t"
+
+        /*
+          temp_1 = input[2] * cospi_24_64 - input[6] * cospi_8_64;
+          step2_2 = dct_const_round_shift(temp_1);
+        */
+        "lh       %[Temp0],             4(%[input])                     \n\t"
+        "lh       %[Temp1],             12(%[input])                    \n\t"
+        "madd     $ac0,                 %[Temp0],       %[cospi_24_64]  \n\t"
+        "msub     $ac0,                 %[Temp1],       %[cospi_8_64]   \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "extp     %[Temp3],             $ac0,           31              \n\t"
+
+        /*
+          step1_1 = step2_1 + step2_2;
+          step1_2 = step2_1 - step2_2;
+        */
+        "add      %[step1_1],           %[Temp2],       %[Temp3]        \n\t"
+        "sub      %[step1_2],           %[Temp2],       %[Temp3]        \n\t"
+
+        /*
+          temp_2 = input[2] * cospi_8_64 + input[6] * cospi_24_64;
+          step2_3 = dct_const_round_shift(temp_2);
+        */
+        "madd     $ac1,                 %[Temp0],       %[cospi_8_64]   \n\t"
+        "madd     $ac1,                 %[Temp1],       %[cospi_24_64]  \n\t"
+        "extp     %[Temp1],             $ac1,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+
+        /*
+          step1_0 = step2_0 + step2_3;
+          step1_3 = step2_0 - step2_3;
+        */
+        "add      %[step1_0],           %[step1_6],     %[Temp1]        \n\t"
+        "sub      %[step1_3],           %[step1_6],     %[Temp1]        \n\t"
+
+        /*
+          temp_1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
+          step1_4 = dct_const_round_shift(temp_1);
+        */
+        "lh       %[Temp0],             2(%[input])                     \n\t"
+        "madd     $ac0,                 %[Temp0],       %[cospi_28_64]  \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "lh       %[Temp1],             14(%[input])                    \n\t"
+        "lh       %[Temp0],             2(%[input])                     \n\t"
+        "msub     $ac0,                 %[Temp1],       %[cospi_4_64]   \n\t"
+        "extp     %[step1_4],           $ac0,           31              \n\t"
+
+        /*
+          temp_2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
+          step1_7 = dct_const_round_shift(temp_2);
+        */
+        "madd     $ac1,                 %[Temp0],       %[cospi_4_64]   \n\t"
+        "madd     $ac1,                 %[Temp1],       %[cospi_28_64]  \n\t"
+        "extp     %[step1_7],           $ac1,           31              \n\t"
+
+        /*
+          temp_1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
+          step1_5 = dct_const_round_shift(temp_1);
+        */
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "lh       %[Temp0],             10(%[input])                    \n\t"
+        "madd     $ac0,                 %[Temp0],       %[cospi_12_64]  \n\t"
+        "lh       %[Temp1],             6(%[input])                     \n\t"
+        "msub     $ac0,                 %[Temp1],       %[cospi_20_64]  \n\t"
+        "extp     %[step1_5],           $ac0,           31              \n\t"
+
+        /*
+          temp_2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
+          step1_6 = dct_const_round_shift(temp_2);
+        */
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "lh       %[Temp0],             10(%[input])                    \n\t"
+        "madd     $ac1,                 %[Temp0],       %[cospi_20_64]  \n\t"
+        "lh       %[Temp1],             6(%[input])                     \n\t"
+        "madd     $ac1,                 %[Temp1],       %[cospi_12_64]  \n\t"
+        "extp     %[step1_6],           $ac1,           31              \n\t"
+
+        /*
+          temp_1 = (step1_7 - step1_6 - step1_4 + step1_5) * cospi_16_64;
+          temp_2 = (step1_4 - step1_5 - step1_6 + step1_7) * cospi_16_64;
+        */
+        "sub      %[Temp0],             %[step1_7],     %[step1_6]      \n\t"
+        "sub      %[Temp0],             %[Temp0],       %[step1_4]      \n\t"
+        "add      %[Temp0],             %[Temp0],       %[step1_5]      \n\t"
+        "sub      %[Temp1],             %[step1_4],     %[step1_5]      \n\t"
+        "sub      %[Temp1],             %[Temp1],       %[step1_6]      \n\t"
+        "add      %[Temp1],             %[Temp1],       %[step1_7]      \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+
+        "madd     $ac0,                 %[Temp0],       %[cospi_16_64]  \n\t"
+        "madd     $ac1,                 %[Temp1],       %[cospi_16_64]  \n\t"
+
+        /*
+          step1_4 = step1_4 + step1_5;
+          step1_7 = step1_6 + step1_7;
+        */
+        "add      %[step1_4],           %[step1_4],     %[step1_5]      \n\t"
+        "add      %[step1_7],           %[step1_7],     %[step1_6]      \n\t"
+
+        "extp     %[step1_5],           $ac0,           31              \n\t"
+        "extp     %[step1_6],           $ac1,           31              \n\t"
+
+        /* add block */
+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
+        "add      %[Temp0],             %[step1_0],     %[step1_7]      \n\t"
+        "addi     %[Temp0],             %[Temp0],       16              \n\t"
+        "sra      %[Temp0],             %[Temp0],       5               \n\t"
+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
+        "add      %[Temp0],             %[step1_1],     %[step1_6]      \n\t"
+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
+        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
+
+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
+        "addi     %[Temp0],             %[Temp0],       16              \n\t"
+        "sra      %[Temp0],             %[Temp0],       5               \n\t"
+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
+        "add      %[Temp0],             %[step1_2],     %[step1_5]      \n\t"
+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
+        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
+
+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
+        "addi     %[Temp0],             %[Temp0],       16              \n\t"
+        "sra      %[Temp0],             %[Temp0],       5               \n\t"
+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
+        "add      %[Temp0],             %[step1_3],     %[step1_4]      \n\t"
+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
+        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
+
+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
+        "addi     %[Temp0],             %[Temp0],       16              \n\t"
+        "sra      %[Temp0],             %[Temp0],       5               \n\t"
+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
+        "sub      %[Temp0],             %[step1_3],     %[step1_4]      \n\t"
+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
+        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
+
+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
+        "addi     %[Temp0],             %[Temp0],       16              \n\t"
+        "sra      %[Temp0],             %[Temp0],       5               \n\t"
+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
+        "sub      %[Temp0],             %[step1_2],     %[step1_5]      \n\t"
+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
+        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
+
+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
+        "addi     %[Temp0],             %[Temp0],       16              \n\t"
+        "sra      %[Temp0],             %[Temp0],       5               \n\t"
+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
+        "sub      %[Temp0],             %[step1_1],     %[step1_6]      \n\t"
+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
+        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
+
+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
+        "addi     %[Temp0],             %[Temp0],       16              \n\t"
+        "sra      %[Temp0],             %[Temp0],       5               \n\t"
+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
+        "sub      %[Temp0],             %[step1_0],     %[step1_7]      \n\t"
+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
+        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
+
+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
+        "addi     %[Temp0],             %[Temp0],       16              \n\t"
+        "sra      %[Temp0],             %[Temp0],       5               \n\t"
+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
+
+        : [step1_0] "=&r" (step1_0), [step1_1] "=&r" (step1_1),
+          [step1_2] "=&r" (step1_2), [step1_3] "=&r" (step1_3),
+          [step1_4] "=&r" (step1_4), [step1_5] "=&r" (step1_5),
+          [step1_6] "=&r" (step1_6), [step1_7] "=&r" (step1_7),
+          [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),
+          [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
+          [dest_pix] "+r" (dest_pix)
+        : [const_2_power_13] "r" (const_2_power_13),
+          [cospi_16_64] "r" (cospi_16_64), [cospi_28_64] "r" (cospi_28_64),
+          [cospi_4_64] "r" (cospi_4_64), [cospi_12_64] "r" (cospi_12_64),
+          [cospi_20_64] "r" (cospi_20_64), [cospi_8_64] "r" (cospi_8_64),
+          [cospi_24_64] "r" (cospi_24_64),
+          [input] "r" (input), [cm] "r" (cm), [dest_stride] "r" (dest_stride)
+    );
+
+    input += 8;
+  }
+}
+
+void vp9_idct8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
+                              int dest_stride) {
+  DECLARE_ALIGNED(32, int16_t, out[8 * 8]);
+  int16_t *outptr = out;
+  uint32_t pos = 45;
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp    %[pos],    1    \n\t"
+    :
+    : [pos] "r" (pos)
+  );
+
+  // First transform rows
+  idct8_1d_rows_dspr2(input, outptr, 8);
+
+  // Then transform columns and add to dest
+  idct8_1d_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+}
+
+static void iadst8_1d_dspr2(const int16_t *input, int16_t *output) {
+  int s0, s1, s2, s3, s4, s5, s6, s7;
+  int x0, x1, x2, x3, x4, x5, x6, x7;
+
+  x0 = input[7];
+  x1 = input[0];
+  x2 = input[5];
+  x3 = input[2];
+  x4 = input[3];
+  x5 = input[4];
+  x6 = input[1];
+  x7 = input[6];
+
+  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
+    output[0] = output[1] = output[2] = output[3] = output[4]
+              = output[5] = output[6] = output[7] = 0;
+    return;
+  }
+
+  // stage 1
+  s0 = cospi_2_64  * x0 + cospi_30_64 * x1;
+  s1 = cospi_30_64 * x0 - cospi_2_64  * x1;
+  s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
+  s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
+  s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
+  s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
+  s6 = cospi_26_64 * x6 + cospi_6_64  * x7;
+  s7 = cospi_6_64  * x6 - cospi_26_64 * x7;
+
+  x0 = ROUND_POWER_OF_TWO((s0 + s4), DCT_CONST_BITS);
+  x1 = ROUND_POWER_OF_TWO((s1 + s5), DCT_CONST_BITS);
+  x2 = ROUND_POWER_OF_TWO((s2 + s6), DCT_CONST_BITS);
+  x3 = ROUND_POWER_OF_TWO((s3 + s7), DCT_CONST_BITS);
+  x4 = ROUND_POWER_OF_TWO((s0 - s4), DCT_CONST_BITS);
+  x5 = ROUND_POWER_OF_TWO((s1 - s5), DCT_CONST_BITS);
+  x6 = ROUND_POWER_OF_TWO((s2 - s6), DCT_CONST_BITS);
+  x7 = ROUND_POWER_OF_TWO((s3 - s7), DCT_CONST_BITS);
+
+  // stage 2
+  s0 = x0;
+  s1 = x1;
+  s2 = x2;
+  s3 = x3;
+  s4 =  cospi_8_64  * x4 + cospi_24_64 * x5;
+  s5 =  cospi_24_64 * x4 - cospi_8_64  * x5;
+  s6 = -cospi_24_64 * x6 + cospi_8_64  * x7;
+  s7 =  cospi_8_64  * x6 + cospi_24_64 * x7;
+
+  x0 = s0 + s2;
+  x1 = s1 + s3;
+  x2 = s0 - s2;
+  x3 = s1 - s3;
+  x4 = ROUND_POWER_OF_TWO((s4 + s6), DCT_CONST_BITS);
+  x5 = ROUND_POWER_OF_TWO((s5 + s7), DCT_CONST_BITS);
+  x6 = ROUND_POWER_OF_TWO((s4 - s6), DCT_CONST_BITS);
+  x7 = ROUND_POWER_OF_TWO((s5 - s7), DCT_CONST_BITS);
+
+  // stage 3
+  s2 = cospi_16_64 * (x2 + x3);
+  s3 = cospi_16_64 * (x2 - x3);
+  s6 = cospi_16_64 * (x6 + x7);
+  s7 = cospi_16_64 * (x6 - x7);
+
+  x2 = ROUND_POWER_OF_TWO((s2), DCT_CONST_BITS);
+  x3 = ROUND_POWER_OF_TWO((s3), DCT_CONST_BITS);
+  x6 = ROUND_POWER_OF_TWO((s6), DCT_CONST_BITS);
+  x7 = ROUND_POWER_OF_TWO((s7), DCT_CONST_BITS);
+
+  output[0] =  x0;
+  output[1] = -x4;
+  output[2] =  x6;
+  output[3] = -x2;
+  output[4] =  x3;
+  output[5] = -x7;
+  output[6] =  x5;
+  output[7] = -x1;
+}
+
+void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
+                             int dest_stride, int tx_type) {
+  int i, j;
+  DECLARE_ALIGNED(32, int16_t, out[8 * 8]);
+  int16_t *outptr = out;
+  int16_t temp_in[8 * 8], temp_out[8];
+  uint32_t pos = 45;
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp    %[pos],    1    \n\t"
+    :
+    : [pos] "r" (pos)
+  );
+
+  switch (tx_type) {
+    case DCT_DCT:     // DCT in both horizontal and vertical
+      idct8_1d_rows_dspr2(input, outptr, 8);
+      idct8_1d_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+      break;
+    case ADST_DCT:    // ADST in vertical, DCT in horizontal
+      idct8_1d_rows_dspr2(input, outptr, 8);
+
+      for (i = 0; i < 8; ++i) {
+        iadst8_1d_dspr2(&out[i * 8], temp_out);
+
+        for (j = 0; j < 8; ++j)
+          dest[j * dest_stride + i] =
+                    clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
+                                      + dest[j * dest_stride + i]);
+      }
+      break;
+    case DCT_ADST:    // DCT in vertical, ADST in horizontal
+      for (i = 0; i < 8; ++i) {
+        iadst8_1d_dspr2(input, outptr);
+        input += 8;
+        outptr += 8;
+      }
+
+      for (i = 0; i < 8; ++i) {
+        for (j = 0; j < 8; ++j) {
+          temp_in[i * 8 + j] = out[j * 8 + i];
+        }
+      }
+      idct8_1d_columns_add_blk_dspr2(&temp_in[0], dest, dest_stride);
+      break;
+    case ADST_ADST:   // ADST in both directions
+      for (i = 0; i < 8; ++i) {
+        iadst8_1d_dspr2(input, outptr);
+        input += 8;
+        outptr += 8;
+      }
+
+      for (i = 0; i < 8; ++i) {
+        for (j = 0; j < 8; ++j)
+          temp_in[j] = out[j * 8 + i];
+
+        iadst8_1d_dspr2(temp_in, temp_out);
+
+        for (j = 0; j < 8; ++j)
+          dest[j * dest_stride + i] =
+                clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
+                                      + dest[j * dest_stride + i]);
+      }
+      break;
+    default:
+      printf("vp9_short_iht8x8_add_dspr2 : Invalid tx_type\n");
+      break;
+  }
+}
+
+void vp9_idct8x8_10_add_dspr2(const int16_t *input, uint8_t *dest,
+                              int dest_stride) {
+  DECLARE_ALIGNED(32, int16_t, out[8 * 8]);
+  int16_t *outptr = out;
+  uint32_t pos = 45;
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp    %[pos],    1    \n\t"
+    :
+    : [pos] "r" (pos)
+  );
+
+  // First transform rows
+  idct8_1d_rows_dspr2(input, outptr, 4);
+
+  outptr += 4;
+
+  __asm__ __volatile__ (
+      "sw  $zero,   0(%[outptr])  \n\t"
+      "sw  $zero,   4(%[outptr])  \n\t"
+      "sw  $zero,  16(%[outptr])  \n\t"
+      "sw  $zero,  20(%[outptr])  \n\t"
+      "sw  $zero,  32(%[outptr])  \n\t"
+      "sw  $zero,  36(%[outptr])  \n\t"
+      "sw  $zero,  48(%[outptr])  \n\t"
+      "sw  $zero,  52(%[outptr])  \n\t"
+      "sw  $zero,  64(%[outptr])  \n\t"
+      "sw  $zero,  68(%[outptr])  \n\t"
+      "sw  $zero,  80(%[outptr])  \n\t"
+      "sw  $zero,  84(%[outptr])  \n\t"
+      "sw  $zero,  96(%[outptr])  \n\t"
+      "sw  $zero, 100(%[outptr])  \n\t"
+      "sw  $zero, 112(%[outptr])  \n\t"
+      "sw  $zero, 116(%[outptr])  \n\t"
+
+      :
+      : [outptr] "r" (outptr)
+  );
+
+
+  // Then transform columns and add to dest
+  idct8_1d_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+}
+
+void vp9_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest,
+                             int dest_stride) {
+  uint32_t pos = 45;
+  int32_t out;
+  int32_t r;
+  int32_t a1, absa1;
+  int32_t t1, t2, vector_a1, vector_1, vector_2;
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp      %[pos],     1           \n\t"
+
+    :
+    : [pos] "r" (pos)
+  );
+
+  out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);
+  __asm__ __volatile__ (
+      "addi     %[out],     %[out],     16      \n\t"
+      "sra      %[a1],      %[out],     5       \n\t"
+
+      : [out] "+r" (out), [a1] "=r" (a1)
+      :
+  );
+
+  if (a1 < 0) {
+    /* use quad-byte
+     * input and output memory are four byte aligned */
+    __asm__ __volatile__ (
+        "abs        %[absa1],       %[a1]       \n\t"
+        "replv.qb   %[vector_a1],   %[absa1]    \n\t"
+
+        : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)
+        : [a1] "r" (a1)
+    );
+
+    for (r = 8; r--;) {
+      __asm__ __volatile__ (
+          "lw           %[t1],          0(%[dest])                      \n\t"
+          "lw           %[t2],          4(%[dest])                      \n\t"
+          "subu_s.qb    %[vector_1],    %[t1],          %[vector_a1]    \n\t"
+          "subu_s.qb    %[vector_2],    %[t2],          %[vector_a1]    \n\t"
+          "sw           %[vector_1],    0(%[dest])                      \n\t"
+          "sw           %[vector_2],    4(%[dest])                      \n\t"
+          "add          %[dest],        %[dest],        %[dest_stride]  \n\t"
+
+          : [t1] "=&r" (t1), [t2] "=&r" (t2),
+            [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
+            [dest] "+&r" (dest)
+          : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
+      );
+    }
+  } else {
+    /* use quad-byte
+     * input and output memory are four byte aligned */
+    __asm__ __volatile__ (
+        "replv.qb   %[vector_a1],   %[a1]   \n\t"
+
+        : [vector_a1] "=r" (vector_a1)
+        : [a1] "r" (a1)
+    );
+
+    for (r = 8; r--;) {
+      __asm__ __volatile__ (
+          "lw           %[t1],          0(%[dest])                      \n\t"
+          "lw           %[t2],          4(%[dest])                      \n\t"
+          "addu_s.qb    %[vector_1],    %[t1],          %[vector_a1]    \n\t"
+          "addu_s.qb    %[vector_2],    %[t2],          %[vector_a1]    \n\t"
+          "sw           %[vector_1],    0(%[dest])                      \n\t"
+          "sw           %[vector_2],    4(%[dest])                      \n\t"
+          "add          %[dest],        %[dest],        %[dest_stride]  \n\t"
+
+          : [t1] "=&r" (t1), [t2] "=&r" (t2),
+            [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
+            [dest] "+r" (dest)
+          : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
+      );
+    }
+  }
+}
+#endif  // #if HAVE_DSPR2
diff --git a/vp9/common/vp9_alloccommon.c b/vp9/common/vp9_alloccommon.c
index 0f50f374d..0d65651f0 100644
--- a/vp9/common/vp9_alloccommon.c
+++ b/vp9/common/vp9_alloccommon.c
@@ -41,17 +41,12 @@ void vp9_free_frame_buffers(VP9_COMMON *cm) {
 
   vpx_free(cm->mip);
   vpx_free(cm->prev_mip);
-  vpx_free(cm->above_seg_context);
   vpx_free(cm->last_frame_seg_map);
   vpx_free(cm->mi_grid_base);
   vpx_free(cm->prev_mi_grid_base);
 
-  vpx_free(cm->above_context[0]);
-  for (i = 0; i < MAX_MB_PLANE; i++)
-    cm->above_context[i] = 0;
   cm->mip = NULL;
   cm->prev_mip = NULL;
-  cm->above_seg_context = NULL;
   cm->last_frame_seg_map = NULL;
   cm->mi_grid_base = NULL;
   cm->prev_mi_grid_base = NULL;
@@ -85,7 +80,7 @@ static void setup_mi(VP9_COMMON *cm) {
 }
 
 int vp9_alloc_frame_buffers(VP9_COMMON *cm, int width, int height) {
-  int i, mi_cols;
+  int i;
 
   const int aligned_width = ALIGN_POWER_OF_TWO(width, MI_SIZE_LOG2);
   const int aligned_height = ALIGN_POWER_OF_TWO(height, MI_SIZE_LOG2);
@@ -140,21 +135,6 @@ int vp9_alloc_frame_buffers(VP9_COMMON *cm, int width, int height) {
 
   setup_mi(cm);
 
-  // FIXME(jkoleszar): allocate subsampled arrays for U/V once subsampling
-  // information is exposed at this level
-  mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
-
-  // 2 contexts per 'mi unit', so that we have one context per 4x4 txfm
-  // block where mi unit size is 8x8.
-  cm->above_context[0] = vpx_calloc(sizeof(ENTROPY_CONTEXT) * MAX_MB_PLANE *
-                                         (2 * mi_cols), 1);
-  if (!cm->above_context[0])
-    goto fail;
-
-  cm->above_seg_context = vpx_calloc(sizeof(PARTITION_CONTEXT) * mi_cols, 1);
-  if (!cm->above_seg_context)
-    goto fail;
-
   // Create the segmentation map structure and set to 0.
   cm->last_frame_seg_map = vpx_calloc(cm->mi_rows * cm->mi_cols, 1);
   if (!cm->last_frame_seg_map)
@@ -186,18 +166,12 @@ void vp9_initialize_common() {
 }
 
 void vp9_update_frame_size(VP9_COMMON *cm) {
-  int i, mi_cols;
   const int aligned_width = ALIGN_POWER_OF_TWO(cm->width, MI_SIZE_LOG2);
   const int aligned_height = ALIGN_POWER_OF_TWO(cm->height, MI_SIZE_LOG2);
 
   set_mb_mi(cm, aligned_width, aligned_height);
   setup_mi(cm);
 
-  mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
-  for (i = 1; i < MAX_MB_PLANE; i++)
-    cm->above_context[i] =
-        cm->above_context[0] + i * sizeof(ENTROPY_CONTEXT) * 2 * mi_cols;
-
   // Initialize the previous frame segment map to 0.
   if (cm->last_frame_seg_map)
     vpx_memset(cm->last_frame_seg_map, 0, cm->mi_rows * cm->mi_cols);
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index 1a03269fb..d0d485272 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -74,10 +74,6 @@ typedef enum {
   MB_MODE_COUNT
 } MB_PREDICTION_MODE;
 
-static INLINE int is_intra_mode(MB_PREDICTION_MODE mode) {
-  return mode <= TM_PRED;
-}
-
 static INLINE int is_inter_mode(MB_PREDICTION_MODE mode) {
   return mode >= NEARESTMV && mode <= NEWMV;
 }
@@ -140,7 +136,7 @@ typedef struct {
   // Flags used for prediction status of various bit-stream signals
   unsigned char seg_id_predicted;
 
-  INTERPOLATIONFILTERTYPE interp_filter;
+  INTERPOLATION_TYPE interp_filter;
 
   BLOCK_SIZE sb_type;
 } MB_MODE_INFO;
@@ -226,6 +222,13 @@ typedef struct macroblockd {
   unsigned char ab_index;   // index of 4x4 block inside the 8x8 block
 
   int q_index;
+
+  /* Y,U,V,(A) */
+  ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
+  ENTROPY_CONTEXT left_context[MAX_MB_PLANE][16];
+
+  PARTITION_CONTEXT *above_seg_context;
+  PARTITION_CONTEXT left_seg_context[8];
 } MACROBLOCKD;
 
 
@@ -414,7 +417,7 @@ static void txfrm_block_to_raster_xy(BLOCK_SIZE plane_bsize,
   *y = (raster_mb >> tx_cols_log2) << tx_size;
 }
 
-static void extend_for_intra(MACROBLOCKD* const xd, BLOCK_SIZE plane_bsize,
+static void extend_for_intra(MACROBLOCKD *xd, BLOCK_SIZE plane_bsize,
                              int plane, int block, TX_SIZE tx_size) {
   struct macroblockd_plane *const pd = &xd->plane[plane];
   uint8_t *const buf = pd->dst.buf;
@@ -439,19 +442,22 @@ static void extend_for_intra(MACROBLOCKD* const xd, BLOCK_SIZE plane_bsize,
   }
 
   if (xd->mb_to_bottom_edge < 0) {
-    const int bh = 4 << b_height_log2(plane_bsize);
-    const int umv_border_start = bh + (xd->mb_to_bottom_edge >>
-                                       (3 + pd->subsampling_y));
-    int i;
-    const uint8_t c = buf[(umv_border_start - 1) * stride + x];
-    uint8_t *d = &buf[umv_border_start * stride + x];
-
-    if (y + bh > umv_border_start)
-      for (i = 0; i < bh; ++i, d += stride)
-        *d = c;
+    if (xd->left_available || x >= 0) {
+      const int bh = 4 << b_height_log2(plane_bsize);
+      const int umv_border_start =
+          bh + (xd->mb_to_bottom_edge >> (3 + pd->subsampling_y));
+
+      if (y + bh > umv_border_start) {
+        const uint8_t c = buf[(umv_border_start - 1) * stride + x];
+        uint8_t *d = &buf[umv_border_start * stride + x];
+        int i;
+        for (i = 0; i < bh; ++i, d += stride)
+          *d = c;
+      }
+    }
   }
 }
-static void set_contexts_on_border(MACROBLOCKD *xd,
+static void set_contexts_on_border(const MACROBLOCKD *xd,
                                    struct macroblockd_plane *pd,
                                    BLOCK_SIZE plane_bsize,
                                    int tx_size_in_blocks, int has_eob,
@@ -489,7 +495,7 @@ static void set_contexts_on_border(MACROBLOCKD *xd,
     L[pt] = 0;
 }
 
-static void set_contexts(MACROBLOCKD *xd, struct macroblockd_plane *pd,
+static void set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd,
                          BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
                          int has_eob, int aoff, int loff) {
   ENTROPY_CONTEXT *const A = pd->above_context + aoff;
diff --git a/vp9/common/vp9_common.h b/vp9/common/vp9_common.h
index 3ac192b4a..36d1cdf14 100644
--- a/vp9/common/vp9_common.h
+++ b/vp9/common/vp9_common.h
@@ -84,9 +84,11 @@ static int get_unsigned_bits(unsigned int num_values) {
   } while (0)
 #endif
 
-#define SYNC_CODE_0 0x49
-#define SYNC_CODE_1 0x83
-#define SYNC_CODE_2 0x42
+#define VP9_SYNC_CODE_0 0x49
+#define VP9_SYNC_CODE_1 0x83
+#define VP9_SYNC_CODE_2 0x42
+
+#define VP9_FRAME_MARKER 0x2
 
 
 #endif  // VP9_COMMON_VP9_COMMON_H_
diff --git a/vp9/common/vp9_entropy.c b/vp9/common/vp9_entropy.c
index 2640ac72b..d3a867c3f 100644
--- a/vp9/common/vp9_entropy.c
+++ b/vp9/common/vp9_entropy.c
@@ -322,9 +322,8 @@ static void adapt_coef_probs(VP9_COMMON *cm, TX_SIZE tx_size,
   vp9_coeff_count_model *coef_counts = cm->counts.coef[tx_size];
   unsigned int (*eob_branch_count)[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] =
       cm->counts.eob_branch[tx_size];
-  int t, i, j, k, l;
+  int i, j, k, l, m;
   unsigned int branch_ct[UNCONSTRAINED_NODES][2];
-  vp9_prob coef_probs[UNCONSTRAINED_NODES];
 
   for (i = 0; i < BLOCK_TYPES; ++i)
     for (j = 0; j < REF_TYPES; ++j)
@@ -332,15 +331,14 @@ static void adapt_coef_probs(VP9_COMMON *cm, TX_SIZE tx_size,
         for (l = 0; l < PREV_COEF_CONTEXTS; ++l) {
           if (l >= 3 && k == 0)
             continue;
-          vp9_tree_probs_from_distribution(vp9_coefmodel_tree, coef_probs,
-                                           branch_ct, coef_counts[i][j][k][l],
-                                           0);
+          vp9_tree_probs_from_distribution(vp9_coefmodel_tree, branch_ct,
+                                           coef_counts[i][j][k][l], 0);
           branch_ct[0][1] = eob_branch_count[i][j][k][l] - branch_ct[0][0];
-          coef_probs[0] = get_binary_prob(branch_ct[0][0], branch_ct[0][1]);
-          for (t = 0; t < UNCONSTRAINED_NODES; ++t)
-            dst_coef_probs[i][j][k][l][t] = merge_probs(
-                pre_coef_probs[i][j][k][l][t], coef_probs[t],
-                branch_ct[t], count_sat, update_factor);
+          for (m = 0; m < UNCONSTRAINED_NODES; ++m)
+            dst_coef_probs[i][j][k][l][m] = merge_probs(
+                                                pre_coef_probs[i][j][k][l][m],
+                                                branch_ct[m],
+                                                count_sat, update_factor);
         }
 }
 
diff --git a/vp9/common/vp9_entropy.h b/vp9/common/vp9_entropy.h
index ec7d09a00..c58e852fe 100644
--- a/vp9/common/vp9_entropy.h
+++ b/vp9/common/vp9_entropy.h
@@ -153,8 +153,8 @@ typedef unsigned int vp9_coeff_count_model[REF_TYPES][COEF_BANDS]
 
 void vp9_model_to_full_probs(const vp9_prob *model, vp9_prob *full);
 
-static int get_entropy_context(TX_SIZE tx_size,
-                               ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l) {
+static int get_entropy_context(TX_SIZE tx_size, const ENTROPY_CONTEXT *a,
+                                                const ENTROPY_CONTEXT *l) {
   ENTROPY_CONTEXT above_ec = 0, left_ec = 0;
 
   switch (tx_size) {
@@ -163,16 +163,16 @@ static int get_entropy_context(TX_SIZE tx_size,
       left_ec = l[0] != 0;
       break;
     case TX_8X8:
-      above_ec = !!*(uint16_t *)a;
-      left_ec  = !!*(uint16_t *)l;
+      above_ec = !!*(const uint16_t *)a;
+      left_ec  = !!*(const uint16_t *)l;
       break;
     case TX_16X16:
-      above_ec = !!*(uint32_t *)a;
-      left_ec  = !!*(uint32_t *)l;
+      above_ec = !!*(const uint32_t *)a;
+      left_ec  = !!*(const uint32_t *)l;
       break;
     case TX_32X32:
-      above_ec = !!*(uint64_t *)a;
-      left_ec  = !!*(uint64_t *)l;
+      above_ec = !!*(const uint64_t *)a;
+      left_ec  = !!*(const uint64_t *)l;
       break;
     default:
       assert(!"Invalid transform size.");
diff --git a/vp9/common/vp9_entropymode.c b/vp9/common/vp9_entropymode.c
index 3347b35de..a963d55e6 100644
--- a/vp9/common/vp9_entropymode.c
+++ b/vp9/common/vp9_entropymode.c
@@ -161,51 +161,52 @@ static const vp9_prob default_if_uv_probs[INTRA_MODES][INTRA_MODES - 1] = {
   { 101,  21, 107, 181, 192, 103,  19,  67, 125 }   // y = tm
 };
 
-static const vp9_prob default_partition_probs[FRAME_TYPES][PARTITION_CONTEXTS]
+const vp9_prob vp9_kf_partition_probs[PARTITION_CONTEXTS]
+                                     [PARTITION_TYPES - 1] = {
+  // 8x8 -> 4x4
+  { 158,  97,  94 },  // a/l both not split
+  {  93,  24,  99 },  // a split, l not split
+  {  85, 119,  44 },  // l split, a not split
+  {  62,  59,  67 },  // a/l both split
+  // 16x16 -> 8x8
+  { 149,  53,  53 },  // a/l both not split
+  {  94,  20,  48 },  // a split, l not split
+  {  83,  53,  24 },  // l split, a not split
+  {  52,  18,  18 },  // a/l both split
+  // 32x32 -> 16x16
+  { 150,  40,  39 },  // a/l both not split
+  {  78,  12,  26 },  // a split, l not split
+  {  67,  33,  11 },  // l split, a not split
+  {  24,   7,   5 },  // a/l both split
+  // 64x64 -> 32x32
+  { 174,  35,  49 },  // a/l both not split
+  {  68,  11,  27 },  // a split, l not split
+  {  57,  15,   9 },  // l split, a not split
+  {  12,   3,   3 },  // a/l both split
+};
+
+static const vp9_prob default_partition_probs[PARTITION_CONTEXTS]
                                              [PARTITION_TYPES - 1] = {
-  {  // frame_type = keyframe
-    // 8x8 -> 4x4
-    { 158,  97,  94 },  // a/l both not split
-    {  93,  24,  99 },  // a split, l not split
-    {  85, 119,  44 },  // l split, a not split
-    {  62,  59,  67 },  // a/l both split
-    // 16x16 -> 8x8
-    { 149,  53,  53 },  // a/l both not split
-    {  94,  20,  48 },  // a split, l not split
-    {  83,  53,  24 },  // l split, a not split
-    {  52,  18,  18 },  // a/l both split
-    // 32x32 -> 16x16
-    { 150,  40,  39 },  // a/l both not split
-    {  78,  12,  26 },  // a split, l not split
-    {  67,  33,  11 },  // l split, a not split
-    {  24,   7,   5 },  // a/l both split
-    // 64x64 -> 32x32
-    { 174,  35,  49 },  // a/l both not split
-    {  68,  11,  27 },  // a split, l not split
-    {  57,  15,   9 },  // l split, a not split
-    {  12,   3,   3 },  // a/l both split
-  }, {  // frame_type = interframe
-    // 8x8 -> 4x4
-    { 199, 122, 141 },  // a/l both not split
-    { 147,  63, 159 },  // a split, l not split
-    { 148, 133, 118 },  // l split, a not split
-    { 121, 104, 114 },  // a/l both split
-    // 16x16 -> 8x8
-    { 174,  73,  87 },  // a/l both not split
-    {  92,  41,  83 },  // a split, l not split
-    {  82,  99,  50 },  // l split, a not split
-    {  53,  39,  39 },  // a/l both split
-    // 32x32 -> 16x16
-    { 177,  58,  59 },  // a/l both not split
-    {  68,  26,  63 },  // a split, l not split
-    {  52,  79,  25 },  // l split, a not split
-    {  17,  14,  12 },  // a/l both split
-    // 64x64 -> 32x32
-    { 222,  34,  30 },  // a/l both not split
-    {  72,  16,  44 },  // a split, l not split
-    {  58,  32,  12 },  // l split, a not split
-    {  10,   7,   6 },  // a/l both split
-  }
+  // 8x8 -> 4x4
+  { 199, 122, 141 },  // a/l both not split
+  { 147,  63, 159 },  // a split, l not split
+  { 148, 133, 118 },  // l split, a not split
+  { 121, 104, 114 },  // a/l both split
+  // 16x16 -> 8x8
+  { 174,  73,  87 },  // a/l both not split
+  {  92,  41,  83 },  // a split, l not split
+  {  82,  99,  50 },  // l split, a not split
+  {  53,  39,  39 },  // a/l both split
+  // 32x32 -> 16x16
+  { 177,  58,  59 },  // a/l both not split
+  {  68,  26,  63 },  // a split, l not split
+  {  52,  79,  25 },  // l split, a not split
+  {  17,  14,  12 },  // a/l both split
+  // 64x64 -> 32x32
+  { 222,  34,  30 },  // a/l both not split
+  {  72,  16,  44 },  // a split, l not split
+  {  58,  32,  12 },  // l split, a not split
+  {  10,   7,   6 },  // a/l both split
 };
 
 static const vp9_prob default_inter_mode_probs[INTER_MODE_CONTEXTS]
@@ -309,8 +310,8 @@ static const vp9_prob default_mbskip_probs[MBSKIP_CONTEXTS] = {
   192, 128, 64
 };
 
-static const vp9_prob default_switchable_interp_prob[SWITCHABLE_FILTERS+1]
-                                                  [SWITCHABLE_FILTERS-1] = {
+static const vp9_prob default_switchable_interp_prob[SWITCHABLE_FILTER_CONTEXTS]
+                                                    [SWITCHABLE_FILTERS - 1] = {
   { 235, 162, },
   { 36, 255, },
   { 34, 3, },
@@ -349,29 +350,15 @@ void vp9_entropy_mode_init() {
 #define COUNT_SAT 20
 #define MAX_UPDATE_FACTOR 128
 
-static int update_ct(vp9_prob pre_prob, vp9_prob prob,
-                     const unsigned int ct[2]) {
-  return merge_probs(pre_prob, prob, ct, COUNT_SAT, MAX_UPDATE_FACTOR);
-}
-
-static int update_ct2(vp9_prob pre_prob, const unsigned int ct[2]) {
-  return merge_probs2(pre_prob, ct, COUNT_SAT, MAX_UPDATE_FACTOR);
+static int adapt_prob(vp9_prob pre_prob, const unsigned int ct[2]) {
+  return merge_probs(pre_prob, ct, COUNT_SAT, MAX_UPDATE_FACTOR);
 }
 
-static void update_mode_probs(int n_modes,
-                              const vp9_tree_index *tree,
-                              const unsigned int *cnt,
-                              const vp9_prob *pre_probs, vp9_prob *dst_probs,
-                              unsigned int tok0_offset) {
-#define MAX_PROBS 32
-  vp9_prob probs[MAX_PROBS];
-  unsigned int branch_ct[MAX_PROBS][2];
-  int t;
-
-  assert(n_modes - 1 < MAX_PROBS);
-  vp9_tree_probs_from_distribution(tree, probs, branch_ct, cnt, tok0_offset);
-  for (t = 0; t < n_modes - 1; ++t)
-    dst_probs[t] = update_ct(pre_probs[t], probs[t], branch_ct[t]);
+static void adapt_probs(const vp9_tree_index *tree,
+                        const vp9_prob *pre_probs, const unsigned int *counts,
+                        unsigned int offset, vp9_prob *probs) {
+  tree_merge_probs(tree, pre_probs, counts, offset,
+                   COUNT_SAT, MAX_UPDATE_FACTOR, probs);
 }
 
 void vp9_adapt_mode_probs(VP9_COMMON *cm) {
@@ -381,46 +368,40 @@ void vp9_adapt_mode_probs(VP9_COMMON *cm) {
   const FRAME_COUNTS *counts = &cm->counts;
 
   for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
-    fc->intra_inter_prob[i] = update_ct2(pre_fc->intra_inter_prob[i],
+    fc->intra_inter_prob[i] = adapt_prob(pre_fc->intra_inter_prob[i],
                                          counts->intra_inter[i]);
   for (i = 0; i < COMP_INTER_CONTEXTS; i++)
-    fc->comp_inter_prob[i] = update_ct2(pre_fc->comp_inter_prob[i],
+    fc->comp_inter_prob[i] = adapt_prob(pre_fc->comp_inter_prob[i],
                                         counts->comp_inter[i]);
   for (i = 0; i < REF_CONTEXTS; i++)
-    fc->comp_ref_prob[i] = update_ct2(pre_fc->comp_ref_prob[i],
+    fc->comp_ref_prob[i] = adapt_prob(pre_fc->comp_ref_prob[i],
                                       counts->comp_ref[i]);
   for (i = 0; i < REF_CONTEXTS; i++)
     for (j = 0; j < 2; j++)
-      fc->single_ref_prob[i][j] = update_ct2(pre_fc->single_ref_prob[i][j],
+      fc->single_ref_prob[i][j] = adapt_prob(pre_fc->single_ref_prob[i][j],
                                              counts->single_ref[i][j]);
 
   for (i = 0; i < INTER_MODE_CONTEXTS; i++)
-    update_mode_probs(INTER_MODES, vp9_inter_mode_tree,
-                      counts->inter_mode[i], pre_fc->inter_mode_probs[i],
-                      fc->inter_mode_probs[i], NEARESTMV);
+    adapt_probs(vp9_inter_mode_tree, pre_fc->inter_mode_probs[i],
+                counts->inter_mode[i], NEARESTMV, fc->inter_mode_probs[i]);
 
   for (i = 0; i < BLOCK_SIZE_GROUPS; i++)
-    update_mode_probs(INTRA_MODES, vp9_intra_mode_tree,
-                      counts->y_mode[i], pre_fc->y_mode_prob[i],
-                      fc->y_mode_prob[i], 0);
+    adapt_probs(vp9_intra_mode_tree, pre_fc->y_mode_prob[i],
+                counts->y_mode[i], 0, fc->y_mode_prob[i]);
 
   for (i = 0; i < INTRA_MODES; ++i)
-    update_mode_probs(INTRA_MODES, vp9_intra_mode_tree,
-                      counts->uv_mode[i], pre_fc->uv_mode_prob[i],
-                      fc->uv_mode_prob[i], 0);
+    adapt_probs(vp9_intra_mode_tree, pre_fc->uv_mode_prob[i],
+                counts->uv_mode[i], 0, fc->uv_mode_prob[i]);
 
   for (i = 0; i < PARTITION_CONTEXTS; i++)
-    update_mode_probs(PARTITION_TYPES, vp9_partition_tree,
-                      counts->partition[i],
-                      pre_fc->partition_prob[INTER_FRAME][i],
-                      fc->partition_prob[INTER_FRAME][i], 0);
+    adapt_probs(vp9_partition_tree, pre_fc->partition_prob[i],
+                counts->partition[i], 0, fc->partition_prob[i]);
 
   if (cm->mcomp_filter_type == SWITCHABLE) {
-    for (i = 0; i <= SWITCHABLE_FILTERS; i++)
-      update_mode_probs(SWITCHABLE_FILTERS, vp9_switchable_interp_tree,
-                        counts->switchable_interp[i],
-                        pre_fc->switchable_interp_prob[i],
-                        fc->switchable_interp_prob[i], 0);
+    for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
+      adapt_probs(vp9_switchable_interp_tree, pre_fc->switchable_interp_prob[i],
+                  counts->switchable_interp[i], 0,
+                  fc->switchable_interp_prob[i]);
   }
 
   if (cm->tx_mode == TX_MODE_SELECT) {
@@ -432,23 +413,23 @@ void vp9_adapt_mode_probs(VP9_COMMON *cm) {
     for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {
       tx_counts_to_branch_counts_8x8(counts->tx.p8x8[i], branch_ct_8x8p);
       for (j = 0; j < TX_SIZES - 3; ++j)
-        fc->tx_probs.p8x8[i][j] = update_ct2(pre_fc->tx_probs.p8x8[i][j],
+        fc->tx_probs.p8x8[i][j] = adapt_prob(pre_fc->tx_probs.p8x8[i][j],
                                              branch_ct_8x8p[j]);
 
       tx_counts_to_branch_counts_16x16(counts->tx.p16x16[i], branch_ct_16x16p);
       for (j = 0; j < TX_SIZES - 2; ++j)
-        fc->tx_probs.p16x16[i][j] = update_ct2(pre_fc->tx_probs.p16x16[i][j],
+        fc->tx_probs.p16x16[i][j] = adapt_prob(pre_fc->tx_probs.p16x16[i][j],
                                                branch_ct_16x16p[j]);
 
       tx_counts_to_branch_counts_32x32(counts->tx.p32x32[i], branch_ct_32x32p);
       for (j = 0; j < TX_SIZES - 1; ++j)
-        fc->tx_probs.p32x32[i][j] = update_ct2(pre_fc->tx_probs.p32x32[i][j],
+        fc->tx_probs.p32x32[i][j] = adapt_prob(pre_fc->tx_probs.p32x32[i][j],
                                                branch_ct_32x32p[j]);
     }
   }
 
   for (i = 0; i < MBSKIP_CONTEXTS; ++i)
-    fc->mbskip_probs[i] = update_ct2(pre_fc->mbskip_probs[i],
+    fc->mbskip_probs[i] = adapt_prob(pre_fc->mbskip_probs[i],
                                      counts->mbskip[i]);
 }
 
diff --git a/vp9/common/vp9_entropymode.h b/vp9/common/vp9_entropymode.h
index ab37b75c6..38b419948 100644
--- a/vp9/common/vp9_entropymode.h
+++ b/vp9/common/vp9_entropymode.h
@@ -16,6 +16,7 @@
 
 #define TX_SIZE_CONTEXTS 2
 #define SWITCHABLE_FILTERS 3   // number of switchable filters
+#define SWITCHABLE_FILTER_CONTEXTS (SWITCHABLE_FILTERS + 1)
 
 // #define MODE_STATS
 
@@ -37,6 +38,9 @@ extern const vp9_prob vp9_kf_uv_mode_prob[INTRA_MODES][INTRA_MODES - 1];
 extern const vp9_prob vp9_kf_y_mode_prob[INTRA_MODES][INTRA_MODES]
                                         [INTRA_MODES - 1];
 
+extern const vp9_prob vp9_kf_partition_probs[PARTITION_CONTEXTS]
+                                            [PARTITION_TYPES - 1];
+
 extern const vp9_tree_index vp9_intra_mode_tree[TREE_SIZE(INTRA_MODES)];
 extern struct vp9_token vp9_intra_mode_encodings[INTRA_MODES];
 
diff --git a/vp9/common/vp9_entropymv.c b/vp9/common/vp9_entropymv.c
index f70b571ef..b061cdb38 100644
--- a/vp9/common/vp9_entropymv.c
+++ b/vp9/common/vp9_entropymv.c
@@ -191,60 +191,47 @@ void vp9_inc_mv(const MV *mv, nmv_context_counts *counts) {
 }
 
 static vp9_prob adapt_prob(vp9_prob prep, const unsigned int ct[2]) {
-  return merge_probs2(prep, ct, MV_COUNT_SAT, MV_MAX_UPDATE_FACTOR);
+  return merge_probs(prep, ct, MV_COUNT_SAT, MV_MAX_UPDATE_FACTOR);
 }
 
-static unsigned int adapt_probs(unsigned int i,
-                                vp9_tree tree,
-                                vp9_prob this_probs[],
-                                const vp9_prob last_probs[],
-                                const unsigned int num_events[]) {
-  const unsigned int left = tree[i] <= 0
-          ? num_events[-tree[i]]
-          : adapt_probs(tree[i], tree, this_probs, last_probs, num_events);
-
-  const unsigned int right = tree[i + 1] <= 0
-          ? num_events[-tree[i + 1]]
-          : adapt_probs(tree[i + 1], tree, this_probs, last_probs, num_events);
-  const unsigned int ct[2] = { left, right };
-  this_probs[i >> 1] = adapt_prob(last_probs[i >> 1], ct);
-  return left + right;
+static void adapt_probs(const vp9_tree_index *tree, const vp9_prob *pre_probs,
+                        const unsigned int *counts, vp9_prob *probs) {
+  tree_merge_probs(tree, pre_probs, counts, 0,
+                   MV_COUNT_SAT, MV_MAX_UPDATE_FACTOR, probs);
 }
 
-
 void vp9_adapt_mv_probs(VP9_COMMON *cm, int allow_hp) {
   int i, j;
 
-  const FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx];
-
-  nmv_context *ctx = &cm->fc.nmvc;
-  const nmv_context *pre_ctx = &pre_fc->nmvc;
-  const nmv_context_counts *cts = &cm->counts.mv;
+  nmv_context *fc = &cm->fc.nmvc;
+  const nmv_context *pre_fc = &cm->frame_contexts[cm->frame_context_idx].nmvc;
+  const nmv_context_counts *counts = &cm->counts.mv;
 
-  adapt_probs(0, vp9_mv_joint_tree, ctx->joints, pre_ctx->joints, cts->joints);
+  adapt_probs(vp9_mv_joint_tree, pre_fc->joints, counts->joints,
+                     fc->joints);
 
   for (i = 0; i < 2; ++i) {
-    ctx->comps[i].sign = adapt_prob(pre_ctx->comps[i].sign, cts->comps[i].sign);
-    adapt_probs(0, vp9_mv_class_tree, ctx->comps[i].classes,
-                pre_ctx->comps[i].classes, cts->comps[i].classes);
-    adapt_probs(0, vp9_mv_class0_tree, ctx->comps[i].class0,
-                pre_ctx->comps[i].class0, cts->comps[i].class0);
+    nmv_component *comp = &fc->comps[i];
+    const nmv_component *pre_comp = &pre_fc->comps[i];
+    const nmv_component_counts *c = &counts->comps[i];
+
+    comp->sign = adapt_prob(pre_comp->sign, c->sign);
+    adapt_probs(vp9_mv_class_tree, pre_comp->classes, c->classes,
+                comp->classes);
+    adapt_probs(vp9_mv_class0_tree, pre_comp->class0, c->class0, comp->class0);
 
     for (j = 0; j < MV_OFFSET_BITS; ++j)
-        ctx->comps[i].bits[j] = adapt_prob(pre_ctx->comps[i].bits[j],
-                                           cts->comps[i].bits[j]);
+      comp->bits[j] = adapt_prob(pre_comp->bits[j], c->bits[j]);
 
     for (j = 0; j < CLASS0_SIZE; ++j)
-      adapt_probs(0, vp9_mv_fp_tree, ctx->comps[i].class0_fp[j],
-                  pre_ctx->comps[i].class0_fp[j], cts->comps[i].class0_fp[j]);
+      adapt_probs(vp9_mv_fp_tree, pre_comp->class0_fp[j], c->class0_fp[j],
+                  comp->class0_fp[j]);
 
-    adapt_probs(0, vp9_mv_fp_tree, ctx->comps[i].fp, pre_ctx->comps[i].fp,
-                cts->comps[i].fp);
+    adapt_probs(vp9_mv_fp_tree, pre_comp->fp, c->fp, comp->fp);
 
     if (allow_hp) {
-      ctx->comps[i].class0_hp = adapt_prob(pre_ctx->comps[i].class0_hp,
-                                           cts->comps[i].class0_hp);
-      ctx->comps[i].hp = adapt_prob(pre_ctx->comps[i].hp, cts->comps[i].hp);
+      comp->class0_hp = adapt_prob(pre_comp->class0_hp, c->class0_hp);
+      comp->hp = adapt_prob(pre_comp->hp, c->hp);
     }
   }
 }
diff --git a/vp9/common/vp9_enums.h b/vp9/common/vp9_enums.h
index 768ff2c94..1651b9050 100644
--- a/vp9/common/vp9_enums.h
+++ b/vp9/common/vp9_enums.h
@@ -76,4 +76,15 @@ typedef enum {
   ADST_ADST = 3                       // ADST in both directions
 } TX_TYPE;
 
+typedef enum {
+  UNKNOWN    = 0,
+  BT_601     = 1,  // YUV
+  BT_709     = 2,  // YUV
+  SMPTE_170  = 3,  // YUV
+  SMPTE_240  = 4,  // YUV
+  RESERVED_1 = 5,
+  RESERVED_2 = 6,
+  SRGB       = 7   // RGB
+} COLOR_SPACE;
+
 #endif  // VP9_COMMON_VP9_ENUMS_H_
diff --git a/vp9/common/vp9_filter.c b/vp9/common/vp9_filter.c
index cedd44cad..79ace147c 100644
--- a/vp9/common/vp9_filter.c
+++ b/vp9/common/vp9_filter.c
@@ -97,19 +97,15 @@ DECLARE_ALIGNED(256, const subpel_kernel,
   { 0, -3,  1,  38, 64, 32, -1, -3}
 };
 
-const subpel_kernel *vp9_get_filter_kernel(INTERPOLATIONFILTERTYPE type) {
-  switch (type) {
-    case EIGHTTAP:
-      return vp9_sub_pel_filters_8;
-    case EIGHTTAP_SMOOTH:
-      return vp9_sub_pel_filters_8lp;
-    case EIGHTTAP_SHARP:
-      return vp9_sub_pel_filters_8s;
-    case BILINEAR:
-      return vp9_bilinear_filters;
-    default:
-      assert(!"Invalid filter type.");
-      return NULL;
-  }
+
+static const subpel_kernel* vp9_filter_kernels[4] = {
+  vp9_sub_pel_filters_8,
+  vp9_sub_pel_filters_8lp,
+  vp9_sub_pel_filters_8s,
+  vp9_bilinear_filters
+};
+
+const subpel_kernel *vp9_get_filter_kernel(INTERPOLATION_TYPE type) {
+  return vp9_filter_kernels[type];
 }
 
diff --git a/vp9/common/vp9_filter.h b/vp9/common/vp9_filter.h
index 302945374..b1e7e6499 100644
--- a/vp9/common/vp9_filter.h
+++ b/vp9/common/vp9_filter.h
@@ -27,7 +27,7 @@ typedef enum {
   EIGHTTAP_SHARP = 2,
   BILINEAR = 3,
   SWITCHABLE = 4  /* should be the last one */
-} INTERPOLATIONFILTERTYPE;
+} INTERPOLATION_TYPE;
 
 typedef int16_t subpel_kernel[SUBPEL_TAPS];
 
@@ -36,10 +36,9 @@ struct subpix_fn_table {
   const subpel_kernel *filter_y;
 };
 
-const subpel_kernel *vp9_get_filter_kernel(INTERPOLATIONFILTERTYPE type);
+const subpel_kernel *vp9_get_filter_kernel(INTERPOLATION_TYPE type);
 
 extern const subpel_kernel vp9_bilinear_filters[SUBPEL_SHIFTS];
-extern const subpel_kernel vp9_sub_pel_filters_6[SUBPEL_SHIFTS];
 extern const subpel_kernel vp9_sub_pel_filters_8[SUBPEL_SHIFTS];
 extern const subpel_kernel vp9_sub_pel_filters_8s[SUBPEL_SHIFTS];
 extern const subpel_kernel vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS];
diff --git a/vp9/common/vp9_findnearmv.c b/vp9/common/vp9_findnearmv.c
index 592ef6afa..b91c50143 100644
--- a/vp9/common/vp9_findnearmv.c
+++ b/vp9/common/vp9_findnearmv.c
@@ -35,6 +35,7 @@ void vp9_find_best_ref_mvs(MACROBLOCKD *xd, int allow_hp,
 }
 
 void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd,
+                                   const TileInfo *const tile,
                                    int_mv *dst_nearest,
                                    int_mv *dst_near,
                                    int block_idx, int ref_idx,
@@ -46,7 +47,7 @@ void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd,
   assert(ref_idx == 0 || ref_idx == 1);
   assert(MAX_MV_REF_CANDIDATES == 2);  // makes code here slightly easier
 
-  vp9_find_mv_refs_idx(cm, xd, mi, xd->last_mi,
+  vp9_find_mv_refs_idx(cm, xd, tile, mi, xd->last_mi,
                        mi->mbmi.ref_frame[ref_idx],
                        mv_list, block_idx, mi_row, mi_col);
 
diff --git a/vp9/common/vp9_findnearmv.h b/vp9/common/vp9_findnearmv.h
index d161d1b3e..2362caa41 100644
--- a/vp9/common/vp9_findnearmv.h
+++ b/vp9/common/vp9_findnearmv.h
@@ -34,8 +34,8 @@ static void clamp_mv2(MV *mv, const MACROBLOCKD *xd) {
                xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);
 }
 
-void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm,
-                                   MACROBLOCKD *xd,
+void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd,
+                                   const TileInfo *const tile,
                                    int_mv *dst_nearest,
                                    int_mv *dst_near,
                                    int block_idx, int ref_idx,
@@ -43,42 +43,30 @@ void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm,
 
 static MB_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mi,
                                           const MODE_INFO *left_mi, int b) {
-  // FIXME(rbultje, jingning): temporary hack because jenkins doesn't
-  // understand this condition. This will go away soon.
-
   if (b == 0 || b == 2) {
-    /* On L edge, get from MB to left of us */
-    if (!left_mi)
+    if (!left_mi || is_inter_block(&left_mi->mbmi))
       return DC_PRED;
 
-    if (is_inter_block(&left_mi->mbmi))
-      return DC_PRED;
-    else
-      return left_mi->mbmi.sb_type < BLOCK_8X8 ? left_mi->bmi[b + 1].as_mode
-                                               : left_mi->mbmi.mode;
+    return left_mi->mbmi.sb_type < BLOCK_8X8 ? left_mi->bmi[b + 1].as_mode
+                                             : left_mi->mbmi.mode;
+  } else {
+    assert(b == 1 || b == 3);
+    return cur_mi->bmi[b - 1].as_mode;
   }
-  assert(b == 1 || b == 3);
-  return cur_mi->bmi[b - 1].as_mode;
 }
 
-static MB_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mb,
-                                           const MODE_INFO *above_mb, int b) {
-  const MODE_INFO *mi = cur_mb;
-
-  if (!(b >> 1)) {
-    /* On top edge, get from MB above us */
-    mi = above_mb;
-    if (!mi)
+static MB_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mi,
+                                           const MODE_INFO *above_mi, int b) {
+  if (b == 0 || b == 1) {
+    if (!above_mi || is_inter_block(&above_mi->mbmi))
       return DC_PRED;
 
-    if (is_inter_block(&mi->mbmi))
-      return DC_PRED;
-    else
-      return mi->mbmi.sb_type < BLOCK_8X8 ? (mi->bmi + 2 + b)->as_mode
-                                          : mi->mbmi.mode;
+    return above_mi->mbmi.sb_type < BLOCK_8X8 ? above_mi->bmi[b + 2].as_mode
+                                              : above_mi->mbmi.mode;
+  } else {
+    assert(b == 2 || b == 3);
+    return cur_mi->bmi[b - 2].as_mode;
   }
-
-  return (mi->bmi + b - 2)->as_mode;
 }
 
 #endif  // VP9_COMMON_VP9_FINDNEARMV_H_
diff --git a/vp9/common/vp9_idct.c b/vp9/common/vp9_idct.c
index 52b039d99..ea8683ea1 100644
--- a/vp9/common/vp9_idct.c
+++ b/vp9/common/vp9_idct.c
@@ -1280,6 +1280,31 @@ void vp9_idct32x32_1024_add_c(const int16_t *input, uint8_t *dest, int stride) {
   }
 }
 
+void vp9_idct32x32_34_add_c(const int16_t *input, uint8_t *dest, int stride) {
+  int16_t out[32 * 32] = {0};
+  int16_t *outptr = out;
+  int i, j;
+  int16_t temp_in[32], temp_out[32];
+
+  // Rows
+  // only upper-left 8x8 has non-zero coeff
+  for (i = 0; i < 8; ++i) {
+    idct32_1d(input, outptr);
+    input += 32;
+    outptr += 32;
+  }
+
+  // Columns
+  for (i = 0; i < 32; ++i) {
+    for (j = 0; j < 32; ++j)
+      temp_in[j] = out[j * 32 + i];
+    idct32_1d(temp_in, temp_out);
+    for (j = 0; j < 32; ++j)
+      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+                                  + dest[j * stride + i]);
+  }
+}
+
 void vp9_idct32x32_1_add_c(const int16_t *input, uint8_t *dest, int stride) {
   int i, j;
   int a1;
@@ -1350,6 +1375,9 @@ void vp9_idct32x32_add(const int16_t *input, uint8_t *dest, int stride,
   if (eob) {
     if (eob == 1)
       vp9_idct32x32_1_add(input, dest, stride);
+    else if (eob <= 34)
+      // non-zero coeff only in upper-left 8x8
+      vp9_idct32x32_34_add(input, dest, stride);
     else
       vp9_idct32x32_1024_add(input, dest, stride);
   }
diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c
index 85ac6d2bf..218e12e62 100644
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@@ -16,12 +16,6 @@
 
 #include "vp9/common/vp9_seg_common.h"
 
-struct loop_filter_info {
-  const uint8_t *mblim;
-  const uint8_t *lim;
-  const uint8_t *hev_thr;
-};
-
 // This structure holds bit masks for all 8x8 blocks in a 64x64 region.
 // Each 1 bit represents a position in which we want to apply the loop filter.
 // Left_ entries refer to whether we apply a filter on the border to the
@@ -259,8 +253,8 @@ static void update_sharpness(loop_filter_info_n *lfi, int sharpness_lvl) {
     if (block_inside_limit < 1)
       block_inside_limit = 1;
 
-    vpx_memset(lfi->lim[lvl], block_inside_limit, SIMD_WIDTH);
-    vpx_memset(lfi->mblim[lvl], (2 * (lvl + 2) + block_inside_limit),
+    vpx_memset(lfi->lfthr[lvl].lim, block_inside_limit, SIMD_WIDTH);
+    vpx_memset(lfi->lfthr[lvl].mblim, (2 * (lvl + 2) + block_inside_limit),
                SIMD_WIDTH);
   }
 }
@@ -268,7 +262,7 @@ static void update_sharpness(loop_filter_info_n *lfi, int sharpness_lvl) {
 void vp9_loop_filter_init(VP9_COMMON *cm) {
   loop_filter_info_n *lfi = &cm->lf_info;
   struct loopfilter *lf = &cm->lf;
-  int i;
+  int lvl;
 
   // init limits for given sharpness
   update_sharpness(lfi, lf->sharpness_level);
@@ -278,8 +272,8 @@ void vp9_loop_filter_init(VP9_COMMON *cm) {
   lf_init_lut(lfi);
 
   // init hev threshold const vectors
-  for (i = 0; i < 4; i++)
-    vpx_memset(lfi->hev_thr[i], i, SIMD_WIDTH);
+  for (lvl = 0; lvl <= MAX_LOOP_FILTER; lvl++)
+    vpx_memset(lfi->lfthr[lvl].hev_thr, (lvl >> 4), SIMD_WIDTH);
 }
 
 void vp9_loop_filter_frame_init(VP9_COMMON *cm, int default_filt_lvl) {
@@ -330,16 +324,14 @@ void vp9_loop_filter_frame_init(VP9_COMMON *cm, int default_filt_lvl) {
 
 static int build_lfi(const loop_filter_info_n *lfi_n,
                      const MB_MODE_INFO *mbmi,
-                     struct loop_filter_info *lfi) {
+                     const loop_filter_thresh **lfi) {
   const int seg = mbmi->segment_id;
   const int ref = mbmi->ref_frame[0];
   const int mode = lfi_n->mode_lf_lut[mbmi->mode];
   const int filter_level = lfi_n->lvl[seg][ref][mode];
 
   if (filter_level > 0) {
-    lfi->mblim = lfi_n->mblim[filter_level];
-    lfi->lim = lfi_n->lim[filter_level];
-    lfi->hev_thr = lfi_n->hev_thr[filter_level >> 4];
+    *lfi = &lfi_n->lfthr[filter_level];
     return 1;
   } else {
     return 0;
@@ -351,11 +343,13 @@ static void filter_selectively_vert(uint8_t *s, int pitch,
                                     unsigned int mask_8x8,
                                     unsigned int mask_4x4,
                                     unsigned int mask_4x4_int,
-                                    const struct loop_filter_info *lfi) {
+                                    const loop_filter_thresh **p_lfi) {
   unsigned int mask;
 
   for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int;
        mask; mask >>= 1) {
+    const loop_filter_thresh *lfi = *p_lfi;
+
     if (mask & 1) {
       if (mask_16x16 & 1) {
         vp9_mb_lpf_vertical_edge_w(s, pitch, lfi->mblim, lfi->lim,
@@ -379,7 +373,7 @@ static void filter_selectively_vert(uint8_t *s, int pitch,
       vp9_loop_filter_vertical_edge(s + 4, pitch, lfi->mblim, lfi->lim,
                                     lfi->hev_thr, 1);
     s += 8;
-    lfi++;
+    p_lfi++;
     mask_16x16 >>= 1;
     mask_8x8 >>= 1;
     mask_4x4 >>= 1;
@@ -393,12 +387,14 @@ static void filter_selectively_horiz(uint8_t *s, int pitch,
                                      unsigned int mask_4x4,
                                      unsigned int mask_4x4_int,
                                      int only_4x4_1,
-                                     const struct loop_filter_info *lfi) {
+                                     const loop_filter_thresh **p_lfi) {
   unsigned int mask;
   int count;
 
   for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int;
        mask; mask >>= count) {
+    const loop_filter_thresh *lfi = *p_lfi;
+
     count = 1;
     if (mask & 1) {
       if (!only_4x4_1) {
@@ -432,7 +428,7 @@ static void filter_selectively_horiz(uint8_t *s, int pitch,
                                         lfi->lim, lfi->hev_thr, 1);
     }
     s += 8 * count;
-    lfi += count;
+    p_lfi += count;
     mask_16x16 >>= count;
     mask_8x8 >>= count;
     mask_4x4 >>= count;
@@ -805,7 +801,7 @@ static void filter_block_plane_non420(VP9_COMMON *cm,
   unsigned int mask_8x8[MI_BLOCK_SIZE] = {0};
   unsigned int mask_4x4[MI_BLOCK_SIZE] = {0};
   unsigned int mask_4x4_int[MI_BLOCK_SIZE] = {0};
-  struct loop_filter_info lfi[MI_BLOCK_SIZE][MI_BLOCK_SIZE];
+  const loop_filter_thresh *lfi[MI_BLOCK_SIZE][MI_BLOCK_SIZE];
   int r, c;
 
   for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += row_step) {
@@ -834,7 +830,7 @@ static void filter_block_plane_non420(VP9_COMMON *cm,
       const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1;
 
       // Filter level can vary per MI
-      if (!build_lfi(&cm->lf_info, &mi[0].mbmi, lfi[r] + (c >> ss_x)))
+      if (!build_lfi(&cm->lf_info, &mi[0].mbmi, &lfi[r][c >> ss_x]))
         continue;
 
       // Build masks based on the transform size of each block
@@ -925,7 +921,7 @@ static void filter_block_plane(VP9_COMMON *const cm,
   struct buf_2d *const dst = &plane->dst;
   uint8_t* const dst0 = dst->buf;
   unsigned int mask_4x4_int[MI_BLOCK_SIZE] = {0};
-  struct loop_filter_info lfi[MI_BLOCK_SIZE][MI_BLOCK_SIZE];
+  const loop_filter_thresh *lfi[MI_BLOCK_SIZE][MI_BLOCK_SIZE];
   int r, c;
   int row_shift = 3 - ss_x;
   int row_mask = 0xff >> (ss_x << 2);
@@ -938,8 +934,8 @@ static void filter_block_plane(VP9_COMMON *const cm,
     // Determine the vertical edges that need filtering
     for (c = 0; c < MI_BLOCK_SIZE && mi_col + c < cm->mi_cols; c += col_step) {
       const MODE_INFO *mi = mi_8x8[c];
-      if (!build_lfi(&cm->lf_info, &mi[0].mbmi, lfi[r] + (c >> ss_x)))
-        continue;
+
+      build_lfi(&cm->lf_info, &mi[0].mbmi, &lfi[r][c >> ss_x]);
     }
     if (!plane->plane_type) {
       mask_4x4_int[r] = MASK_ROW(lfm->int_4x4_y);
diff --git a/vp9/common/vp9_loopfilter.h b/vp9/common/vp9_loopfilter.h
index c698090d8..62389ea5e 100644
--- a/vp9/common/vp9_loopfilter.h
+++ b/vp9/common/vp9_loopfilter.h
@@ -46,12 +46,13 @@ struct loopfilter {
 // Need to align this structure so when it is declared and
 // passed it can be loaded into vector registers.
 typedef struct {
-  DECLARE_ALIGNED(SIMD_WIDTH, uint8_t,
-                  mblim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]);
-  DECLARE_ALIGNED(SIMD_WIDTH, uint8_t,
-                  lim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]);
-  DECLARE_ALIGNED(SIMD_WIDTH, uint8_t,
-                  hev_thr[4][SIMD_WIDTH]);
+  DECLARE_ALIGNED(SIMD_WIDTH, uint8_t, mblim[SIMD_WIDTH]);
+  DECLARE_ALIGNED(SIMD_WIDTH, uint8_t, lim[SIMD_WIDTH]);
+  DECLARE_ALIGNED(SIMD_WIDTH, uint8_t, hev_thr[SIMD_WIDTH]);
+} loop_filter_thresh;
+
+typedef struct {
+  loop_filter_thresh lfthr[MAX_LOOP_FILTER + 1];
   uint8_t lvl[MAX_SEGMENTS][MAX_REF_FRAMES][MAX_MODE_LF_DELTAS];
   uint8_t mode_lf_lut[MB_MODE_COUNT];
 } loop_filter_info_n;
diff --git a/vp9/common/vp9_mvref_common.c b/vp9/common/vp9_mvref_common.c
index 659079639..8df8aec84 100644
--- a/vp9/common/vp9_mvref_common.c
+++ b/vp9/common/vp9_mvref_common.c
@@ -170,17 +170,19 @@ static INLINE int_mv scale_mv(const MB_MODE_INFO *mbmi, int ref,
 
 // Checks that the given mi_row, mi_col and search point
 // are inside the borders of the tile.
-static INLINE int is_inside(const VP9_COMMON *cm, int mi_col, int mi_row,
+static INLINE int is_inside(const TileInfo *const tile,
+                            int mi_col, int mi_row, int mi_rows,
                             const MV *mv) {
   return !(mi_row + mv->row < 0 ||
-           mi_col + mv->col < cm->cur_tile_mi_col_start ||
-           mi_row + mv->row >= cm->mi_rows ||
-           mi_col + mv->col >= cm->cur_tile_mi_col_end);
+           mi_col + mv->col < tile->mi_col_start ||
+           mi_row + mv->row >= mi_rows ||
+           mi_col + mv->col >= tile->mi_col_end);
 }
 
 // This function searches the neighbourhood of a given MB/SB
 // to try and find candidate reference vectors.
 void vp9_find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
+                          const TileInfo *const tile,
                           MODE_INFO *mi, const MODE_INFO *prev_mi,
                           MV_REFERENCE_FRAME ref_frame,
                           int_mv *mv_ref_list,
@@ -201,7 +203,7 @@ void vp9_find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
   // and we also need to keep a mode count.
   for (i = 0; i < 2; ++i) {
     const MV *const mv_ref = &mv_ref_search[i];
-    if (is_inside(cm, mi_col, mi_row, mv_ref)) {
+    if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) {
       const MODE_INFO *const candidate_mi = xd->mi_8x8[mv_ref->col + mv_ref->row
                                                    * xd->mode_info_stride];
       const MB_MODE_INFO *const candidate = &candidate_mi->mbmi;
@@ -228,7 +230,7 @@ void vp9_find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
   // mode counts.
   for (; i < MVREF_NEIGHBOURS; ++i) {
     const MV *const mv_ref = &mv_ref_search[i];
-    if (is_inside(cm, mi_col, mi_row, mv_ref)) {
+    if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) {
       const MB_MODE_INFO *const candidate = &xd->mi_8x8[mv_ref->col +
                                             mv_ref->row
                                             * xd->mode_info_stride]->mbmi;
@@ -258,7 +260,7 @@ void vp9_find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
   if (different_ref_found) {
     for (i = 0; i < MVREF_NEIGHBOURS; ++i) {
       const MV *mv_ref = &mv_ref_search[i];
-      if (is_inside(cm, mi_col, mi_row, mv_ref)) {
+      if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) {
         const MB_MODE_INFO *const candidate = &xd->mi_8x8[mv_ref->col +
                                                           mv_ref->row
                                               * xd->mode_info_stride]->mbmi;
diff --git a/vp9/common/vp9_mvref_common.h b/vp9/common/vp9_mvref_common.h
index 39ebdb078..ce4c55983 100644
--- a/vp9/common/vp9_mvref_common.h
+++ b/vp9/common/vp9_mvref_common.h
@@ -15,6 +15,7 @@
 #define VP9_COMMON_VP9_MVREF_COMMON_H_
 
 void vp9_find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
+                          const TileInfo *const tile,
                           MODE_INFO *mi, const MODE_INFO *prev_mi,
                           MV_REFERENCE_FRAME ref_frame,
                           int_mv *mv_ref_list,
@@ -22,11 +23,12 @@ void vp9_find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
                           int mi_row, int mi_col);
 
 static INLINE void vp9_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd,
+                                    const TileInfo *const tile,
                                     MODE_INFO *mi, const MODE_INFO *prev_mi,
                                     MV_REFERENCE_FRAME ref_frame,
                                     int_mv *mv_ref_list,
                                     int mi_row, int mi_col) {
-  vp9_find_mv_refs_idx(cm, xd, mi, prev_mi, ref_frame,
+  vp9_find_mv_refs_idx(cm, xd, tile, mi, prev_mi, ref_frame,
                        mv_ref_list, -1, mi_row, mi_col);
 }
 
diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h
index f2244e555..a2af57acf 100644
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h
@@ -19,6 +19,7 @@
 #include "vp9/common/vp9_entropy.h"
 #include "vp9/common/vp9_entropymode.h"
 #include "vp9/common/vp9_quant_common.h"
+#include "vp9/common/vp9_tile_common.h"
 
 #if CONFIG_VP9_POSTPROC
 #include "vp9/common/vp9_postproc.h"
@@ -40,9 +41,9 @@
 typedef struct frame_contexts {
   vp9_prob y_mode_prob[BLOCK_SIZE_GROUPS][INTRA_MODES - 1];
   vp9_prob uv_mode_prob[INTRA_MODES][INTRA_MODES - 1];
-  vp9_prob partition_prob[FRAME_TYPES][PARTITION_CONTEXTS][PARTITION_TYPES - 1];
+  vp9_prob partition_prob[PARTITION_CONTEXTS][PARTITION_TYPES - 1];
   vp9_coeff_probs_model coef_probs[TX_SIZES][BLOCK_TYPES];
-  vp9_prob switchable_interp_prob[SWITCHABLE_FILTERS + 1]
+  vp9_prob switchable_interp_prob[SWITCHABLE_FILTER_CONTEXTS]
                                  [SWITCHABLE_FILTERS - 1];
   vp9_prob inter_mode_probs[INTER_MODE_CONTEXTS][INTER_MODES - 1];
   vp9_prob intra_inter_prob[INTRA_INTER_CONTEXTS];
@@ -61,7 +62,7 @@ typedef struct {
   vp9_coeff_count_model coef[TX_SIZES][BLOCK_TYPES];
   unsigned int eob_branch[TX_SIZES][BLOCK_TYPES][REF_TYPES]
                          [COEF_BANDS][PREV_COEF_CONTEXTS];
-  unsigned int switchable_interp[SWITCHABLE_FILTERS + 1]
+  unsigned int switchable_interp[SWITCHABLE_FILTER_CONTEXTS]
                                 [SWITCHABLE_FILTERS];
   unsigned int inter_mode[INTER_MODE_CONTEXTS][INTER_MODES];
   unsigned int intra_inter[INTRA_INTER_CONTEXTS][2];
@@ -90,6 +91,8 @@ typedef struct VP9Common {
   DECLARE_ALIGNED(16, int16_t, a_dequant[QINDEX_RANGE][8]);
 #endif
 
+  COLOR_SPACE color_space;
+
   int width;
   int height;
   int display_width;
@@ -115,6 +118,7 @@ typedef struct VP9Common {
   // Each frame can reference ALLOWED_REFS_PER_FRAME buffers
   int active_ref_idx[ALLOWED_REFS_PER_FRAME];
   struct scale_factors active_ref_scale[ALLOWED_REFS_PER_FRAME];
+  struct scale_factors_common active_ref_scale_comm[ALLOWED_REFS_PER_FRAME];
   int new_fb_idx;
 
   YV12_BUFFER_CONFIG post_proc_buffer;
@@ -171,7 +175,7 @@ typedef struct VP9Common {
   // Persistent mb segment id map used in prediction.
   unsigned char *last_frame_seg_map;
 
-  INTERPOLATIONFILTERTYPE mcomp_filter_type;
+  INTERPOLATION_TYPE mcomp_filter_type;
 
   loop_filter_info_n lf_info;
 
@@ -182,14 +186,6 @@ typedef struct VP9Common {
   struct loopfilter lf;
   struct segmentation seg;
 
-  /* Y,U,V */
-  ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
-  ENTROPY_CONTEXT left_context[MAX_MB_PLANE][16];
-
-  // partition contexts
-  PARTITION_CONTEXT *above_seg_context;
-  PARTITION_CONTEXT left_seg_context[8];
-
   // Context probabilities for reference frame prediction
   int allow_comp_inter_inter;
   MV_REFERENCE_FRAME comp_fixed_ref;
@@ -212,10 +208,19 @@ typedef struct VP9Common {
   int frame_parallel_decoding_mode;
 
   int log2_tile_cols, log2_tile_rows;
-  int cur_tile_mi_col_start, cur_tile_mi_col_end;
-  int cur_tile_mi_row_start, cur_tile_mi_row_end;
 } VP9_COMMON;
 
+// ref == 0 => LAST_FRAME
+// ref == 1 => GOLDEN_FRAME
+// ref == 2 => ALTREF_FRAME
+static YV12_BUFFER_CONFIG *get_frame_ref_buffer(VP9_COMMON *cm, int ref) {
+  return &cm->yv12_fb[cm->active_ref_idx[ref]];
+}
+
+static YV12_BUFFER_CONFIG *get_frame_new_buffer(VP9_COMMON *cm) {
+  return &cm->yv12_fb[cm->new_fb_idx];
+}
+
 static int get_free_fb(VP9_COMMON *cm) {
   int i;
   for (i = 0; i < NUM_YV12_BUFFERS; i++)
@@ -240,47 +245,38 @@ static int mi_cols_aligned_to_sb(int n_mis) {
   return ALIGN_POWER_OF_TWO(n_mis, MI_BLOCK_SIZE_LOG2);
 }
 
-static INLINE void set_skip_context(VP9_COMMON *cm, MACROBLOCKD *xd,
-                                    int mi_row, int mi_col) {
+static INLINE const vp9_prob* get_partition_probs(VP9_COMMON *cm, int ctx) {
+  return cm->frame_type == KEY_FRAME ? vp9_kf_partition_probs[ctx]
+                                     : cm->fc.partition_prob[ctx];
+}
+
+static INLINE void set_skip_context(
+    MACROBLOCKD *xd,
+    ENTROPY_CONTEXT *above_context[MAX_MB_PLANE],
+    ENTROPY_CONTEXT left_context[MAX_MB_PLANE][16],
+    int mi_row, int mi_col) {
   const int above_idx = mi_col * 2;
   const int left_idx = (mi_row * 2) & 15;
   int i;
   for (i = 0; i < MAX_MB_PLANE; i++) {
     struct macroblockd_plane *const pd = &xd->plane[i];
-    pd->above_context = cm->above_context[i] + (above_idx >> pd->subsampling_x);
-    pd->left_context = cm->left_context[i] + (left_idx >> pd->subsampling_y);
+    pd->above_context = above_context[i] + (above_idx >> pd->subsampling_x);
+    pd->left_context = left_context[i] + (left_idx >> pd->subsampling_y);
   }
 }
 
-// return the node index in the prob tree for binary coding
-static int check_bsize_coverage(int bs, int mi_rows, int mi_cols,
-                                int mi_row, int mi_col) {
-  const int r = (mi_row + bs < mi_rows);
-  const int c = (mi_col + bs < mi_cols);
-
-  if (r && c)
-    return 0;
-
-  if (c && !r)
-    return 1;  // only allow horizontal/split partition types
-
-  if (r && !c)
-    return 2;  // only allow vertical/split partition types
-
-  return -1;
-}
-
-static void set_mi_row_col(VP9_COMMON *cm, MACROBLOCKD *xd,
-                       int mi_row, int bh,
-                       int mi_col, int bw) {
+static void set_mi_row_col(MACROBLOCKD *xd, const TileInfo *const tile,
+                           int mi_row, int bh,
+                           int mi_col, int bw,
+                           int mi_rows, int mi_cols) {
   xd->mb_to_top_edge    = -((mi_row * MI_SIZE) * 8);
-  xd->mb_to_bottom_edge = ((cm->mi_rows - bh - mi_row) * MI_SIZE) * 8;
+  xd->mb_to_bottom_edge = ((mi_rows - bh - mi_row) * MI_SIZE) * 8;
   xd->mb_to_left_edge   = -((mi_col * MI_SIZE) * 8);
-  xd->mb_to_right_edge  = ((cm->mi_cols - bw - mi_col) * MI_SIZE) * 8;
+  xd->mb_to_right_edge  = ((mi_cols - bw - mi_col) * MI_SIZE) * 8;
 
   // Are edges available for intra prediction?
   xd->up_available    = (mi_row != 0);
-  xd->left_available  = (mi_col > cm->cur_tile_mi_col_start);
+  xd->left_available  = (mi_col > tile->mi_col_start);
 }
 
 static void set_prev_mi(VP9_COMMON *cm) {
@@ -299,12 +295,14 @@ static INLINE int frame_is_intra_only(const VP9_COMMON *const cm) {
   return cm->frame_type == KEY_FRAME || cm->intra_only;
 }
 
-static INLINE void update_partition_context(VP9_COMMON *cm,
-                                            int mi_row, int mi_col,
-                                            BLOCK_SIZE sb_type,
-                                            BLOCK_SIZE sb_size) {
-  PARTITION_CONTEXT *above_ctx = cm->above_seg_context + mi_col;
-  PARTITION_CONTEXT *left_ctx = cm->left_seg_context + (mi_row & MI_MASK);
+static INLINE void update_partition_context(
+    PARTITION_CONTEXT *above_seg_context,
+    PARTITION_CONTEXT left_seg_context[8],
+    int mi_row, int mi_col,
+    BLOCK_SIZE sb_type,
+    BLOCK_SIZE sb_size) {
+  PARTITION_CONTEXT *above_ctx = above_seg_context + mi_col;
+  PARTITION_CONTEXT *left_ctx = left_seg_context + (mi_row & MI_MASK);
 
   const int bsl = b_width_log2(sb_size), bs = (1 << bsl) / 2;
   const int bwl = b_width_log2(sb_type);
@@ -323,11 +321,13 @@ static INLINE void update_partition_context(VP9_COMMON *cm,
   vpx_memset(left_ctx, pcvalue[bhl == bsl], bs);
 }
 
-static INLINE int partition_plane_context(const VP9_COMMON *cm,
-                                          int mi_row, int mi_col,
-                                          BLOCK_SIZE sb_type) {
-  const PARTITION_CONTEXT *above_ctx = cm->above_seg_context + mi_col;
-  const PARTITION_CONTEXT *left_ctx = cm->left_seg_context + (mi_row & MI_MASK);
+static INLINE int partition_plane_context(
+    const PARTITION_CONTEXT *above_seg_context,
+    const PARTITION_CONTEXT left_seg_context[8],
+    int mi_row, int mi_col,
+    BLOCK_SIZE sb_type) {
+  const PARTITION_CONTEXT *above_ctx = above_seg_context + mi_col;
+  const PARTITION_CONTEXT *left_ctx = left_seg_context + (mi_row & MI_MASK);
 
   int bsl = mi_width_log2(sb_type), bs = 1 << bsl;
   int above = 0, left = 0, i;
diff --git a/vp9/common/vp9_pred_common.c b/vp9/common/vp9_pred_common.c
index be42c56b5..6018e1775 100644
--- a/vp9/common/vp9_pred_common.c
+++ b/vp9/common/vp9_pred_common.c
@@ -35,14 +35,14 @@ unsigned char vp9_get_pred_context_switchable_interp(const MACROBLOCKD *xd) {
   // left of the entries correpsonding to real macroblocks.
   // The prediction flags in these dummy entries are initialised to 0.
   // left
-  const int left_mv_pred = left_in_image ? is_inter_mode(left_mi->mbmi.mode)
+  const int left_mv_pred = left_in_image ? is_inter_block(&left_mi->mbmi)
                                          : 0;
   const int left_interp = left_in_image && left_mv_pred
                               ? left_mi->mbmi.interp_filter
                               : SWITCHABLE_FILTERS;
 
   // above
-  const int above_mv_pred = above_in_image ? is_inter_mode(above_mi->mbmi.mode)
+  const int above_mv_pred = above_in_image ? is_inter_block(&above_mi->mbmi)
                                            : 0;
   const int above_interp = above_in_image && above_mv_pred
                                ? above_mi->mbmi.interp_filter
@@ -403,8 +403,8 @@ void vp9_set_pred_flag_seg_id(MACROBLOCKD *xd, uint8_t pred_flag) {
 int vp9_get_segment_id(VP9_COMMON *cm, const uint8_t *segment_ids,
                        BLOCK_SIZE bsize, int mi_row, int mi_col) {
   const int mi_offset = mi_row * cm->mi_cols + mi_col;
-  const int bw = 1 << mi_width_log2(bsize);
-  const int bh = 1 << mi_height_log2(bsize);
+  const int bw = num_8x8_blocks_wide_lookup[bsize];
+  const int bh = num_8x8_blocks_high_lookup[bsize];
   const int xmis = MIN(cm->mi_cols - mi_col, bw);
   const int ymis = MIN(cm->mi_rows - mi_row, bh);
   int x, y, segment_id = INT_MAX;
diff --git a/vp9/common/vp9_pred_common.h b/vp9/common/vp9_pred_common.h
index a869dc0a6..19032bf62 100644
--- a/vp9/common/vp9_pred_common.h
+++ b/vp9/common/vp9_pred_common.h
@@ -127,14 +127,14 @@ static const vp9_prob *get_tx_probs2(const MACROBLOCKD *xd,
   return get_tx_probs(bsize, context, tx_probs);
 }
 
-static void update_tx_counts(BLOCK_SIZE bsize, uint8_t context,
-                             TX_SIZE tx_size, struct tx_counts *tx_counts) {
-  if (bsize >= BLOCK_32X32)
-    tx_counts->p32x32[context][tx_size]++;
-  else if (bsize >= BLOCK_16X16)
-    tx_counts->p16x16[context][tx_size]++;
+static unsigned int *get_tx_counts(BLOCK_SIZE bsize, uint8_t context,
+                                   struct tx_counts *tx_counts) {
+  if (bsize < BLOCK_16X16)
+    return tx_counts->p8x8[context];
+  else if (bsize < BLOCK_32X32)
+    return tx_counts->p16x16[context];
   else
-    tx_counts->p8x8[context][tx_size]++;
+    return tx_counts->p32x32[context];
 }
 
 #endif  // VP9_COMMON_VP9_PRED_COMMON_H_
diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c
index 6f16ac70a..1c96788db 100644
--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c
@@ -21,7 +21,7 @@
 #include "vp9/common/vp9_reconintra.h"
 
 void vp9_setup_interp_filters(MACROBLOCKD *xd,
-                              INTERPOLATIONFILTERTYPE mcomp_filter_type,
+                              INTERPOLATION_TYPE mcomp_filter_type,
                               VP9_COMMON *cm) {
   if (xd->mi_8x8 && xd->mi_8x8[0]) {
     MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
@@ -40,6 +40,24 @@ void vp9_setup_interp_filters(MACROBLOCKD *xd,
   assert(((intptr_t)xd->subpix.filter_x & 0xff) == 0);
 }
 
+static void inter_predictor(const uint8_t *src, int src_stride,
+                            uint8_t *dst, int dst_stride,
+                            const MV32 *mv,
+                            const struct scale_factors *scale,
+                            int w, int h, int ref,
+                            const struct subpix_fn_table *subpix,
+                            int xs, int ys) {
+  const int subpel_x = mv->col & SUBPEL_MASK;
+  const int subpel_y = mv->row & SUBPEL_MASK;
+
+  src += (mv->row >> SUBPEL_BITS) * src_stride + (mv->col >> SUBPEL_BITS);
+  scale->sfc->predict[subpel_x != 0][subpel_y != 0][ref](
+      src, src_stride, dst, dst_stride,
+      subpix->filter_x[subpel_x], xs,
+      subpix->filter_y[subpel_y], ys,
+      w, h);
+}
+
 void vp9_build_inter_predictor(const uint8_t *src, int src_stride,
                                uint8_t *dst, int dst_stride,
                                const MV *src_mv,
@@ -50,16 +68,11 @@ void vp9_build_inter_predictor(const uint8_t *src, int src_stride,
   const int is_q4 = precision == MV_PRECISION_Q4;
   const MV mv_q4 = { is_q4 ? src_mv->row : src_mv->row * 2,
                      is_q4 ? src_mv->col : src_mv->col * 2 };
-  const MV32 mv = scale->scale_mv(&mv_q4, scale);
-  const int subpel_x = mv.col & SUBPEL_MASK;
-  const int subpel_y = mv.row & SUBPEL_MASK;
+  const struct scale_factors_common *sfc = scale->sfc;
+  const MV32 mv = sfc->scale_mv(&mv_q4, scale);
 
-  src += (mv.row >> SUBPEL_BITS) * src_stride + (mv.col >> SUBPEL_BITS);
-  scale->predict[subpel_x != 0][subpel_y != 0][ref](
-      src, src_stride, dst, dst_stride,
-      subpix->filter_x[subpel_x], scale->x_step_q4,
-      subpix->filter_y[subpel_y], scale->y_step_q4,
-      w, h);
+  inter_predictor(src, src_stride, dst, dst_stride, &mv, scale,
+                  w, h, ref, subpix, sfc->x_step_q4, sfc->y_step_q4);
 }
 
 static INLINE int round_mv_comp_q4(int value) {
@@ -133,10 +146,6 @@ static void build_inter_predictors(int plane, int block, BLOCK_SIZE bsize,
     struct scale_factors *const scale = &xd->scale_factor[ref];
     struct buf_2d *const pre_buf = &pd->pre[ref];
     struct buf_2d *const dst_buf = &pd->dst;
-
-    const uint8_t *const pre = pre_buf->buf + scaled_buffer_offset(x, y,
-                               pre_buf->stride, scale);
-
     uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x;
 
     // TODO(jkoleszar): All chroma MVs in SPLITMV mode are taken as the
@@ -152,15 +161,32 @@ static void build_inter_predictors(int plane, int block, BLOCK_SIZE bsize,
     // scaling case. It needs to be done on the scaled MV, not the pre-scaling
     // MV. Note however that it performs the subsampling aware scaling so
     // that the result is always q4.
-    const MV res_mv = clamp_mv_to_umv_border_sb(xd, &mv, bw, bh,
-                                                pd->subsampling_x,
-                                                pd->subsampling_y);
-
-    scale->set_scaled_offsets(scale, arg->y + y, arg->x + x);
-    vp9_build_inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride,
-                              &res_mv, scale,
-                              4 << pred_w, 4 << pred_h, ref,
-                              &xd->subpix, MV_PRECISION_Q4);
+    // mv_precision precision is MV_PRECISION_Q4.
+    const MV mv_q4 = clamp_mv_to_umv_border_sb(xd, &mv, bw, bh,
+                                               pd->subsampling_x,
+                                               pd->subsampling_y);
+
+    uint8_t *pre;
+    MV32 scaled_mv;
+    int xs, ys;
+
+    if (vp9_is_scaled(scale->sfc)) {
+      pre = pre_buf->buf + scaled_buffer_offset(x, y, pre_buf->stride, scale);
+      scale->sfc->set_scaled_offsets(scale, arg->y + y, arg->x + x);
+      scaled_mv = scale->sfc->scale_mv(&mv_q4, scale);
+      xs = scale->sfc->x_step_q4;
+      ys = scale->sfc->y_step_q4;
+    } else {
+      pre = pre_buf->buf + (y * pre_buf->stride + x);
+      scaled_mv.row = mv_q4.row;
+      scaled_mv.col = mv_q4.col;
+      xs = ys = 16;
+    }
+
+    inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride,
+                    &scaled_mv, scale,
+                    4 << pred_w, 4 << pred_h, ref,
+                    &xd->subpix, xs, ys);
   }
 }
 
@@ -220,15 +246,17 @@ void vp9_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
 void vp9_setup_scale_factors(VP9_COMMON *cm, int i) {
   const int ref = cm->active_ref_idx[i];
   struct scale_factors *const sf = &cm->active_ref_scale[i];
+  struct scale_factors_common *const sfc = &cm->active_ref_scale_comm[i];
   if (ref >= NUM_YV12_BUFFERS) {
     vp9_zero(*sf);
+    vp9_zero(*sfc);
   } else {
     YV12_BUFFER_CONFIG *const fb = &cm->yv12_fb[ref];
-    vp9_setup_scale_factors_for_frame(sf,
+    vp9_setup_scale_factors_for_frame(sf, sfc,
                                       fb->y_crop_width, fb->y_crop_height,
                                       cm->width, cm->height);
 
-    if (vp9_is_scaled(sf))
+    if (vp9_is_scaled(sfc))
       vp9_extend_frame_borders(fb, cm->subsampling_x, cm->subsampling_y);
   }
 }
diff --git a/vp9/common/vp9_reconinter.h b/vp9/common/vp9_reconinter.h
index 504b79356..2c8a6e4d9 100644
--- a/vp9/common/vp9_reconinter.h
+++ b/vp9/common/vp9_reconinter.h
@@ -25,7 +25,7 @@ void vp9_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
                                    BLOCK_SIZE bsize);
 
 void vp9_setup_interp_filters(MACROBLOCKD *xd,
-                              INTERPOLATIONFILTERTYPE filter,
+                              INTERPOLATION_TYPE filter,
                               VP9_COMMON *cm);
 
 void vp9_build_inter_predictor(const uint8_t *src, int src_stride,
@@ -38,8 +38,10 @@ void vp9_build_inter_predictor(const uint8_t *src, int src_stride,
 
 static int scaled_buffer_offset(int x_offset, int y_offset, int stride,
                                 const struct scale_factors *scale) {
-  const int x = scale ? scale->scale_value_x(x_offset, scale) : x_offset;
-  const int y = scale ? scale->scale_value_y(y_offset, scale) : y_offset;
+  const int x = scale ? scale->sfc->scale_value_x(x_offset, scale->sfc) :
+      x_offset;
+  const int y = scale ? scale->sfc->scale_value_y(y_offset, scale->sfc) :
+      y_offset;
   return y * stride + x;
 }
 
diff --git a/vp9/common/vp9_reconintra.c b/vp9/common/vp9_reconintra.c
index bd609dcf0..eb643b090 100644
--- a/vp9/common/vp9_reconintra.c
+++ b/vp9/common/vp9_reconintra.c
@@ -369,7 +369,7 @@ static void build_intra_predictors(const uint8_t *ref, int ref_stride,
   }
 }
 
-void vp9_predict_intra_block(MACROBLOCKD *xd, int block_idx, int bwl_in,
+void vp9_predict_intra_block(const MACROBLOCKD *xd, int block_idx, int bwl_in,
                             TX_SIZE tx_size, int mode,
                             const uint8_t *ref, int ref_stride,
                             uint8_t *dst, int dst_stride) {
diff --git a/vp9/common/vp9_reconintra.h b/vp9/common/vp9_reconintra.h
index e9d0dbf04..6e3f55c4d 100644
--- a/vp9/common/vp9_reconintra.h
+++ b/vp9/common/vp9_reconintra.h
@@ -14,8 +14,8 @@
 #include "vpx/vpx_integer.h"
 #include "vp9/common/vp9_blockd.h"
 
-void vp9_predict_intra_block(MACROBLOCKD *xd, int block_idx, int bwl_in,
-                            TX_SIZE tx_size, int mode,
-                            const uint8_t *ref, int ref_stride,
-                            uint8_t *dst, int dst_stride);
+void vp9_predict_intra_block(const MACROBLOCKD *xd, int block_idx, int bwl_in,
+                             TX_SIZE tx_size, int mode,
+                             const uint8_t *ref, int ref_stride,
+                             uint8_t *dst, int dst_stride);
 #endif  // VP9_COMMON_VP9_RECONINTRA_H_
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index df92b5882..debec6154 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -22,10 +22,11 @@ forward_decls vp9_common_forward_decls
 
 # x86inc.asm doesn't work if pic is enabled on 32 bit platforms so no assembly.
 [ "$CONFIG_USE_X86INC" = "yes" ] && mmx_x86inc=mmx && sse_x86inc=sse &&
-  sse2_x86inc=sse2 && ssse3_x86inc=ssse3
+  sse2_x86inc=sse2 && ssse3_x86inc=ssse3 && avx_x86inc=avx && avx2_x86inc=avx2
 
 # this variable is for functions that are 64 bit only.
-[ $arch = "x86_64" ] && mmx_x86_64=mmx && sse2_x86_64=sse2 && ssse3_x86_64=ssse3
+[ $arch = "x86_64" ] && mmx_x86_64=mmx && sse2_x86_64=sse2 && 
+  ssse3_x86_64=ssse3 && avx_x86_64=avx && avx2_x86_64=avx2
 
 #
 # RECON
@@ -157,7 +158,7 @@ prototype void vp9_d63_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const
 specialize vp9_d63_predictor_32x32 $ssse3_x86inc
 
 prototype void vp9_h_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_h_predictor_32x32 $ssse3 x86inc
+specialize vp9_h_predictor_32x32 $ssse3_x86inc
 
 prototype void vp9_d117_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
 specialize vp9_d117_predictor_32x32
@@ -199,7 +200,7 @@ prototype void vp9_loop_filter_vertical_edge "uint8_t *s, int pitch, const uint8
 specialize vp9_loop_filter_vertical_edge mmx neon
 
 prototype void vp9_mb_lpf_horizontal_edge_w "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
-specialize vp9_mb_lpf_horizontal_edge_w sse2 neon
+specialize vp9_mb_lpf_horizontal_edge_w sse2 avx2 neon
 
 prototype void vp9_mbloop_filter_horizontal_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
 specialize vp9_mbloop_filter_horizontal_edge sse2 neon
@@ -268,43 +269,46 @@ specialize vp9_convolve8_avg_vert sse2 ssse3 neon dspr2
 # dct
 #
 prototype void vp9_idct4x4_1_add "const int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_idct4x4_1_add sse2 neon
+specialize vp9_idct4x4_1_add sse2 neon dspr2
 
 prototype void vp9_idct4x4_16_add "const int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_idct4x4_16_add sse2 neon
+specialize vp9_idct4x4_16_add sse2 neon dspr2
 
 prototype void vp9_idct8x8_1_add "const int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_idct8x8_1_add sse2 neon
+specialize vp9_idct8x8_1_add sse2 neon dspr2
 
 prototype void vp9_idct8x8_64_add "const int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_idct8x8_64_add sse2 neon
+specialize vp9_idct8x8_64_add sse2 neon dspr2
 
 prototype void vp9_idct8x8_10_add "const int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_idct8x8_10_add sse2 neon
+specialize vp9_idct8x8_10_add sse2 neon dspr2
 
 prototype void vp9_idct16x16_1_add "const int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_idct16x16_1_add sse2 neon
+specialize vp9_idct16x16_1_add sse2 neon dspr2
 
 prototype void vp9_idct16x16_256_add "const int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_idct16x16_256_add sse2 neon
+specialize vp9_idct16x16_256_add sse2 neon dspr2
 
 prototype void vp9_idct16x16_10_add "const int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_idct16x16_10_add sse2 neon
+specialize vp9_idct16x16_10_add sse2 neon dspr2
 
 prototype void vp9_idct32x32_1024_add "const int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_idct32x32_1024_add sse2 neon
+specialize vp9_idct32x32_1024_add sse2 neon dspr2
+
+prototype void vp9_idct32x32_34_add "const int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_idct32x32_34_add sse2
 
 prototype void vp9_idct32x32_1_add "const int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_idct32x32_1_add sse2
+specialize vp9_idct32x32_1_add sse2 dspr2
 
 prototype void vp9_iht4x4_16_add "const int16_t *input, uint8_t *dest, int dest_stride, int tx_type"
-specialize vp9_iht4x4_16_add sse2 neon
+specialize vp9_iht4x4_16_add sse2 neon dspr2
 
 prototype void vp9_iht8x8_64_add "const int16_t *input, uint8_t *dest, int dest_stride, int tx_type"
-specialize vp9_iht8x8_64_add sse2 neon
+specialize vp9_iht8x8_64_add sse2 neon dspr2
 
 prototype void vp9_iht16x16_256_add "const int16_t *input, uint8_t *output, int pitch, int tx_type"
-specialize vp9_iht16x16_256_add sse2
+specialize vp9_iht16x16_256_add sse2 dspr2
 
 # dct and add
 
@@ -668,10 +672,10 @@ specialize vp9_block_error $sse2_x86inc
 prototype void vp9_subtract_block "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride"
 specialize vp9_subtract_block $sse2_x86inc
 
-prototype void vp9_quantize_b "int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr, int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"
+prototype void vp9_quantize_b "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"
 specialize vp9_quantize_b $ssse3_x86_64
 
-prototype void vp9_quantize_b_32x32 "int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr, int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"
+prototype void vp9_quantize_b_32x32 "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"
 specialize vp9_quantize_b_32x32 $ssse3_x86_64
 
 #
@@ -686,32 +690,32 @@ if [ "$CONFIG_INTERNAL_STATS" = "yes" ]; then
 fi
 
 # fdct functions
-prototype void vp9_short_fht4x4 "int16_t *InputData, int16_t *OutputData, int pitch, int tx_type"
+prototype void vp9_short_fht4x4 "const int16_t *input, int16_t *output, int stride, int tx_type"
 specialize vp9_short_fht4x4 sse2
 
-prototype void vp9_short_fht8x8 "int16_t *InputData, int16_t *OutputData, int pitch, int tx_type"
+prototype void vp9_short_fht8x8 "const int16_t *input, int16_t *output, int stride, int tx_type"
 specialize vp9_short_fht8x8 sse2
 
-prototype void vp9_short_fht16x16 "int16_t *InputData, int16_t *OutputData, int pitch, int tx_type"
+prototype void vp9_short_fht16x16 "const int16_t *input, int16_t *output, int stride, int tx_type"
 specialize vp9_short_fht16x16 sse2
 
-prototype void vp9_short_fdct8x8 "int16_t *InputData, int16_t *OutputData, int stride"
-specialize vp9_short_fdct8x8 sse2
+prototype void vp9_fwht4x4 "const int16_t *input, int16_t *output, int stride"
+specialize vp9_fwht4x4
 
-prototype void vp9_short_fdct4x4 "int16_t *InputData, int16_t *OutputData, int stride"
-specialize vp9_short_fdct4x4 sse2
+prototype void vp9_fdct4x4 "const int16_t *input, int16_t *output, int stride"
+specialize vp9_fdct4x4 sse2
 
-prototype void vp9_short_fdct32x32 "int16_t *InputData, int16_t *OutputData, int stride"
-specialize vp9_short_fdct32x32 sse2
+prototype void vp9_fdct8x8 "const int16_t *input, int16_t *output, int stride"
+specialize vp9_fdct8x8 sse2
 
-prototype void vp9_short_fdct32x32_rd "int16_t *InputData, int16_t *OutputData, int stride"
-specialize vp9_short_fdct32x32_rd sse2
+prototype void vp9_fdct16x16 "const int16_t *input, int16_t *output, int stride"
+specialize vp9_fdct16x16 sse2
 
-prototype void vp9_short_fdct16x16 "int16_t *InputData, int16_t *OutputData, int stride"
-specialize vp9_short_fdct16x16 sse2
+prototype void vp9_fdct32x32 "const int16_t *input, int16_t *output, int stride"
+specialize vp9_fdct32x32 sse2
 
-prototype void vp9_short_walsh4x4 "int16_t *InputData, int16_t *OutputData, int pitch"
-specialize vp9_short_walsh4x4
+prototype void vp9_fdct32x32_rd "const int16_t *input, int16_t *output, int stride"
+specialize vp9_fdct32x32_rd sse2
 
 #
 # Motion search
diff --git a/vp9/common/vp9_scale.c b/vp9/common/vp9_scale.c
index 989206c60..3f0994f80 100644
--- a/vp9/common/vp9_scale.c
+++ b/vp9/common/vp9_scale.c
@@ -12,23 +12,23 @@
 #include "vp9/common/vp9_filter.h"
 #include "vp9/common/vp9_scale.h"
 
-static INLINE int scaled_x(int val, const struct scale_factors *scale) {
-  return val * scale->x_scale_fp >> REF_SCALE_SHIFT;
+static INLINE int scaled_x(int val, const struct scale_factors_common *sfc) {
+  return val * sfc->x_scale_fp >> REF_SCALE_SHIFT;
 }
 
-static INLINE int scaled_y(int val, const struct scale_factors *scale) {
-  return val * scale->y_scale_fp >> REF_SCALE_SHIFT;
+static INLINE int scaled_y(int val, const struct scale_factors_common *sfc) {
+  return val * sfc->y_scale_fp >> REF_SCALE_SHIFT;
 }
 
-static int unscaled_value(int val, const struct scale_factors *scale) {
-  (void) scale;
+static int unscaled_value(int val, const struct scale_factors_common *sfc) {
+  (void) sfc;
   return val;
 }
 
 static MV32 scaled_mv(const MV *mv, const struct scale_factors *scale) {
   const MV32 res = {
-    scaled_y(mv->row, scale) + scale->y_offset_q4,
-    scaled_x(mv->col, scale) + scale->x_offset_q4
+    scaled_y(mv->row, scale->sfc) + scale->y_offset_q4,
+    scaled_x(mv->col, scale->sfc) + scale->x_offset_q4
   };
   return res;
 }
@@ -43,8 +43,8 @@ static MV32 unscaled_mv(const MV *mv, const struct scale_factors *scale) {
 
 static void set_offsets_with_scaling(struct scale_factors *scale,
                                      int row, int col) {
-  scale->x_offset_q4 = scaled_x(col << SUBPEL_BITS, scale) & SUBPEL_MASK;
-  scale->y_offset_q4 = scaled_y(row << SUBPEL_BITS, scale) & SUBPEL_MASK;
+  scale->x_offset_q4 = scaled_x(col << SUBPEL_BITS, scale->sfc) & SUBPEL_MASK;
+  scale->y_offset_q4 = scaled_y(row << SUBPEL_BITS, scale->sfc) & SUBPEL_MASK;
 }
 
 static void set_offsets_without_scaling(struct scale_factors *scale,
@@ -70,31 +70,30 @@ static int check_scale_factors(int other_w, int other_h,
 }
 
 void vp9_setup_scale_factors_for_frame(struct scale_factors *scale,
+                                       struct scale_factors_common *scale_comm,
                                        int other_w, int other_h,
                                        int this_w, int this_h) {
   if (!check_scale_factors(other_w, other_h, this_w, this_h)) {
-    scale->x_scale_fp = REF_INVALID_SCALE;
-    scale->y_scale_fp = REF_INVALID_SCALE;
+    scale_comm->x_scale_fp = REF_INVALID_SCALE;
+    scale_comm->y_scale_fp = REF_INVALID_SCALE;
     return;
   }
 
-  scale->x_scale_fp = get_fixed_point_scale_factor(other_w, this_w);
-  scale->y_scale_fp = get_fixed_point_scale_factor(other_h, this_h);
-  scale->x_step_q4 = scaled_x(16, scale);
-  scale->y_step_q4 = scaled_y(16, scale);
-  scale->x_offset_q4 = 0;  // calculated per block
-  scale->y_offset_q4 = 0;  // calculated per block
+  scale_comm->x_scale_fp = get_fixed_point_scale_factor(other_w, this_w);
+  scale_comm->y_scale_fp = get_fixed_point_scale_factor(other_h, this_h);
+  scale_comm->x_step_q4 = scaled_x(16, scale_comm);
+  scale_comm->y_step_q4 = scaled_y(16, scale_comm);
 
-  if (vp9_is_scaled(scale)) {
-    scale->scale_value_x = scaled_x;
-    scale->scale_value_y = scaled_y;
-    scale->set_scaled_offsets = set_offsets_with_scaling;
-    scale->scale_mv = scaled_mv;
+  if (vp9_is_scaled(scale_comm)) {
+    scale_comm->scale_value_x = scaled_x;
+    scale_comm->scale_value_y = scaled_y;
+    scale_comm->set_scaled_offsets = set_offsets_with_scaling;
+    scale_comm->scale_mv = scaled_mv;
   } else {
-    scale->scale_value_x = unscaled_value;
-    scale->scale_value_y = unscaled_value;
-    scale->set_scaled_offsets = set_offsets_without_scaling;
-    scale->scale_mv = unscaled_mv;
+    scale_comm->scale_value_x = unscaled_value;
+    scale_comm->scale_value_y = unscaled_value;
+    scale_comm->set_scaled_offsets = set_offsets_without_scaling;
+    scale_comm->scale_mv = unscaled_mv;
   }
 
   // TODO(agrange): Investigate the best choice of functions to use here
@@ -103,44 +102,48 @@ void vp9_setup_scale_factors_for_frame(struct scale_factors *scale,
   // applied in one direction only, and not at all for 0,0, seems to give the
   // best quality, but it may be worth trying an additional mode that does
   // do the filtering on full-pel.
-  if (scale->x_step_q4 == 16) {
-    if (scale->y_step_q4 == 16) {
+  if (scale_comm->x_step_q4 == 16) {
+    if (scale_comm->y_step_q4 == 16) {
       // No scaling in either direction.
-      scale->predict[0][0][0] = vp9_convolve_copy;
-      scale->predict[0][0][1] = vp9_convolve_avg;
-      scale->predict[0][1][0] = vp9_convolve8_vert;
-      scale->predict[0][1][1] = vp9_convolve8_avg_vert;
-      scale->predict[1][0][0] = vp9_convolve8_horiz;
-      scale->predict[1][0][1] = vp9_convolve8_avg_horiz;
+      scale_comm->predict[0][0][0] = vp9_convolve_copy;
+      scale_comm->predict[0][0][1] = vp9_convolve_avg;
+      scale_comm->predict[0][1][0] = vp9_convolve8_vert;
+      scale_comm->predict[0][1][1] = vp9_convolve8_avg_vert;
+      scale_comm->predict[1][0][0] = vp9_convolve8_horiz;
+      scale_comm->predict[1][0][1] = vp9_convolve8_avg_horiz;
     } else {
       // No scaling in x direction. Must always scale in the y direction.
-      scale->predict[0][0][0] = vp9_convolve8_vert;
-      scale->predict[0][0][1] = vp9_convolve8_avg_vert;
-      scale->predict[0][1][0] = vp9_convolve8_vert;
-      scale->predict[0][1][1] = vp9_convolve8_avg_vert;
-      scale->predict[1][0][0] = vp9_convolve8;
-      scale->predict[1][0][1] = vp9_convolve8_avg;
+      scale_comm->predict[0][0][0] = vp9_convolve8_vert;
+      scale_comm->predict[0][0][1] = vp9_convolve8_avg_vert;
+      scale_comm->predict[0][1][0] = vp9_convolve8_vert;
+      scale_comm->predict[0][1][1] = vp9_convolve8_avg_vert;
+      scale_comm->predict[1][0][0] = vp9_convolve8;
+      scale_comm->predict[1][0][1] = vp9_convolve8_avg;
     }
   } else {
-    if (scale->y_step_q4 == 16) {
+    if (scale_comm->y_step_q4 == 16) {
       // No scaling in the y direction. Must always scale in the x direction.
-      scale->predict[0][0][0] = vp9_convolve8_horiz;
-      scale->predict[0][0][1] = vp9_convolve8_avg_horiz;
-      scale->predict[0][1][0] = vp9_convolve8;
-      scale->predict[0][1][1] = vp9_convolve8_avg;
-      scale->predict[1][0][0] = vp9_convolve8_horiz;
-      scale->predict[1][0][1] = vp9_convolve8_avg_horiz;
+      scale_comm->predict[0][0][0] = vp9_convolve8_horiz;
+      scale_comm->predict[0][0][1] = vp9_convolve8_avg_horiz;
+      scale_comm->predict[0][1][0] = vp9_convolve8;
+      scale_comm->predict[0][1][1] = vp9_convolve8_avg;
+      scale_comm->predict[1][0][0] = vp9_convolve8_horiz;
+      scale_comm->predict[1][0][1] = vp9_convolve8_avg_horiz;
     } else {
       // Must always scale in both directions.
-      scale->predict[0][0][0] = vp9_convolve8;
-      scale->predict[0][0][1] = vp9_convolve8_avg;
-      scale->predict[0][1][0] = vp9_convolve8;
-      scale->predict[0][1][1] = vp9_convolve8_avg;
-      scale->predict[1][0][0] = vp9_convolve8;
-      scale->predict[1][0][1] = vp9_convolve8_avg;
+      scale_comm->predict[0][0][0] = vp9_convolve8;
+      scale_comm->predict[0][0][1] = vp9_convolve8_avg;
+      scale_comm->predict[0][1][0] = vp9_convolve8;
+      scale_comm->predict[0][1][1] = vp9_convolve8_avg;
+      scale_comm->predict[1][0][0] = vp9_convolve8;
+      scale_comm->predict[1][0][1] = vp9_convolve8_avg;
     }
   }
   // 2D subpel motion always gets filtered in both directions
-  scale->predict[1][1][0] = vp9_convolve8;
-  scale->predict[1][1][1] = vp9_convolve8_avg;
+  scale_comm->predict[1][1][0] = vp9_convolve8;
+  scale_comm->predict[1][1][1] = vp9_convolve8_avg;
+
+  scale->sfc = scale_comm;
+  scale->x_offset_q4 = 0;  // calculated per block
+  scale->y_offset_q4 = 0;  // calculated per block
 }
diff --git a/vp9/common/vp9_scale.h b/vp9/common/vp9_scale.h
index ece011477..1437fcd9c 100644
--- a/vp9/common/vp9_scale.h
+++ b/vp9/common/vp9_scale.h
@@ -18,34 +18,40 @@
 #define REF_NO_SCALE (1 << REF_SCALE_SHIFT)
 #define REF_INVALID_SCALE -1
 
-struct scale_factors {
+struct scale_factors;
+struct scale_factors_common {
   int x_scale_fp;   // horizontal fixed point scale factor
   int y_scale_fp;   // vertical fixed point scale factor
-  int x_offset_q4;
   int x_step_q4;
-  int y_offset_q4;
   int y_step_q4;
 
-  int (*scale_value_x)(int val, const struct scale_factors *scale);
-  int (*scale_value_y)(int val, const struct scale_factors *scale);
+  int (*scale_value_x)(int val, const struct scale_factors_common *sfc);
+  int (*scale_value_y)(int val, const struct scale_factors_common *sfc);
   void (*set_scaled_offsets)(struct scale_factors *scale, int row, int col);
   MV32 (*scale_mv)(const MV *mv, const struct scale_factors *scale);
 
   convolve_fn_t predict[2][2][2];  // horiz, vert, avg
 };
 
+struct scale_factors {
+  int x_offset_q4;
+  int y_offset_q4;
+  const struct scale_factors_common *sfc;
+};
+
 void vp9_setup_scale_factors_for_frame(struct scale_factors *scale,
+                                       struct scale_factors_common *scale_comm,
                                        int other_w, int other_h,
                                        int this_w, int this_h);
 
-static int vp9_is_valid_scale(const struct scale_factors *sf) {
-  return sf->x_scale_fp != REF_INVALID_SCALE &&
-         sf->y_scale_fp != REF_INVALID_SCALE;
+static int vp9_is_valid_scale(const struct scale_factors_common *sfc) {
+  return sfc->x_scale_fp != REF_INVALID_SCALE &&
+         sfc->y_scale_fp != REF_INVALID_SCALE;
 }
 
-static int vp9_is_scaled(const struct scale_factors *sf) {
-  return sf->x_scale_fp != REF_NO_SCALE ||
-         sf->y_scale_fp != REF_NO_SCALE;
+static int vp9_is_scaled(const struct scale_factors_common *sfc) {
+  return sfc->x_scale_fp != REF_NO_SCALE ||
+         sfc->y_scale_fp != REF_NO_SCALE;
 }
 
 #endif  // VP9_COMMON_VP9_SCALE_H_
diff --git a/vp9/common/vp9_scan.h b/vp9/common/vp9_scan.h
index a5c8463d5..14a1a7eb0 100644
--- a/vp9/common/vp9_scan.h
+++ b/vp9/common/vp9_scan.h
@@ -191,8 +191,7 @@ static INLINE const int16_t* get_iscan_16x16(TX_TYPE tx_type) {
 }
 
 static INLINE int get_coef_context(const int16_t *neighbors,
-                                   uint8_t *token_cache,
-                                   int c) {
+                                   const uint8_t *token_cache, int c) {
   return (1 + token_cache[neighbors[MAX_NEIGHBORS * c + 0]] +
           token_cache[neighbors[MAX_NEIGHBORS * c + 1]]) >> 1;
 }
diff --git a/vp9/common/vp9_tile_common.c b/vp9/common/vp9_tile_common.c
index 1791c1a8f..e3035d076 100644
--- a/vp9/common/vp9_tile_common.c
+++ b/vp9/common/vp9_tile_common.c
@@ -10,6 +10,8 @@
 
 #include "vp9/common/vp9_tile_common.h"
 
+#include "vp9/common/vp9_onyxc_int.h"
+
 #define MIN_TILE_WIDTH_B64 4
 #define MAX_TILE_WIDTH_B64 64
 
@@ -17,8 +19,8 @@ static int to_sbs(n_mis) {
   return mi_cols_aligned_to_sb(n_mis) >> MI_BLOCK_SIZE_LOG2;
 }
 
-static void vp9_get_tile_offsets(int *min_tile_off, int *max_tile_off,
-                                 int tile_idx, int log2_n_tiles, int n_mis) {
+static void get_tile_offsets(int *min_tile_off, int *max_tile_off,
+                             int tile_idx, int log2_n_tiles, int n_mis) {
   const int n_sbs = to_sbs(n_mis);
   const int sb_off1 =  (tile_idx      * n_sbs) >> log2_n_tiles;
   const int sb_off2 = ((tile_idx + 1) * n_sbs) >> log2_n_tiles;
@@ -27,17 +29,14 @@ static void vp9_get_tile_offsets(int *min_tile_off, int *max_tile_off,
   *max_tile_off = MIN(sb_off2 << 3, n_mis);
 }
 
-void vp9_get_tile_col_offsets(VP9_COMMON *cm, int tile_col_idx) {
-  vp9_get_tile_offsets(&cm->cur_tile_mi_col_start, &cm->cur_tile_mi_col_end,
-                       tile_col_idx, cm->log2_tile_cols, cm->mi_cols);
-}
-
-void vp9_get_tile_row_offsets(VP9_COMMON *cm, int tile_row_idx) {
-  vp9_get_tile_offsets(&cm->cur_tile_mi_row_start, &cm->cur_tile_mi_row_end,
-                       tile_row_idx, cm->log2_tile_rows, cm->mi_rows);
+void vp9_tile_init(TileInfo *tile, const VP9_COMMON *cm,
+                   int row_idx, int col_idx) {
+  get_tile_offsets(&tile->mi_row_start, &tile->mi_row_end,
+                   row_idx, cm->log2_tile_rows, cm->mi_rows);
+  get_tile_offsets(&tile->mi_col_start, &tile->mi_col_end,
+                   col_idx, cm->log2_tile_cols, cm->mi_cols);
 }
 
-
 void vp9_get_tile_n_bits(int mi_cols,
                          int *min_log2_tile_cols, int *max_log2_tile_cols) {
   const int sb_cols = to_sbs(mi_cols);
diff --git a/vp9/common/vp9_tile_common.h b/vp9/common/vp9_tile_common.h
index 6d14560b9..a110abbdb 100644
--- a/vp9/common/vp9_tile_common.h
+++ b/vp9/common/vp9_tile_common.h
@@ -11,11 +11,17 @@
 #ifndef VP9_COMMON_VP9_TILE_COMMON_H_
 #define VP9_COMMON_VP9_TILE_COMMON_H_
 
-#include "vp9/common/vp9_onyxc_int.h"
+struct VP9Common;
 
-void vp9_get_tile_col_offsets(VP9_COMMON *cm, int tile_col_idx);
+typedef struct TileInfo {
+  int mi_row_start, mi_row_end;
+  int mi_col_start, mi_col_end;
+} TileInfo;
 
-void vp9_get_tile_row_offsets(VP9_COMMON *cm, int tile_row_idx);
+// initializes 'tile->mi_(row|col)_(start|end)' for (row_idx, col_idx) based on
+// 'cm->log2_tile_(rows|cols)' & 'cm->mi_(rows|cols)'
+void vp9_tile_init(TileInfo *tile, const struct VP9Common *cm,
+                   int row_idx, int col_idx);
 
 void vp9_get_tile_n_bits(int mi_cols,
                          int *min_log2_tile_cols, int *max_log2_tile_cols);
diff --git a/vp9/common/vp9_treecoder.c b/vp9/common/vp9_treecoder.c
index da1213d71..1805fb4d8 100644
--- a/vp9/common/vp9_treecoder.c
+++ b/vp9/common/vp9_treecoder.c
@@ -40,9 +40,7 @@ void vp9_tokens_from_tree_offset(struct vp9_token *p, vp9_tree t,
   tree2tok(p - offset, t, 0, 0, 0);
 }
 
-static unsigned int convert_distribution(unsigned int i,
-                                         vp9_tree tree,
-                                         vp9_prob probs[],
+static unsigned int convert_distribution(unsigned int i, vp9_tree tree,
                                          unsigned int branch_ct[][2],
                                          const unsigned int num_events[],
                                          unsigned int tok0_offset) {
@@ -51,24 +49,25 @@ static unsigned int convert_distribution(unsigned int i,
   if (tree[i] <= 0) {
     left = num_events[-tree[i] - tok0_offset];
   } else {
-    left = convert_distribution(tree[i], tree, probs, branch_ct,
-                                num_events, tok0_offset);
+    left = convert_distribution(tree[i], tree, branch_ct, num_events,
+                                tok0_offset);
   }
   if (tree[i + 1] <= 0)
     right = num_events[-tree[i + 1] - tok0_offset];
   else
-    right = convert_distribution(tree[i + 1], tree, probs, branch_ct,
-                                 num_events, tok0_offset);
+    right = convert_distribution(tree[i + 1], tree, branch_ct, num_events,
+                                 tok0_offset);
 
-  probs[i>>1] = get_binary_prob(left, right);
-  branch_ct[i>>1][0] = left;
-  branch_ct[i>>1][1] = right;
+  branch_ct[i >> 1][0] = left;
+  branch_ct[i >> 1][1] = right;
   return left + right;
 }
 
-void vp9_tree_probs_from_distribution(vp9_tree tree, vp9_prob probs[/* n-1 */],
+void vp9_tree_probs_from_distribution(vp9_tree tree,
                                       unsigned int branch_ct[/* n-1 */][2],
                                       const unsigned int num_events[/* n */],
                                       unsigned int tok0_offset) {
-  convert_distribution(0, tree, probs, branch_ct, num_events, tok0_offset);
+  convert_distribution(0, tree, branch_ct, num_events, tok0_offset);
 }
+
+
diff --git a/vp9/common/vp9_treecoder.h b/vp9/common/vp9_treecoder.h
index 4ba171f46..9c776d61c 100644
--- a/vp9/common/vp9_treecoder.h
+++ b/vp9/common/vp9_treecoder.h
@@ -50,11 +50,11 @@ void vp9_tokens_from_tree_offset(struct vp9_token*, vp9_tree, int offset);
    probability updates. */
 
 void vp9_tree_probs_from_distribution(vp9_tree tree,
-                                      vp9_prob probs[ /* n - 1 */ ],
                                       unsigned int branch_ct[ /* n - 1 */ ][2],
                                       const unsigned int num_events[ /* n */ ],
                                       unsigned int tok0_offset);
 
+
 static INLINE vp9_prob clip_prob(int p) {
   return (p > 255) ? 255u : (p < 1) ? 1u : p;
 }
@@ -81,21 +81,46 @@ static INLINE vp9_prob weighted_prob(int prob1, int prob2, int factor) {
   return ROUND_POWER_OF_TWO(prob1 * (256 - factor) + prob2 * factor, 8);
 }
 
-static INLINE vp9_prob merge_probs(vp9_prob pre_prob, vp9_prob prob,
+static INLINE vp9_prob merge_probs(vp9_prob pre_prob,
                                    const unsigned int ct[2],
                                    unsigned int count_sat,
                                    unsigned int max_update_factor) {
+  const vp9_prob prob = get_binary_prob(ct[0], ct[1]);
   const unsigned int count = MIN(ct[0] + ct[1], count_sat);
   const unsigned int factor = max_update_factor * count / count_sat;
   return weighted_prob(pre_prob, prob, factor);
 }
 
-static INLINE vp9_prob merge_probs2(vp9_prob pre_prob,
-                                   const unsigned int ct[2],
-                                   unsigned int count_sat,
-                                   unsigned int max_update_factor) {
-  return merge_probs(pre_prob, get_binary_prob(ct[0], ct[1]), ct, count_sat,
-                     max_update_factor);
+static unsigned int tree_merge_probs_impl(unsigned int i,
+                                          const vp9_tree_index *tree,
+                                          const vp9_prob *pre_probs,
+                                          const unsigned int *counts,
+                                          unsigned int count_sat,
+                                          unsigned int max_update_factor,
+                                          vp9_prob *probs) {
+  const int l = tree[i];
+  const unsigned int left_count = (l <= 0)
+                 ? counts[-l]
+                 : tree_merge_probs_impl(l, tree, pre_probs, counts,
+                                         count_sat, max_update_factor, probs);
+  const int r = tree[i + 1];
+  const unsigned int right_count = (r <= 0)
+                 ? counts[-r]
+                 : tree_merge_probs_impl(r, tree, pre_probs, counts,
+                                         count_sat, max_update_factor, probs);
+  const unsigned int ct[2] = { left_count, right_count };
+  probs[i >> 1] = merge_probs(pre_probs[i >> 1], ct,
+                              count_sat, max_update_factor);
+  return left_count + right_count;
+}
+
+static void tree_merge_probs(const vp9_tree_index *tree,
+                             const vp9_prob *pre_probs,
+                             const unsigned int *counts, int offset,
+                             unsigned int count_sat,
+                             unsigned int max_update_factor, vp9_prob *probs) {
+  tree_merge_probs_impl(0, tree, pre_probs, &counts[-offset],
+                        count_sat, max_update_factor, probs);
 }
 
 
diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c
index cfec36b42..ccf5aac17 100644
--- a/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -415,7 +415,7 @@ void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride,
       res3 = _mm_packs_epi32(tmp6, tmp7); \
   }
 
-#define IDCT8x8_1D  \
+#define IDCT8_1D  \
   /* Stage1 */      \
   { \
     const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \
@@ -525,12 +525,12 @@ void vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
 
   // 2-D
   for (i = 0; i < 2; i++) {
-    // 8x8 Transpose is copied from vp9_short_fdct8x8_sse2()
+    // 8x8 Transpose is copied from vp9_fdct8x8_sse2()
     TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
                   in4, in5, in6, in7);
 
     // 4-stage 1D idct8x8
-    IDCT8x8_1D
+    IDCT8_1D
   }
 
   // Final rounding and shift
@@ -638,12 +638,12 @@ static void idct8_1d_sse2(__m128i *in) {
   in6 = in[6];
   in7 = in[7];
 
-  // 8x8 Transpose is copied from vp9_short_fdct8x8_sse2()
+  // 8x8 Transpose is copied from vp9_fdct8x8_sse2()
   TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
                 in4, in5, in6, in7);
 
   // 4-stage 1D idct8x8
-  IDCT8x8_1D
+  IDCT8_1D
   in[0] = in0;
   in[1] = in1;
   in[2] = in2;
@@ -1068,7 +1068,7 @@ void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
                 in4, in5, in6, in7)
 
   // 1D idct8x8
-  IDCT8x8_1D
+  IDCT8_1D
 
   // Final rounding and shift
   in0 = _mm_adds_epi16(in0, final_rounding);
@@ -1099,7 +1099,7 @@ void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
   RECON_AND_STORE(dest, in7);
 }
 
-#define IDCT16x16_1D \
+#define IDCT16_1D \
   /* Stage2 */ \
   { \
     const __m128i lo_1_15 = _mm_unpacklo_epi16(in1, in15); \
@@ -1321,7 +1321,7 @@ void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest,
                     in12, in13, in14, in15);
     }
 
-    IDCT16x16_1D
+    IDCT16_1D
 
     // Stage7
     if (i == 0) {
@@ -2703,7 +2703,7 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
 
     in8 = in9 = in10 = in11 = in12 = in13 = in14 = in15 = zero;
 
-    IDCT16x16_1D
+    IDCT16_1D
 
     // Stage7
     in0 = _mm_add_epi16(stp2_0, stp1_15);
@@ -2785,6 +2785,698 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
     input += 8; \
   }  \
 
+#define IDCT32_1D \
+/* Stage1 */ \
+{ \
+  const __m128i lo_1_31 = _mm_unpacklo_epi16(in1, in31); \
+  const __m128i hi_1_31 = _mm_unpackhi_epi16(in1, in31); \
+  const __m128i lo_17_15 = _mm_unpacklo_epi16(in17, in15); \
+  const __m128i hi_17_15 = _mm_unpackhi_epi16(in17, in15); \
+  \
+  const __m128i lo_9_23 = _mm_unpacklo_epi16(in9, in23); \
+  const __m128i hi_9_23 = _mm_unpackhi_epi16(in9, in23); \
+  const __m128i lo_25_7= _mm_unpacklo_epi16(in25, in7); \
+  const __m128i hi_25_7 = _mm_unpackhi_epi16(in25, in7); \
+  \
+  const __m128i lo_5_27 = _mm_unpacklo_epi16(in5, in27); \
+  const __m128i hi_5_27 = _mm_unpackhi_epi16(in5, in27); \
+  const __m128i lo_21_11 = _mm_unpacklo_epi16(in21, in11); \
+  const __m128i hi_21_11 = _mm_unpackhi_epi16(in21, in11); \
+  \
+  const __m128i lo_13_19 = _mm_unpacklo_epi16(in13, in19); \
+  const __m128i hi_13_19 = _mm_unpackhi_epi16(in13, in19); \
+  const __m128i lo_29_3 = _mm_unpacklo_epi16(in29, in3); \
+  const __m128i hi_29_3 = _mm_unpackhi_epi16(in29, in3); \
+  \
+  MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \
+                         stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, \
+                         stp1_17, stp1_30) \
+  MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, \
+                         stg1_5, stg1_6, stg1_7, stp1_18, stp1_29, \
+                         stp1_19, stp1_28) \
+  MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, \
+                         stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, \
+                         stp1_21, stp1_26) \
+  MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, \
+                         stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, \
+                         stp1_23, stp1_24) \
+} \
+\
+/* Stage2 */ \
+{ \
+  const __m128i lo_2_30 = _mm_unpacklo_epi16(in2, in30); \
+  const __m128i hi_2_30 = _mm_unpackhi_epi16(in2, in30); \
+  const __m128i lo_18_14 = _mm_unpacklo_epi16(in18, in14); \
+  const __m128i hi_18_14 = _mm_unpackhi_epi16(in18, in14); \
+  \
+  const __m128i lo_10_22 = _mm_unpacklo_epi16(in10, in22); \
+  const __m128i hi_10_22 = _mm_unpackhi_epi16(in10, in22); \
+  const __m128i lo_26_6 = _mm_unpacklo_epi16(in26, in6); \
+  const __m128i hi_26_6 = _mm_unpackhi_epi16(in26, in6); \
+  \
+  MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \
+                         stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \
+                         stp2_14) \
+  MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, \
+                         stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, \
+                         stp2_11, stp2_12) \
+  \
+  stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \
+  stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \
+  stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); \
+  stp2_19 = _mm_add_epi16(stp1_19, stp1_18); \
+  \
+  stp2_20 = _mm_add_epi16(stp1_20, stp1_21); \
+  stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); \
+  stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); \
+  stp2_23 = _mm_add_epi16(stp1_23, stp1_22); \
+  \
+  stp2_24 = _mm_add_epi16(stp1_24, stp1_25); \
+  stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); \
+  stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); \
+  stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \
+  \
+  stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \
+  stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \
+  stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \
+  stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \
+} \
+\
+/* Stage3 */ \
+{ \
+  const __m128i lo_4_28 = _mm_unpacklo_epi16(in4, in28); \
+  const __m128i hi_4_28 = _mm_unpackhi_epi16(in4, in28); \
+  const __m128i lo_20_12 = _mm_unpacklo_epi16(in20, in12); \
+  const __m128i hi_20_12 = _mm_unpackhi_epi16(in20, in12); \
+  \
+  const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \
+  const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \
+  const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
+  const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
+  \
+  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
+  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
+  const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
+  const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
+  \
+  MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, \
+                         stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, \
+                         stp1_6) \
+  \
+  stp1_8 = _mm_add_epi16(stp2_8, stp2_9); \
+  stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \
+  stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
+  stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
+  stp1_12 = _mm_add_epi16(stp2_12, stp2_13); \
+  stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
+  stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
+  stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
+  \
+  MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
+                         stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \
+                         stp1_18, stp1_29) \
+  MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
+                         stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \
+                         stp1_22, stp1_25) \
+  \
+  stp1_16 = stp2_16; \
+  stp1_31 = stp2_31; \
+  stp1_19 = stp2_19; \
+  stp1_20 = stp2_20; \
+  stp1_23 = stp2_23; \
+  stp1_24 = stp2_24; \
+  stp1_27 = stp2_27; \
+  stp1_28 = stp2_28; \
+} \
+\
+/* Stage4 */ \
+{ \
+  const __m128i lo_0_16 = _mm_unpacklo_epi16(in0, in16); \
+  const __m128i hi_0_16 = _mm_unpackhi_epi16(in0, in16); \
+  const __m128i lo_8_24 = _mm_unpacklo_epi16(in8, in24); \
+  const __m128i hi_8_24 = _mm_unpackhi_epi16(in8, in24); \
+  \
+  const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
+  const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
+  const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
+  const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
+  \
+  MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, \
+                         stg4_1, stg4_2, stg4_3, stp2_0, stp2_1, \
+                         stp2_2, stp2_3) \
+  \
+  stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
+  stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
+  stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
+  stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
+  \
+  MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
+                         stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \
+                         stp2_10, stp2_13) \
+  \
+  stp2_8 = stp1_8; \
+  stp2_15 = stp1_15; \
+  stp2_11 = stp1_11; \
+  stp2_12 = stp1_12; \
+  \
+  stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
+  stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
+  stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
+  stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
+  stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
+  stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
+  stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
+  stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
+  \
+  stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
+  stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
+  stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
+  stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
+  stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
+  stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
+  stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
+  stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
+} \
+\
+/* Stage5 */ \
+{ \
+  const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
+  const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
+  const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
+  const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
+  \
+  const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
+  const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
+  const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
+  const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
+  \
+  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
+  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
+  \
+  stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
+  stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
+  stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
+  stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
+  \
+  tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
+  tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
+  tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
+  tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
+  \
+  tmp0 = _mm_add_epi32(tmp0, rounding); \
+  tmp1 = _mm_add_epi32(tmp1, rounding); \
+  tmp2 = _mm_add_epi32(tmp2, rounding); \
+  tmp3 = _mm_add_epi32(tmp3, rounding); \
+  \
+  tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+  tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+  tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+  tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+  \
+  stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
+  stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
+  \
+  stp1_4 = stp2_4; \
+  stp1_7 = stp2_7; \
+  \
+  stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
+  stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
+  stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
+  stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
+  stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
+  stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
+  stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
+  stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
+  \
+  stp1_16 = stp2_16; \
+  stp1_17 = stp2_17; \
+  \
+  MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
+                         stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \
+                         stp1_19, stp1_28) \
+  MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
+                         stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \
+                         stp1_21, stp1_26) \
+  \
+  stp1_22 = stp2_22; \
+  stp1_23 = stp2_23; \
+  stp1_24 = stp2_24; \
+  stp1_25 = stp2_25; \
+  stp1_30 = stp2_30; \
+  stp1_31 = stp2_31; \
+} \
+\
+/* Stage6 */ \
+{ \
+  const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
+  const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
+  const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
+  const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
+  \
+  stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
+  stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
+  stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
+  stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
+  stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
+  stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
+  stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
+  stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
+  \
+  stp2_8 = stp1_8; \
+  stp2_9 = stp1_9; \
+  stp2_14 = stp1_14; \
+  stp2_15 = stp1_15; \
+  \
+  MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
+                         stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \
+                         stp2_13, stp2_11, stp2_12) \
+  \
+  stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
+  stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
+  stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
+  stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
+  stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
+  stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
+  stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
+  stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
+  \
+  stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
+  stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
+  stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
+  stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
+  stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
+  stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
+  stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
+  stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
+} \
+\
+/* Stage7 */ \
+{ \
+  const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
+  const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
+  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
+  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
+  \
+  const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
+  const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
+  const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
+  const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
+  \
+  stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
+  stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
+  stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
+  stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
+  stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
+  stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
+  stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
+  stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
+  stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
+  stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
+  stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
+  stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
+  stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
+  stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
+  stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
+  stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
+  \
+  stp1_16 = stp2_16; \
+  stp1_17 = stp2_17; \
+  stp1_18 = stp2_18; \
+  stp1_19 = stp2_19; \
+  \
+  MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
+                         stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \
+                         stp1_21, stp1_26) \
+  MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
+                         stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \
+                         stp1_23, stp1_24) \
+  \
+  stp1_28 = stp2_28; \
+  stp1_29 = stp2_29; \
+  stp1_30 = stp2_30; \
+  stp1_31 = stp2_31; \
+}
+
+// Only upper-left 8x8 has non-zero coeff
+void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest,
+                                 int stride) {
+  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  const __m128i final_rounding = _mm_set1_epi16(1<<5);
+
+  // idct constants for each stage
+  const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
+  const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
+  const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
+  const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
+  const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
+  const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
+  const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
+  const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
+  const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
+  const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
+  const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
+  const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
+  const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
+  const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
+  const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
+  const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
+
+  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
+  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
+  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
+  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
+  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
+  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
+
+  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
+  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
+  const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+  const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
+  const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
+  const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
+  const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
+
+  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
+  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+
+  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+
+  __m128i in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12,
+          in13, in14, in15, in16, in17, in18, in19, in20, in21, in22, in23,
+          in24, in25, in26, in27, in28, in29, in30, in31;
+  __m128i col[128];
+  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
+          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
+          stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
+          stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29,
+          stp1_30, stp1_31;
+  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
+          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
+          stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22,
+          stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
+          stp2_30, stp2_31;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  int i, j, i32;
+
+  // We work on a 8x32 block each time, and loop 8 times for 2-D 32x32 idct.
+  for (i = 0; i < 8; i++) {
+    i32 = (i << 5);
+    if (i == 0) {
+      // First 1-D idct: first 8 rows
+      // Load input data.
+      LOAD_DQCOEFF(in0, input);
+      LOAD_DQCOEFF(in8, input);
+      LOAD_DQCOEFF(in16, input);
+      LOAD_DQCOEFF(in24, input);
+      LOAD_DQCOEFF(in1, input);
+      LOAD_DQCOEFF(in9, input);
+      LOAD_DQCOEFF(in17, input);
+      LOAD_DQCOEFF(in25, input);
+      LOAD_DQCOEFF(in2, input);
+      LOAD_DQCOEFF(in10, input);
+      LOAD_DQCOEFF(in18, input);
+      LOAD_DQCOEFF(in26, input);
+      LOAD_DQCOEFF(in3, input);
+      LOAD_DQCOEFF(in11, input);
+      LOAD_DQCOEFF(in19, input);
+      LOAD_DQCOEFF(in27, input);
+
+      LOAD_DQCOEFF(in4, input);
+      LOAD_DQCOEFF(in12, input);
+      LOAD_DQCOEFF(in20, input);
+      LOAD_DQCOEFF(in28, input);
+      LOAD_DQCOEFF(in5, input);
+      LOAD_DQCOEFF(in13, input);
+      LOAD_DQCOEFF(in21, input);
+      LOAD_DQCOEFF(in29, input);
+      LOAD_DQCOEFF(in6, input);
+      LOAD_DQCOEFF(in14, input);
+      LOAD_DQCOEFF(in22, input);
+      LOAD_DQCOEFF(in30, input);
+      LOAD_DQCOEFF(in7, input);
+      LOAD_DQCOEFF(in15, input);
+      LOAD_DQCOEFF(in23, input);
+      LOAD_DQCOEFF(in31, input);
+
+      // Transpose 32x8 block to 8x32 block
+      TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+                    in4, in5, in6, in7);
+      TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
+                    in10, in11, in12, in13, in14, in15);
+      TRANSPOSE_8X8(in16, in17, in18, in19, in20, in21, in22, in23, in16, in17,
+                    in18, in19, in20, in21, in22, in23);
+      TRANSPOSE_8X8(in24, in25, in26, in27, in28, in29, in30, in31, in24, in25,
+                    in26, in27, in28, in29, in30, in31);
+    } else if (i < 4) {
+      // First 1-D idct: next 24 zero-coeff rows
+      col[i32 + 0] = _mm_setzero_si128();
+      col[i32 + 1] = _mm_setzero_si128();
+      col[i32 + 2] = _mm_setzero_si128();
+      col[i32 + 3] = _mm_setzero_si128();
+      col[i32 + 4] = _mm_setzero_si128();
+      col[i32 + 5] = _mm_setzero_si128();
+      col[i32 + 6] = _mm_setzero_si128();
+      col[i32 + 7] = _mm_setzero_si128();
+      col[i32 + 8] = _mm_setzero_si128();
+      col[i32 + 9] = _mm_setzero_si128();
+      col[i32 + 10] = _mm_setzero_si128();
+      col[i32 + 11] = _mm_setzero_si128();
+      col[i32 + 12] = _mm_setzero_si128();
+      col[i32 + 13] = _mm_setzero_si128();
+      col[i32 + 14] = _mm_setzero_si128();
+      col[i32 + 15] = _mm_setzero_si128();
+      col[i32 + 16] = _mm_setzero_si128();
+      col[i32 + 17] = _mm_setzero_si128();
+      col[i32 + 18] = _mm_setzero_si128();
+      col[i32 + 19] = _mm_setzero_si128();
+      col[i32 + 20] = _mm_setzero_si128();
+      col[i32 + 21] = _mm_setzero_si128();
+      col[i32 + 22] = _mm_setzero_si128();
+      col[i32 + 23] = _mm_setzero_si128();
+      col[i32 + 24] = _mm_setzero_si128();
+      col[i32 + 25] = _mm_setzero_si128();
+      col[i32 + 26] = _mm_setzero_si128();
+      col[i32 + 27] = _mm_setzero_si128();
+      col[i32 + 28] = _mm_setzero_si128();
+      col[i32 + 29] = _mm_setzero_si128();
+      col[i32 + 30] = _mm_setzero_si128();
+      col[i32 + 31] = _mm_setzero_si128();
+      continue;
+    } else {
+      // Second 1-D idct
+      j = i - 4;
+
+      // Transpose 32x8 block to 8x32 block
+      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
+                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
+                    col[j * 8 + 6], col[j * 8 + 7], in0, in1, in2, in3, in4,
+                    in5, in6, in7);
+      j += 4;
+      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
+                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
+                    col[j * 8 + 6], col[j * 8 + 7], in8, in9, in10,
+                    in11, in12, in13, in14, in15);
+      j += 4;
+      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
+                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
+                    col[j * 8 + 6], col[j * 8 + 7], in16, in17, in18,
+                    in19, in20, in21, in22, in23);
+      j += 4;
+      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
+                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
+                    col[j * 8 + 6], col[j * 8 + 7], in24, in25, in26, in27,
+                    in28, in29, in30, in31);
+    }
+
+    IDCT32_1D
+
+    // final stage
+    if (i < 4) {
+      // 1_D: Store 32 intermediate results for each 8x32 block.
+      col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
+      col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
+      col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
+      col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
+      col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
+      col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
+      col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
+      col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
+      col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
+      col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
+      col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
+      col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
+      col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
+      col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
+      col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
+      col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
+      col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
+      col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
+      col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
+      col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
+      col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
+      col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
+      col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
+      col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
+      col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
+      col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
+      col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
+      col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
+      col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
+      col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
+      col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
+      col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
+    } else {
+      const __m128i zero = _mm_setzero_si128();
+
+      // 2_D: Calculate the results and store them to destination.
+      in0 = _mm_add_epi16(stp1_0, stp1_31);
+      in1 = _mm_add_epi16(stp1_1, stp1_30);
+      in2 = _mm_add_epi16(stp1_2, stp1_29);
+      in3 = _mm_add_epi16(stp1_3, stp1_28);
+      in4 = _mm_add_epi16(stp1_4, stp1_27);
+      in5 = _mm_add_epi16(stp1_5, stp1_26);
+      in6 = _mm_add_epi16(stp1_6, stp1_25);
+      in7 = _mm_add_epi16(stp1_7, stp1_24);
+      in8 = _mm_add_epi16(stp1_8, stp1_23);
+      in9 = _mm_add_epi16(stp1_9, stp1_22);
+      in10 = _mm_add_epi16(stp1_10, stp1_21);
+      in11 = _mm_add_epi16(stp1_11, stp1_20);
+      in12 = _mm_add_epi16(stp1_12, stp1_19);
+      in13 = _mm_add_epi16(stp1_13, stp1_18);
+      in14 = _mm_add_epi16(stp1_14, stp1_17);
+      in15 = _mm_add_epi16(stp1_15, stp1_16);
+      in16 = _mm_sub_epi16(stp1_15, stp1_16);
+      in17 = _mm_sub_epi16(stp1_14, stp1_17);
+      in18 = _mm_sub_epi16(stp1_13, stp1_18);
+      in19 = _mm_sub_epi16(stp1_12, stp1_19);
+      in20 = _mm_sub_epi16(stp1_11, stp1_20);
+      in21 = _mm_sub_epi16(stp1_10, stp1_21);
+      in22 = _mm_sub_epi16(stp1_9, stp1_22);
+      in23 = _mm_sub_epi16(stp1_8, stp1_23);
+      in24 = _mm_sub_epi16(stp1_7, stp1_24);
+      in25 = _mm_sub_epi16(stp1_6, stp1_25);
+      in26 = _mm_sub_epi16(stp1_5, stp1_26);
+      in27 = _mm_sub_epi16(stp1_4, stp1_27);
+      in28 = _mm_sub_epi16(stp1_3, stp1_28);
+      in29 = _mm_sub_epi16(stp1_2, stp1_29);
+      in30 = _mm_sub_epi16(stp1_1, stp1_30);
+      in31 = _mm_sub_epi16(stp1_0, stp1_31);
+
+      // Final rounding and shift
+      in0 = _mm_adds_epi16(in0, final_rounding);
+      in1 = _mm_adds_epi16(in1, final_rounding);
+      in2 = _mm_adds_epi16(in2, final_rounding);
+      in3 = _mm_adds_epi16(in3, final_rounding);
+      in4 = _mm_adds_epi16(in4, final_rounding);
+      in5 = _mm_adds_epi16(in5, final_rounding);
+      in6 = _mm_adds_epi16(in6, final_rounding);
+      in7 = _mm_adds_epi16(in7, final_rounding);
+      in8 = _mm_adds_epi16(in8, final_rounding);
+      in9 = _mm_adds_epi16(in9, final_rounding);
+      in10 = _mm_adds_epi16(in10, final_rounding);
+      in11 = _mm_adds_epi16(in11, final_rounding);
+      in12 = _mm_adds_epi16(in12, final_rounding);
+      in13 = _mm_adds_epi16(in13, final_rounding);
+      in14 = _mm_adds_epi16(in14, final_rounding);
+      in15 = _mm_adds_epi16(in15, final_rounding);
+      in16 = _mm_adds_epi16(in16, final_rounding);
+      in17 = _mm_adds_epi16(in17, final_rounding);
+      in18 = _mm_adds_epi16(in18, final_rounding);
+      in19 = _mm_adds_epi16(in19, final_rounding);
+      in20 = _mm_adds_epi16(in20, final_rounding);
+      in21 = _mm_adds_epi16(in21, final_rounding);
+      in22 = _mm_adds_epi16(in22, final_rounding);
+      in23 = _mm_adds_epi16(in23, final_rounding);
+      in24 = _mm_adds_epi16(in24, final_rounding);
+      in25 = _mm_adds_epi16(in25, final_rounding);
+      in26 = _mm_adds_epi16(in26, final_rounding);
+      in27 = _mm_adds_epi16(in27, final_rounding);
+      in28 = _mm_adds_epi16(in28, final_rounding);
+      in29 = _mm_adds_epi16(in29, final_rounding);
+      in30 = _mm_adds_epi16(in30, final_rounding);
+      in31 = _mm_adds_epi16(in31, final_rounding);
+
+      in0 = _mm_srai_epi16(in0, 6);
+      in1 = _mm_srai_epi16(in1, 6);
+      in2 = _mm_srai_epi16(in2, 6);
+      in3 = _mm_srai_epi16(in3, 6);
+      in4 = _mm_srai_epi16(in4, 6);
+      in5 = _mm_srai_epi16(in5, 6);
+      in6 = _mm_srai_epi16(in6, 6);
+      in7 = _mm_srai_epi16(in7, 6);
+      in8 = _mm_srai_epi16(in8, 6);
+      in9 = _mm_srai_epi16(in9, 6);
+      in10 = _mm_srai_epi16(in10, 6);
+      in11 = _mm_srai_epi16(in11, 6);
+      in12 = _mm_srai_epi16(in12, 6);
+      in13 = _mm_srai_epi16(in13, 6);
+      in14 = _mm_srai_epi16(in14, 6);
+      in15 = _mm_srai_epi16(in15, 6);
+      in16 = _mm_srai_epi16(in16, 6);
+      in17 = _mm_srai_epi16(in17, 6);
+      in18 = _mm_srai_epi16(in18, 6);
+      in19 = _mm_srai_epi16(in19, 6);
+      in20 = _mm_srai_epi16(in20, 6);
+      in21 = _mm_srai_epi16(in21, 6);
+      in22 = _mm_srai_epi16(in22, 6);
+      in23 = _mm_srai_epi16(in23, 6);
+      in24 = _mm_srai_epi16(in24, 6);
+      in25 = _mm_srai_epi16(in25, 6);
+      in26 = _mm_srai_epi16(in26, 6);
+      in27 = _mm_srai_epi16(in27, 6);
+      in28 = _mm_srai_epi16(in28, 6);
+      in29 = _mm_srai_epi16(in29, 6);
+      in30 = _mm_srai_epi16(in30, 6);
+      in31 = _mm_srai_epi16(in31, 6);
+
+      RECON_AND_STORE(dest, in0);
+      RECON_AND_STORE(dest, in1);
+      RECON_AND_STORE(dest, in2);
+      RECON_AND_STORE(dest, in3);
+      RECON_AND_STORE(dest, in4);
+      RECON_AND_STORE(dest, in5);
+      RECON_AND_STORE(dest, in6);
+      RECON_AND_STORE(dest, in7);
+      RECON_AND_STORE(dest, in8);
+      RECON_AND_STORE(dest, in9);
+      RECON_AND_STORE(dest, in10);
+      RECON_AND_STORE(dest, in11);
+      RECON_AND_STORE(dest, in12);
+      RECON_AND_STORE(dest, in13);
+      RECON_AND_STORE(dest, in14);
+      RECON_AND_STORE(dest, in15);
+      RECON_AND_STORE(dest, in16);
+      RECON_AND_STORE(dest, in17);
+      RECON_AND_STORE(dest, in18);
+      RECON_AND_STORE(dest, in19);
+      RECON_AND_STORE(dest, in20);
+      RECON_AND_STORE(dest, in21);
+      RECON_AND_STORE(dest, in22);
+      RECON_AND_STORE(dest, in23);
+      RECON_AND_STORE(dest, in24);
+      RECON_AND_STORE(dest, in25);
+      RECON_AND_STORE(dest, in26);
+      RECON_AND_STORE(dest, in27);
+      RECON_AND_STORE(dest, in28);
+      RECON_AND_STORE(dest, in29);
+      RECON_AND_STORE(dest, in30);
+      RECON_AND_STORE(dest, in31);
+
+      dest += 8 - (stride * 32);
+    }
+  }
+}
+
 void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest,
                                  int stride) {
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
@@ -3009,336 +3701,7 @@ void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest,
                     in28, in29, in30, in31);
     }
 
-    // Stage1
-    {
-      const __m128i lo_1_31 = _mm_unpacklo_epi16(in1, in31);
-      const __m128i hi_1_31 = _mm_unpackhi_epi16(in1, in31);
-      const __m128i lo_17_15 = _mm_unpacklo_epi16(in17, in15);
-      const __m128i hi_17_15 = _mm_unpackhi_epi16(in17, in15);
-
-      const __m128i lo_9_23 = _mm_unpacklo_epi16(in9, in23);
-      const __m128i hi_9_23 = _mm_unpackhi_epi16(in9, in23);
-      const __m128i lo_25_7= _mm_unpacklo_epi16(in25, in7);
-      const __m128i hi_25_7 = _mm_unpackhi_epi16(in25, in7);
-
-      const __m128i lo_5_27 = _mm_unpacklo_epi16(in5, in27);
-      const __m128i hi_5_27 = _mm_unpackhi_epi16(in5, in27);
-      const __m128i lo_21_11 = _mm_unpacklo_epi16(in21, in11);
-      const __m128i hi_21_11 = _mm_unpackhi_epi16(in21, in11);
-
-      const __m128i lo_13_19 = _mm_unpacklo_epi16(in13, in19);
-      const __m128i hi_13_19 = _mm_unpackhi_epi16(in13, in19);
-      const __m128i lo_29_3 = _mm_unpacklo_epi16(in29, in3);
-      const __m128i hi_29_3 = _mm_unpackhi_epi16(in29, in3);
-
-      MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0,
-                             stg1_1, stg1_2, stg1_3, stp1_16, stp1_31,
-                             stp1_17, stp1_30)
-      MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4,
-                             stg1_5, stg1_6, stg1_7, stp1_18, stp1_29,
-                             stp1_19, stp1_28)
-      MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8,
-                             stg1_9, stg1_10, stg1_11, stp1_20, stp1_27,
-                             stp1_21, stp1_26)
-      MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12,
-                             stg1_13, stg1_14, stg1_15, stp1_22, stp1_25,
-                             stp1_23, stp1_24)
-    }
-
-    // Stage2
-    {
-      const __m128i lo_2_30 = _mm_unpacklo_epi16(in2, in30);
-      const __m128i hi_2_30 = _mm_unpackhi_epi16(in2, in30);
-      const __m128i lo_18_14 = _mm_unpacklo_epi16(in18, in14);
-      const __m128i hi_18_14 = _mm_unpackhi_epi16(in18, in14);
-
-      const __m128i lo_10_22 = _mm_unpacklo_epi16(in10, in22);
-      const __m128i hi_10_22 = _mm_unpackhi_epi16(in10, in22);
-      const __m128i lo_26_6 = _mm_unpacklo_epi16(in26, in6);
-      const __m128i hi_26_6 = _mm_unpackhi_epi16(in26, in6);
-
-      MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0,
-                             stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9,
-                             stp2_14)
-      MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4,
-                             stg2_5, stg2_6, stg2_7, stp2_10, stp2_13,
-                             stp2_11, stp2_12)
-
-      stp2_16 = _mm_add_epi16(stp1_16, stp1_17);
-      stp2_17 = _mm_sub_epi16(stp1_16, stp1_17);
-      stp2_18 = _mm_sub_epi16(stp1_19, stp1_18);
-      stp2_19 = _mm_add_epi16(stp1_19, stp1_18);
-
-      stp2_20 = _mm_add_epi16(stp1_20, stp1_21);
-      stp2_21 = _mm_sub_epi16(stp1_20, stp1_21);
-      stp2_22 = _mm_sub_epi16(stp1_23, stp1_22);
-      stp2_23 = _mm_add_epi16(stp1_23, stp1_22);
-
-      stp2_24 = _mm_add_epi16(stp1_24, stp1_25);
-      stp2_25 = _mm_sub_epi16(stp1_24, stp1_25);
-      stp2_26 = _mm_sub_epi16(stp1_27, stp1_26);
-      stp2_27 = _mm_add_epi16(stp1_27, stp1_26);
-
-      stp2_28 = _mm_add_epi16(stp1_28, stp1_29);
-      stp2_29 = _mm_sub_epi16(stp1_28, stp1_29);
-      stp2_30 = _mm_sub_epi16(stp1_31, stp1_30);
-      stp2_31 = _mm_add_epi16(stp1_31, stp1_30);
-    }
-
-    // Stage3
-    {
-      const __m128i lo_4_28 = _mm_unpacklo_epi16(in4, in28);
-      const __m128i hi_4_28 = _mm_unpackhi_epi16(in4, in28);
-      const __m128i lo_20_12 = _mm_unpacklo_epi16(in20, in12);
-      const __m128i hi_20_12 = _mm_unpackhi_epi16(in20, in12);
-
-      const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30);
-      const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30);
-      const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29);
-      const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29);
-
-      const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);
-      const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);
-      const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25);
-      const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25);
-
-      MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0,
-                             stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5,
-                             stp1_6)
-
-      stp1_8 = _mm_add_epi16(stp2_8, stp2_9);
-      stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);
-      stp1_10 = _mm_sub_epi16(stp2_11, stp2_10);
-      stp1_11 = _mm_add_epi16(stp2_11, stp2_10);
-      stp1_12 = _mm_add_epi16(stp2_12, stp2_13);
-      stp1_13 = _mm_sub_epi16(stp2_12, stp2_13);
-      stp1_14 = _mm_sub_epi16(stp2_15, stp2_14);
-      stp1_15 = _mm_add_epi16(stp2_15, stp2_14);
-
-      MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4,
-                             stg3_5, stg3_6, stg3_4, stp1_17, stp1_30,
-                             stp1_18, stp1_29)
-      MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8,
-                             stg3_9, stg3_10, stg3_8, stp1_21, stp1_26,
-                             stp1_22, stp1_25)
-
-      stp1_16 = stp2_16;
-      stp1_31 = stp2_31;
-      stp1_19 = stp2_19;
-      stp1_20 = stp2_20;
-      stp1_23 = stp2_23;
-      stp1_24 = stp2_24;
-      stp1_27 = stp2_27;
-      stp1_28 = stp2_28;
-    }
-
-    // Stage4
-    {
-      const __m128i lo_0_16 = _mm_unpacklo_epi16(in0, in16);
-      const __m128i hi_0_16 = _mm_unpackhi_epi16(in0, in16);
-      const __m128i lo_8_24 = _mm_unpacklo_epi16(in8, in24);
-      const __m128i hi_8_24 = _mm_unpackhi_epi16(in8, in24);
-
-      const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);
-      const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14);
-      const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
-      const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);
-
-      MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0,
-                             stg4_1, stg4_2, stg4_3, stp2_0, stp2_1,
-                             stp2_2, stp2_3)
-
-      stp2_4 = _mm_add_epi16(stp1_4, stp1_5);
-      stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);
-      stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);
-      stp2_7 = _mm_add_epi16(stp1_7, stp1_6);
-
-      MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4,
-                             stg4_5, stg4_6, stg4_4, stp2_9, stp2_14,
-                             stp2_10, stp2_13)
-
-      stp2_8 = stp1_8;
-      stp2_15 = stp1_15;
-      stp2_11 = stp1_11;
-      stp2_12 = stp1_12;
-
-      stp2_16 = _mm_add_epi16(stp1_16, stp1_19);
-      stp2_17 = _mm_add_epi16(stp1_17, stp1_18);
-      stp2_18 = _mm_sub_epi16(stp1_17, stp1_18);
-      stp2_19 = _mm_sub_epi16(stp1_16, stp1_19);
-      stp2_20 = _mm_sub_epi16(stp1_23, stp1_20);
-      stp2_21 = _mm_sub_epi16(stp1_22, stp1_21);
-      stp2_22 = _mm_add_epi16(stp1_22, stp1_21);
-      stp2_23 = _mm_add_epi16(stp1_23, stp1_20);
-
-      stp2_24 = _mm_add_epi16(stp1_24, stp1_27);
-      stp2_25 = _mm_add_epi16(stp1_25, stp1_26);
-      stp2_26 = _mm_sub_epi16(stp1_25, stp1_26);
-      stp2_27 = _mm_sub_epi16(stp1_24, stp1_27);
-      stp2_28 = _mm_sub_epi16(stp1_31, stp1_28);
-      stp2_29 = _mm_sub_epi16(stp1_30, stp1_29);
-      stp2_30 = _mm_add_epi16(stp1_29, stp1_30);
-      stp2_31 = _mm_add_epi16(stp1_28, stp1_31);
-    }
-
-    // Stage5
-    {
-      const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);
-      const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5);
-      const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29);
-      const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29);
-
-      const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28);
-      const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28);
-      const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);
-      const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);
-
-      const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);
-      const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);
-
-      stp1_0 = _mm_add_epi16(stp2_0, stp2_3);
-      stp1_1 = _mm_add_epi16(stp2_1, stp2_2);
-      stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);
-      stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);
-
-      tmp0 = _mm_madd_epi16(lo_6_5, stg4_1);
-      tmp1 = _mm_madd_epi16(hi_6_5, stg4_1);
-      tmp2 = _mm_madd_epi16(lo_6_5, stg4_0);
-      tmp3 = _mm_madd_epi16(hi_6_5, stg4_0);
-
-      tmp0 = _mm_add_epi32(tmp0, rounding);
-      tmp1 = _mm_add_epi32(tmp1, rounding);
-      tmp2 = _mm_add_epi32(tmp2, rounding);
-      tmp3 = _mm_add_epi32(tmp3, rounding);
-
-      tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-      tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
-      tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-      tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
-
-      stp1_5 = _mm_packs_epi32(tmp0, tmp1);
-      stp1_6 = _mm_packs_epi32(tmp2, tmp3);
-
-      stp1_4 = stp2_4;
-      stp1_7 = stp2_7;
-
-      stp1_8 = _mm_add_epi16(stp2_8, stp2_11);
-      stp1_9 = _mm_add_epi16(stp2_9, stp2_10);
-      stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);
-      stp1_11 = _mm_sub_epi16(stp2_8, stp2_11);
-      stp1_12 = _mm_sub_epi16(stp2_15, stp2_12);
-      stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);
-      stp1_14 = _mm_add_epi16(stp2_14, stp2_13);
-      stp1_15 = _mm_add_epi16(stp2_15, stp2_12);
-
-      stp1_16 = stp2_16;
-      stp1_17 = stp2_17;
-
-      MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4,
-                             stg4_5, stg4_4, stg4_5, stp1_18, stp1_29,
-                             stp1_19, stp1_28)
-      MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6,
-                             stg4_4, stg4_6, stg4_4, stp1_20, stp1_27,
-                             stp1_21, stp1_26)
-
-      stp1_22 = stp2_22;
-      stp1_23 = stp2_23;
-      stp1_24 = stp2_24;
-      stp1_25 = stp2_25;
-      stp1_30 = stp2_30;
-      stp1_31 = stp2_31;
-    }
-
-    // Stage6
-    {
-      const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
-      const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);
-      const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);
-      const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12);
-
-      stp2_0 = _mm_add_epi16(stp1_0, stp1_7);
-      stp2_1 = _mm_add_epi16(stp1_1, stp1_6);
-      stp2_2 = _mm_add_epi16(stp1_2, stp1_5);
-      stp2_3 = _mm_add_epi16(stp1_3, stp1_4);
-      stp2_4 = _mm_sub_epi16(stp1_3, stp1_4);
-      stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);
-      stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);
-      stp2_7 = _mm_sub_epi16(stp1_0, stp1_7);
-
-      stp2_8 = stp1_8;
-      stp2_9 = stp1_9;
-      stp2_14 = stp1_14;
-      stp2_15 = stp1_15;
-
-      MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12,
-                             stg6_0, stg4_0, stg6_0, stg4_0, stp2_10,
-                             stp2_13, stp2_11, stp2_12)
-
-      stp2_16 = _mm_add_epi16(stp1_16, stp1_23);
-      stp2_17 = _mm_add_epi16(stp1_17, stp1_22);
-      stp2_18 = _mm_add_epi16(stp1_18, stp1_21);
-      stp2_19 = _mm_add_epi16(stp1_19, stp1_20);
-      stp2_20 = _mm_sub_epi16(stp1_19, stp1_20);
-      stp2_21 = _mm_sub_epi16(stp1_18, stp1_21);
-      stp2_22 = _mm_sub_epi16(stp1_17, stp1_22);
-      stp2_23 = _mm_sub_epi16(stp1_16, stp1_23);
-
-      stp2_24 = _mm_sub_epi16(stp1_31, stp1_24);
-      stp2_25 = _mm_sub_epi16(stp1_30, stp1_25);
-      stp2_26 = _mm_sub_epi16(stp1_29, stp1_26);
-      stp2_27 = _mm_sub_epi16(stp1_28, stp1_27);
-      stp2_28 = _mm_add_epi16(stp1_27, stp1_28);
-      stp2_29 = _mm_add_epi16(stp1_26, stp1_29);
-      stp2_30 = _mm_add_epi16(stp1_25, stp1_30);
-      stp2_31 = _mm_add_epi16(stp1_24, stp1_31);
-    }
-
-    // Stage7
-    {
-      const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);
-      const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);
-      const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);
-      const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);
-
-      const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25);
-      const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25);
-      const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24);
-      const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24);
-
-      stp1_0 = _mm_add_epi16(stp2_0, stp2_15);
-      stp1_1 = _mm_add_epi16(stp2_1, stp2_14);
-      stp1_2 = _mm_add_epi16(stp2_2, stp2_13);
-      stp1_3 = _mm_add_epi16(stp2_3, stp2_12);
-      stp1_4 = _mm_add_epi16(stp2_4, stp2_11);
-      stp1_5 = _mm_add_epi16(stp2_5, stp2_10);
-      stp1_6 = _mm_add_epi16(stp2_6, stp2_9);
-      stp1_7 = _mm_add_epi16(stp2_7, stp2_8);
-      stp1_8 = _mm_sub_epi16(stp2_7, stp2_8);
-      stp1_9 = _mm_sub_epi16(stp2_6, stp2_9);
-      stp1_10 = _mm_sub_epi16(stp2_5, stp2_10);
-      stp1_11 = _mm_sub_epi16(stp2_4, stp2_11);
-      stp1_12 = _mm_sub_epi16(stp2_3, stp2_12);
-      stp1_13 = _mm_sub_epi16(stp2_2, stp2_13);
-      stp1_14 = _mm_sub_epi16(stp2_1, stp2_14);
-      stp1_15 = _mm_sub_epi16(stp2_0, stp2_15);
-
-      stp1_16 = stp2_16;
-      stp1_17 = stp2_17;
-      stp1_18 = stp2_18;
-      stp1_19 = stp2_19;
-
-      MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0,
-                             stg4_0, stg6_0, stg4_0, stp1_20, stp1_27,
-                             stp1_21, stp1_26)
-      MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0,
-                             stg4_0, stg6_0, stg4_0, stp1_22, stp1_25,
-                             stp1_23, stp1_24)
-
-      stp1_28 = stp2_28;
-      stp1_29 = stp2_29;
-      stp1_30 = stp2_30;
-      stp1_31 = stp2_31;
-    }
+    IDCT32_1D
 
     // final stage
     if (i < 4) {
diff --git a/vp9/common/x86/vp9_intrapred_ssse3.asm b/vp9/common/x86/vp9_intrapred_ssse3.asm
index 568e2080e..88df9b2d1 100644
--- a/vp9/common/x86/vp9_intrapred_ssse3.asm
+++ b/vp9/common/x86/vp9_intrapred_ssse3.asm
@@ -991,7 +991,7 @@ cglobal d207_predictor_32x32, 4, 5, 8, dst, stride, stride3, left, goffset
   lea              dst8q, [dst8q+strideq*4]
 
   ; output 2nd half of 3rd 8 lines and half of 4th 8 lines
-  mova                m0, [sh_b23456789abcdefff]
+  mova                m0, [GLOBAL(sh_b23456789abcdefff)]
   mova  [dstq           +16], m7
   mova  [dst8q             ], m7
   pshufb              m7, m0
diff --git a/vp9/common/x86/vp9_loopfilter_intrin_avx2.c b/vp9/common/x86/vp9_loopfilter_intrin_avx2.c
new file mode 100644
index 000000000..3c5cb8ffd
--- /dev/null
+++ b/vp9/common/x86/vp9_loopfilter_intrin_avx2.c
@@ -0,0 +1,943 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h>  /* AVX2 */
+
+static void mb_lpf_horizontal_edge_w_avx2_8(unsigned char *s, int p,
+        const unsigned char *_blimit, const unsigned char *_limit,
+        const unsigned char *_thresh) {
+    __m128i mask, hev, flat, flat2;
+    const __m128i zero = _mm_set1_epi16(0);
+    const __m128i one = _mm_set1_epi8(1);
+    __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
+    __m128i abs_p1p0;
+
+    const __m128i thresh = _mm_broadcastb_epi8(
+            _mm_cvtsi32_si128((int) _thresh[0]));
+    const __m128i limit = _mm_broadcastb_epi8(
+            _mm_cvtsi32_si128((int) _limit[0]));
+    const __m128i blimit = _mm_broadcastb_epi8(
+            _mm_cvtsi32_si128((int) _blimit[0]));
+
+    q4p4 = _mm_loadl_epi64((__m128i *) (s - 5 * p));
+    q4p4 = _mm_castps_si128(
+            _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *) (s + 4 * p)));
+    q3p3 = _mm_loadl_epi64((__m128i *) (s - 4 * p));
+    q3p3 = _mm_castps_si128(
+            _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *) (s + 3 * p)));
+    q2p2 = _mm_loadl_epi64((__m128i *) (s - 3 * p));
+    q2p2 = _mm_castps_si128(
+            _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *) (s + 2 * p)));
+    q1p1 = _mm_loadl_epi64((__m128i *) (s - 2 * p));
+    q1p1 = _mm_castps_si128(
+            _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *) (s + 1 * p)));
+    p1q1 = _mm_shuffle_epi32(q1p1, 78);
+    q0p0 = _mm_loadl_epi64((__m128i *) (s - 1 * p));
+    q0p0 = _mm_castps_si128(
+            _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *) (s - 0 * p)));
+    p0q0 = _mm_shuffle_epi32(q0p0, 78);
+
+    {
+        __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work;
+        abs_p1p0 = _mm_or_si128(_mm_subs_epu8(q1p1, q0p0),
+                _mm_subs_epu8(q0p0, q1p1));
+        abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
+        fe = _mm_set1_epi8(0xfe);
+        ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
+        abs_p0q0 = _mm_or_si128(_mm_subs_epu8(q0p0, p0q0),
+                _mm_subs_epu8(p0q0, q0p0));
+        abs_p1q1 = _mm_or_si128(_mm_subs_epu8(q1p1, p1q1),
+                _mm_subs_epu8(p1q1, q1p1));
+        flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+        hev = _mm_subs_epu8(flat, thresh);
+        hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+
+        abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+        abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+        mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+        mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+        // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+        mask = _mm_max_epu8(abs_p1p0, mask);
+        // mask |= (abs(p1 - p0) > limit) * -1;
+        // mask |= (abs(q1 - q0) > limit) * -1;
+
+        work = _mm_max_epu8(
+                _mm_or_si128(_mm_subs_epu8(q2p2, q1p1),
+                        _mm_subs_epu8(q1p1, q2p2)),
+                _mm_or_si128(_mm_subs_epu8(q3p3, q2p2),
+                        _mm_subs_epu8(q2p2, q3p3)));
+        mask = _mm_max_epu8(work, mask);
+        mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
+        mask = _mm_subs_epu8(mask, limit);
+        mask = _mm_cmpeq_epi8(mask, zero);
+    }
+
+    // lp filter
+    {
+        const __m128i t4 = _mm_set1_epi8(4);
+        const __m128i t3 = _mm_set1_epi8(3);
+        const __m128i t80 = _mm_set1_epi8(0x80);
+        const __m128i t1 = _mm_set1_epi16(0x1);
+        __m128i qs1ps1 = _mm_xor_si128(q1p1, t80);
+        __m128i qs0ps0 = _mm_xor_si128(q0p0, t80);
+        __m128i qs0 = _mm_xor_si128(p0q0, t80);
+        __m128i qs1 = _mm_xor_si128(p1q1, t80);
+        __m128i filt;
+        __m128i work_a;
+        __m128i filter1, filter2;
+        __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
+        __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
+
+        filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev);
+        work_a = _mm_subs_epi8(qs0, qs0ps0);
+        filt = _mm_adds_epi8(filt, work_a);
+        filt = _mm_adds_epi8(filt, work_a);
+        filt = _mm_adds_epi8(filt, work_a);
+        /* (vp9_filter + 3 * (qs0 - ps0)) & mask */
+        filt = _mm_and_si128(filt, mask);
+
+        filter1 = _mm_adds_epi8(filt, t4);
+        filter2 = _mm_adds_epi8(filt, t3);
+
+        filter1 = _mm_unpacklo_epi8(zero, filter1);
+        filter1 = _mm_srai_epi16(filter1, 0xB);
+        filter2 = _mm_unpacklo_epi8(zero, filter2);
+        filter2 = _mm_srai_epi16(filter2, 0xB);
+
+        /* Filter1 >> 3 */
+        filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1));
+        qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80);
+
+        /* filt >> 1 */
+        filt = _mm_adds_epi16(filter1, t1);
+        filt = _mm_srai_epi16(filt, 1);
+        filt = _mm_andnot_si128(
+                _mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8), filt);
+        filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt));
+        qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80);
+        // loopfilter done
+
+        {
+            __m128i work;
+            flat = _mm_max_epu8(
+                    _mm_or_si128(_mm_subs_epu8(q2p2, q0p0),
+                            _mm_subs_epu8(q0p0, q2p2)),
+                    _mm_or_si128(_mm_subs_epu8(q3p3, q0p0),
+                            _mm_subs_epu8(q0p0, q3p3)));
+            flat = _mm_max_epu8(abs_p1p0, flat);
+            flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
+            flat = _mm_subs_epu8(flat, one);
+            flat = _mm_cmpeq_epi8(flat, zero);
+            flat = _mm_and_si128(flat, mask);
+
+            q5p5 = _mm_loadl_epi64((__m128i *) (s - 6 * p));
+            q5p5 = _mm_castps_si128(
+                    _mm_loadh_pi(_mm_castsi128_ps(q5p5),
+                            (__m64 *) (s + 5 * p)));
+
+            q6p6 = _mm_loadl_epi64((__m128i *) (s - 7 * p));
+            q6p6 = _mm_castps_si128(
+                    _mm_loadh_pi(_mm_castsi128_ps(q6p6),
+                            (__m64 *) (s + 6 * p)));
+
+            flat2 = _mm_max_epu8(
+                    _mm_or_si128(_mm_subs_epu8(q4p4, q0p0),
+                            _mm_subs_epu8(q0p0, q4p4)),
+                    _mm_or_si128(_mm_subs_epu8(q5p5, q0p0),
+                            _mm_subs_epu8(q0p0, q5p5)));
+
+            q7p7 = _mm_loadl_epi64((__m128i *) (s - 8 * p));
+            q7p7 = _mm_castps_si128(
+                    _mm_loadh_pi(_mm_castsi128_ps(q7p7),
+                            (__m64 *) (s + 7 * p)));
+
+            work = _mm_max_epu8(
+                    _mm_or_si128(_mm_subs_epu8(q6p6, q0p0),
+                            _mm_subs_epu8(q0p0, q6p6)),
+                    _mm_or_si128(_mm_subs_epu8(q7p7, q0p0),
+                            _mm_subs_epu8(q0p0, q7p7)));
+
+            flat2 = _mm_max_epu8(work, flat2);
+            flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
+            flat2 = _mm_subs_epu8(flat2, one);
+            flat2 = _mm_cmpeq_epi8(flat2, zero);
+            flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
+        }
+
+        // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+        // flat and wide flat calculations
+        {
+            const __m128i eight = _mm_set1_epi16(8);
+            const __m128i four = _mm_set1_epi16(4);
+            __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
+            __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
+            __m128i pixelFilter_p, pixelFilter_q;
+            __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
+            __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;
+
+            p7_16 = _mm_unpacklo_epi8(q7p7, zero);
+            p6_16 = _mm_unpacklo_epi8(q6p6, zero);
+            p5_16 = _mm_unpacklo_epi8(q5p5, zero);
+            p4_16 = _mm_unpacklo_epi8(q4p4, zero);
+            p3_16 = _mm_unpacklo_epi8(q3p3, zero);
+            p2_16 = _mm_unpacklo_epi8(q2p2, zero);
+            p1_16 = _mm_unpacklo_epi8(q1p1, zero);
+            p0_16 = _mm_unpacklo_epi8(q0p0, zero);
+            q0_16 = _mm_unpackhi_epi8(q0p0, zero);
+            q1_16 = _mm_unpackhi_epi8(q1p1, zero);
+            q2_16 = _mm_unpackhi_epi8(q2p2, zero);
+            q3_16 = _mm_unpackhi_epi8(q3p3, zero);
+            q4_16 = _mm_unpackhi_epi8(q4p4, zero);
+            q5_16 = _mm_unpackhi_epi8(q5p5, zero);
+            q6_16 = _mm_unpackhi_epi8(q6p6, zero);
+            q7_16 = _mm_unpackhi_epi8(q7p7, zero);
+
+            pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16),
+                    _mm_add_epi16(p4_16, p3_16));
+            pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16),
+                    _mm_add_epi16(q4_16, q3_16));
+
+            pixetFilter_p2p1p0 = _mm_add_epi16(p0_16,
+                    _mm_add_epi16(p2_16, p1_16));
+            pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
+
+            pixetFilter_q2q1q0 = _mm_add_epi16(q0_16,
+                    _mm_add_epi16(q2_16, q1_16));
+            pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
+            pixelFilter_p = _mm_add_epi16(eight,
+                    _mm_add_epi16(pixelFilter_p, pixelFilter_q));
+            pixetFilter_p2p1p0 = _mm_add_epi16(four,
+                    _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
+            res_p = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7_16, p0_16)),
+                    4);
+            res_q = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7_16, q0_16)),
+                    4);
+            flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
+            res_p = _mm_srli_epi16(
+                    _mm_add_epi16(pixetFilter_p2p1p0,
+                            _mm_add_epi16(p3_16, p0_16)), 3);
+            res_q = _mm_srli_epi16(
+                    _mm_add_epi16(pixetFilter_p2p1p0,
+                            _mm_add_epi16(q3_16, q0_16)), 3);
+
+            flat_q0p0 = _mm_packus_epi16(res_p, res_q);
+
+            sum_p7 = _mm_add_epi16(p7_16, p7_16);
+            sum_q7 = _mm_add_epi16(q7_16, q7_16);
+            sum_p3 = _mm_add_epi16(p3_16, p3_16);
+            sum_q3 = _mm_add_epi16(q3_16, q3_16);
+
+            pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16);
+            pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16);
+            res_p = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1_16)),
+                    4);
+            res_q = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1_16)),
+                    4);
+            flat2_q1p1 = _mm_packus_epi16(res_p, res_q);
+
+            pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
+            pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
+            res_p = _mm_srli_epi16(
+                    _mm_add_epi16(pixetFilter_p2p1p0,
+                            _mm_add_epi16(sum_p3, p1_16)), 3);
+            res_q = _mm_srli_epi16(
+                    _mm_add_epi16(pixetFilter_q2q1q0,
+                            _mm_add_epi16(sum_q3, q1_16)), 3);
+            flat_q1p1 = _mm_packus_epi16(res_p, res_q);
+
+            sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+            sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+            sum_p3 = _mm_add_epi16(sum_p3, p3_16);
+            sum_q3 = _mm_add_epi16(sum_q3, q3_16);
+
+            pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
+            pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16);
+            res_p = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2_16)),
+                    4);
+            res_q = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2_16)),
+                    4);
+            flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
+
+            pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
+            pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
+
+            res_p = _mm_srli_epi16(
+                    _mm_add_epi16(pixetFilter_p2p1p0,
+                            _mm_add_epi16(sum_p3, p2_16)), 3);
+            res_q = _mm_srli_epi16(
+                    _mm_add_epi16(pixetFilter_q2q1q0,
+                            _mm_add_epi16(sum_q3, q2_16)), 3);
+            flat_q2p2 = _mm_packus_epi16(res_p, res_q);
+
+            sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+            sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+            pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
+            pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
+            res_p = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3_16)),
+                    4);
+            res_q = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3_16)),
+                    4);
+            flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
+
+            sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+            sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+            pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
+            pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
+            res_p = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4_16)),
+                    4);
+            res_q = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4_16)),
+                    4);
+            flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
+
+            sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+            sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+            pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
+            pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
+            res_p = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5_16)),
+                    4);
+            res_q = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5_16)),
+                    4);
+            flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
+
+            sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+            sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+            pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
+            pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
+            res_p = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6_16)),
+                    4);
+            res_q = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6_16)),
+                    4);
+            flat2_q6p6 = _mm_packus_epi16(res_p, res_q);
+        }
+        // wide flat
+        // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+        flat = _mm_shuffle_epi32(flat, 68);
+        flat2 = _mm_shuffle_epi32(flat2, 68);
+
+        q2p2 = _mm_andnot_si128(flat, q2p2);
+        flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
+        q2p2 = _mm_or_si128(q2p2, flat_q2p2);
+
+        qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
+        flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
+        q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
+
+        qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
+        flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
+        q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
+
+        q6p6 = _mm_andnot_si128(flat2, q6p6);
+        flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
+        q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
+        _mm_storel_epi64((__m128i *) (s - 7 * p), q6p6);
+        _mm_storeh_pi((__m64 *) (s + 6 * p), _mm_castsi128_ps(q6p6));
+
+        q5p5 = _mm_andnot_si128(flat2, q5p5);
+        flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
+        q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
+        _mm_storel_epi64((__m128i *) (s - 6 * p), q5p5);
+        _mm_storeh_pi((__m64 *) (s + 5 * p), _mm_castsi128_ps(q5p5));
+
+        q4p4 = _mm_andnot_si128(flat2, q4p4);
+        flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
+        q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
+        _mm_storel_epi64((__m128i *) (s - 5 * p), q4p4);
+        _mm_storeh_pi((__m64 *) (s + 4 * p), _mm_castsi128_ps(q4p4));
+
+        q3p3 = _mm_andnot_si128(flat2, q3p3);
+        flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
+        q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
+        _mm_storel_epi64((__m128i *) (s - 4 * p), q3p3);
+        _mm_storeh_pi((__m64 *) (s + 3 * p), _mm_castsi128_ps(q3p3));
+
+        q2p2 = _mm_andnot_si128(flat2, q2p2);
+        flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
+        q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
+        _mm_storel_epi64((__m128i *) (s - 3 * p), q2p2);
+        _mm_storeh_pi((__m64 *) (s + 2 * p), _mm_castsi128_ps(q2p2));
+
+        q1p1 = _mm_andnot_si128(flat2, q1p1);
+        flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
+        q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
+        _mm_storel_epi64((__m128i *) (s - 2 * p), q1p1);
+        _mm_storeh_pi((__m64 *) (s + 1 * p), _mm_castsi128_ps(q1p1));
+
+        q0p0 = _mm_andnot_si128(flat2, q0p0);
+        flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
+        q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
+        _mm_storel_epi64((__m128i *) (s - 1 * p), q0p0);
+        _mm_storeh_pi((__m64 *) (s - 0 * p), _mm_castsi128_ps(q0p0));
+    }
+}
+
+static void mb_lpf_horizontal_edge_w_avx2_16(unsigned char *s, int p,
+        const unsigned char *_blimit, const unsigned char *_limit,
+        const unsigned char *_thresh) {
+    __m128i mask, hev, flat, flat2;
+    const __m128i zero = _mm_set1_epi16(0);
+    const __m128i one = _mm_set1_epi8(1);
+    __m128i p7, p6, p5;
+    __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
+    __m128i q5, q6, q7;
+
+    const __m128i thresh = _mm_broadcastb_epi8(
+            _mm_cvtsi32_si128((int) _thresh[0]));
+    const __m128i limit = _mm_broadcastb_epi8(
+            _mm_cvtsi32_si128((int) _limit[0]));
+    const __m128i blimit = _mm_broadcastb_epi8(
+            _mm_cvtsi32_si128((int) _blimit[0]));
+
+    p4 = _mm_loadu_si128((__m128i *) (s - 5 * p));
+    p3 = _mm_loadu_si128((__m128i *) (s - 4 * p));
+    p2 = _mm_loadu_si128((__m128i *) (s - 3 * p));
+    p1 = _mm_loadu_si128((__m128i *) (s - 2 * p));
+    p0 = _mm_loadu_si128((__m128i *) (s - 1 * p));
+    q0 = _mm_loadu_si128((__m128i *) (s - 0 * p));
+    q1 = _mm_loadu_si128((__m128i *) (s + 1 * p));
+    q2 = _mm_loadu_si128((__m128i *) (s + 2 * p));
+    q3 = _mm_loadu_si128((__m128i *) (s + 3 * p));
+    q4 = _mm_loadu_si128((__m128i *) (s + 4 * p));
+
+    {
+        const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
+                _mm_subs_epu8(p0, p1));
+        const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
+                _mm_subs_epu8(q0, q1));
+        const __m128i fe = _mm_set1_epi8(0xfe);
+        const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
+        __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
+                _mm_subs_epu8(q0, p0));
+        __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
+                _mm_subs_epu8(q1, p1));
+        __m128i work;
+        flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+        hev = _mm_subs_epu8(flat, thresh);
+        hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+
+        abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+        abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+        mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+        mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+        // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+        mask = _mm_max_epu8(flat, mask);
+        // mask |= (abs(p1 - p0) > limit) * -1;
+        // mask |= (abs(q1 - q0) > limit) * -1;
+        work = _mm_max_epu8(
+                _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
+                _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3)));
+        mask = _mm_max_epu8(work, mask);
+        work = _mm_max_epu8(
+                _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
+                _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
+        mask = _mm_max_epu8(work, mask);
+        mask = _mm_subs_epu8(mask, limit);
+        mask = _mm_cmpeq_epi8(mask, zero);
+    }
+
+    // lp filter
+    {
+        const __m128i t4 = _mm_set1_epi8(4);
+        const __m128i t3 = _mm_set1_epi8(3);
+        const __m128i t80 = _mm_set1_epi8(0x80);
+        const __m128i te0 = _mm_set1_epi8(0xe0);
+        const __m128i t1f = _mm_set1_epi8(0x1f);
+        const __m128i t1 = _mm_set1_epi8(0x1);
+        const __m128i t7f = _mm_set1_epi8(0x7f);
+
+        __m128i ps1 = _mm_xor_si128(p1, t80);
+        __m128i ps0 = _mm_xor_si128(p0, t80);
+        __m128i qs0 = _mm_xor_si128(q0, t80);
+        __m128i qs1 = _mm_xor_si128(q1, t80);
+        __m128i filt;
+        __m128i work_a;
+        __m128i filter1, filter2;
+        __m128i flat2_p6, flat2_p5, flat2_p4, flat2_p3, flat2_p2, flat2_p1,
+                flat2_p0, flat2_q0, flat2_q1, flat2_q2, flat2_q3, flat2_q4,
+                flat2_q5, flat2_q6, flat_p2, flat_p1, flat_p0, flat_q0, flat_q1,
+                flat_q2;
+
+        filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
+        work_a = _mm_subs_epi8(qs0, ps0);
+        filt = _mm_adds_epi8(filt, work_a);
+        filt = _mm_adds_epi8(filt, work_a);
+        filt = _mm_adds_epi8(filt, work_a);
+        /* (vp9_filter + 3 * (qs0 - ps0)) & mask */
+        filt = _mm_and_si128(filt, mask);
+
+        filter1 = _mm_adds_epi8(filt, t4);
+        filter2 = _mm_adds_epi8(filt, t3);
+
+        /* Filter1 >> 3 */
+        work_a = _mm_cmpgt_epi8(zero, filter1);
+        filter1 = _mm_srli_epi16(filter1, 3);
+        work_a = _mm_and_si128(work_a, te0);
+        filter1 = _mm_and_si128(filter1, t1f);
+        filter1 = _mm_or_si128(filter1, work_a);
+        qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
+
+        /* Filter2 >> 3 */
+        work_a = _mm_cmpgt_epi8(zero, filter2);
+        filter2 = _mm_srli_epi16(filter2, 3);
+        work_a = _mm_and_si128(work_a, te0);
+        filter2 = _mm_and_si128(filter2, t1f);
+        filter2 = _mm_or_si128(filter2, work_a);
+        ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
+
+        /* filt >> 1 */
+        filt = _mm_adds_epi8(filter1, t1);
+        work_a = _mm_cmpgt_epi8(zero, filt);
+        filt = _mm_srli_epi16(filt, 1);
+        work_a = _mm_and_si128(work_a, t80);
+        filt = _mm_and_si128(filt, t7f);
+        filt = _mm_or_si128(filt, work_a);
+        filt = _mm_andnot_si128(hev, filt);
+        ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
+        qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
+        // loopfilter done
+
+        {
+            __m128i work;
+            work = _mm_max_epu8(
+                    _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)),
+                    _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2)));
+            flat = _mm_max_epu8(work, flat);
+            work = _mm_max_epu8(
+                    _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)),
+                    _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3)));
+            flat = _mm_max_epu8(work, flat);
+            work = _mm_max_epu8(
+                    _mm_or_si128(_mm_subs_epu8(p4, p0), _mm_subs_epu8(p0, p4)),
+                    _mm_or_si128(_mm_subs_epu8(q4, q0), _mm_subs_epu8(q0, q4)));
+            flat = _mm_subs_epu8(flat, one);
+            flat = _mm_cmpeq_epi8(flat, zero);
+            flat = _mm_and_si128(flat, mask);
+
+            p5 = _mm_loadu_si128((__m128i *) (s - 6 * p));
+            q5 = _mm_loadu_si128((__m128i *) (s + 5 * p));
+            flat2 = _mm_max_epu8(
+                    _mm_or_si128(_mm_subs_epu8(p5, p0), _mm_subs_epu8(p0, p5)),
+                    _mm_or_si128(_mm_subs_epu8(q5, q0), _mm_subs_epu8(q0, q5)));
+
+            flat2 = _mm_max_epu8(work, flat2);
+            p6 = _mm_loadu_si128((__m128i *) (s - 7 * p));
+            q6 = _mm_loadu_si128((__m128i *) (s + 6 * p));
+            work = _mm_max_epu8(
+                    _mm_or_si128(_mm_subs_epu8(p6, p0), _mm_subs_epu8(p0, p6)),
+                    _mm_or_si128(_mm_subs_epu8(q6, q0), _mm_subs_epu8(q0, q6)));
+
+            flat2 = _mm_max_epu8(work, flat2);
+
+            p7 = _mm_loadu_si128((__m128i *) (s - 8 * p));
+            q7 = _mm_loadu_si128((__m128i *) (s + 7 * p));
+            work = _mm_max_epu8(
+                    _mm_or_si128(_mm_subs_epu8(p7, p0), _mm_subs_epu8(p0, p7)),
+                    _mm_or_si128(_mm_subs_epu8(q7, q0), _mm_subs_epu8(q0, q7)));
+
+            flat2 = _mm_max_epu8(work, flat2);
+            flat2 = _mm_subs_epu8(flat2, one);
+            flat2 = _mm_cmpeq_epi8(flat2, zero);
+            flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
+        }
+
+        // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+        // flat and wide flat calculations
+        {
+            const __m256i eight = _mm256_set1_epi16(8);
+            const __m256i four = _mm256_set1_epi16(4);
+            __m256i p256_7, q256_7, p256_6, q256_6, p256_5, q256_5, p256_4,
+                    q256_4, p256_3, q256_3, p256_2, q256_2, p256_1, q256_1,
+                    p256_0, q256_0;
+            __m256i pixelFilter_p, pixelFilter_q, pixetFilter_p2p1p0,
+                    pixetFilter_q2q1q0, sum_p7, sum_q7, sum_p3, sum_q3, res_p,
+                    res_q;
+
+            p256_7 = _mm256_cvtepu8_epi16(p7);
+            p256_6 = _mm256_cvtepu8_epi16(p6);
+            p256_5 = _mm256_cvtepu8_epi16(p5);
+            p256_4 = _mm256_cvtepu8_epi16(p4);
+            p256_3 = _mm256_cvtepu8_epi16(p3);
+            p256_2 = _mm256_cvtepu8_epi16(p2);
+            p256_1 = _mm256_cvtepu8_epi16(p1);
+            p256_0 = _mm256_cvtepu8_epi16(p0);
+            q256_0 = _mm256_cvtepu8_epi16(q0);
+            q256_1 = _mm256_cvtepu8_epi16(q1);
+            q256_2 = _mm256_cvtepu8_epi16(q2);
+            q256_3 = _mm256_cvtepu8_epi16(q3);
+            q256_4 = _mm256_cvtepu8_epi16(q4);
+            q256_5 = _mm256_cvtepu8_epi16(q5);
+            q256_6 = _mm256_cvtepu8_epi16(q6);
+            q256_7 = _mm256_cvtepu8_epi16(q7);
+
+            pixelFilter_p = _mm256_add_epi16(_mm256_add_epi16(p256_6, p256_5),
+                    _mm256_add_epi16(p256_4, p256_3));
+            pixelFilter_q = _mm256_add_epi16(_mm256_add_epi16(q256_6, q256_5),
+                    _mm256_add_epi16(q256_4, q256_3));
+
+            pixetFilter_p2p1p0 = _mm256_add_epi16(p256_0,
+                    _mm256_add_epi16(p256_2, p256_1));
+            pixelFilter_p = _mm256_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
+
+            pixetFilter_q2q1q0 = _mm256_add_epi16(q256_0,
+                    _mm256_add_epi16(q256_2, q256_1));
+            pixelFilter_q = _mm256_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
+
+            pixelFilter_p = _mm256_add_epi16(eight,
+                    _mm256_add_epi16(pixelFilter_p, pixelFilter_q));
+
+            pixetFilter_p2p1p0 = _mm256_add_epi16(four,
+                    _mm256_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
+
+            res_p = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_p,
+                            _mm256_add_epi16(p256_7, p256_0)), 4);
+
+            flat2_p0 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+                            168));
+
+            res_q = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_p,
+                            _mm256_add_epi16(q256_7, q256_0)), 4);
+
+            flat2_q0 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+                            168));
+
+            res_p = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixetFilter_p2p1p0,
+                            _mm256_add_epi16(p256_3, p256_0)), 3);
+
+            flat_p0 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+                            168));
+
+            res_q = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixetFilter_p2p1p0,
+                            _mm256_add_epi16(q256_3, q256_0)), 3);
+
+            flat_q0 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+                            168));
+
+            sum_p7 = _mm256_add_epi16(p256_7, p256_7);
+
+            sum_q7 = _mm256_add_epi16(q256_7, q256_7);
+
+            sum_p3 = _mm256_add_epi16(p256_3, p256_3);
+
+            sum_q3 = _mm256_add_epi16(q256_3, q256_3);
+
+            pixelFilter_q = _mm256_sub_epi16(pixelFilter_p, p256_6);
+
+            pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_6);
+
+            res_p = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_p,
+                            _mm256_add_epi16(sum_p7, p256_1)), 4);
+
+            flat2_p1 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+                            168));
+
+            res_q = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_q,
+                            _mm256_add_epi16(sum_q7, q256_1)), 4);
+
+            flat2_q1 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+                            168));
+
+            pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_p2p1p0, p256_2);
+
+            pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_2);
+
+            res_p = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixetFilter_p2p1p0,
+                            _mm256_add_epi16(sum_p3, p256_1)), 3);
+
+            flat_p1 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+                            168));
+
+            res_q = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixetFilter_q2q1q0,
+                            _mm256_add_epi16(sum_q3, q256_1)), 3);
+
+            flat_q1 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+                            168));
+
+            sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
+
+            sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
+
+            sum_p3 = _mm256_add_epi16(sum_p3, p256_3);
+
+            sum_q3 = _mm256_add_epi16(sum_q3, q256_3);
+
+            pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_5);
+
+            pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_5);
+
+            res_p = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_p,
+                            _mm256_add_epi16(sum_p7, p256_2)), 4);
+
+            flat2_p2 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+                            168));
+
+            res_q = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_q,
+                            _mm256_add_epi16(sum_q7, q256_2)), 4);
+
+            flat2_q2 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+                            168));
+
+            pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_1);
+
+            pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_q2q1q0, p256_1);
+
+            res_p = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixetFilter_p2p1p0,
+                            _mm256_add_epi16(sum_p3, p256_2)), 3);
+
+            flat_p2 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+                            168));
+
+            res_q = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixetFilter_q2q1q0,
+                            _mm256_add_epi16(sum_q3, q256_2)), 3);
+
+            flat_q2 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+                            168));
+
+            sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
+
+            sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
+
+            pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_4);
+
+            pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_4);
+
+            res_p = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_p,
+                            _mm256_add_epi16(sum_p7, p256_3)), 4);
+
+            flat2_p3 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+                            168));
+
+            res_q = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_q,
+                            _mm256_add_epi16(sum_q7, q256_3)), 4);
+
+            flat2_q3 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+                            168));
+
+            sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
+
+            sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
+
+            pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_3);
+
+            pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_3);
+
+            res_p = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_p,
+                            _mm256_add_epi16(sum_p7, p256_4)), 4);
+
+            flat2_p4 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+                            168));
+
+            res_q = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_q,
+                            _mm256_add_epi16(sum_q7, q256_4)), 4);
+
+            flat2_q4 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+                            168));
+
+            sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
+
+            sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
+
+            pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_2);
+
+            pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_2);
+
+            res_p = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_p,
+                            _mm256_add_epi16(sum_p7, p256_5)), 4);
+
+            flat2_p5 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+                            168));
+
+            res_q = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_q,
+                            _mm256_add_epi16(sum_q7, q256_5)), 4);
+
+            flat2_q5 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+                            168));
+
+            sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
+
+            sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
+
+            pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_1);
+
+            pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_1);
+
+            res_p = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_p,
+                            _mm256_add_epi16(sum_p7, p256_6)), 4);
+
+            flat2_p6 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+                            168));
+
+            res_q = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_q,
+                            _mm256_add_epi16(sum_q7, q256_6)), 4);
+
+            flat2_q6 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+                            168));
+        }
+
+        // wide flat
+        // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+        p2 = _mm_andnot_si128(flat, p2);
+        flat_p2 = _mm_and_si128(flat, flat_p2);
+        p2 = _mm_or_si128(flat_p2, p2);
+
+        p1 = _mm_andnot_si128(flat, ps1);
+        flat_p1 = _mm_and_si128(flat, flat_p1);
+        p1 = _mm_or_si128(flat_p1, p1);
+
+        p0 = _mm_andnot_si128(flat, ps0);
+        flat_p0 = _mm_and_si128(flat, flat_p0);
+        p0 = _mm_or_si128(flat_p0, p0);
+
+        q0 = _mm_andnot_si128(flat, qs0);
+        flat_q0 = _mm_and_si128(flat, flat_q0);
+        q0 = _mm_or_si128(flat_q0, q0);
+
+        q1 = _mm_andnot_si128(flat, qs1);
+        flat_q1 = _mm_and_si128(flat, flat_q1);
+        q1 = _mm_or_si128(flat_q1, q1);
+
+        q2 = _mm_andnot_si128(flat, q2);
+        flat_q2 = _mm_and_si128(flat, flat_q2);
+        q2 = _mm_or_si128(flat_q2, q2);
+
+        p6 = _mm_andnot_si128(flat2, p6);
+        flat2_p6 = _mm_and_si128(flat2, flat2_p6);
+        p6 = _mm_or_si128(flat2_p6, p6);
+        _mm_storeu_si128((__m128i *) (s - 7 * p), p6);
+
+        p5 = _mm_andnot_si128(flat2, p5);
+        flat2_p5 = _mm_and_si128(flat2, flat2_p5);
+        p5 = _mm_or_si128(flat2_p5, p5);
+        _mm_storeu_si128((__m128i *) (s - 6 * p), p5);
+
+        p4 = _mm_andnot_si128(flat2, p4);
+        flat2_p4 = _mm_and_si128(flat2, flat2_p4);
+        p4 = _mm_or_si128(flat2_p4, p4);
+        _mm_storeu_si128((__m128i *) (s - 5 * p), p4);
+
+        p3 = _mm_andnot_si128(flat2, p3);
+        flat2_p3 = _mm_and_si128(flat2, flat2_p3);
+        p3 = _mm_or_si128(flat2_p3, p3);
+        _mm_storeu_si128((__m128i *) (s - 4 * p), p3);
+
+        p2 = _mm_andnot_si128(flat2, p2);
+        flat2_p2 = _mm_and_si128(flat2, flat2_p2);
+        p2 = _mm_or_si128(flat2_p2, p2);
+        _mm_storeu_si128((__m128i *) (s - 3 * p), p2);
+
+        p1 = _mm_andnot_si128(flat2, p1);
+        flat2_p1 = _mm_and_si128(flat2, flat2_p1);
+        p1 = _mm_or_si128(flat2_p1, p1);
+        _mm_storeu_si128((__m128i *) (s - 2 * p), p1);
+
+        p0 = _mm_andnot_si128(flat2, p0);
+        flat2_p0 = _mm_and_si128(flat2, flat2_p0);
+        p0 = _mm_or_si128(flat2_p0, p0);
+        _mm_storeu_si128((__m128i *) (s - 1 * p), p0);
+
+        q0 = _mm_andnot_si128(flat2, q0);
+        flat2_q0 = _mm_and_si128(flat2, flat2_q0);
+        q0 = _mm_or_si128(flat2_q0, q0);
+        _mm_storeu_si128((__m128i *) (s - 0 * p), q0);
+
+        q1 = _mm_andnot_si128(flat2, q1);
+        flat2_q1 = _mm_and_si128(flat2, flat2_q1);
+        q1 = _mm_or_si128(flat2_q1, q1);
+        _mm_storeu_si128((__m128i *) (s + 1 * p), q1);
+
+        q2 = _mm_andnot_si128(flat2, q2);
+        flat2_q2 = _mm_and_si128(flat2, flat2_q2);
+        q2 = _mm_or_si128(flat2_q2, q2);
+        _mm_storeu_si128((__m128i *) (s + 2 * p), q2);
+
+        q3 = _mm_andnot_si128(flat2, q3);
+        flat2_q3 = _mm_and_si128(flat2, flat2_q3);
+        q3 = _mm_or_si128(flat2_q3, q3);
+        _mm_storeu_si128((__m128i *) (s + 3 * p), q3);
+
+        q4 = _mm_andnot_si128(flat2, q4);
+        flat2_q4 = _mm_and_si128(flat2, flat2_q4);
+        q4 = _mm_or_si128(flat2_q4, q4);
+        _mm_storeu_si128((__m128i *) (s + 4 * p), q4);
+
+        q5 = _mm_andnot_si128(flat2, q5);
+        flat2_q5 = _mm_and_si128(flat2, flat2_q5);
+        q5 = _mm_or_si128(flat2_q5, q5);
+        _mm_storeu_si128((__m128i *) (s + 5 * p), q5);
+
+        q6 = _mm_andnot_si128(flat2, q6);
+        flat2_q6 = _mm_and_si128(flat2, flat2_q6);
+        q6 = _mm_or_si128(flat2_q6, q6);
+        _mm_storeu_si128((__m128i *) (s + 6 * p), q6);
+    }
+}
+
+void vp9_mb_lpf_horizontal_edge_w_avx2(unsigned char *s, int p,
+        const unsigned char *_blimit, const unsigned char *_limit,
+        const unsigned char *_thresh, int count) {
+    if (count == 1)
+        mb_lpf_horizontal_edge_w_avx2_8(s, p, _blimit, _limit, _thresh);
+    else
+        mb_lpf_horizontal_edge_w_avx2_16(s, p, _blimit, _limit, _thresh);
+}
diff --git a/vp9/decoder/vp9_dboolhuff.h b/vp9/decoder/vp9_dboolhuff.h
index c86451649..fd8e74ca4 100644
--- a/vp9/decoder/vp9_dboolhuff.h
+++ b/vp9/decoder/vp9_dboolhuff.h
@@ -44,7 +44,7 @@ static int vp9_read(vp9_reader *br, int probability) {
   VP9_BD_VALUE bigsplit;
   int count;
   unsigned int range;
-  unsigned int split = 1 + (((br->range - 1) * probability) >> 8);
+  unsigned int split = ((br->range * probability) + (256 - probability)) >> 8;
 
   if (br->count < 0)
     vp9_reader_fill(br);
diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index 33793eee0..9792d2c6d 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -72,7 +72,7 @@ static TX_SIZE read_selected_tx_size(VP9_COMMON *cm, MACROBLOCKD *xd,
   }
 
   if (!cm->frame_parallel_decoding_mode)
-    update_tx_counts(bsize, context, tx_size, &cm->counts.tx);
+    ++get_tx_counts(bsize, context, &cm->counts.tx)[tx_size];
   return tx_size;
 }
 
@@ -91,8 +91,8 @@ static TX_SIZE read_tx_size(VP9_COMMON *const cm, MACROBLOCKD *const xd,
 static void set_segment_id(VP9_COMMON *cm, BLOCK_SIZE bsize,
                            int mi_row, int mi_col, int segment_id) {
   const int mi_offset = mi_row * cm->mi_cols + mi_col;
-  const int bw = 1 << mi_width_log2(bsize);
-  const int bh = 1 << mi_height_log2(bsize);
+  const int bw = num_8x8_blocks_wide_lookup[bsize];
+  const int bh = num_8x8_blocks_high_lookup[bsize];
   const int xmis = MIN(cm->mi_cols - mi_col, bw);
   const int ymis = MIN(cm->mi_rows - mi_row, bh);
   int x, y;
@@ -149,16 +149,17 @@ static int read_inter_segment_id(VP9_COMMON *const cm, MACROBLOCKD *const xd,
   return segment_id;
 }
 
-static uint8_t read_skip_coeff(VP9_COMMON *const cm, MACROBLOCKD *const xd,
-                               int segment_id, vp9_reader *r) {
-  int skip_coeff = vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP);
-  if (!skip_coeff) {
+static int read_skip_coeff(VP9_COMMON *cm, const MACROBLOCKD *xd,
+                           int segment_id, vp9_reader *r) {
+  if (vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
+    return 1;
+  } else {
     const int ctx = vp9_get_pred_context_mbskip(xd);
-    skip_coeff = vp9_read(r, vp9_get_pred_prob_mbskip(cm, xd));
+    const int skip = vp9_read(r, cm->fc.mbskip_probs[ctx]);
     if (!cm->frame_parallel_decoding_mode)
-      ++cm->counts.mbskip[ctx][skip_coeff];
+      ++cm->counts.mbskip[ctx][skip];
+    return skip;
   }
-  return skip_coeff;
 }
 
 static void read_intra_frame_mode_info(VP9_COMMON *const cm,
@@ -311,7 +312,7 @@ static void read_ref_frames(VP9_COMMON *const cm, MACROBLOCKD *const xd,
 }
 
 
-static INLINE INTERPOLATIONFILTERTYPE read_switchable_filter_type(
+static INLINE INTERPOLATION_TYPE read_switchable_filter_type(
     VP9_COMMON *const cm, MACROBLOCKD *const xd, vp9_reader *r) {
   const int ctx = vp9_get_pred_context_switchable_interp(xd);
   const int type = treed_read(r, vp9_switchable_interp_tree,
@@ -414,6 +415,7 @@ static int read_is_inter_block(VP9_COMMON *const cm, MACROBLOCKD *const xd,
 
 static void read_inter_block_mode_info(VP9_COMMON *const cm,
                                        MACROBLOCKD *const xd,
+                                       const TileInfo *const tile,
                                        MODE_INFO *const mi,
                                        int mi_row, int mi_col, vp9_reader *r) {
   MB_MODE_INFO *const mbmi = &mi->mbmi;
@@ -430,7 +432,7 @@ static void read_inter_block_mode_info(VP9_COMMON *const cm,
   ref0 = mbmi->ref_frame[0];
   is_compound = has_second_ref(mbmi);
 
-  vp9_find_mv_refs(cm, xd, mi, xd->last_mi, ref0, mbmi->ref_mvs[ref0],
+  vp9_find_mv_refs(cm, xd, tile, mi, xd->last_mi, ref0, mbmi->ref_mvs[ref0],
                    mi_row, mi_col);
 
   inter_mode_ctx = mbmi->mode_context[ref0];
@@ -456,7 +458,7 @@ static void read_inter_block_mode_info(VP9_COMMON *const cm,
 
   if (is_compound) {
     const MV_REFERENCE_FRAME ref1 = mbmi->ref_frame[1];
-    vp9_find_mv_refs(cm, xd, mi, xd->last_mi,
+    vp9_find_mv_refs(cm, xd, tile, mi, xd->last_mi,
                      ref1, mbmi->ref_mvs[ref1], mi_row, mi_col);
 
     if (bsize < BLOCK_8X8 || mbmi->mode != ZEROMV) {
@@ -482,12 +484,12 @@ static void read_inter_block_mode_info(VP9_COMMON *const cm,
         b_mode = read_inter_mode(cm, r, inter_mode_ctx);
 
         if (b_mode == NEARESTMV || b_mode == NEARMV) {
-          vp9_append_sub8x8_mvs_for_idx(cm, xd, &nearest[0],
+          vp9_append_sub8x8_mvs_for_idx(cm, xd, tile, &nearest[0],
                                         &nearmv[0], j, 0,
                                         mi_row, mi_col);
 
           if (is_compound)
-            vp9_append_sub8x8_mvs_for_idx(cm, xd,  &nearest[1],
+            vp9_append_sub8x8_mvs_for_idx(cm, xd, tile, &nearest[1],
                                           &nearmv[1], j, 1,
                                           mi_row, mi_col);
         }
@@ -523,6 +525,7 @@ static void read_inter_block_mode_info(VP9_COMMON *const cm,
 
 static void read_inter_frame_mode_info(VP9_COMMON *const cm,
                                        MACROBLOCKD *const xd,
+                                       const TileInfo *const tile,
                                        MODE_INFO *const mi,
                                        int mi_row, int mi_col, vp9_reader *r) {
   MB_MODE_INFO *const mbmi = &mi->mbmi;
@@ -537,17 +540,18 @@ static void read_inter_frame_mode_info(VP9_COMMON *const cm,
                                !mbmi->skip_coeff || !inter_block, r);
 
   if (inter_block)
-    read_inter_block_mode_info(cm, xd, mi, mi_row, mi_col, r);
+    read_inter_block_mode_info(cm, xd, tile, mi, mi_row, mi_col, r);
   else
     read_intra_block_mode_info(cm, mi, r);
 }
 
 void vp9_read_mode_info(VP9_COMMON *cm, MACROBLOCKD *xd,
+                        const TileInfo *const tile,
                         int mi_row, int mi_col, vp9_reader *r) {
   MODE_INFO *const mi = xd->mi_8x8[0];
   const BLOCK_SIZE bsize = mi->mbmi.sb_type;
-  const int bw = 1 << mi_width_log2(bsize);
-  const int bh = 1 << mi_height_log2(bsize);
+  const int bw = num_8x8_blocks_wide_lookup[bsize];
+  const int bh = num_8x8_blocks_high_lookup[bsize];
   const int y_mis = MIN(bh, cm->mi_rows - mi_row);
   const int x_mis = MIN(bw, cm->mi_cols - mi_col);
   int x, y, z;
@@ -555,7 +559,7 @@ void vp9_read_mode_info(VP9_COMMON *cm, MACROBLOCKD *xd,
   if (frame_is_intra_only(cm))
     read_intra_frame_mode_info(cm, xd, mi, mi_row, mi_col, r);
   else
-    read_inter_frame_mode_info(cm, xd, mi, mi_row, mi_col, r);
+    read_inter_frame_mode_info(cm, xd, tile, mi, mi_row, mi_col, r);
 
   for (y = 0, z = 0; y < y_mis; y++, z += cm->mode_info_stride) {
     for (x = !y; x < x_mis; x++) {
diff --git a/vp9/decoder/vp9_decodemv.h b/vp9/decoder/vp9_decodemv.h
index cec99f253..8e9ae4a54 100644
--- a/vp9/decoder/vp9_decodemv.h
+++ b/vp9/decoder/vp9_decodemv.h
@@ -14,7 +14,10 @@
 #include "vp9/decoder/vp9_onyxd_int.h"
 #include "vp9/decoder/vp9_dboolhuff.h"
 
+struct TileInfo;
+
 void vp9_read_mode_info(VP9_COMMON *cm, MACROBLOCKD *xd,
+                        const struct TileInfo *const tile,
                         int mi_row, int mi_col, vp9_reader *r);
 
 #endif  // VP9_DECODER_VP9_DECODEMV_H_
diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c
index 3ee8ba41d..4746a3abd 100644
--- a/vp9/decoder/vp9_decodframe.c
+++ b/vp9/decoder/vp9_decodframe.c
@@ -37,10 +37,44 @@
 #include "vp9/decoder/vp9_thread.h"
 #include "vp9/decoder/vp9_treereader.h"
 
+typedef struct TileWorkerData {
+  VP9_COMMON *cm;
+  vp9_reader bit_reader;
+  DECLARE_ALIGNED(16, MACROBLOCKD, xd);
+  DECLARE_ALIGNED(16, unsigned char, token_cache[1024]);
+} TileWorkerData;
+
 static int read_be32(const uint8_t *p) {
   return (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
 }
 
+static int is_compound_prediction_allowed(const VP9_COMMON *cm) {
+  int i;
+  for (i = 1; i < ALLOWED_REFS_PER_FRAME; ++i)
+    if  (cm->ref_frame_sign_bias[i + 1] != cm->ref_frame_sign_bias[1])
+      return 1;
+
+  return 0;
+}
+
+static void setup_compound_prediction(VP9_COMMON *cm) {
+  if (cm->ref_frame_sign_bias[LAST_FRAME] ==
+          cm->ref_frame_sign_bias[GOLDEN_FRAME]) {
+    cm->comp_fixed_ref = ALTREF_FRAME;
+    cm->comp_var_ref[0] = LAST_FRAME;
+    cm->comp_var_ref[1] = GOLDEN_FRAME;
+  } else if (cm->ref_frame_sign_bias[LAST_FRAME] ==
+                 cm->ref_frame_sign_bias[ALTREF_FRAME]) {
+    cm->comp_fixed_ref = GOLDEN_FRAME;
+    cm->comp_var_ref[0] = LAST_FRAME;
+    cm->comp_var_ref[1] = ALTREF_FRAME;
+  } else {
+    cm->comp_fixed_ref = LAST_FRAME;
+    cm->comp_var_ref[0] = GOLDEN_FRAME;
+    cm->comp_var_ref[1] = ALTREF_FRAME;
+  }
+}
+
 // len == 0 is not allowed
 static int read_is_valid(const uint8_t *start, size_t len, const uint8_t *end) {
   return start + len > start && start + len <= end;
@@ -76,7 +110,7 @@ static void read_tx_probs(struct tx_probs *tx_probs, vp9_reader *r) {
 
 static void read_switchable_interp_probs(FRAME_CONTEXT *fc, vp9_reader *r) {
   int i, j;
-  for (j = 0; j < SWITCHABLE_FILTERS + 1; ++j)
+  for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j)
     for (i = 0; i < SWITCHABLE_FILTERS - 1; ++i)
       vp9_diff_update_prob(r, &fc->switchable_interp_prob[j][i]);
 }
@@ -98,8 +132,11 @@ static INLINE COMPPREDMODE_TYPE read_comp_pred_mode(vp9_reader *r) {
 static void read_comp_pred(VP9_COMMON *cm, vp9_reader *r) {
   int i;
 
-  cm->comp_pred_mode = cm->allow_comp_inter_inter ? read_comp_pred_mode(r)
-                                                  : SINGLE_PREDICTION_ONLY;
+  const int compound_allowed = is_compound_prediction_allowed(cm);
+  cm->comp_pred_mode = compound_allowed ? read_comp_pred_mode(r)
+                                        : SINGLE_PREDICTION_ONLY;
+  if (compound_allowed)
+    setup_compound_prediction(cm);
 
   if (cm->comp_pred_mode == HYBRID_PREDICTION)
     for (i = 0; i < COMP_INTER_CONTEXTS; i++)
@@ -169,11 +206,49 @@ static void setup_plane_dequants(VP9_COMMON *cm, MACROBLOCKD *xd, int q_index) {
     xd->plane[i].dequant = cm->uv_dequant[q_index];
 }
 
-static void decode_block(int plane, int block, BLOCK_SIZE plane_bsize,
-                         TX_SIZE tx_size, void *arg) {
-  MACROBLOCKD* const xd = arg;
+// Allocate storage for each tile column.
+// TODO(jzern): when max_threads <= 1 the same storage could be used for each
+// tile.
+static void alloc_tile_storage(VP9D_COMP *pbi, int tile_cols) {
+  VP9_COMMON *const cm = &pbi->common;
+  const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
+  int i, tile_col;
+
+  CHECK_MEM_ERROR(cm, pbi->mi_streams,
+                  vpx_realloc(pbi->mi_streams, tile_cols *
+                              sizeof(*pbi->mi_streams)));
+  for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
+    TileInfo tile;
+
+    vp9_tile_init(&tile, cm, 0, tile_col);
+    pbi->mi_streams[tile_col] =
+        &cm->mi[cm->mi_rows * tile.mi_col_start];
+  }
+
+  // 2 contexts per 'mi unit', so that we have one context per 4x4 txfm
+  // block where mi unit size is 8x8.
+  CHECK_MEM_ERROR(cm, pbi->above_context[0],
+                  vpx_realloc(pbi->above_context[0],
+                              sizeof(*pbi->above_context[0]) * MAX_MB_PLANE *
+                              2 * aligned_mi_cols));
+  for (i = 1; i < MAX_MB_PLANE; ++i) {
+    pbi->above_context[i] = pbi->above_context[0] +
+                            i * sizeof(*pbi->above_context[0]) *
+                            2 * aligned_mi_cols;
+  }
+
+  // This is sized based on the entire frame. Each tile operates within its
+  // column bounds.
+  CHECK_MEM_ERROR(cm, pbi->above_seg_context,
+                  vpx_realloc(pbi->above_seg_context,
+                              sizeof(*pbi->above_seg_context) *
+                              aligned_mi_cols));
+}
+
+static void inverse_transform_block(MACROBLOCKD* xd, int plane, int block,
+                                    BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {
   struct macroblockd_plane *const pd = &xd->plane[plane];
-  int16_t* const qcoeff = BLOCK_OFFSET(pd->qcoeff, block);
+  int16_t* const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
   const int stride = pd->dst.stride;
   const int eob = pd->eobs[block];
   if (eob > 0) {
@@ -186,40 +261,53 @@ static void decode_block(int plane, int block, BLOCK_SIZE plane_bsize,
       case TX_4X4:
         tx_type = get_tx_type_4x4(pd->plane_type, xd, raster_block);
         if (tx_type == DCT_DCT)
-          xd->itxm_add(qcoeff, dst, stride, eob);
+          xd->itxm_add(dqcoeff, dst, stride, eob);
         else
-          vp9_iht4x4_add(tx_type, qcoeff, dst, stride, eob);
+          vp9_iht4x4_16_add(dqcoeff, dst, stride, tx_type);
         break;
       case TX_8X8:
         tx_type = get_tx_type_8x8(pd->plane_type, xd);
-        vp9_iht8x8_add(tx_type, qcoeff, dst, stride, eob);
+        vp9_iht8x8_add(tx_type, dqcoeff, dst, stride, eob);
         break;
       case TX_16X16:
         tx_type = get_tx_type_16x16(pd->plane_type, xd);
-        vp9_iht16x16_add(tx_type, qcoeff, dst, stride, eob);
+        vp9_iht16x16_add(tx_type, dqcoeff, dst, stride, eob);
         break;
       case TX_32X32:
         tx_type = DCT_DCT;
-        vp9_idct32x32_add(qcoeff, dst, stride, eob);
+        vp9_idct32x32_add(dqcoeff, dst, stride, eob);
         break;
       default:
         assert(!"Invalid transform size");
     }
 
     if (eob == 1) {
-      *((int32_t *)qcoeff) = 0;
+      vpx_memset(dqcoeff, 0, 2 * sizeof(dqcoeff[0]));
     } else {
       if (tx_type == DCT_DCT && tx_size <= TX_16X16 && eob <= 10)
-        vpx_memset(qcoeff, 0, 4 * (4 << tx_size) * sizeof(qcoeff[0]));
+        vpx_memset(dqcoeff, 0, 4 * (4 << tx_size) * sizeof(dqcoeff[0]));
+      else if (tx_size == TX_32X32 && eob <= 34)
+        vpx_memset(dqcoeff, 0, 256 * sizeof(dqcoeff[0]));
       else
-        vpx_memset(qcoeff, 0, (16 << (tx_size << 1)) * sizeof(qcoeff[0]));
+        vpx_memset(dqcoeff, 0, (16 << (tx_size << 1)) * sizeof(dqcoeff[0]));
     }
   }
 }
 
-static void decode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
-                               TX_SIZE tx_size, void *arg) {
-  MACROBLOCKD* const xd = arg;
+struct intra_args {
+  VP9_COMMON *cm;
+  MACROBLOCKD *xd;
+  vp9_reader *r;
+  unsigned char* token_cache;
+};
+
+static void predict_and_reconstruct_intra_block(int plane, int block,
+                                                BLOCK_SIZE plane_bsize,
+                                                TX_SIZE tx_size, void *arg) {
+  struct intra_args *const args = arg;
+  VP9_COMMON *const cm = args->cm;
+  MACROBLOCKD *const xd = args->xd;
+
   struct macroblockd_plane *const pd = &xd->plane[plane];
   MODE_INFO *const mi = xd->mi_8x8[0];
   const int raster_block = txfrm_block_to_raster_block(plane_bsize, tx_size,
@@ -238,31 +326,37 @@ static void decode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
                           b_width_log2(plane_bsize), tx_size, mode,
                           dst, pd->dst.stride, dst, pd->dst.stride);
 
-  if (!mi->mbmi.skip_coeff)
-    decode_block(plane, block, plane_bsize, tx_size, arg);
+  if (!mi->mbmi.skip_coeff) {
+    vp9_decode_block_tokens(cm, xd, plane, block, plane_bsize, tx_size,
+                            args->r, args->token_cache);
+    inverse_transform_block(xd, plane, block, plane_bsize, tx_size);
+  }
 }
 
-static int decode_tokens(VP9_COMMON *const cm, MACROBLOCKD *const xd,
-                         BLOCK_SIZE bsize, vp9_reader *r) {
-  MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
-
-  if (mbmi->skip_coeff) {
-    reset_skip_context(xd, bsize);
-    return -1;
-  } else {
-    if (cm->seg.enabled)
-      setup_plane_dequants(cm, xd, vp9_get_qindex(&cm->seg, mbmi->segment_id,
-                                                  cm->base_qindex));
-
-    // TODO(dkovalev) if (!vp9_reader_has_error(r))
-    return vp9_decode_tokens(cm, xd, &cm->seg, r, bsize);
-  }
+struct inter_args {
+  VP9_COMMON *cm;
+  MACROBLOCKD *xd;
+  vp9_reader *r;
+  int *eobtotal;
+  unsigned char* token_cache;
+};
+
+static void reconstruct_inter_block(int plane, int block,
+                                    BLOCK_SIZE plane_bsize,
+                                    TX_SIZE tx_size, void *arg) {
+  struct inter_args *args = arg;
+  VP9_COMMON *const cm = args->cm;
+  MACROBLOCKD *const xd = args->xd;
+
+  *args->eobtotal += vp9_decode_block_tokens(cm, xd, plane, block,
+                                             plane_bsize, tx_size,
+                                             args->r, args->token_cache);
+  inverse_transform_block(xd, plane, block, plane_bsize, tx_size);
 }
 
-static void set_offsets(VP9D_COMP *pbi, BLOCK_SIZE bsize,
-                        int mi_row, int mi_col) {
-  VP9_COMMON *const cm = &pbi->common;
-  MACROBLOCKD *const xd = &pbi->mb;
+static void set_offsets(VP9_COMMON *const cm, MACROBLOCKD *const xd,
+                        const TileInfo *const tile,
+                        BLOCK_SIZE bsize, int mi_row, int mi_col) {
   const int bh = num_8x8_blocks_high_lookup[bsize];
   const int bw = num_8x8_blocks_wide_lookup[bsize];
   const int offset = mi_row * cm->mode_info_stride + mi_col;
@@ -281,143 +375,163 @@ static void set_offsets(VP9D_COMP *pbi, BLOCK_SIZE bsize,
   // cannot be used.
   xd->last_mi = cm->prev_mi ? xd->prev_mi_8x8[0] : NULL;
 
-  set_skip_context(cm, xd, mi_row, mi_col);
+  set_skip_context(xd, xd->above_context, xd->left_context, mi_row, mi_col);
 
   // Distance of Mb to the various image edges. These are specified to 8th pel
   // as they are always compared to values that are in 1/8th pel units
-  set_mi_row_col(cm, xd, mi_row, bh, mi_col, bw);
+  set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols);
 
-  setup_dst_planes(xd, &cm->yv12_fb[cm->new_fb_idx], mi_row, mi_col);
+  setup_dst_planes(xd, get_frame_new_buffer(cm), mi_row, mi_col);
 }
 
 static void set_ref(VP9_COMMON *const cm, MACROBLOCKD *const xd,
                     int idx, int mi_row, int mi_col) {
   MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
   const int ref = mbmi->ref_frame[idx] - LAST_FRAME;
-  const YV12_BUFFER_CONFIG *cfg = &cm->yv12_fb[cm->active_ref_idx[ref]];
-  const struct scale_factors *sf = &cm->active_ref_scale[ref];
-  if (!vp9_is_valid_scale(sf))
+  const YV12_BUFFER_CONFIG *cfg = get_frame_ref_buffer(cm, ref);
+  const struct scale_factors_common *sfc = &cm->active_ref_scale_comm[ref];
+  if (!vp9_is_valid_scale(sfc))
     vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
                        "Invalid scale factors");
 
-  xd->scale_factor[idx] = *sf;
-  setup_pre_planes(xd, idx, cfg, mi_row, mi_col, sf);
+  xd->scale_factor[idx].sfc = sfc;
+  setup_pre_planes(xd, idx, cfg, mi_row, mi_col, &xd->scale_factor[idx]);
   xd->corrupted |= cfg->corrupted;
 }
 
-static void decode_modes_b(VP9D_COMP *pbi, int mi_row, int mi_col,
-                           vp9_reader *r, BLOCK_SIZE bsize, int index) {
-  VP9_COMMON *const cm = &pbi->common;
-  MACROBLOCKD *const xd = &pbi->mb;
+static void decode_modes_b(VP9_COMMON *const cm, MACROBLOCKD *const xd,
+                           const TileInfo *const tile,
+                           int mi_row, int mi_col,
+                           vp9_reader *r, BLOCK_SIZE bsize,
+                           unsigned char *token_cache) {
   const int less8x8 = bsize < BLOCK_8X8;
   MB_MODE_INFO *mbmi;
-  int eobtotal;
-
-  if (less8x8)
-    if (index > 0)
-      return;
 
-  set_offsets(pbi, bsize, mi_row, mi_col);
-  vp9_read_mode_info(cm, xd, mi_row, mi_col, r);
+  set_offsets(cm, xd, tile, bsize, mi_row, mi_col);
+  vp9_read_mode_info(cm, xd, tile, mi_row, mi_col, r);
 
   if (less8x8)
     bsize = BLOCK_8X8;
 
   // Has to be called after set_offsets
   mbmi = &xd->mi_8x8[0]->mbmi;
-  eobtotal = decode_tokens(cm, xd, bsize, r);
 
-  if (!is_inter_block(mbmi)) {
-    // Intra reconstruction
-    foreach_transformed_block(xd, bsize, decode_block_intra, xd);
+  if (mbmi->skip_coeff) {
+    reset_skip_context(xd, bsize);
   } else {
-    // Inter reconstruction
-    const int decode_blocks = (eobtotal > 0);
-
-    if (!less8x8) {
-      assert(mbmi->sb_type == bsize);
-      if (eobtotal == 0)
-        mbmi->skip_coeff = 1;  // skip loopfilter
-    }
+    if (cm->seg.enabled)
+      setup_plane_dequants(cm, xd, vp9_get_qindex(&cm->seg, mbmi->segment_id,
+                                                  cm->base_qindex));
+  }
 
+  if (!is_inter_block(mbmi)) {
+    struct intra_args arg = { cm, xd, r, token_cache };
+    foreach_transformed_block(xd, bsize, predict_and_reconstruct_intra_block,
+                              &arg);
+  } else {
+    // Setup
     set_ref(cm, xd, 0, mi_row, mi_col);
     if (has_second_ref(mbmi))
       set_ref(cm, xd, 1, mi_row, mi_col);
 
     xd->subpix.filter_x = xd->subpix.filter_y =
         vp9_get_filter_kernel(mbmi->interp_filter);
+
+    // Prediction
     vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
 
-    if (decode_blocks)
-      foreach_transformed_block(xd, bsize, decode_block, xd);
+    // Reconstruction
+    if (!mbmi->skip_coeff) {
+      int eobtotal = 0;
+      struct inter_args arg = { cm, xd, r, &eobtotal, token_cache };
+      foreach_transformed_block(xd, bsize, reconstruct_inter_block, &arg);
+      if (!less8x8 && eobtotal == 0)
+        mbmi->skip_coeff = 1;  // skip loopfilter
+    }
   }
+
   xd->corrupted |= vp9_reader_has_error(r);
 }
 
-static void decode_modes_sb(VP9D_COMP *pbi, int mi_row, int mi_col,
-                            vp9_reader* r, BLOCK_SIZE bsize, int index) {
-  VP9_COMMON *const cm = &pbi->common;
+static PARTITION_TYPE read_partition(VP9_COMMON *cm, MACROBLOCKD *xd, int hbs,
+                                     int mi_row, int mi_col, BLOCK_SIZE bsize,
+                                     vp9_reader *r) {
+  const int ctx = partition_plane_context(xd->above_seg_context,
+                                          xd->left_seg_context,
+                                          mi_row, mi_col, bsize);
+  const vp9_prob *const probs = get_partition_probs(cm, ctx);
+  const int has_rows = (mi_row + hbs) < cm->mi_rows;
+  const int has_cols = (mi_col + hbs) < cm->mi_cols;
+  PARTITION_TYPE p;
+
+  if (has_rows && has_cols)
+    p = treed_read(r, vp9_partition_tree, probs);
+  else if (!has_rows && has_cols)
+    p = vp9_read(r, probs[1]) ? PARTITION_SPLIT : PARTITION_HORZ;
+  else if (has_rows && !has_cols)
+    p = vp9_read(r, probs[2]) ? PARTITION_SPLIT : PARTITION_VERT;
+  else
+    p = PARTITION_SPLIT;
+
+  if (!cm->frame_parallel_decoding_mode)
+    ++cm->counts.partition[ctx][p];
+
+  return p;
+}
+
+static void decode_modes_sb(VP9_COMMON *const cm, MACROBLOCKD *const xd,
+                            const TileInfo *const tile,
+                            int mi_row, int mi_col,
+                            vp9_reader* r, BLOCK_SIZE bsize,
+                            unsigned char *token_cache) {
   const int hbs = num_8x8_blocks_wide_lookup[bsize] / 2;
-  PARTITION_TYPE partition = PARTITION_NONE;
+  PARTITION_TYPE partition;
   BLOCK_SIZE subsize;
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
 
-  if (bsize < BLOCK_8X8) {
-    if (index > 0)
-      return;
-  } else {
-    int pl;
-    const int idx = check_bsize_coverage(hbs, cm->mi_rows, cm->mi_cols,
-                                         mi_row, mi_col);
-    pl = partition_plane_context(cm, mi_row, mi_col, bsize);
-
-    if (idx == 0)
-      partition = treed_read(r, vp9_partition_tree,
-                             cm->fc.partition_prob[cm->frame_type][pl]);
-    else if (idx > 0 &&
-        !vp9_read(r, cm->fc.partition_prob[cm->frame_type][pl][idx]))
-      partition = (idx == 1) ? PARTITION_HORZ : PARTITION_VERT;
-    else
-      partition = PARTITION_SPLIT;
-
-    if (!cm->frame_parallel_decoding_mode)
-      ++cm->counts.partition[pl][partition];
-  }
-
+  partition = read_partition(cm, xd, hbs, mi_row, mi_col, bsize, r);
   subsize = get_subsize(bsize, partition);
-
-  switch (partition) {
-    case PARTITION_NONE:
-      decode_modes_b(pbi, mi_row, mi_col, r, subsize, 0);
-      break;
-    case PARTITION_HORZ:
-      decode_modes_b(pbi, mi_row, mi_col, r, subsize, 0);
-      if (mi_row + hbs < cm->mi_rows)
-        decode_modes_b(pbi, mi_row + hbs, mi_col, r, subsize, 1);
-      break;
-    case PARTITION_VERT:
-      decode_modes_b(pbi, mi_row, mi_col, r, subsize, 0);
-      if (mi_col + hbs < cm->mi_cols)
-        decode_modes_b(pbi, mi_row, mi_col + hbs, r, subsize, 1);
-      break;
-    case PARTITION_SPLIT: {
-      int n;
-      for (n = 0; n < 4; n++) {
-        const int j = n >> 1, i = n & 1;
-        decode_modes_sb(pbi, mi_row + j * hbs, mi_col + i * hbs,
-                        r, subsize, n);
-      }
-    } break;
-    default:
-      assert(!"Invalid partition type");
+  if (subsize < BLOCK_8X8) {
+    decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache);
+  } else {
+    switch (partition) {
+      case PARTITION_NONE:
+        decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache);
+        break;
+      case PARTITION_HORZ:
+        decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache);
+        if (mi_row + hbs < cm->mi_rows)
+          decode_modes_b(cm, xd, tile, mi_row + hbs, mi_col, r, subsize,
+                         token_cache);
+        break;
+      case PARTITION_VERT:
+        decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache);
+        if (mi_col + hbs < cm->mi_cols)
+          decode_modes_b(cm, xd, tile, mi_row, mi_col + hbs, r, subsize,
+                         token_cache);
+        break;
+      case PARTITION_SPLIT:
+        decode_modes_sb(cm, xd, tile, mi_row, mi_col, r, subsize,
+                        token_cache);
+        decode_modes_sb(cm, xd, tile, mi_row, mi_col + hbs, r, subsize,
+                        token_cache);
+        decode_modes_sb(cm, xd, tile, mi_row + hbs, mi_col, r, subsize,
+                        token_cache);
+        decode_modes_sb(cm, xd, tile, mi_row + hbs, mi_col + hbs, r, subsize,
+                        token_cache);
+        break;
+      default:
+        assert(!"Invalid partition type");
+    }
   }
 
   // update partition context
   if (bsize >= BLOCK_8X8 &&
       (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT))
-    update_partition_context(cm, mi_row, mi_col, subsize, bsize);
+    update_partition_context(xd->above_seg_context, xd->left_seg_context,
+                             mi_row, mi_col, subsize, bsize);
 }
 
 static void setup_token_decoder(const uint8_t *data,
@@ -453,16 +567,10 @@ static void read_coef_probs_common(vp9_coeff_probs_model *coef_probs,
 
 static void read_coef_probs(FRAME_CONTEXT *fc, TX_MODE tx_mode,
                             vp9_reader *r) {
-  read_coef_probs_common(fc->coef_probs[TX_4X4], r);
-
-  if (tx_mode > ONLY_4X4)
-    read_coef_probs_common(fc->coef_probs[TX_8X8], r);
-
-  if (tx_mode > ALLOW_8X8)
-    read_coef_probs_common(fc->coef_probs[TX_16X16], r);
-
-  if (tx_mode > ALLOW_16X16)
-    read_coef_probs_common(fc->coef_probs[TX_32X32], r);
+    const TX_SIZE max_tx_size = tx_mode_to_biggest_tx_size[tx_mode];
+    TX_SIZE tx_size;
+    for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size)
+      read_coef_probs_common(fc->coef_probs[tx_size], r);
 }
 
 static void setup_segmentation(struct segmentation *seg,
@@ -549,9 +657,8 @@ static int read_delta_q(struct vp9_read_bit_buffer *rb, int *delta_q) {
   return old != *delta_q;
 }
 
-static void setup_quantization(VP9D_COMP *pbi, struct vp9_read_bit_buffer *rb) {
-  MACROBLOCKD *const xd = &pbi->mb;
-  VP9_COMMON *const cm = &pbi->common;
+static void setup_quantization(VP9_COMMON *const cm, MACROBLOCKD *const xd,
+                               struct vp9_read_bit_buffer *rb) {
   int update = 0;
 
   cm->base_qindex = vp9_rb_read_literal(rb, QINDEX_BITS);
@@ -569,12 +676,12 @@ static void setup_quantization(VP9D_COMP *pbi, struct vp9_read_bit_buffer *rb) {
   xd->itxm_add = xd->lossless ? vp9_iwht4x4_add : vp9_idct4x4_add;
 }
 
-static INTERPOLATIONFILTERTYPE read_interp_filter_type(
-    struct vp9_read_bit_buffer *rb) {
-  const INTERPOLATIONFILTERTYPE literal_to_type[] = { EIGHTTAP_SMOOTH,
-                                                      EIGHTTAP,
-                                                      EIGHTTAP_SHARP,
-                                                      BILINEAR };
+static INTERPOLATION_TYPE read_interp_filter_type(
+                              struct vp9_read_bit_buffer *rb) {
+  const INTERPOLATION_TYPE literal_to_type[] = { EIGHTTAP_SMOOTH,
+                                                 EIGHTTAP,
+                                                 EIGHTTAP_SHARP,
+                                                 BILINEAR };
   return vp9_rb_read_bit(rb) ? SWITCHABLE
                              : literal_to_type[vp9_rb_read_literal(rb, 2)];
 }
@@ -620,7 +727,7 @@ static void apply_frame_size(VP9D_COMP *pbi, int width, int height) {
     vp9_update_frame_size(cm);
   }
 
-  vp9_realloc_frame_buffer(&cm->yv12_fb[cm->new_fb_idx], cm->width, cm->height,
+  vp9_realloc_frame_buffer(get_frame_new_buffer(cm), cm->width, cm->height,
                            cm->subsampling_x, cm->subsampling_y,
                            VP9BORDERINPIXELS);
 }
@@ -641,7 +748,7 @@ static void setup_frame_size_with_refs(VP9D_COMP *pbi,
   int found = 0, i;
   for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i) {
     if (vp9_rb_read_bit(rb)) {
-      YV12_BUFFER_CONFIG *cfg = &cm->yv12_fb[cm->active_ref_idx[i]];
+      YV12_BUFFER_CONFIG *const cfg = get_frame_ref_buffer(cm, i);
       width = cfg->y_crop_width;
       height = cfg->y_crop_height;
       found = 1;
@@ -660,18 +767,28 @@ static void setup_frame_size_with_refs(VP9D_COMP *pbi,
   setup_display_size(cm, rb);
 }
 
-static void decode_tile(VP9D_COMP *pbi, vp9_reader *r, int tile_col) {
+static void setup_tile_context(VP9D_COMP *const pbi, MACROBLOCKD *const xd,
+                               int tile_col) {
+  int i;
+  xd->mi_stream = pbi->mi_streams[tile_col];
+
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    xd->above_context[i] = pbi->above_context[i];
+  }
+  // see note in alloc_tile_storage().
+  xd->above_seg_context = pbi->above_seg_context;
+}
+
+static void decode_tile(VP9D_COMP *pbi, const TileInfo *const tile,
+                        vp9_reader *r) {
   const int num_threads = pbi->oxcf.max_threads;
   VP9_COMMON *const cm = &pbi->common;
   int mi_row, mi_col;
-  YV12_BUFFER_CONFIG *const fb = &cm->yv12_fb[cm->new_fb_idx];
   MACROBLOCKD *xd = &pbi->mb;
 
-  xd->mi_stream = pbi->mi_streams[tile_col];
-
   if (pbi->do_loopfilter_inline) {
     LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1;
-    lf_data->frame_buffer = fb;
+    lf_data->frame_buffer = get_frame_new_buffer(cm);
     lf_data->cm = cm;
     lf_data->xd = pbi->mb;
     lf_data->stop = 0;
@@ -679,14 +796,15 @@ static void decode_tile(VP9D_COMP *pbi, vp9_reader *r, int tile_col) {
     vp9_loop_filter_frame_init(cm, cm->lf.filter_level);
   }
 
-  for (mi_row = cm->cur_tile_mi_row_start; mi_row < cm->cur_tile_mi_row_end;
+  for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end;
        mi_row += MI_BLOCK_SIZE) {
     // For a SB there are 2 left contexts, each pertaining to a MB row within
-    vp9_zero(cm->left_context);
-    vp9_zero(cm->left_seg_context);
-    for (mi_col = cm->cur_tile_mi_col_start; mi_col < cm->cur_tile_mi_col_end;
+    vp9_zero(xd->left_context);
+    vp9_zero(xd->left_seg_context);
+    for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
          mi_col += MI_BLOCK_SIZE)
-      decode_modes_sb(pbi, mi_row, mi_col, r, BLOCK_64X64, 0);
+      decode_modes_sb(cm, xd, tile, mi_row, mi_col, r, BLOCK_64X64,
+                      pbi->token_cache);
 
     if (pbi->do_loopfilter_inline) {
       const int lf_start = mi_row - MI_BLOCK_SIZE;
@@ -696,7 +814,7 @@ static void decode_tile(VP9D_COMP *pbi, vp9_reader *r, int tile_col) {
       if (lf_start < 0) continue;
 
       // decoding has completed: finish up the loop filter in this thread.
-      if (mi_row + MI_BLOCK_SIZE >= cm->cur_tile_mi_row_end) continue;
+      if (mi_row + MI_BLOCK_SIZE >= tile->mi_row_end) continue;
 
       vp9_worker_sync(&pbi->lf_worker);
       lf_data->start = lf_start;
@@ -735,10 +853,32 @@ static void setup_tile_info(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
     cm->log2_tile_rows += vp9_rb_read_bit(rb);
 }
 
+// Reads the next tile returning its size and adjusting '*data' accordingly
+// based on 'is_last'.
+static size_t get_tile(const uint8_t *const data_end,
+                       int is_last,
+                       struct vpx_internal_error_info *error_info,
+                       const uint8_t **data) {
+  size_t size;
+
+  if (!is_last) {
+    if (!read_is_valid(*data, 4, data_end))
+      vpx_internal_error(error_info, VPX_CODEC_CORRUPT_FRAME,
+          "Truncated packet or corrupt tile length");
+
+    size = read_be32(*data);
+    *data += 4;
+  } else {
+    size = data_end - *data;
+  }
+  return size;
+}
+
 static const uint8_t *decode_tiles(VP9D_COMP *pbi, const uint8_t *data) {
   vp9_reader residual_bc;
 
   VP9_COMMON *const cm = &pbi->common;
+  MACROBLOCKD *const xd = &pbi->mb;
 
   const uint8_t *const data_end = pbi->source + pbi->source_sz;
   const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
@@ -748,70 +888,57 @@ static const uint8_t *decode_tiles(VP9D_COMP *pbi, const uint8_t *data) {
 
   // Note: this memset assumes above_context[0], [1] and [2]
   // are allocated as part of the same buffer.
-  vpx_memset(cm->above_context[0], 0,
-             sizeof(ENTROPY_CONTEXT) * MAX_MB_PLANE * (2 * aligned_mi_cols));
+  vpx_memset(pbi->above_context[0], 0,
+             sizeof(*pbi->above_context[0]) * MAX_MB_PLANE *
+             2 * aligned_mi_cols);
 
-  vpx_memset(cm->above_seg_context, 0,
-             sizeof(PARTITION_CONTEXT) * aligned_mi_cols);
+  vpx_memset(pbi->above_seg_context, 0,
+             sizeof(*pbi->above_seg_context) * aligned_mi_cols);
 
   if (pbi->oxcf.inv_tile_order) {
     const uint8_t *data_ptr2[4][1 << 6];
     vp9_reader bc_bak = {0};
 
-    // pre-initialize the offsets, we're going to read in inverse order
+    // pre-initialize the offsets, we're going to decode in inverse order
     data_ptr2[0][0] = data;
     for (tile_row = 0; tile_row < tile_rows; tile_row++) {
-      if (tile_row) {
-        const int size = read_be32(data_ptr2[tile_row - 1][tile_cols - 1]);
-        data_ptr2[tile_row - 1][tile_cols - 1] += 4;
-        data_ptr2[tile_row][0] = data_ptr2[tile_row - 1][tile_cols - 1] + size;
-      }
-
-      for (tile_col = 1; tile_col < tile_cols; tile_col++) {
-        const int size = read_be32(data_ptr2[tile_row][tile_col - 1]);
-        data_ptr2[tile_row][tile_col - 1] += 4;
-        data_ptr2[tile_row][tile_col] =
-            data_ptr2[tile_row][tile_col - 1] + size;
+      for (tile_col = 0; tile_col < tile_cols; tile_col++) {
+        const int last_tile =
+            tile_row == tile_rows - 1 && tile_col == tile_cols - 1;
+        const size_t size = get_tile(data_end, last_tile, &cm->error, &data);
+        data_ptr2[tile_row][tile_col] = data;
+        data += size;
       }
     }
 
     for (tile_row = 0; tile_row < tile_rows; tile_row++) {
-      vp9_get_tile_row_offsets(cm, tile_row);
       for (tile_col = tile_cols - 1; tile_col >= 0; tile_col--) {
-        vp9_get_tile_col_offsets(cm, tile_col);
+        TileInfo tile;
+
+        vp9_tile_init(&tile, cm, tile_row, tile_col);
         setup_token_decoder(data_ptr2[tile_row][tile_col], data_end,
                             data_end - data_ptr2[tile_row][tile_col],
                             &cm->error, &residual_bc);
-        decode_tile(pbi, &residual_bc, tile_col);
+        setup_tile_context(pbi, xd, tile_col);
+        decode_tile(pbi, &tile, &residual_bc);
         if (tile_row == tile_rows - 1 && tile_col == tile_cols - 1)
           bc_bak = residual_bc;
       }
     }
     residual_bc = bc_bak;
   } else {
-    int has_more;
-
     for (tile_row = 0; tile_row < tile_rows; tile_row++) {
-      vp9_get_tile_row_offsets(cm, tile_row);
       for (tile_col = 0; tile_col < tile_cols; tile_col++) {
-        size_t size;
+        const int last_tile =
+            tile_row == tile_rows - 1 && tile_col == tile_cols - 1;
+        const size_t size = get_tile(data_end, last_tile, &cm->error, &data);
+        TileInfo tile;
 
-        vp9_get_tile_col_offsets(cm, tile_col);
-
-        has_more = tile_col < tile_cols - 1 || tile_row < tile_rows - 1;
-        if (has_more) {
-          if (!read_is_valid(data, 4, data_end))
-            vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
-                         "Truncated packet or corrupt tile length");
-
-          size = read_be32(data);
-          data += 4;
-        } else {
-          size = data_end - data;
-        }
+        vp9_tile_init(&tile, cm, tile_row, tile_col);
 
         setup_token_decoder(data, data_end, size, &cm->error, &residual_bc);
-        decode_tile(pbi, &residual_bc, tile_col);
+        setup_tile_context(pbi, xd, tile_col);
+        decode_tile(pbi, &tile, &residual_bc);
         data += size;
       }
     }
@@ -820,10 +947,113 @@ static const uint8_t *decode_tiles(VP9D_COMP *pbi, const uint8_t *data) {
   return vp9_reader_find_end(&residual_bc);
 }
 
+static int tile_worker_hook(void *arg1, void *arg2) {
+  TileWorkerData *tile_data = (TileWorkerData*)arg1;
+  const TileInfo *const tile = (TileInfo*)arg2;
+  int mi_row, mi_col;
+
+  for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end;
+       mi_row += MI_BLOCK_SIZE) {
+    vp9_zero(tile_data->xd.left_context);
+    vp9_zero(tile_data->xd.left_seg_context);
+    for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
+         mi_col += MI_BLOCK_SIZE) {
+      decode_modes_sb(tile_data->cm, &tile_data->xd, tile,
+                      mi_row, mi_col, &tile_data->bit_reader, BLOCK_64X64,
+                      tile_data->token_cache);
+    }
+  }
+  return !tile_data->xd.corrupted;
+}
+
+static const uint8_t *decode_tiles_mt(VP9D_COMP *pbi, const uint8_t *data) {
+  VP9_COMMON *const cm = &pbi->common;
+  const uint8_t *const data_end = pbi->source + pbi->source_sz;
+  const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  const int tile_rows = 1 << cm->log2_tile_rows;
+  const int num_workers = MIN(pbi->oxcf.max_threads & ~1, tile_cols);
+  int tile_col = 0;
+
+  assert(tile_rows == 1);
+  (void)tile_rows;
+
+  if (num_workers > pbi->num_tile_workers) {
+    int i;
+    CHECK_MEM_ERROR(cm, pbi->tile_workers,
+                    vpx_realloc(pbi->tile_workers,
+                                num_workers * sizeof(*pbi->tile_workers)));
+    for (i = pbi->num_tile_workers; i < num_workers; ++i) {
+      VP9Worker *const worker = &pbi->tile_workers[i];
+      ++pbi->num_tile_workers;
+
+      vp9_worker_init(worker);
+      worker->hook = (VP9WorkerHook)tile_worker_hook;
+      CHECK_MEM_ERROR(cm, worker->data1,
+                      vpx_memalign(32, sizeof(TileWorkerData)));
+      CHECK_MEM_ERROR(cm, worker->data2, vpx_malloc(sizeof(TileInfo)));
+      if (i < num_workers - 1 && !vp9_worker_reset(worker)) {
+        vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+                           "Tile decoder thread creation failed");
+      }
+    }
+  }
+
+  // Note: this memset assumes above_context[0], [1] and [2]
+  // are allocated as part of the same buffer.
+  vpx_memset(pbi->above_context[0], 0,
+             sizeof(*pbi->above_context[0]) * MAX_MB_PLANE *
+             2 * aligned_mi_cols);
+  vpx_memset(pbi->above_seg_context, 0,
+             sizeof(*pbi->above_seg_context) * aligned_mi_cols);
+
+  while (tile_col < tile_cols) {
+    int i;
+    for (i = 0; i < num_workers && tile_col < tile_cols; ++i) {
+      VP9Worker *const worker = &pbi->tile_workers[i];
+      TileWorkerData *const tile_data = (TileWorkerData*)worker->data1;
+      TileInfo *const tile = (TileInfo*)worker->data2;
+      const size_t size =
+          get_tile(data_end, tile_col == tile_cols - 1, &cm->error, &data);
+
+      tile_data->cm = cm;
+      tile_data->xd = pbi->mb;
+      tile_data->xd.corrupted = 0;
+      vp9_tile_init(tile, tile_data->cm, 0, tile_col);
+
+      setup_token_decoder(data, data_end, size, &cm->error,
+                          &tile_data->bit_reader);
+      setup_tile_context(pbi, &tile_data->xd, tile_col);
+
+      worker->had_error = 0;
+      if (i == num_workers - 1 || tile_col == tile_cols - 1) {
+        vp9_worker_execute(worker);
+      } else {
+        vp9_worker_launch(worker);
+      }
+
+      data += size;
+      ++tile_col;
+    }
+
+    for (; i > 0; --i) {
+      VP9Worker *const worker = &pbi->tile_workers[i - 1];
+      pbi->mb.corrupted |= !vp9_worker_sync(worker);
+    }
+  }
+
+  {
+    const int final_worker = (tile_cols + num_workers - 1) % num_workers;
+    TileWorkerData *const tile_data =
+        (TileWorkerData*)pbi->tile_workers[final_worker].data1;
+    return vp9_reader_find_end(&tile_data->bit_reader);
+  }
+}
+
 static void check_sync_code(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
-  if (vp9_rb_read_literal(rb, 8) != SYNC_CODE_0 ||
-      vp9_rb_read_literal(rb, 8) != SYNC_CODE_1 ||
-      vp9_rb_read_literal(rb, 8) != SYNC_CODE_2) {
+  if (vp9_rb_read_literal(rb, 8) != VP9_SYNC_CODE_0 ||
+      vp9_rb_read_literal(rb, 8) != VP9_SYNC_CODE_1 ||
+      vp9_rb_read_literal(rb, 8) != VP9_SYNC_CODE_2) {
     vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
                        "Invalid frame sync code");
   }
@@ -834,34 +1064,6 @@ static void error_handler(void *data, size_t bit_offset) {
   vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, "Truncated packet");
 }
 
-static void setup_inter_inter(VP9_COMMON *cm) {
-  int i;
-
-  cm->allow_comp_inter_inter = 0;
-  for (i = 1; i < ALLOWED_REFS_PER_FRAME; ++i)
-    cm->allow_comp_inter_inter |=
-        cm->ref_frame_sign_bias[i + 1] != cm->ref_frame_sign_bias[1];
-
-  if (cm->allow_comp_inter_inter) {
-    // which one is always-on in comp inter-inter?
-    if (cm->ref_frame_sign_bias[LAST_FRAME] ==
-        cm->ref_frame_sign_bias[GOLDEN_FRAME]) {
-      cm->comp_fixed_ref = ALTREF_FRAME;
-      cm->comp_var_ref[0] = LAST_FRAME;
-      cm->comp_var_ref[1] = GOLDEN_FRAME;
-    } else if (cm->ref_frame_sign_bias[LAST_FRAME] ==
-               cm->ref_frame_sign_bias[ALTREF_FRAME]) {
-      cm->comp_fixed_ref = GOLDEN_FRAME;
-      cm->comp_var_ref[0] = LAST_FRAME;
-      cm->comp_var_ref[1] = ALTREF_FRAME;
-    } else {
-      cm->comp_fixed_ref = LAST_FRAME;
-      cm->comp_var_ref[0] = GOLDEN_FRAME;
-      cm->comp_var_ref[1] = ALTREF_FRAME;
-    }
-  }
-}
-
 #define RESERVED \
   if (vp9_rb_read_bit(rb)) \
       vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM, \
@@ -875,7 +1077,7 @@ static size_t read_uncompressed_header(VP9D_COMP *pbi,
 
   cm->last_frame_type = cm->frame_type;
 
-  if (vp9_rb_read_literal(rb, 2) != 0x2)
+  if (vp9_rb_read_literal(rb, 2) != VP9_FRAME_MARKER)
       vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
                          "Invalid frame marker");
 
@@ -896,12 +1098,10 @@ static size_t read_uncompressed_header(VP9D_COMP *pbi,
   cm->error_resilient_mode = vp9_rb_read_bit(rb);
 
   if (cm->frame_type == KEY_FRAME) {
-    int csp;
-
     check_sync_code(cm, rb);
 
-    csp = vp9_rb_read_literal(rb, 3);  // colorspace
-    if (csp != 7) {  // != sRGB
+    cm->color_space = vp9_rb_read_literal(rb, 3);  // colorspace
+    if (cm->color_space != SRGB) {
       vp9_rb_read_bit(rb);  // [16,235] (including xvycc) vs [0,255] range
       if (cm->version == 1) {
         cm->subsampling_x = vp9_rb_read_bit(rb);
@@ -953,8 +1153,6 @@ static size_t read_uncompressed_header(VP9D_COMP *pbi,
 
       for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i)
         vp9_setup_scale_factors(cm, i);
-
-      setup_inter_inter(cm);
     }
   }
 
@@ -974,13 +1172,17 @@ static size_t read_uncompressed_header(VP9D_COMP *pbi,
     vp9_setup_past_independence(cm);
 
   setup_loopfilter(&cm->lf, rb);
-  setup_quantization(pbi, rb);
+  setup_quantization(cm, &pbi->mb, rb);
   setup_segmentation(&cm->seg, rb);
 
   setup_tile_info(cm, rb);
   sz = vp9_rb_read_literal(rb, 16);
 
-  return sz > 0 ? sz : -1;
+  if (sz == 0)
+    vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+                       "Invalid header size");
+
+  return sz;
 }
 
 static int read_compressed_header(VP9D_COMP *pbi, const uint8_t *data,
@@ -1023,7 +1225,7 @@ static int read_compressed_header(VP9D_COMP *pbi, const uint8_t *data,
 
     for (j = 0; j < PARTITION_CONTEXTS; ++j)
       for (i = 0; i < PARTITION_TYPES - 1; ++i)
-        vp9_diff_update_prob(&r, &fc->partition_prob[INTER_FRAME][j][i]);
+        vp9_diff_update_prob(&r, &fc->partition_prob[j][i]);
 
     read_mv_probs(&r, nmvc, cm->allow_high_precision_mv);
   }
@@ -1087,69 +1289,65 @@ int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) {
   MACROBLOCKD *const xd = &pbi->mb;
 
   const uint8_t *data = pbi->source;
-  const uint8_t *data_end = pbi->source + pbi->source_sz;
+  const uint8_t *const data_end = pbi->source + pbi->source_sz;
 
-  struct vp9_read_bit_buffer rb = { data, data_end, 0,
-                                    cm, error_handler };
+  struct vp9_read_bit_buffer rb = { data, data_end, 0, cm, error_handler };
   const size_t first_partition_size = read_uncompressed_header(pbi, &rb);
   const int keyframe = cm->frame_type == KEY_FRAME;
-  YV12_BUFFER_CONFIG *new_fb = &cm->yv12_fb[cm->new_fb_idx];
+  const int tile_rows = 1 << cm->log2_tile_rows;
   const int tile_cols = 1 << cm->log2_tile_cols;
-  int tile_col;
+  YV12_BUFFER_CONFIG *const new_fb = get_frame_new_buffer(cm);
 
   if (!first_partition_size) {
-    if (!keyframe) {
       // showing a frame directly
       *p_data_end = data + 1;
       return 0;
-    } else {
-      vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
-                         "Invalid key frame");
-      return -1;
-    }
   }
-  data += vp9_rb_bytes_read(&rb);
-  xd->corrupted = 0;
-  new_fb->corrupted = 0;
-  pbi->do_loopfilter_inline =
-      (cm->log2_tile_rows | cm->log2_tile_cols) == 0 && cm->lf.filter_level;
 
   if (!pbi->decoded_key_frame && !keyframe)
     return -1;
 
+  data += vp9_rb_bytes_read(&rb);
   if (!read_is_valid(data, first_partition_size, data_end))
     vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
                        "Truncated packet or corrupt header length");
 
-  setup_plane_dequants(cm, &pbi->mb, cm->base_qindex);
+  pbi->do_loopfilter_inline =
+      (cm->log2_tile_rows | cm->log2_tile_cols) == 0 && cm->lf.filter_level;
+  if (pbi->do_loopfilter_inline && pbi->lf_worker.data1 == NULL) {
+    CHECK_MEM_ERROR(cm, pbi->lf_worker.data1, vpx_malloc(sizeof(LFWorkerData)));
+    pbi->lf_worker.hook = (VP9WorkerHook)vp9_loop_filter_worker;
+    if (pbi->oxcf.max_threads > 1 && !vp9_worker_reset(&pbi->lf_worker)) {
+      vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+                         "Loop filter thread creation failed");
+    }
+  }
+
+  alloc_tile_storage(pbi, tile_cols);
 
   xd->mi_8x8 = cm->mi_grid_visible;
   xd->mode_info_stride = cm->mode_info_stride;
+  set_prev_mi(cm);
 
-  CHECK_MEM_ERROR(cm, pbi->mi_streams,
-                  vpx_realloc(pbi->mi_streams, tile_cols *
-                              sizeof(*pbi->mi_streams)));
-  for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
-    vp9_get_tile_col_offsets(cm, tile_col);
-    pbi->mi_streams[tile_col] =
-        &cm->mi[cm->mi_rows * cm->cur_tile_mi_col_start];
-  }
+  setup_plane_dequants(cm, xd, cm->base_qindex);
+  setup_block_dptrs(xd, cm->subsampling_x, cm->subsampling_y);
 
   cm->fc = cm->frame_contexts[cm->frame_context_idx];
-
   vp9_zero(cm->counts);
-
-  new_fb->corrupted |= read_compressed_header(pbi, data, first_partition_size);
-
-  setup_block_dptrs(xd, cm->subsampling_x, cm->subsampling_y);
-
-  // clear out the coeff buffer
   for (i = 0; i < MAX_MB_PLANE; ++i)
-    vp9_zero(xd->plane[i].qcoeff);
+    vp9_zero(xd->plane[i].dqcoeff);
 
-  set_prev_mi(cm);
+  xd->corrupted = 0;
+  new_fb->corrupted = read_compressed_header(pbi, data, first_partition_size);
 
-  *p_data_end = decode_tiles(pbi, data + first_partition_size);
+  // TODO(jzern): remove frame_parallel_decoding_mode restriction for
+  // single-frame tile decoding.
+  if (pbi->oxcf.max_threads > 1 && tile_rows == 1 && tile_cols > 1 &&
+      cm->frame_parallel_decoding_mode) {
+    *p_data_end = decode_tiles_mt(pbi, data + first_partition_size);
+  } else {
+    *p_data_end = decode_tiles(pbi, data + first_partition_size);
+  }
 
   cm->last_width = cm->width;
   cm->last_height = cm->height;
diff --git a/vp9/decoder/vp9_detokenize.c b/vp9/decoder/vp9_detokenize.c
index 0d0f0dfe0..010b8fe33 100644
--- a/vp9/decoder/vp9_detokenize.c
+++ b/vp9/decoder/vp9_detokenize.c
@@ -70,28 +70,28 @@ static const vp9_prob cat6_prob[15] = {
                      DCT_EOB_MODEL_TOKEN : TWO_TOKEN) :  \
                     token];                              \
     }                                                    \
-    token_cache[scan[c]] = vp9_pt_energy_class[token];   \
   } while (0)
 
 #define WRITE_COEF_CONTINUE(val, token)                  \
   {                                                      \
-    qcoeff_ptr[scan[c]] = vp9_read_and_apply_sign(r, val) * \
+    dqcoeff_ptr[scan[c]] = vp9_read_and_apply_sign(r, val) * \
                             dq[c > 0] / (1 + (tx_size == TX_32X32)); \
     INCREMENT_COUNT(token);                              \
+    token_cache[scan[c]] = vp9_pt_energy_class[token];   \
     c++;                                                 \
     continue;                                            \
   }
 
-#define ADJUST_COEF(prob, bits_count)  \
-  do {                                 \
-    if (vp9_read(r, prob))             \
-      val += 1 << bits_count;          \
+#define ADJUST_COEF(prob, bits_count)                   \
+  do {                                                  \
+    val += (vp9_read(r, prob) << bits_count);           \
   } while (0);
 
 static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd,
                         vp9_reader *r, int block_idx,
-                        PLANE_TYPE type, int seg_eob, int16_t *qcoeff_ptr,
-                        TX_SIZE tx_size, const int16_t *dq, int pt) {
+                        PLANE_TYPE type, int seg_eob, int16_t *dqcoeff_ptr,
+                        TX_SIZE tx_size, const int16_t *dq, int pt,
+                        uint8_t *token_cache) {
   const FRAME_CONTEXT *const fc = &cm->fc;
   FRAME_COUNTS *const counts = &cm->counts;
   const int ref = is_inter_block(&xd->mi_8x8[0]->mbmi);
@@ -104,7 +104,6 @@ static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd,
   vp9_coeff_count_model *coef_counts = counts->coef[tx_size];
   const int16_t *scan, *nb;
   const uint8_t *const band_translate = get_band_translate(tx_size);
-  uint8_t token_cache[1024];
   get_scan(xd, tx_size, type, block_idx, &scan, &nb);
 
   while (1) {
@@ -131,6 +130,7 @@ static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd,
 
     if (!vp9_read(r, prob[ZERO_CONTEXT_NODE])) {
       INCREMENT_COUNT(ZERO_TOKEN);
+      token_cache[scan[c]] = vp9_pt_energy_class[ZERO_TOKEN];
       ++c;
       goto SKIP_START;
     }
@@ -210,45 +210,26 @@ static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd,
   return c;
 }
 
-struct decode_block_args {
-  VP9_COMMON *cm;
-  MACROBLOCKD *xd;
-  struct segmentation *seg;
-  vp9_reader *r;
-  int *eobtotal;
-};
-
-static void decode_block(int plane, int block, BLOCK_SIZE plane_bsize,
-                         TX_SIZE tx_size, void *argv) {
-  const struct decode_block_args* const arg = argv;
-
-  // find the maximum eob for this transform size, adjusted by segment
-  MACROBLOCKD *xd = arg->xd;
-  const struct segmentation *seg = arg->seg;
-  struct macroblockd_plane* pd = &xd->plane[plane];
-  const int segment_id = xd->mi_8x8[0]->mbmi.segment_id;
-  const int seg_eob = get_tx_eob(seg, segment_id, tx_size);
+int vp9_decode_block_tokens(VP9_COMMON *cm, MACROBLOCKD *xd,
+                            int plane, int block, BLOCK_SIZE plane_bsize,
+                            TX_SIZE tx_size, vp9_reader *r,
+                            uint8_t *token_cache) {
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int seg_eob = get_tx_eob(&cm->seg, xd->mi_8x8[0]->mbmi.segment_id,
+                                 tx_size);
   int aoff, loff, eob, pt;
-
   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &aoff, &loff);
   pt = get_entropy_context(tx_size, pd->above_context + aoff,
                                     pd->left_context + loff);
 
-  eob = decode_coefs(arg->cm, xd, arg->r, block,
-                     pd->plane_type, seg_eob, BLOCK_OFFSET(pd->qcoeff, block),
-                     tx_size, pd->dequant, pt);
+  eob = decode_coefs(cm, xd, r, block,
+                     pd->plane_type, seg_eob, BLOCK_OFFSET(pd->dqcoeff, block),
+                     tx_size, pd->dequant, pt, token_cache);
 
   set_contexts(xd, pd, plane_bsize, tx_size, eob > 0, aoff, loff);
 
   pd->eobs[block] = eob;
-  *arg->eobtotal += eob;
+  return eob;
 }
 
-int vp9_decode_tokens(VP9_COMMON *cm, MACROBLOCKD *xd,
-                      struct segmentation *seg,
-                      vp9_reader *r, BLOCK_SIZE bsize) {
-  int eobtotal = 0;
-  struct decode_block_args args = {cm, xd, seg, r, &eobtotal};
-  foreach_transformed_block(xd, bsize, decode_block, &args);
-  return eobtotal;
-}
+
diff --git a/vp9/decoder/vp9_detokenize.h b/vp9/decoder/vp9_detokenize.h
index 0fb4c3cc9..04939ead3 100644
--- a/vp9/decoder/vp9_detokenize.h
+++ b/vp9/decoder/vp9_detokenize.h
@@ -15,8 +15,9 @@
 #include "vp9/decoder/vp9_onyxd_int.h"
 #include "vp9/decoder/vp9_dboolhuff.h"
 
-int vp9_decode_tokens(VP9_COMMON *cm, MACROBLOCKD *xd,
-                      struct segmentation *seg,
-                      vp9_reader *r, BLOCK_SIZE bsize);
+int vp9_decode_block_tokens(VP9_COMMON *cm, MACROBLOCKD *xd,
+                            int plane, int block, BLOCK_SIZE plane_bsize,
+                            TX_SIZE tx_size, vp9_reader *r,
+                            uint8_t *token_cache);
 
 #endif  // VP9_DECODER_VP9_DETOKENIZE_H_
diff --git a/vp9/decoder/vp9_onyxd_if.c b/vp9/decoder/vp9_onyxd_if.c
index 243dbef21..5f970a3d5 100644
--- a/vp9/decoder/vp9_onyxd_if.c
+++ b/vp9/decoder/vp9_onyxd_if.c
@@ -142,18 +142,12 @@ VP9D_PTR vp9_create_decompressor(VP9D_CONFIG *oxcf) {
   pbi->decoded_key_frame = 0;
 
   vp9_worker_init(&pbi->lf_worker);
-  pbi->lf_worker.data1 = vpx_malloc(sizeof(LFWorkerData));
-  pbi->lf_worker.hook = (VP9WorkerHook)vp9_loop_filter_worker;
-  if (pbi->lf_worker.data1 == NULL ||
-      (pbi->oxcf.max_threads > 1 && !vp9_worker_reset(&pbi->lf_worker))) {
-    vp9_remove_decompressor(pbi);
-    return NULL;
-  }
 
   return pbi;
 }
 
 void vp9_remove_decompressor(VP9D_PTR ptr) {
+  int i;
   VP9D_COMP *const pbi = (VP9D_COMP *)ptr;
 
   if (!pbi)
@@ -162,7 +156,16 @@ void vp9_remove_decompressor(VP9D_PTR ptr) {
   vp9_remove_common(&pbi->common);
   vp9_worker_end(&pbi->lf_worker);
   vpx_free(pbi->lf_worker.data1);
+  for (i = 0; i < pbi->num_tile_workers; ++i) {
+    VP9Worker *const worker = &pbi->tile_workers[i];
+    vp9_worker_end(worker);
+    vpx_free(worker->data1);
+    vpx_free(worker->data2);
+  }
+  vpx_free(pbi->tile_workers);
   vpx_free(pbi->mi_streams);
+  vpx_free(pbi->above_context[0]);
+  vpx_free(pbi->above_seg_context);
   vpx_free(pbi);
 }
 
@@ -176,7 +179,6 @@ vpx_codec_err_t vp9_copy_reference_dec(VP9D_PTR ptr,
                                        YV12_BUFFER_CONFIG *sd) {
   VP9D_COMP *pbi = (VP9D_COMP *) ptr;
   VP9_COMMON *cm = &pbi->common;
-  int ref_fb_idx;
 
   /* TODO(jkoleszar): The decoder doesn't have any real knowledge of what the
    * encoder is using the frame buffers for. This is just a stub to keep the
@@ -184,18 +186,15 @@ vpx_codec_err_t vp9_copy_reference_dec(VP9D_PTR ptr,
    * later commit that adds VP9-specific controls for this functionality.
    */
   if (ref_frame_flag == VP9_LAST_FLAG) {
-    ref_fb_idx = cm->ref_frame_map[0];
+    YV12_BUFFER_CONFIG *cfg = &cm->yv12_fb[cm->ref_frame_map[0]];
+    if (!equal_dimensions(cfg, sd))
+      vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+                         "Incorrect buffer dimensions");
+    else
+      vp8_yv12_copy_frame(cfg, sd);
   } else {
     vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
                        "Invalid reference frame");
-    return cm->error.error_code;
-  }
-
-  if (!equal_dimensions(&cm->yv12_fb[ref_fb_idx], sd)) {
-    vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
-                       "Incorrect buffer dimensions");
-  } else {
-    vp8_yv12_copy_frame(&cm->yv12_fb[ref_fb_idx], sd);
   }
 
   return cm->error.error_code;
@@ -267,7 +266,7 @@ static void swap_frame_buffers(VP9D_COMP *pbi) {
     ++ref_index;
   }
 
-  cm->frame_to_show = &cm->yv12_fb[cm->new_fb_idx];
+  cm->frame_to_show = get_frame_new_buffer(cm);
   cm->fb_idx_ref_cnt[cm->new_fb_idx]--;
 
   // Invalidate these references until the next frame starts.
@@ -305,7 +304,7 @@ int vp9_receive_compressed_data(VP9D_PTR ptr,
      * thing to do here.
      */
     if (cm->active_ref_idx[0] != INT_MAX)
-      cm->yv12_fb[cm->active_ref_idx[0]].corrupted = 1;
+      get_frame_ref_buffer(cm, 0)->corrupted = 1;
   }
 
   cm->new_fb_idx = get_free_fb(cm);
@@ -322,7 +321,7 @@ int vp9_receive_compressed_data(VP9D_PTR ptr,
      * thing to do here.
      */
     if (cm->active_ref_idx[0] != INT_MAX)
-      cm->yv12_fb[cm->active_ref_idx[0]].corrupted = 1;
+      get_frame_ref_buffer(cm, 0)->corrupted = 1;
 
     if (cm->fb_idx_ref_cnt[cm->new_fb_idx] > 0)
       cm->fb_idx_ref_cnt[cm->new_fb_idx]--;
diff --git a/vp9/decoder/vp9_onyxd_int.h b/vp9/decoder/vp9_onyxd_int.h
index 68b30347e..7c4c9db36 100644
--- a/vp9/decoder/vp9_onyxd_int.h
+++ b/vp9/decoder/vp9_onyxd_int.h
@@ -40,9 +40,17 @@ typedef struct VP9Decompressor {
   int do_loopfilter_inline;  // apply loopfilter to available rows immediately
   VP9Worker lf_worker;
 
+  VP9Worker *tile_workers;
+  int num_tile_workers;
+
   /* Each tile column has its own MODE_INFO stream. This array indexes them by
      tile column index. */
   MODE_INFO **mi_streams;
+
+  ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
+  PARTITION_CONTEXT *above_seg_context;
+
+  DECLARE_ALIGNED(16, unsigned char, token_cache[1024]);
 } VP9D_COMP;
 
 #endif  // VP9_DECODER_VP9_ONYXD_INT_H_
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index 8378a78e1..87bd36c2b 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -53,8 +53,7 @@ extern unsigned int active_section;
 int64_t tx_count_32x32p_stats[TX_SIZE_CONTEXTS][TX_SIZES];
 int64_t tx_count_16x16p_stats[TX_SIZE_CONTEXTS][TX_SIZES - 1];
 int64_t tx_count_8x8p_stats[TX_SIZE_CONTEXTS][TX_SIZES - 2];
-int64_t switchable_interp_stats[SWITCHABLE_FILTERS+1]
-                               [SWITCHABLE_FILTERS];
+int64_t switchable_interp_stats[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS];
 
 void init_tx_count_stats() {
   vp9_zero(tx_count_32x32p_stats);
@@ -87,10 +86,9 @@ static void update_tx_count_stats(VP9_COMMON *cm) {
 
 static void update_switchable_interp_stats(VP9_COMMON *cm) {
   int i, j;
-  for (i = 0; i < SWITCHABLE_FILTERS+1; ++i)
-    for (j = 0; j < SWITCHABLE_FILTERS; ++j) {
+  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
+    for (j = 0; j < SWITCHABLE_FILTERS; ++j)
       switchable_interp_stats[i][j] += cm->fc.switchable_interp_count[i][j];
-    }
 }
 
 void write_tx_count_stats() {
@@ -140,9 +138,9 @@ void write_switchable_interp_stats() {
   fclose(fp);
 
   printf(
-      "vp9_default_switchable_filter_count[SWITCHABLE_FILTERS+1]"
+      "vp9_default_switchable_filter_count[SWITCHABLE_FILTER_CONTEXTS]"
       "[SWITCHABLE_FILTERS] = {\n");
-  for (i = 0; i < SWITCHABLE_FILTERS+1; i++) {
+  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
     printf("  { ");
     for (j = 0; j < SWITCHABLE_FILTERS; j++) {
       printf("%"PRId64", ", switchable_interp_stats[i][j]);
@@ -165,18 +163,13 @@ void vp9_encode_unsigned_max(struct vp9_write_bit_buffer *wb,
   vp9_wb_write_literal(wb, data, get_unsigned_bits(max));
 }
 
-static void update_mode(
-  vp9_writer *w,
-  int n,
-  vp9_tree tree,
-  vp9_prob Pnew[/* n-1 */],
-  vp9_prob Pcur[/* n-1 */],
-  unsigned int bct[/* n-1 */] [2],
-  const unsigned int num_events[/* n */]
-) {
+static void update_mode(vp9_writer *w, int n, vp9_tree tree,
+                        vp9_prob Pcur[/* n-1 */],
+                        unsigned int bct[/* n-1 */][2],
+                        const unsigned int num_events[/* n */]) {
   int i = 0;
 
-  vp9_tree_probs_from_distribution(tree, Pnew, bct, num_events, 0);
+  vp9_tree_probs_from_distribution(tree, bct, num_events, 0);
   n--;
 
   for (i = 0; i < n; ++i)
@@ -187,11 +180,10 @@ static void update_mbintra_mode_probs(VP9_COMP* const cpi,
                                       vp9_writer* const bc) {
   VP9_COMMON *const cm = &cpi->common;
   int j;
-  vp9_prob pnew[INTRA_MODES - 1];
   unsigned int bct[INTRA_MODES - 1][2];
 
   for (j = 0; j < BLOCK_SIZE_GROUPS; j++)
-    update_mode(bc, INTRA_MODES, vp9_intra_mode_tree, pnew,
+    update_mode(bc, INTRA_MODES, vp9_intra_mode_tree,
                 cm->fc.y_mode_prob[j], bct,
                 (unsigned int *)cpi->y_mode_count[j]);
 }
@@ -233,44 +225,35 @@ static void write_intra_mode(vp9_writer *bc, int m, const vp9_prob *p) {
   write_token(bc, vp9_intra_mode_tree, p, vp9_intra_mode_encodings + m);
 }
 
-static void update_switchable_interp_probs(VP9_COMP *const cpi,
-                                           vp9_writer* const bc) {
+static void update_switchable_interp_probs(VP9_COMP *cpi, vp9_writer *w) {
   VP9_COMMON *const cm = &cpi->common;
-  unsigned int branch_ct[SWITCHABLE_FILTERS + 1]
-                        [SWITCHABLE_FILTERS - 1][2];
-  vp9_prob new_prob[SWITCHABLE_FILTERS + 1][SWITCHABLE_FILTERS - 1];
+  unsigned int branch_ct[SWITCHABLE_FILTERS - 1][2];
   int i, j;
-  for (j = 0; j <= SWITCHABLE_FILTERS; ++j) {
-    vp9_tree_probs_from_distribution(
-        vp9_switchable_interp_tree,
-        new_prob[j], branch_ct[j],
-        cm->counts.switchable_interp[j], 0);
-  }
-  for (j = 0; j <= SWITCHABLE_FILTERS; ++j) {
-    for (i = 0; i < SWITCHABLE_FILTERS - 1; ++i) {
-      vp9_cond_prob_diff_update(bc, &cm->fc.switchable_interp_prob[j][i],
-                                branch_ct[j][i]);
-    }
+  for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j) {
+    vp9_tree_probs_from_distribution(vp9_switchable_interp_tree, branch_ct,
+                                     cm->counts.switchable_interp[j], 0);
+
+    for (i = 0; i < SWITCHABLE_FILTERS - 1; ++i)
+      vp9_cond_prob_diff_update(w, &cm->fc.switchable_interp_prob[j][i],
+                                branch_ct[i]);
   }
+
 #ifdef MODE_STATS
   if (!cpi->dummy_packing)
     update_switchable_interp_stats(cm);
 #endif
 }
 
-static void update_inter_mode_probs(VP9_COMMON *cm, vp9_writer* const bc) {
+static void update_inter_mode_probs(VP9_COMMON *cm, vp9_writer *w) {
   int i, j;
 
   for (i = 0; i < INTER_MODE_CONTEXTS; ++i) {
     unsigned int branch_ct[INTER_MODES - 1][2];
-    vp9_prob new_prob[INTER_MODES - 1];
-
-    vp9_tree_probs_from_distribution(vp9_inter_mode_tree,
-                                     new_prob, branch_ct,
+    vp9_tree_probs_from_distribution(vp9_inter_mode_tree, branch_ct,
                                      cm->counts.inter_mode[i], NEARESTMV);
 
     for (j = 0; j < INTER_MODES - 1; ++j)
-      vp9_cond_prob_diff_update(bc, &cm->fc.inter_mode_probs[i][j],
+      vp9_cond_prob_diff_update(w, &cm->fc.inter_mode_probs[i][j],
                                 branch_ct[j]);
   }
 }
@@ -561,7 +544,8 @@ static void write_mb_modes_kf(const VP9_COMP *cpi, MODE_INFO **mi_8x8,
   write_intra_mode(bc, m->mbmi.uv_mode, vp9_kf_uv_mode_prob[ym]);
 }
 
-static void write_modes_b(VP9_COMP *cpi, MODE_INFO **mi_8x8, vp9_writer *bc,
+static void write_modes_b(VP9_COMP *cpi, const TileInfo *const tile,
+                          MODE_INFO **mi_8x8, vp9_writer *bc,
                           TOKENEXTRA **tok, TOKENEXTRA *tok_end,
                           int mi_row, int mi_col, int index) {
   VP9_COMMON *const cm = &cpi->common;
@@ -574,9 +558,10 @@ static void write_modes_b(VP9_COMP *cpi, MODE_INFO **mi_8x8, vp9_writer *bc,
 
   xd->mi_8x8 = mi_8x8;
 
-  set_mi_row_col(&cpi->common, xd,
+  set_mi_row_col(xd, tile,
                  mi_row, num_8x8_blocks_high_lookup[m->mbmi.sb_type],
-                 mi_col, num_8x8_blocks_wide_lookup[m->mbmi.sb_type]);
+                 mi_col, num_8x8_blocks_wide_lookup[m->mbmi.sb_type],
+                 cm->mi_rows, cm->mi_cols);
   if (frame_is_intra_only(cm)) {
     write_mb_modes_kf(cpi, mi_8x8, bc);
 #ifdef ENTROPY_STATS
@@ -593,7 +578,31 @@ static void write_modes_b(VP9_COMP *cpi, MODE_INFO **mi_8x8, vp9_writer *bc,
   pack_mb_tokens(bc, tok, tok_end);
 }
 
-static void write_modes_sb(VP9_COMP *cpi, MODE_INFO **mi_8x8, vp9_writer *bc,
+static void write_partition(VP9_COMP *cpi, int hbs, int mi_row, int mi_col,
+                            PARTITION_TYPE p, BLOCK_SIZE bsize, vp9_writer *w) {
+  VP9_COMMON *const cm = &cpi->common;
+  const int ctx = partition_plane_context(cpi->above_seg_context,
+                                          cpi->left_seg_context,
+                                          mi_row, mi_col, bsize);
+  const vp9_prob *const probs = get_partition_probs(cm, ctx);
+  const int has_rows = (mi_row + hbs) < cm->mi_rows;
+  const int has_cols = (mi_col + hbs) < cm->mi_cols;
+
+  if (has_rows && has_cols) {
+    write_token(w, vp9_partition_tree, probs, &vp9_partition_encodings[p]);
+  } else if (!has_rows && has_cols) {
+    assert(p == PARTITION_SPLIT || p == PARTITION_HORZ);
+    vp9_write(w, p == PARTITION_SPLIT, probs[1]);
+  } else if (has_rows && !has_cols) {
+    assert(p == PARTITION_SPLIT || p == PARTITION_VERT);
+    vp9_write(w, p == PARTITION_SPLIT, probs[2]);
+  } else {
+    assert(p == PARTITION_SPLIT);
+  }
+}
+
+static void write_modes_sb(VP9_COMP *cpi, const TileInfo *const tile,
+                           MODE_INFO **mi_8x8, vp9_writer *bc,
                            TOKENEXTRA **tok, TOKENEXTRA *tok_end,
                            int mi_row, int mi_col, BLOCK_SIZE bsize,
                            int index) {
@@ -615,42 +624,32 @@ static void write_modes_sb(VP9_COMP *cpi, MODE_INFO **mi_8x8, vp9_writer *bc,
     if (index > 0)
       return;
   } else {
-    int pl;
-    const int idx = check_bsize_coverage(bs, cm->mi_rows, cm->mi_cols,
-                                         mi_row, mi_col);
-    pl = partition_plane_context(cm, mi_row, mi_col, bsize);
-    // encode the partition information
-    if (idx == 0)
-      write_token(bc, vp9_partition_tree,
-                  cm->fc.partition_prob[cm->frame_type][pl],
-                  vp9_partition_encodings + partition);
-    else if (idx > 0)
-      vp9_write(bc, partition == PARTITION_SPLIT,
-                cm->fc.partition_prob[cm->frame_type][pl][idx]);
+    write_partition(cpi, bs, mi_row, mi_col, partition, bsize, bc);
   }
 
   subsize = get_subsize(bsize, partition);
 
   switch (partition) {
     case PARTITION_NONE:
-      write_modes_b(cpi, mi_8x8, bc, tok, tok_end, mi_row, mi_col, 0);
+      write_modes_b(cpi, tile, mi_8x8, bc, tok, tok_end, mi_row, mi_col, 0);
       break;
     case PARTITION_HORZ:
-      write_modes_b(cpi, mi_8x8, bc, tok, tok_end, mi_row, mi_col, 0);
+      write_modes_b(cpi, tile, mi_8x8, bc, tok, tok_end, mi_row, mi_col, 0);
       if ((mi_row + bs) < cm->mi_rows)
-        write_modes_b(cpi, mi_8x8 + bs * mis, bc, tok, tok_end, mi_row + bs,
-                      mi_col, 1);
+        write_modes_b(cpi, tile, mi_8x8 + bs * mis, bc, tok, tok_end,
+                      mi_row + bs, mi_col, 1);
       break;
     case PARTITION_VERT:
-      write_modes_b(cpi, mi_8x8, bc, tok, tok_end, mi_row, mi_col, 0);
+      write_modes_b(cpi, tile, mi_8x8, bc, tok, tok_end, mi_row, mi_col, 0);
       if ((mi_col + bs) < cm->mi_cols)
-        write_modes_b(cpi, mi_8x8 + bs, bc, tok, tok_end, mi_row, mi_col + bs,
-                      1);
+        write_modes_b(cpi, tile, mi_8x8 + bs, bc, tok, tok_end,
+                      mi_row, mi_col + bs, 1);
       break;
     case PARTITION_SPLIT:
       for (n = 0; n < 4; n++) {
         const int j = n >> 1, i = n & 1;
-        write_modes_sb(cpi, mi_8x8 + j * bs * mis + i * bs, bc, tok, tok_end,
+        write_modes_sb(cpi, tile, mi_8x8 + j * bs * mis + i * bs, bc,
+                       tok, tok_end,
                        mi_row + j * bs, mi_col + i * bs, subsize, n);
       }
       break;
@@ -661,10 +660,12 @@ static void write_modes_sb(VP9_COMP *cpi, MODE_INFO **mi_8x8, vp9_writer *bc,
   // update partition context
   if (bsize >= BLOCK_8X8 &&
       (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT))
-    update_partition_context(cm, mi_row, mi_col, subsize, bsize);
+    update_partition_context(cpi->above_seg_context, cpi->left_seg_context,
+                             mi_row, mi_col, subsize, bsize);
 }
 
-static void write_modes(VP9_COMP *cpi, vp9_writer* const bc,
+static void write_modes(VP9_COMP *cpi, const TileInfo *const tile,
+                        vp9_writer* const bc,
                         TOKENEXTRA **tok, TOKENEXTRA *tok_end) {
   VP9_COMMON *const cm = &cpi->common;
   const int mis = cm->mode_info_stride;
@@ -672,15 +673,15 @@ static void write_modes(VP9_COMP *cpi, vp9_writer* const bc,
   MODE_INFO **mi_8x8 = cm->mi_grid_visible;
   MODE_INFO **m_8x8;
 
-  mi_8x8 += cm->cur_tile_mi_col_start + cm->cur_tile_mi_row_start * mis;
+  mi_8x8 += tile->mi_col_start + tile->mi_row_start * mis;
 
-  for (mi_row = cm->cur_tile_mi_row_start; mi_row < cm->cur_tile_mi_row_end;
+  for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end;
        mi_row += 8, mi_8x8 += 8 * mis) {
     m_8x8 = mi_8x8;
-    vp9_zero(cm->left_seg_context);
-    for (mi_col = cm->cur_tile_mi_col_start; mi_col < cm->cur_tile_mi_col_end;
+    vp9_zero(cpi->left_seg_context);
+    for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
          mi_col += MI_BLOCK_SIZE, m_8x8 += MI_BLOCK_SIZE) {
-      write_modes_sb(cpi, m_8x8, bc, tok, tok_end, mi_row, mi_col,
+      write_modes_sb(cpi, tile, m_8x8, bc, tok, tok_end, mi_row, mi_col,
                      BLOCK_64X64, 0);
     }
   }
@@ -692,8 +693,7 @@ static void build_tree_distribution(VP9_COMP *cpi, TX_SIZE tx_size) {
   unsigned int (*eob_branch_ct)[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] =
       cpi->common.counts.eob_branch[tx_size];
   vp9_coeff_stats *coef_branch_ct = cpi->frame_branch_ct[tx_size];
-  vp9_prob full_probs[ENTROPY_NODES];
-  int i, j, k, l;
+  int i, j, k, l, m;
 
   for (i = 0; i < BLOCK_TYPES; ++i) {
     for (j = 0; j < REF_TYPES; ++j) {
@@ -702,16 +702,14 @@ static void build_tree_distribution(VP9_COMP *cpi, TX_SIZE tx_size) {
           if (l >= 3 && k == 0)
             continue;
           vp9_tree_probs_from_distribution(vp9_coef_tree,
-                                           full_probs,
                                            coef_branch_ct[i][j][k][l],
                                            coef_counts[i][j][k][l], 0);
-          vpx_memcpy(coef_probs[i][j][k][l], full_probs,
-                     sizeof(vp9_prob) * UNCONSTRAINED_NODES);
           coef_branch_ct[i][j][k][l][0][1] = eob_branch_ct[i][j][k][l] -
                                              coef_branch_ct[i][j][k][l][0][0];
-          coef_probs[i][j][k][l][0] =
-              get_binary_prob(coef_branch_ct[i][j][k][l][0][0],
-                              coef_branch_ct[i][j][k][l][0][1]);
+          for (m = 0; m < UNCONSTRAINED_NODES; ++m)
+            coef_probs[i][j][k][l][m] = get_binary_prob(
+                                            coef_branch_ct[i][j][k][l][m][0],
+                                            coef_branch_ct[i][j][k][l][m][1]);
 #ifdef ENTROPY_STATS
           if (!cpi->dummy_packing) {
             int t;
@@ -1103,7 +1101,7 @@ static void encode_txfm_probs(VP9_COMP *cpi, vp9_writer *w) {
   }
 }
 
-static void write_interp_filter_type(INTERPOLATIONFILTERTYPE type,
+static void write_interp_filter_type(INTERPOLATION_TYPE type,
                                      struct vp9_write_bit_buffer *wb) {
   const int type_to_literal[] = { 1, 0, 2, 3 };
 
@@ -1121,7 +1119,7 @@ static void fix_mcomp_filter_type(VP9_COMP *cpi) {
     int i, j, c = 0;
     for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
       count[i] = 0;
-      for (j = 0; j <= SWITCHABLE_FILTERS; ++j)
+      for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j)
         count[i] += cm->counts.switchable_interp[j][i];
       c += (count[i] > 0);
     }
@@ -1201,7 +1199,7 @@ static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) {
   const int tile_cols = 1 << cm->log2_tile_cols;
   const int tile_rows = 1 << cm->log2_tile_rows;
 
-  vpx_memset(cm->above_seg_context, 0, sizeof(PARTITION_CONTEXT) *
+  vpx_memset(cpi->above_seg_context, 0, sizeof(*cpi->above_seg_context) *
              mi_cols_aligned_to_sb(cm->mi_cols));
 
   tok[0][0] = cpi->tok;
@@ -1216,9 +1214,10 @@ static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) {
   }
 
   for (tile_row = 0; tile_row < tile_rows; tile_row++) {
-    vp9_get_tile_row_offsets(cm, tile_row);
     for (tile_col = 0; tile_col < tile_cols; tile_col++) {
-      vp9_get_tile_col_offsets(cm, tile_col);
+      TileInfo tile;
+
+      vp9_tile_init(&tile, cm, 0, tile_col);
       tok_end = tok[tile_row][tile_col] + cpi->tok_count[tile_row][tile_col];
 
       if (tile_col < tile_cols - 1 || tile_row < tile_rows - 1)
@@ -1226,7 +1225,7 @@ static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) {
       else
         vp9_start_encode(&residual_bc, data_ptr + total_size);
 
-      write_modes(cpi, &residual_bc, &tok[tile_row][tile_col], tok_end);
+      write_modes(cpi, &tile, &residual_bc, &tok[tile_row][tile_col], tok_end);
       assert(tok[tile_row][tile_col] == tok_end);
       vp9_stop_encode(&residual_bc);
       if (tile_col < tile_cols - 1 || tile_row < tile_rows - 1) {
@@ -1295,17 +1294,16 @@ static void write_frame_size_with_refs(VP9_COMP *cpi,
 }
 
 static void write_sync_code(struct vp9_write_bit_buffer *wb) {
-  vp9_wb_write_literal(wb, SYNC_CODE_0, 8);
-  vp9_wb_write_literal(wb, SYNC_CODE_1, 8);
-  vp9_wb_write_literal(wb, SYNC_CODE_2, 8);
+  vp9_wb_write_literal(wb, VP9_SYNC_CODE_0, 8);
+  vp9_wb_write_literal(wb, VP9_SYNC_CODE_1, 8);
+  vp9_wb_write_literal(wb, VP9_SYNC_CODE_2, 8);
 }
 
 static void write_uncompressed_header(VP9_COMP *cpi,
                                       struct vp9_write_bit_buffer *wb) {
   VP9_COMMON *const cm = &cpi->common;
 
-  // frame marker bits
-  vp9_wb_write_literal(wb, 0x2, 2);
+  vp9_wb_write_literal(wb, VP9_FRAME_MARKER, 2);
 
   // bitstream version.
   // 00 - profile 0. 4:2:0 only
@@ -1319,18 +1317,10 @@ static void write_uncompressed_header(VP9_COMP *cpi,
   vp9_wb_write_bit(wb, cm->error_resilient_mode);
 
   if (cm->frame_type == KEY_FRAME) {
+    const COLOR_SPACE cs = UNKNOWN;
     write_sync_code(wb);
-    // colorspaces
-    // 000 - Unknown
-    // 001 - BT.601
-    // 010 - BT.709
-    // 011 - SMPTE-170
-    // 100 - SMPTE-240
-    // 101 - Reserved
-    // 110 - Reserved
-    // 111 - sRGB (RGB)
-    vp9_wb_write_literal(wb, 0, 3);
-    if (1 /* colorspace != sRGB */) {
+    vp9_wb_write_literal(wb, cs, 3);
+    if (cs != SRGB) {
       vp9_wb_write_bit(wb, 0);  // 0: [16, 235] (i.e. xvYCC), 1: [0, 255]
       if (cm->version == 1) {
         vp9_wb_write_bit(wb, cm->subsampling_x);
@@ -1457,11 +1447,9 @@ static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) {
     update_mbintra_mode_probs(cpi, &header_bc);
 
     for (i = 0; i < PARTITION_CONTEXTS; ++i) {
-      vp9_prob pnew[PARTITION_TYPES - 1];
       unsigned int bct[PARTITION_TYPES - 1][2];
-      update_mode(&header_bc, PARTITION_TYPES,
-                  vp9_partition_tree, pnew,
-                  fc->partition_prob[cm->frame_type][i], bct,
+      update_mode(&header_bc, PARTITION_TYPES, vp9_partition_tree,
+                  fc->partition_prob[i], bct,
                   (unsigned int *)cpi->partition_count[i]);
     }
 
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index 12dad0311..8033a4d15 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -42,7 +42,7 @@ typedef struct {
   int comp_pred_diff;
   int single_pred_diff;
   int64_t tx_rd_diff[TX_MODES];
-  int64_t best_filter_diff[SWITCHABLE_FILTERS + 1];
+  int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
 
   // motion vector cache for adaptive motion search control in partition
   // search loop
@@ -118,8 +118,7 @@ struct macroblock {
   unsigned inter_mode_cost[INTER_MODE_CONTEXTS][INTER_MODES];
   int intra_uv_mode_cost[2][MB_MODE_COUNT];
   int y_mode_costs[INTRA_MODES][INTRA_MODES][INTRA_MODES];
-  int switchable_interp_costs[SWITCHABLE_FILTERS + 1]
-                             [SWITCHABLE_FILTERS];
+  int switchable_interp_costs[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS];
 
   // These define limits to motion vector components to prevent them
   // from extending outside the UMV borders
@@ -137,7 +136,7 @@ struct macroblock {
 
   // note that token_costs is the cost when eob node is skipped
   vp9_coeff_cost token_costs[TX_SIZES];
-  uint8_t token_cache[1024];
+  DECLARE_ALIGNED(16, uint8_t, token_cache[1024]);
 
   int optimize;
 
@@ -173,7 +172,7 @@ struct macroblock {
   BLOCK_SIZE sb_partitioning[4];
   BLOCK_SIZE sb64_partitioning;
 
-  void (*fwd_txm4x4)(int16_t *input, int16_t *output, int pitch);
+  void (*fwd_txm4x4)(const int16_t *input, int16_t *output, int stride);
 };
 
 // TODO(jingning): the variables used here are little complicated. need further
diff --git a/vp9/encoder/vp9_dct.c b/vp9/encoder/vp9_dct.c
index 550cdee60..065992a25 100644
--- a/vp9/encoder/vp9_dct.c
+++ b/vp9/encoder/vp9_dct.c
@@ -8,14 +8,17 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
 #include <assert.h>
 #include <math.h>
+
 #include "./vpx_config.h"
-#include "vp9/common/vp9_systemdependent.h"
+#include "./vp9_rtcd.h"
 
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_idct.h"
+#include "vp9/common/vp9_systemdependent.h"
+
+#include "vp9/encoder/vp9_dct.h"
 
 static void fdct4(const int16_t *input, int16_t *output) {
   int16_t step[4];
@@ -36,7 +39,7 @@ static void fdct4(const int16_t *input, int16_t *output) {
   output[3] = dct_const_round_shift(temp2);
 }
 
-void vp9_short_fdct4x4_c(int16_t *input, int16_t *output, int stride) {
+void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride) {
   // The 2D transform is done with two passes which are actually pretty
   // similar. In the first one, we transform the columns and transpose
   // the results. In the second one, we transform the rows. To achieve that,
@@ -46,7 +49,7 @@ void vp9_short_fdct4x4_c(int16_t *input, int16_t *output, int stride) {
   int pass;
   // We need an intermediate buffer between passes.
   int16_t intermediate[4 * 4];
-  int16_t *in = input;
+  const int16_t *in = input;
   int16_t *out = intermediate;
   // Do the two transform/transpose passes
   for (pass = 0; pass < 2; ++pass) {
@@ -148,8 +151,8 @@ static const transform_2d FHT_4[] = {
   { fadst4, fadst4 }   // ADST_ADST = 3
 };
 
-void vp9_short_fht4x4_c(int16_t *input, int16_t *output,
-                        int pitch, TX_TYPE tx_type) {
+void vp9_short_fht4x4_c(const int16_t *input, int16_t *output,
+                        int stride, int tx_type) {
   int16_t out[4 * 4];
   int16_t *outptr = &out[0];
   int i, j;
@@ -159,7 +162,7 @@ void vp9_short_fht4x4_c(int16_t *input, int16_t *output,
   // Columns
   for (i = 0; i < 4; ++i) {
     for (j = 0; j < 4; ++j)
-      temp_in[j] = input[j * pitch + i] * 16;
+      temp_in[j] = input[j * stride + i] * 16;
     if (i == 0 && temp_in[0])
       temp_in[0] += 1;
     ht.cols(temp_in, temp_out);
@@ -229,7 +232,7 @@ static void fdct8(const int16_t *input, int16_t *output) {
   output[7] = dct_const_round_shift(t3);
 }
 
-void vp9_short_fdct8x8_c(int16_t *input, int16_t *final_output, int stride) {
+void vp9_fdct8x8_c(const int16_t *input, int16_t *final_output, int stride) {
   int i, j;
   int16_t intermediate[64];
 
@@ -300,7 +303,7 @@ void vp9_short_fdct8x8_c(int16_t *input, int16_t *final_output, int stride) {
   }
 }
 
-void vp9_short_fdct16x16_c(int16_t *input, int16_t *output, int stride) {
+void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride) {
   // The 2D transform is done with two passes which are actually pretty
   // similar. In the first one, we transform the columns and transpose
   // the results. In the second one, we transform the rows. To achieve that,
@@ -310,7 +313,7 @@ void vp9_short_fdct16x16_c(int16_t *input, int16_t *output, int stride) {
   int pass;
   // We need an intermediate buffer between passes.
   int16_t intermediate[256];
-  int16_t *in = input;
+  const int16_t *in = input;
   int16_t *out = intermediate;
   // Do the two transform/transpose passes
   for (pass = 0; pass < 2; ++pass) {
@@ -556,8 +559,8 @@ static const transform_2d FHT_8[] = {
   { fadst8, fadst8 }   // ADST_ADST = 3
 };
 
-void vp9_short_fht8x8_c(int16_t *input, int16_t *output,
-                        int pitch, TX_TYPE tx_type) {
+void vp9_short_fht8x8_c(const int16_t *input, int16_t *output,
+                        int stride, int tx_type) {
   int16_t out[64];
   int16_t *outptr = &out[0];
   int i, j;
@@ -567,7 +570,7 @@ void vp9_short_fht8x8_c(int16_t *input, int16_t *output,
   // Columns
   for (i = 0; i < 8; ++i) {
     for (j = 0; j < 8; ++j)
-      temp_in[j] = input[j * pitch + i] * 4;
+      temp_in[j] = input[j * stride + i] * 4;
     ht.cols(temp_in, temp_out);
     for (j = 0; j < 8; ++j)
       outptr[j * 8 + i] = temp_out[j];
@@ -585,10 +588,10 @@ void vp9_short_fht8x8_c(int16_t *input, int16_t *output,
 
 /* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per
    pixel. */
-void vp9_short_walsh4x4_c(int16_t *input, int16_t *output, int stride) {
+void vp9_fwht4x4_c(const int16_t *input, int16_t *output, int stride) {
   int i;
   int a1, b1, c1, d1, e1;
-  int16_t *ip = input;
+  const int16_t *ip = input;
   int16_t *op = output;
 
   for (i = 0; i < 4; i++) {
@@ -949,8 +952,8 @@ static const transform_2d FHT_16[] = {
   { fadst16, fadst16 }   // ADST_ADST = 3
 };
 
-void vp9_short_fht16x16_c(int16_t *input, int16_t *output,
-                          int pitch, TX_TYPE tx_type) {
+void vp9_short_fht16x16_c(const int16_t *input, int16_t *output,
+                          int stride, int tx_type) {
   int16_t out[256];
   int16_t *outptr = &out[0];
   int i, j;
@@ -960,7 +963,7 @@ void vp9_short_fht16x16_c(int16_t *input, int16_t *output,
   // Columns
   for (i = 0; i < 16; ++i) {
     for (j = 0; j < 16; ++j)
-      temp_in[j] = input[j * pitch + i] * 4;
+      temp_in[j] = input[j * stride + i] * 4;
     ht.cols(temp_in, temp_out);
     for (j = 0; j < 16; ++j)
       outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
@@ -1311,7 +1314,7 @@ static void dct32_1d(const int *input, int *output, int round) {
   output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64);
 }
 
-void vp9_short_fdct32x32_c(int16_t *input, int16_t *out, int stride) {
+void vp9_fdct32x32_c(const int16_t *input, int16_t *out, int stride) {
   int i, j;
   int output[32 * 32];
 
@@ -1339,7 +1342,7 @@ void vp9_short_fdct32x32_c(int16_t *input, int16_t *out, int stride) {
 // Note that although we use dct_32_round in dct32_1d computation flow,
 // this 2d fdct32x32 for rate-distortion optimization loop is operating
 // within 16 bits precision.
-void vp9_short_fdct32x32_rd_c(int16_t *input, int16_t *out, int stride) {
+void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *out, int stride) {
   int i, j;
   int output[32 * 32];
 
@@ -1366,3 +1369,27 @@ void vp9_short_fdct32x32_rd_c(int16_t *input, int16_t *out, int stride) {
       out[j + i * 32] = temp_out[j];
   }
 }
+
+void vp9_fht4x4(TX_TYPE tx_type, const int16_t *input, int16_t *output,
+                int stride) {
+  if (tx_type == DCT_DCT)
+    vp9_fdct4x4(input, output, stride);
+  else
+    vp9_short_fht4x4(input, output, stride, tx_type);
+}
+
+void vp9_fht8x8(TX_TYPE tx_type, const int16_t *input, int16_t *output,
+                int stride) {
+  if (tx_type == DCT_DCT)
+    vp9_fdct8x8(input, output, stride);
+  else
+    vp9_short_fht8x8(input, output, stride, tx_type);
+}
+
+void vp9_fht16x16(TX_TYPE tx_type, const int16_t *input, int16_t *output,
+                  int stride) {
+  if (tx_type == DCT_DCT)
+    vp9_fdct16x16(input, output, stride);
+  else
+    vp9_short_fht16x16(input, output, stride, tx_type);
+}
diff --git a/vp9/encoder/vp9_dct.h b/vp9/encoder/vp9_dct.h
new file mode 100644
index 000000000..aaf976d93
--- /dev/null
+++ b/vp9/encoder/vp9_dct.h
@@ -0,0 +1,24 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP9_ENCODER_VP9_DCT_H_
+#define VP9_ENCODER_VP9_DCT_H_
+
+void vp9_fht4x4(TX_TYPE tx_type, const int16_t *input, int16_t *output,
+                int stride);
+
+void vp9_fht8x8(TX_TYPE tx_type, const int16_t *input, int16_t *output,
+                int stride);
+
+void vp9_fht16x16(TX_TYPE tx_type, const int16_t *input, int16_t *output,
+                  int stride);
+
+#endif  // VP9_ENCODER_VP9_DCT_H_
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 98284a690..a45299b59 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -282,7 +282,7 @@ static void build_activity_map(VP9_COMP *cpi) {
   VP9_COMMON * const cm = &cpi->common;
 
 #if ALT_ACT_MEASURE
-  YV12_BUFFER_CONFIG *new_yv12 = &cm->yv12_fb[cm->new_fb_idx];
+  YV12_BUFFER_CONFIG *new_yv12 = get_frame_new_buffer(cm);
   int recon_yoffset;
   int recon_y_stride = new_yv12->y_stride;
 #endif
@@ -465,7 +465,7 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
     cpi->rd_comp_pred_diff[COMP_PREDICTION_ONLY] += ctx->comp_pred_diff;
     cpi->rd_comp_pred_diff[HYBRID_PREDICTION] += ctx->hybrid_pred_diff;
 
-    for (i = 0; i <= SWITCHABLE_FILTERS; i++)
+    for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
       cpi->rd_filter_diff[i] += ctx->best_filter_diff[i];
   }
 }
@@ -484,8 +484,8 @@ void vp9_setup_src_planes(MACROBLOCK *x, const YV12_BUFFER_CONFIG *src,
                      x->e_mbd.plane[i].subsampling_y);
 }
 
-static void set_offsets(VP9_COMP *cpi, int mi_row, int mi_col,
-                        BLOCK_SIZE bsize) {
+static void set_offsets(VP9_COMP *cpi, const TileInfo *const tile,
+                        int mi_row, int mi_col, BLOCK_SIZE bsize) {
   MACROBLOCK *const x = &cpi->mb;
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -499,7 +499,7 @@ static void set_offsets(VP9_COMP *cpi, int mi_row, int mi_col,
   const int idx_map = mb_row * cm->mb_cols + mb_col;
   const struct segmentation *const seg = &cm->seg;
 
-  set_skip_context(cm, xd, mi_row, mi_col);
+  set_skip_context(xd, cpi->above_context, cpi->left_context, mi_row, mi_col);
 
   // Activity map pointer
   x->mb_activity_ptr = &cpi->mb_activity_map[idx_map];
@@ -528,7 +528,8 @@ static void set_offsets(VP9_COMP *cpi, int mi_row, int mi_col,
 
   // Set up distance of MB to edge of frame in 1/8th pel units
   assert(!(mi_col & (mi_width - 1)) && !(mi_row & (mi_height - 1)));
-  set_mi_row_col(cm, xd, mi_row, mi_height, mi_col, mi_width);
+  set_mi_row_col(xd, tile, mi_row, mi_height, mi_col, mi_width,
+                 cm->mi_rows, cm->mi_cols);
 
   /* set up source buffers */
   vp9_setup_src_planes(x, cpi->Source, mi_row, mi_col);
@@ -555,9 +556,8 @@ static void set_offsets(VP9_COMP *cpi, int mi_row, int mi_col,
       const int x = mb_col & ~3;
       const int p16 = ((mb_row & 1) << 1) + (mb_col & 1);
       const int p32 = ((mb_row & 2) << 2) + ((mb_col & 2) << 1);
-      const int tile_progress = cm->cur_tile_mi_col_start * cm->mb_rows >> 1;
-      const int mb_cols = (cm->cur_tile_mi_col_end - cm->cur_tile_mi_col_start)
-          >> 1;
+      const int tile_progress = tile->mi_col_start * cm->mb_rows >> 1;
+      const int mb_cols = (tile->mi_col_end - tile->mi_col_start) >> 1;
 
       cpi->seg0_progress = ((y * mb_cols + x * 4 + p32 + p16 + tile_progress)
           << 16) / cm->MBs;
@@ -570,7 +570,8 @@ static void set_offsets(VP9_COMP *cpi, int mi_row, int mi_col,
   }
 }
 
-static void pick_sb_modes(VP9_COMP *cpi, int mi_row, int mi_col,
+static void pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile,
+                          int mi_row, int mi_col,
                           int *totalrate, int64_t *totaldist,
                           BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
                           int64_t best_rd) {
@@ -596,7 +597,7 @@ static void pick_sb_modes(VP9_COMP *cpi, int mi_row, int mi_col,
     }
   }
 
-  set_offsets(cpi, mi_row, mi_col, bsize);
+  set_offsets(cpi, tile, mi_row, mi_col, bsize);
   xd->mi_8x8[0]->mbmi.sb_type = bsize;
 
   // Set to zero to make sure we do not use the previous encoded frame stats
@@ -632,10 +633,10 @@ static void pick_sb_modes(VP9_COMP *cpi, int mi_row, int mi_col,
                               best_rd);
   } else {
     if (bsize >= BLOCK_8X8)
-      vp9_rd_pick_inter_mode_sb(cpi, x, mi_row, mi_col, totalrate, totaldist,
-                                bsize, ctx, best_rd);
+      vp9_rd_pick_inter_mode_sb(cpi, x, tile, mi_row, mi_col,
+                                totalrate, totaldist, bsize, ctx, best_rd);
     else
-      vp9_rd_pick_inter_mode_sub8x8(cpi, x, mi_row, mi_col, totalrate,
+      vp9_rd_pick_inter_mode_sub8x8(cpi, x, tile, mi_row, mi_col, totalrate,
                                     totaldist, bsize, ctx, best_rd);
   }
 
@@ -682,10 +683,6 @@ static void update_stats(VP9_COMP *cpi) {
                                [mbmi->ref_frame[0] != GOLDEN_FRAME]++;
       }
     }
-
-    // Count of last ref frame 0,0 usage
-    if (mbmi->mode == ZEROMV && mbmi->ref_frame[0] == LAST_FRAME)
-      cpi->inter_zz_count++;
   }
 }
 
@@ -711,7 +708,6 @@ static void restore_context(VP9_COMP *cpi, int mi_row, int mi_col,
                             ENTROPY_CONTEXT l[16 * MAX_MB_PLANE],
                             PARTITION_CONTEXT sa[8], PARTITION_CONTEXT sl[8],
                             BLOCK_SIZE bsize) {
-  VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   int p;
@@ -721,28 +717,27 @@ static void restore_context(VP9_COMP *cpi, int mi_row, int mi_col,
   int mi_height = num_8x8_blocks_high_lookup[bsize];
   for (p = 0; p < MAX_MB_PLANE; p++) {
     vpx_memcpy(
-        cm->above_context[p] + ((mi_col * 2) >> xd->plane[p].subsampling_x),
+        cpi->above_context[p] + ((mi_col * 2) >> xd->plane[p].subsampling_x),
         a + num_4x4_blocks_wide * p,
         (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide) >>
         xd->plane[p].subsampling_x);
     vpx_memcpy(
-        cm->left_context[p]
+        cpi->left_context[p]
             + ((mi_row & MI_MASK) * 2 >> xd->plane[p].subsampling_y),
         l + num_4x4_blocks_high * p,
         (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high) >>
         xd->plane[p].subsampling_y);
   }
-  vpx_memcpy(cm->above_seg_context + mi_col, sa,
-             sizeof(PARTITION_CONTEXT) * mi_width);
-  vpx_memcpy(cm->left_seg_context + (mi_row & MI_MASK), sl,
-             sizeof(PARTITION_CONTEXT) * mi_height);
+  vpx_memcpy(cpi->above_seg_context + mi_col, sa,
+             sizeof(*cpi->above_seg_context) * mi_width);
+  vpx_memcpy(cpi->left_seg_context + (mi_row & MI_MASK), sl,
+             sizeof(cpi->left_seg_context[0]) * mi_height);
 }
 static void save_context(VP9_COMP *cpi, int mi_row, int mi_col,
                          ENTROPY_CONTEXT a[16 * MAX_MB_PLANE],
                          ENTROPY_CONTEXT l[16 * MAX_MB_PLANE],
                          PARTITION_CONTEXT sa[8], PARTITION_CONTEXT sl[8],
                          BLOCK_SIZE bsize) {
-  const VP9_COMMON *const cm = &cpi->common;
   const MACROBLOCK *const x = &cpi->mb;
   const MACROBLOCKD *const xd = &x->e_mbd;
   int p;
@@ -755,23 +750,24 @@ static void save_context(VP9_COMP *cpi, int mi_row, int mi_col,
   for (p = 0; p < MAX_MB_PLANE; ++p) {
     vpx_memcpy(
         a + num_4x4_blocks_wide * p,
-        cm->above_context[p] + (mi_col * 2 >> xd->plane[p].subsampling_x),
+        cpi->above_context[p] + (mi_col * 2 >> xd->plane[p].subsampling_x),
         (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide) >>
         xd->plane[p].subsampling_x);
     vpx_memcpy(
         l + num_4x4_blocks_high * p,
-        cm->left_context[p]
+        cpi->left_context[p]
             + ((mi_row & MI_MASK) * 2 >> xd->plane[p].subsampling_y),
         (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high) >>
         xd->plane[p].subsampling_y);
   }
-  vpx_memcpy(sa, cm->above_seg_context + mi_col,
-             sizeof(PARTITION_CONTEXT) * mi_width);
-  vpx_memcpy(sl, cm->left_seg_context + (mi_row & MI_MASK),
-             sizeof(PARTITION_CONTEXT) * mi_height);
+  vpx_memcpy(sa, cpi->above_seg_context + mi_col,
+             sizeof(*cpi->above_seg_context) * mi_width);
+  vpx_memcpy(sl, cpi->left_seg_context + (mi_row & MI_MASK),
+             sizeof(cpi->left_seg_context[0]) * mi_height);
 }
 
-static void encode_b(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col,
+static void encode_b(VP9_COMP *cpi, const TileInfo *const tile,
+                     TOKENEXTRA **tp, int mi_row, int mi_col,
                      int output_enabled, BLOCK_SIZE bsize, int sub_index) {
   VP9_COMMON * const cm = &cpi->common;
   MACROBLOCK * const x = &cpi->mb;
@@ -789,7 +785,7 @@ static void encode_b(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col,
     if (xd->ab_index > 0)
       return;
   }
-  set_offsets(cpi, mi_row, mi_col, bsize);
+  set_offsets(cpi, tile, mi_row, mi_col, bsize);
   update_state(cpi, get_block_context(x, bsize), bsize, output_enabled);
   encode_superblock(cpi, tp, output_enabled, mi_row, mi_col, bsize);
 
@@ -801,7 +797,8 @@ static void encode_b(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col,
   }
 }
 
-static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col,
+static void encode_sb(VP9_COMP *cpi, const TileInfo *const tile,
+                      TOKENEXTRA **tp, int mi_row, int mi_col,
                       int output_enabled, BLOCK_SIZE bsize) {
   VP9_COMMON * const cm = &cpi->common;
   MACROBLOCK * const x = &cpi->mb;
@@ -818,7 +815,8 @@ static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col,
 
   c1 = BLOCK_4X4;
   if (bsize >= BLOCK_8X8) {
-    pl = partition_plane_context(cm, mi_row, mi_col, bsize);
+    pl = partition_plane_context(cpi->above_seg_context, cpi->left_seg_context,
+                                 mi_row, mi_col, bsize);
     c1 = *(get_sb_partitioning(x, bsize));
   }
   partition = partition_lookup[bsl][c1];
@@ -827,19 +825,19 @@ static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col,
     case PARTITION_NONE:
       if (output_enabled && bsize >= BLOCK_8X8)
         cpi->partition_count[pl][PARTITION_NONE]++;
-      encode_b(cpi, tp, mi_row, mi_col, output_enabled, c1, -1);
+      encode_b(cpi, tile, tp, mi_row, mi_col, output_enabled, c1, -1);
       break;
     case PARTITION_VERT:
       if (output_enabled)
         cpi->partition_count[pl][PARTITION_VERT]++;
-      encode_b(cpi, tp, mi_row, mi_col, output_enabled, c1, 0);
-      encode_b(cpi, tp, mi_row, mi_col + bs, output_enabled, c1, 1);
+      encode_b(cpi, tile, tp, mi_row, mi_col, output_enabled, c1, 0);
+      encode_b(cpi, tile, tp, mi_row, mi_col + bs, output_enabled, c1, 1);
       break;
     case PARTITION_HORZ:
       if (output_enabled)
         cpi->partition_count[pl][PARTITION_HORZ]++;
-      encode_b(cpi, tp, mi_row, mi_col, output_enabled, c1, 0);
-      encode_b(cpi, tp, mi_row + bs, mi_col, output_enabled, c1, 1);
+      encode_b(cpi, tile, tp, mi_row, mi_col, output_enabled, c1, 0);
+      encode_b(cpi, tile, tp, mi_row + bs, mi_col, output_enabled, c1, 1);
       break;
     case PARTITION_SPLIT:
       subsize = get_subsize(bsize, PARTITION_SPLIT);
@@ -851,7 +849,7 @@ static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col,
         const int x_idx = i & 1, y_idx = i >> 1;
 
         *get_sb_index(xd, subsize) = i;
-        encode_sb(cpi, tp, mi_row + y_idx * bs, mi_col + x_idx * bs,
+        encode_sb(cpi, tile, tp, mi_row + y_idx * bs, mi_col + x_idx * bs,
                   output_enabled, subsize);
       }
       break;
@@ -861,7 +859,8 @@ static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col,
   }
 
   if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8)
-    update_partition_context(cm, mi_row, mi_col, c1, bsize);
+    update_partition_context(cpi->above_seg_context, cpi->left_seg_context,
+                             mi_row, mi_col, c1, bsize);
 }
 
 // Check to see if the given partition size is allowed for a specified number
@@ -889,13 +888,13 @@ static BLOCK_SIZE find_partition_size(BLOCK_SIZE bsize,
 // However, at the bottom and right borders of the image the requested size
 // may not be allowed in which case this code attempts to choose the largest
 // allowable partition.
-static void set_partitioning(VP9_COMP *cpi, MODE_INFO **mi_8x8,
-                             int mi_row, int mi_col) {
+static void set_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
+                             MODE_INFO **mi_8x8, int mi_row, int mi_col) {
   VP9_COMMON *const cm = &cpi->common;
   BLOCK_SIZE bsize = cpi->sf.always_this_block_size;
   const int mis = cm->mode_info_stride;
-  int row8x8_remaining = cm->cur_tile_mi_row_end - mi_row;
-  int col8x8_remaining = cm->cur_tile_mi_col_end - mi_col;
+  int row8x8_remaining = tile->mi_row_end - mi_row;
+  int col8x8_remaining = tile->mi_col_end - mi_col;
   int block_row, block_col;
   MODE_INFO * mi_upper_left = cm->mi + mi_row * mis + mi_col;
   int bh = num_8x8_blocks_high_lookup[bsize];
@@ -970,7 +969,9 @@ static int sb_has_motion(VP9_COMP *cpi, MODE_INFO **prev_mi_8x8) {
   return 0;
 }
 
-static void rd_use_partition(VP9_COMP *cpi, MODE_INFO **mi_8x8,
+static void rd_use_partition(VP9_COMP *cpi,
+                             const TileInfo *const tile,
+                             MODE_INFO **mi_8x8,
                              TOKENEXTRA **tp, int mi_row, int mi_col,
                              BLOCK_SIZE bsize, int *rate, int64_t *dist,
                              int do_recon) {
@@ -1022,7 +1023,7 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO **mi_8x8,
   save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
 
   if (bsize == BLOCK_16X16) {
-    set_offsets(cpi, mi_row, mi_col, bsize);
+    set_offsets(cpi, tile, mi_row, mi_col, bsize);
     x->mb_energy = vp9_block_energy(cpi, x, bsize);
   }
 
@@ -1049,10 +1050,12 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO **mi_8x8,
         mi_row + (ms >> 1) < cm->mi_rows &&
         mi_col + (ms >> 1) < cm->mi_cols) {
       *(get_sb_partitioning(x, bsize)) = bsize;
-      pick_sb_modes(cpi, mi_row, mi_col, &none_rate, &none_dist, bsize,
+      pick_sb_modes(cpi, tile, mi_row, mi_col, &none_rate, &none_dist, bsize,
                     get_block_context(x, bsize), INT64_MAX);
 
-      pl = partition_plane_context(cm, mi_row, mi_col, bsize);
+      pl = partition_plane_context(cpi->above_seg_context,
+                                   cpi->left_seg_context,
+                                   mi_row, mi_col, bsize);
       none_rate += x->partition_cost[pl][PARTITION_NONE];
 
       restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
@@ -1063,12 +1066,12 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO **mi_8x8,
 
   switch (partition) {
     case PARTITION_NONE:
-      pick_sb_modes(cpi, mi_row, mi_col, &last_part_rate, &last_part_dist,
+      pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate, &last_part_dist,
                     bsize, get_block_context(x, bsize), INT64_MAX);
       break;
     case PARTITION_HORZ:
       *get_sb_index(xd, subsize) = 0;
-      pick_sb_modes(cpi, mi_row, mi_col, &last_part_rate, &last_part_dist,
+      pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate, &last_part_dist,
                     subsize, get_block_context(x, subsize), INT64_MAX);
       if (last_part_rate != INT_MAX &&
           bsize >= BLOCK_8X8 && mi_row + (mh >> 1) < cm->mi_rows) {
@@ -1077,7 +1080,7 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO **mi_8x8,
         update_state(cpi, get_block_context(x, subsize), subsize, 0);
         encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
         *get_sb_index(xd, subsize) = 1;
-        pick_sb_modes(cpi, mi_row + (ms >> 1), mi_col, &rt, &dt, subsize,
+        pick_sb_modes(cpi, tile, mi_row + (ms >> 1), mi_col, &rt, &dt, subsize,
                       get_block_context(x, subsize), INT64_MAX);
         if (rt == INT_MAX || dt == INT_MAX) {
           last_part_rate = INT_MAX;
@@ -1091,7 +1094,7 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO **mi_8x8,
       break;
     case PARTITION_VERT:
       *get_sb_index(xd, subsize) = 0;
-      pick_sb_modes(cpi, mi_row, mi_col, &last_part_rate, &last_part_dist,
+      pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate, &last_part_dist,
                     subsize, get_block_context(x, subsize), INT64_MAX);
       if (last_part_rate != INT_MAX &&
           bsize >= BLOCK_8X8 && mi_col + (ms >> 1) < cm->mi_cols) {
@@ -1100,7 +1103,7 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO **mi_8x8,
         update_state(cpi, get_block_context(x, subsize), subsize, 0);
         encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
         *get_sb_index(xd, subsize) = 1;
-        pick_sb_modes(cpi, mi_row, mi_col + (ms >> 1), &rt, &dt, subsize,
+        pick_sb_modes(cpi, tile, mi_row, mi_col + (ms >> 1), &rt, &dt, subsize,
                       get_block_context(x, subsize), INT64_MAX);
         if (rt == INT_MAX || dt == INT_MAX) {
           last_part_rate = INT_MAX;
@@ -1127,7 +1130,7 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO **mi_8x8,
 
         *get_sb_index(xd, subsize) = i;
 
-        rd_use_partition(cpi, mi_8x8 + jj * bss * mis + ii * bss, tp,
+        rd_use_partition(cpi, tile, mi_8x8 + jj * bss * mis + ii * bss, tp,
                          mi_row + y_idx, mi_col + x_idx, subsize, &rt, &dt,
                          i != 3);
         if (rt == INT_MAX || dt == INT_MAX) {
@@ -1143,7 +1146,8 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO **mi_8x8,
       assert(0);
   }
 
-  pl = partition_plane_context(cm, mi_row, mi_col, bsize);
+  pl = partition_plane_context(cpi->above_seg_context, cpi->left_seg_context,
+                               mi_row, mi_col, bsize);
   if (last_part_rate < INT_MAX)
     last_part_rate += x->partition_cost[pl][partition];
 
@@ -1175,7 +1179,7 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO **mi_8x8,
 
       save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
 
-      pick_sb_modes(cpi, mi_row + y_idx, mi_col + x_idx, &rt, &dt,
+      pick_sb_modes(cpi, tile, mi_row + y_idx, mi_col + x_idx, &rt, &dt,
                     split_subsize, get_block_context(x, split_subsize),
                     INT64_MAX);
 
@@ -1188,15 +1192,18 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO **mi_8x8,
       }
 
       if (i != 3)
-        encode_sb(cpi, tp,  mi_row + y_idx, mi_col + x_idx, 0,
+        encode_sb(cpi, tile, tp,  mi_row + y_idx, mi_col + x_idx, 0,
                   split_subsize);
 
       split_rate += rt;
       split_dist += dt;
-      pl = partition_plane_context(cm, mi_row + y_idx, mi_col + x_idx, bsize);
+      pl = partition_plane_context(cpi->above_seg_context,
+                                   cpi->left_seg_context,
+                                   mi_row + y_idx, mi_col + x_idx, bsize);
       split_rate += x->partition_cost[pl][PARTITION_NONE];
     }
-    pl = partition_plane_context(cm, mi_row, mi_col, bsize);
+    pl = partition_plane_context(cpi->above_seg_context, cpi->left_seg_context,
+                                 mi_row, mi_col, bsize);
     if (split_rate < INT_MAX) {
       split_rate += x->partition_cost[pl][PARTITION_SPLIT];
 
@@ -1231,7 +1238,7 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO **mi_8x8,
     assert(chosen_rate < INT_MAX && chosen_dist < INT_MAX);
 
   if (do_recon)
-    encode_sb(cpi, tp, mi_row, mi_col, bsize == BLOCK_64X64, bsize);
+    encode_sb(cpi, tile, tp, mi_row, mi_col, bsize == BLOCK_64X64, bsize);
 
   *rate = chosen_rate;
   *dist = chosen_dist;
@@ -1279,7 +1286,8 @@ static void get_sb_partition_size_range(VP9_COMP *cpi, MODE_INFO ** mi_8x8,
 
 // Look at neighboring blocks and set a min and max partition size based on
 // what they chose.
-static void rd_auto_partition_range(VP9_COMP *cpi, int row, int col,
+static void rd_auto_partition_range(VP9_COMP *cpi, const TileInfo *const tile,
+                                    int row, int col,
                                     BLOCK_SIZE *min_block_size,
                                     BLOCK_SIZE *max_block_size) {
   VP9_COMMON * const cm = &cpi->common;
@@ -1293,8 +1301,8 @@ static void rd_auto_partition_range(VP9_COMP *cpi, int row, int col,
   MODE_INFO ** above_sb64_mi_8x8;
   MODE_INFO ** left_sb64_mi_8x8;
 
-  int row8x8_remaining = cm->cur_tile_mi_row_end - row;
-  int col8x8_remaining = cm->cur_tile_mi_col_end - col;
+  int row8x8_remaining = tile->mi_row_end - row;
+  int col8x8_remaining = tile->mi_col_end - col;
   int bh, bw;
 
   // Trap case where we do not have a prediction.
@@ -1444,7 +1452,8 @@ static INLINE void load_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
 // TODO(jingning,jimbankoski,rbultje): properly skip partition types that are
 // unlikely to be selected depending on previous rate-distortion optimization
 // results, for encoding speed-up.
-static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,
+static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
+                              TOKENEXTRA **tp, int mi_row,
                               int mi_col, BLOCK_SIZE bsize, int *rate,
                               int64_t *dist, int do_recon, int64_t best_rd) {
   VP9_COMMON * const cm = &cpi->common;
@@ -1481,10 +1490,11 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,
       return;
     }
   }
-  assert(mi_height_log2(bsize) == mi_width_log2(bsize));
+  assert(num_8x8_blocks_wide_lookup[bsize] ==
+             num_8x8_blocks_high_lookup[bsize]);
 
   if (bsize == BLOCK_16X16) {
-    set_offsets(cpi, mi_row, mi_col, bsize);
+    set_offsets(cpi, tile, mi_row, mi_col, bsize);
     x->mb_energy = vp9_block_energy(cpi, x, bsize);
   }
 
@@ -1521,11 +1531,13 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,
 
   // PARTITION_NONE
   if (partition_none_allowed) {
-    pick_sb_modes(cpi, mi_row, mi_col, &this_rate, &this_dist, bsize,
+    pick_sb_modes(cpi, tile, mi_row, mi_col, &this_rate, &this_dist, bsize,
                   get_block_context(x, bsize), best_rd);
     if (this_rate != INT_MAX) {
       if (bsize >= BLOCK_8X8) {
-        pl = partition_plane_context(cm, mi_row, mi_col, bsize);
+        pl = partition_plane_context(cpi->above_seg_context,
+                                     cpi->left_seg_context,
+                                     mi_row, mi_col, bsize);
         this_rate += x->partition_cost[pl][PARTITION_NONE];
       }
       sum_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_dist);
@@ -1573,7 +1585,7 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,
       *get_sb_index(xd, subsize) = i;
       if (cpi->sf.adaptive_motion_search)
         load_pred_mv(x, get_block_context(x, bsize));
-      rd_pick_partition(cpi, tp, mi_row + y_idx, mi_col + x_idx, subsize,
+      rd_pick_partition(cpi, tile, tp, mi_row + y_idx, mi_col + x_idx, subsize,
                         &this_rate, &this_dist, i != 3, best_rd - sum_rd);
 
       if (this_rate == INT_MAX) {
@@ -1585,7 +1597,9 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,
       }
     }
     if (sum_rd < best_rd && i == 4) {
-      pl = partition_plane_context(cm, mi_row, mi_col, bsize);
+      pl = partition_plane_context(cpi->above_seg_context,
+                                   cpi->left_seg_context,
+                                   mi_row, mi_col, bsize);
       sum_rate += x->partition_cost[pl][PARTITION_SPLIT];
       sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
       if (sum_rd < best_rd) {
@@ -1618,7 +1632,7 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,
     *get_sb_index(xd, subsize) = 0;
     if (cpi->sf.adaptive_motion_search)
       load_pred_mv(x, get_block_context(x, bsize));
-    pick_sb_modes(cpi, mi_row, mi_col, &sum_rate, &sum_dist, subsize,
+    pick_sb_modes(cpi, tile, mi_row, mi_col, &sum_rate, &sum_dist, subsize,
                   get_block_context(x, subsize), best_rd);
     sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
 
@@ -1629,7 +1643,7 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,
       *get_sb_index(xd, subsize) = 1;
       if (cpi->sf.adaptive_motion_search)
         load_pred_mv(x, get_block_context(x, bsize));
-      pick_sb_modes(cpi, mi_row + ms, mi_col, &this_rate,
+      pick_sb_modes(cpi, tile, mi_row + ms, mi_col, &this_rate,
                     &this_dist, subsize, get_block_context(x, subsize),
                     best_rd - sum_rd);
       if (this_rate == INT_MAX) {
@@ -1641,7 +1655,9 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,
       }
     }
     if (sum_rd < best_rd) {
-      pl = partition_plane_context(cm, mi_row, mi_col, bsize);
+      pl = partition_plane_context(cpi->above_seg_context,
+                                   cpi->left_seg_context,
+                                   mi_row, mi_col, bsize);
       sum_rate += x->partition_cost[pl][PARTITION_HORZ];
       sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
       if (sum_rd < best_rd) {
@@ -1661,7 +1677,7 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,
     *get_sb_index(xd, subsize) = 0;
     if (cpi->sf.adaptive_motion_search)
       load_pred_mv(x, get_block_context(x, bsize));
-    pick_sb_modes(cpi, mi_row, mi_col, &sum_rate, &sum_dist, subsize,
+    pick_sb_modes(cpi, tile, mi_row, mi_col, &sum_rate, &sum_dist, subsize,
                   get_block_context(x, subsize), best_rd);
     sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
     if (sum_rd < best_rd && mi_col + ms < cm->mi_cols) {
@@ -1671,7 +1687,7 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,
       *get_sb_index(xd, subsize) = 1;
       if (cpi->sf.adaptive_motion_search)
         load_pred_mv(x, get_block_context(x, bsize));
-      pick_sb_modes(cpi, mi_row, mi_col + ms, &this_rate,
+      pick_sb_modes(cpi, tile, mi_row, mi_col + ms, &this_rate,
                     &this_dist, subsize, get_block_context(x, subsize),
                     best_rd - sum_rd);
       if (this_rate == INT_MAX) {
@@ -1683,7 +1699,9 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,
       }
     }
     if (sum_rd < best_rd) {
-      pl = partition_plane_context(cm, mi_row, mi_col, bsize);
+      pl = partition_plane_context(cpi->above_seg_context,
+                                   cpi->left_seg_context,
+                                   mi_row, mi_col, bsize);
       sum_rate += x->partition_cost[pl][PARTITION_VERT];
       sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
       if (sum_rd < best_rd) {
@@ -1701,7 +1719,7 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,
   *dist = best_dist;
 
   if (best_rate < INT_MAX && best_dist < INT64_MAX && do_recon)
-    encode_sb(cpi, tp, mi_row, mi_col, bsize == BLOCK_64X64, bsize);
+    encode_sb(cpi, tile, tp, mi_row, mi_col, bsize == BLOCK_64X64, bsize);
   if (bsize == BLOCK_64X64) {
     assert(tp_orig < *tp);
     assert(best_rate < INT_MAX);
@@ -1712,7 +1730,8 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,
 }
 
 // Examines 64x64 block and chooses a best reference frame
-static void rd_pick_reference_frame(VP9_COMP *cpi, int mi_row, int mi_col) {
+static void rd_pick_reference_frame(VP9_COMP *cpi, const TileInfo *const tile,
+                                    int mi_row, int mi_col) {
   VP9_COMMON * const cm = &cpi->common;
   MACROBLOCK * const x = &cpi->mb;
   int bsl = b_width_log2(BLOCK_64X64), bs = 1 << bsl;
@@ -1732,9 +1751,10 @@ static void rd_pick_reference_frame(VP9_COMP *cpi, int mi_row, int mi_col) {
   if ((mi_row + (ms >> 1) < cm->mi_rows) &&
       (mi_col + (ms >> 1) < cm->mi_cols)) {
     cpi->set_ref_frame_mask = 1;
-    pick_sb_modes(cpi, mi_row, mi_col, &r, &d, BLOCK_64X64,
+    pick_sb_modes(cpi, tile, mi_row, mi_col, &r, &d, BLOCK_64X64,
                   get_block_context(x, BLOCK_64X64), INT64_MAX);
-    pl = partition_plane_context(cm, mi_row, mi_col, BLOCK_64X64);
+    pl = partition_plane_context(cpi->above_seg_context, cpi->left_seg_context,
+                                 mi_row, mi_col, BLOCK_64X64);
     r += x->partition_cost[pl][PARTITION_NONE];
 
     *(get_sb_partitioning(x, BLOCK_64X64)) = BLOCK_64X64;
@@ -1744,17 +1764,17 @@ static void rd_pick_reference_frame(VP9_COMP *cpi, int mi_row, int mi_col) {
   restore_context(cpi, mi_row, mi_col, a, l, sa, sl, BLOCK_64X64);
 }
 
-static void encode_sb_row(VP9_COMP *cpi, int mi_row, TOKENEXTRA **tp,
-                          int *totalrate) {
+static void encode_sb_row(VP9_COMP *cpi, const TileInfo *const tile,
+                          int mi_row, TOKENEXTRA **tp, int *totalrate) {
   VP9_COMMON * const cm = &cpi->common;
   int mi_col;
 
   // Initialize the left context for the new SB row
-  vpx_memset(&cm->left_context, 0, sizeof(cm->left_context));
-  vpx_memset(cm->left_seg_context, 0, sizeof(cm->left_seg_context));
+  vpx_memset(&cpi->left_context, 0, sizeof(cpi->left_context));
+  vpx_memset(cpi->left_seg_context, 0, sizeof(cpi->left_seg_context));
 
   // Code each SB in the row
-  for (mi_col = cm->cur_tile_mi_col_start; mi_col < cm->cur_tile_mi_col_end;
+  for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
        mi_col += MI_BLOCK_SIZE) {
     int dummy_rate;
     int64_t dummy_dist;
@@ -1762,7 +1782,7 @@ static void encode_sb_row(VP9_COMP *cpi, int mi_row, TOKENEXTRA **tp,
     vp9_zero(cpi->mb.pred_mv);
 
     if (cpi->sf.reference_masking)
-      rd_pick_reference_frame(cpi, mi_row, mi_col);
+      rd_pick_reference_frame(cpi, tile, mi_row, mi_col);
 
     if (cpi->sf.use_lastframe_partitioning ||
         cpi->sf.use_one_partition_size_always ) {
@@ -1772,9 +1792,9 @@ static void encode_sb_row(VP9_COMP *cpi, int mi_row, TOKENEXTRA **tp,
 
       cpi->mb.source_variance = UINT_MAX;
       if (cpi->sf.use_one_partition_size_always) {
-        set_offsets(cpi, mi_row, mi_col, BLOCK_64X64);
-        set_partitioning(cpi, mi_8x8, mi_row, mi_col);
-        rd_use_partition(cpi, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64,
+        set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64);
+        set_partitioning(cpi, tile, mi_8x8, mi_row, mi_col);
+        rd_use_partition(cpi, tile, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64,
                          &dummy_rate, &dummy_dist, 1);
       } else {
         if ((cpi->common.current_video_frame
@@ -1788,28 +1808,28 @@ static void encode_sb_row(VP9_COMP *cpi, int mi_row, TOKENEXTRA **tp,
                  sb_has_motion(cpi, prev_mi_8x8))) {
           // If required set upper and lower partition size limits
           if (cpi->sf.auto_min_max_partition_size) {
-            set_offsets(cpi, mi_row, mi_col, BLOCK_64X64);
-            rd_auto_partition_range(cpi, mi_row, mi_col,
+            set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64);
+            rd_auto_partition_range(cpi, tile, mi_row, mi_col,
                                     &cpi->sf.min_partition_size,
                                     &cpi->sf.max_partition_size);
           }
-          rd_pick_partition(cpi, tp, mi_row, mi_col, BLOCK_64X64,
+          rd_pick_partition(cpi, tile, tp, mi_row, mi_col, BLOCK_64X64,
                             &dummy_rate, &dummy_dist, 1, INT64_MAX);
         } else {
           copy_partitioning(cpi, mi_8x8, prev_mi_8x8);
-          rd_use_partition(cpi, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64,
+          rd_use_partition(cpi, tile, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64,
                            &dummy_rate, &dummy_dist, 1);
         }
       }
     } else {
       // If required set upper and lower partition size limits
       if (cpi->sf.auto_min_max_partition_size) {
-        set_offsets(cpi, mi_row, mi_col, BLOCK_64X64);
-        rd_auto_partition_range(cpi, mi_row, mi_col,
+        set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64);
+        rd_auto_partition_range(cpi, tile, mi_row, mi_col,
                                 &cpi->sf.min_partition_size,
                                 &cpi->sf.max_partition_size);
       }
-      rd_pick_partition(cpi, tp, mi_row, mi_col, BLOCK_64X64,
+      rd_pick_partition(cpi, tile, tp, mi_row, mi_col, BLOCK_64X64,
                         &dummy_rate, &dummy_dist, 1, INT64_MAX);
     }
   }
@@ -1836,7 +1856,7 @@ static void init_encode_frame_mb_context(VP9_COMP *cpi) {
   // TODO(jkoleszar): are these initializations required?
   setup_pre_planes(xd, 0, &cm->yv12_fb[cm->ref_frame_map[cpi->lst_fb_idx]],
                    0, 0, NULL);
-  setup_dst_planes(xd, &cm->yv12_fb[cm->new_fb_idx], 0, 0);
+  setup_dst_planes(xd, get_frame_new_buffer(cm), 0, 0);
 
   setup_block_dptrs(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);
 
@@ -1856,16 +1876,17 @@ static void init_encode_frame_mb_context(VP9_COMP *cpi) {
 
   // Note: this memset assumes above_context[0], [1] and [2]
   // are allocated as part of the same buffer.
-  vpx_memset(cm->above_context[0], 0,
-             sizeof(ENTROPY_CONTEXT) * 2 * MAX_MB_PLANE * aligned_mi_cols);
-  vpx_memset(cm->above_seg_context, 0,
-             sizeof(PARTITION_CONTEXT) * aligned_mi_cols);
+  vpx_memset(cpi->above_context[0], 0,
+             sizeof(*cpi->above_context[0]) *
+             2 * aligned_mi_cols * MAX_MB_PLANE);
+  vpx_memset(cpi->above_seg_context, 0,
+             sizeof(*cpi->above_seg_context) * aligned_mi_cols);
 }
 
 static void switch_lossless_mode(VP9_COMP *cpi, int lossless) {
   if (lossless) {
     // printf("Switching to lossless\n");
-    cpi->mb.fwd_txm4x4 = vp9_short_walsh4x4;
+    cpi->mb.fwd_txm4x4 = vp9_fwht4x4;
     cpi->mb.e_mbd.itxm_add = vp9_iwht4x4_add;
     cpi->mb.optimize = 0;
     cpi->common.lf.filter_level = 0;
@@ -1873,7 +1894,7 @@ static void switch_lossless_mode(VP9_COMP *cpi, int lossless) {
     cpi->common.tx_mode = ONLY_4X4;
   } else {
     // printf("Not lossless\n");
-    cpi->mb.fwd_txm4x4 = vp9_short_fdct4x4;
+    cpi->mb.fwd_txm4x4 = vp9_fdct4x4;
     cpi->mb.e_mbd.itxm_add = vp9_idct4x4_add;
   }
 }
@@ -1907,9 +1928,6 @@ static void encode_frame_internal(VP9_COMP *cpi) {
 
   totalrate = 0;
 
-  // Reset frame count of inter 0,0 motion vector usage.
-  cpi->inter_zz_count = 0;
-
   vp9_zero(cm->counts.switchable_interp);
   vp9_zero(cpi->tx_stepdown_count);
 
@@ -1963,16 +1981,15 @@ static void encode_frame_internal(VP9_COMP *cpi) {
       const int tile_rows = 1 << cm->log2_tile_rows;
 
       for (tile_row = 0; tile_row < tile_rows; tile_row++) {
-        vp9_get_tile_row_offsets(cm, tile_row);
-
         for (tile_col = 0; tile_col < tile_cols; tile_col++) {
+          TileInfo tile;
           TOKENEXTRA *tp_old = tp;
 
           // For each row of SBs in the frame
-          vp9_get_tile_col_offsets(cm, tile_col);
-          for (mi_row = cm->cur_tile_mi_row_start;
-               mi_row < cm->cur_tile_mi_row_end; mi_row += 8)
-            encode_sb_row(cpi, mi_row, &tp, &totalrate);
+          vp9_tile_init(&tile, cm, tile_row, tile_col);
+          for (mi_row = tile.mi_row_start;
+               mi_row < tile.mi_row_end; mi_row += 8)
+            encode_sb_row(cpi, &tile, mi_row, &tp, &totalrate);
 
           cpi->tok_count[tile_row][tile_col] = (unsigned int)(tp - tp_old);
           assert(tp - cpi->tok <= get_token_alloc(cm->mb_rows, cm->mb_cols));
@@ -2188,7 +2205,7 @@ void vp9_encode_frame(VP9_COMP *cpi) {
 
   if (cpi->sf.RD) {
     int i, pred_type;
-    INTERPOLATIONFILTERTYPE filter_type;
+    INTERPOLATION_TYPE filter_type;
     /*
      * This code does a single RD pass over the whole frame assuming
      * either compound, single or hybrid prediction as per whatever has
@@ -2256,7 +2273,7 @@ void vp9_encode_frame(VP9_COMP *cpi) {
       cpi->rd_prediction_type_threshes[frame_type][i] >>= 1;
     }
 
-    for (i = 0; i <= SWITCHABLE_FILTERS; i++) {
+    for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
       const int64_t diff = cpi->rd_filter_diff[i] / cpi->common.MBs;
       cpi->rd_filter_threshes[frame_type][i] =
           (cpi->rd_filter_threshes[frame_type][i] + diff) / 2;
@@ -2470,7 +2487,7 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
             (mbmi->skip_coeff ||
              vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)))) {
       const uint8_t context = vp9_get_pred_context_tx_size(xd);
-      update_tx_counts(bsize, context, mbmi->tx_size, &cm->counts.tx);
+      ++get_tx_counts(bsize, context, &cm->counts.tx)[mbmi->tx_size];
     } else {
       int x, y;
       TX_SIZE sz = tx_mode_to_biggest_tx_size[cm->tx_mode];
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 3358fbbe9..e52e8ec1e 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -19,6 +19,7 @@
 #include "vp9/common/vp9_reconintra.h"
 #include "vp9/common/vp9_systemdependent.h"
 
+#include "vp9/encoder/vp9_dct.h"
 #include "vp9/encoder/vp9_encodemb.h"
 #include "vp9/encoder/vp9_quantize.h"
 #include "vp9/encoder/vp9_rdopt.h"
@@ -365,9 +366,9 @@ void vp9_xform_quant(int plane, int block, BLOCK_SIZE plane_bsize,
       yoff = 32 * (block >> twl);
       src_diff = p->src_diff + 4 * bw * yoff + xoff;
       if (x->use_lp32x32fdct)
-        vp9_short_fdct32x32_rd(src_diff, coeff, bw * 4);
+        vp9_fdct32x32_rd(src_diff, coeff, bw * 4);
       else
-        vp9_short_fdct32x32(src_diff, coeff, bw * 4);
+        vp9_fdct32x32(src_diff, coeff, bw * 4);
       vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
                            p->quant, p->quant_shift, qcoeff, dqcoeff,
                            pd->dequant, p->zbin_extra, eob, scan, iscan);
@@ -379,7 +380,7 @@ void vp9_xform_quant(int plane, int block, BLOCK_SIZE plane_bsize,
       xoff = 16 * (block & twmask);
       yoff = 16 * (block >> twl);
       src_diff = p->src_diff + 4 * bw * yoff + xoff;
-      vp9_short_fdct16x16(src_diff, coeff, bw * 4);
+      vp9_fdct16x16(src_diff, coeff, bw * 4);
       vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
                      p->quant, p->quant_shift, qcoeff, dqcoeff,
                      pd->dequant, p->zbin_extra, eob, scan, iscan);
@@ -391,7 +392,7 @@ void vp9_xform_quant(int plane, int block, BLOCK_SIZE plane_bsize,
       xoff = 8 * (block & twmask);
       yoff = 8 * (block >> twl);
       src_diff = p->src_diff + 4 * bw * yoff + xoff;
-      vp9_short_fdct8x8(src_diff, coeff, bw * 4);
+      vp9_fdct8x8(src_diff, coeff, bw * 4);
       vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,
                      p->quant, p->quant_shift, qcoeff, dqcoeff,
                      pd->dequant, p->zbin_extra, eob, scan, iscan);
@@ -417,6 +418,7 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize,
   struct encode_b_args *const args = arg;
   MACROBLOCK *const x = args->x;
   MACROBLOCKD *const xd = &x->e_mbd;
+  struct optimize_ctx *const ctx = args->ctx;
   struct macroblockd_plane *const pd = &xd->plane[plane];
   const int raster_block = txfrm_block_to_raster_block(plane_bsize, tx_size,
                                                        block);
@@ -428,14 +430,18 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize,
   // TODO(jingning): per transformed block zero forcing only enabled for
   // luma component. will integrate chroma components as well.
   if (x->zcoeff_blk[tx_size][block] && plane == 0) {
+    int x, y;
     pd->eobs[block] = 0;
+    txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &x, &y);
+    ctx->ta[plane][x] = 0;
+    ctx->tl[plane][y] = 0;
     return;
   }
 
   vp9_xform_quant(plane, block, plane_bsize, tx_size, arg);
 
   if (x->optimize)
-    vp9_optimize_b(plane, block, plane_bsize, tx_size, x, args->ctx);
+    vp9_optimize_b(plane, block, plane_bsize, tx_size, x, ctx);
 
   if (x->skip_encode || pd->eobs[block] == 0)
     return;
@@ -461,6 +467,27 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize,
   }
 }
 
+static void encode_block_pass1(int plane, int block, BLOCK_SIZE plane_bsize,
+                               TX_SIZE tx_size, void *arg) {
+  struct encode_b_args *const args = arg;
+  MACROBLOCK *const x = args->x;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int raster_block = txfrm_block_to_raster_block(plane_bsize, tx_size,
+                                                       block);
+
+  int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  uint8_t *const dst = raster_block_offset_uint8(plane_bsize, raster_block,
+                                                 pd->dst.buf, pd->dst.stride);
+
+  vp9_xform_quant(plane, block, plane_bsize, tx_size, arg);
+
+  if (pd->eobs[block] == 0)
+    return;
+
+  xd->itxm_add(dqcoeff, dst, pd->dst.stride, pd->eobs[block]);
+}
+
 void vp9_encode_sby(MACROBLOCK *x, BLOCK_SIZE bsize) {
   MACROBLOCKD *const xd = &x->e_mbd;
   struct optimize_ctx ctx;
@@ -470,7 +497,7 @@ void vp9_encode_sby(MACROBLOCK *x, BLOCK_SIZE bsize) {
   if (x->optimize)
     optimize_init_b(0, bsize, &arg);
 
-  foreach_transformed_block_in_plane(xd, bsize, 0, encode_block, &arg);
+  foreach_transformed_block_in_plane(xd, bsize, 0, encode_block_pass1, &arg);
 }
 
 void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize) {
@@ -532,9 +559,9 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
       vp9_subtract_block(32, 32, src_diff, bw * 4,
                          src, p->src.stride, dst, pd->dst.stride);
       if (x->use_lp32x32fdct)
-        vp9_short_fdct32x32_rd(src_diff, coeff, bw * 4);
+        vp9_fdct32x32_rd(src_diff, coeff, bw * 4);
       else
-        vp9_short_fdct32x32(src_diff, coeff, bw * 4);
+        vp9_fdct32x32(src_diff, coeff, bw * 4);
       vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
                            p->quant, p->quant_shift, qcoeff, dqcoeff,
                            pd->dequant, p->zbin_extra, eob, scan, iscan);
@@ -556,10 +583,7 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
                               dst, pd->dst.stride, dst, pd->dst.stride);
       vp9_subtract_block(16, 16, src_diff, bw * 4,
                          src, p->src.stride, dst, pd->dst.stride);
-      if (tx_type != DCT_DCT)
-        vp9_short_fht16x16(src_diff, coeff, bw * 4, tx_type);
-      else
-        vp9_short_fdct16x16(src_diff, coeff, bw * 4);
+      vp9_fht16x16(tx_type, src_diff, coeff, bw * 4);
       vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
                      p->quant, p->quant_shift, qcoeff, dqcoeff,
                      pd->dequant, p->zbin_extra, eob, scan, iscan);
@@ -581,10 +605,7 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
                               dst, pd->dst.stride, dst, pd->dst.stride);
       vp9_subtract_block(8, 8, src_diff, bw * 4,
                          src, p->src.stride, dst, pd->dst.stride);
-      if (tx_type != DCT_DCT)
-        vp9_short_fht8x8(src_diff, coeff, bw * 4, tx_type);
-      else
-        vp9_short_fdct8x8(src_diff, coeff, bw * 4);
+      vp9_fht8x8(tx_type, src_diff, coeff, bw * 4);
       vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, p->quant,
                      p->quant_shift, qcoeff, dqcoeff,
                      pd->dequant, p->zbin_extra, eob, scan, iscan);
diff --git a/vp9/encoder/vp9_encodemv.c b/vp9/encoder/vp9_encodemv.c
index 9ebcc4983..e2c6c4c0c 100644
--- a/vp9/encoder/vp9_encodemv.c
+++ b/vp9/encoder/vp9_encodemv.c
@@ -124,8 +124,9 @@ static void build_nmv_component_cost_table(int *mvcost,
   }
 }
 
-static int update_mv(vp9_writer *w, const unsigned int ct[2],
-                     vp9_prob *cur_p, vp9_prob new_p, vp9_prob upd_p) {
+static int update_mv(vp9_writer *w, const unsigned int ct[2], vp9_prob *cur_p,
+                     vp9_prob upd_p) {
+  const vp9_prob new_p = get_binary_prob(ct[0], ct[1]);
   vp9_prob mod_p = new_p | 1;
   const int cur_b = cost_branch256(ct, *cur_p);
   const int mod_b = cost_branch256(ct, mod_p);
@@ -143,7 +144,6 @@ static int update_mv(vp9_writer *w, const unsigned int ct[2],
 
 static void counts_to_nmv_context(
     nmv_context_counts *nmv_count,
-    nmv_context *prob,
     int usehp,
     unsigned int (*branch_ct_joint)[2],
     unsigned int (*branch_ct_sign)[2],
@@ -156,29 +156,24 @@ static void counts_to_nmv_context(
     unsigned int (*branch_ct_hp)[2]) {
   int i, j, k;
   vp9_tree_probs_from_distribution(vp9_mv_joint_tree,
-                                   prob->joints,
                                    branch_ct_joint,
                                    nmv_count->joints, 0);
   for (i = 0; i < 2; ++i) {
     const uint32_t s0 = nmv_count->comps[i].sign[0];
     const uint32_t s1 = nmv_count->comps[i].sign[1];
 
-    prob->comps[i].sign = get_binary_prob(s0, s1);
     branch_ct_sign[i][0] = s0;
     branch_ct_sign[i][1] = s1;
     vp9_tree_probs_from_distribution(vp9_mv_class_tree,
-                                     prob->comps[i].classes,
-                                     branch_ct_classes[i],
-                                     nmv_count->comps[i].classes, 0);
+                                    branch_ct_classes[i],
+                                    nmv_count->comps[i].classes, 0);
     vp9_tree_probs_from_distribution(vp9_mv_class0_tree,
-                                     prob->comps[i].class0,
                                      branch_ct_class0[i],
                                      nmv_count->comps[i].class0, 0);
     for (j = 0; j < MV_OFFSET_BITS; ++j) {
       const uint32_t b0 = nmv_count->comps[i].bits[j][0];
       const uint32_t b1 = nmv_count->comps[i].bits[j][1];
 
-      prob->comps[i].bits[j] = get_binary_prob(b0, b1);
       branch_ct_bits[i][j][0] = b0;
       branch_ct_bits[i][j][1] = b1;
     }
@@ -186,12 +181,10 @@ static void counts_to_nmv_context(
   for (i = 0; i < 2; ++i) {
     for (k = 0; k < CLASS0_SIZE; ++k) {
       vp9_tree_probs_from_distribution(vp9_mv_fp_tree,
-                                       prob->comps[i].class0_fp[k],
                                        branch_ct_class0_fp[i][k],
                                        nmv_count->comps[i].class0_fp[k], 0);
     }
     vp9_tree_probs_from_distribution(vp9_mv_fp_tree,
-                                     prob->comps[i].fp,
                                      branch_ct_fp[i],
                                      nmv_count->comps[i].fp, 0);
   }
@@ -202,11 +195,9 @@ static void counts_to_nmv_context(
       const uint32_t hp0 = nmv_count->comps[i].hp[0];
       const uint32_t hp1 = nmv_count->comps[i].hp[1];
 
-      prob->comps[i].class0_hp = get_binary_prob(c0_hp0, c0_hp1);
       branch_ct_class0_hp[i][0] = c0_hp0;
       branch_ct_class0_hp[i][1] = c0_hp1;
 
-      prob->comps[i].hp = get_binary_prob(hp0, hp1);
       branch_ct_hp[i][0] = hp0;
       branch_ct_hp[i][1] = hp1;
     }
@@ -215,7 +206,6 @@ static void counts_to_nmv_context(
 
 void vp9_write_nmv_probs(VP9_COMP* const cpi, int usehp, vp9_writer* const bc) {
   int i, j;
-  nmv_context prob;
   unsigned int branch_ct_joint[MV_JOINTS - 1][2];
   unsigned int branch_ct_sign[2][2];
   unsigned int branch_ct_classes[2][MV_CLASSES - 1][2];
@@ -227,30 +217,28 @@ void vp9_write_nmv_probs(VP9_COMP* const cpi, int usehp, vp9_writer* const bc) {
   unsigned int branch_ct_hp[2][2];
   nmv_context *mvc = &cpi->common.fc.nmvc;
 
-  counts_to_nmv_context(&cpi->NMVcount, &prob, usehp,
+  counts_to_nmv_context(&cpi->NMVcount, usehp,
                         branch_ct_joint, branch_ct_sign, branch_ct_classes,
                         branch_ct_class0, branch_ct_bits,
                         branch_ct_class0_fp, branch_ct_fp,
                         branch_ct_class0_hp, branch_ct_hp);
 
   for (j = 0; j < MV_JOINTS - 1; ++j)
-    update_mv(bc, branch_ct_joint[j], &mvc->joints[j], prob.joints[j],
-              NMV_UPDATE_PROB);
+    update_mv(bc, branch_ct_joint[j], &mvc->joints[j], NMV_UPDATE_PROB);
 
   for (i = 0; i < 2; ++i) {
-    update_mv(bc, branch_ct_sign[i], &mvc->comps[i].sign,
-              prob.comps[i].sign, NMV_UPDATE_PROB);
+    update_mv(bc, branch_ct_sign[i], &mvc->comps[i].sign, NMV_UPDATE_PROB);
     for (j = 0; j < MV_CLASSES - 1; ++j)
       update_mv(bc, branch_ct_classes[i][j], &mvc->comps[i].classes[j],
-                prob.comps[i].classes[j], NMV_UPDATE_PROB);
+                NMV_UPDATE_PROB);
 
     for (j = 0; j < CLASS0_SIZE - 1; ++j)
       update_mv(bc, branch_ct_class0[i][j], &mvc->comps[i].class0[j],
-                prob.comps[i].class0[j], NMV_UPDATE_PROB);
+                NMV_UPDATE_PROB);
 
     for (j = 0; j < MV_OFFSET_BITS; ++j)
       update_mv(bc, branch_ct_bits[i][j], &mvc->comps[i].bits[j],
-                prob.comps[i].bits[j], NMV_UPDATE_PROB);
+                NMV_UPDATE_PROB);
   }
 
   for (i = 0; i < 2; ++i) {
@@ -258,21 +246,19 @@ void vp9_write_nmv_probs(VP9_COMP* const cpi, int usehp, vp9_writer* const bc) {
       int k;
       for (k = 0; k < 3; ++k)
         update_mv(bc, branch_ct_class0_fp[i][j][k],
-                  &mvc->comps[i].class0_fp[j][k],
-                  prob.comps[i].class0_fp[j][k], NMV_UPDATE_PROB);
+                  &mvc->comps[i].class0_fp[j][k], NMV_UPDATE_PROB);
     }
 
     for (j = 0; j < 3; ++j)
-      update_mv(bc, branch_ct_fp[i][j], &mvc->comps[i].fp[j],
-                prob.comps[i].fp[j], NMV_UPDATE_PROB);
+      update_mv(bc, branch_ct_fp[i][j], &mvc->comps[i].fp[j], NMV_UPDATE_PROB);
   }
 
   if (usehp) {
     for (i = 0; i < 2; ++i) {
       update_mv(bc, branch_ct_class0_hp[i], &mvc->comps[i].class0_hp,
-                prob.comps[i].class0_hp, NMV_UPDATE_PROB);
+                NMV_UPDATE_PROB);
       update_mv(bc, branch_ct_hp[i], &mvc->comps[i].hp,
-                prob.comps[i].hp, NMV_UPDATE_PROB);
+                NMV_UPDATE_PROB);
     }
   }
 }
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index caf41625c..6a3555d68 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -481,13 +481,14 @@ void vp9_first_pass(VP9_COMP *cpi) {
   MACROBLOCK *const x = &cpi->mb;
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
+  TileInfo tile;
 
   int recon_yoffset, recon_uvoffset;
   const int lst_yv12_idx = cm->ref_frame_map[cpi->lst_fb_idx];
   const int gld_yv12_idx = cm->ref_frame_map[cpi->gld_fb_idx];
   YV12_BUFFER_CONFIG *const lst_yv12 = &cm->yv12_fb[lst_yv12_idx];
-  YV12_BUFFER_CONFIG *const new_yv12 = &cm->yv12_fb[cm->new_fb_idx];
   YV12_BUFFER_CONFIG *const gld_yv12 = &cm->yv12_fb[gld_yv12_idx];
+  YV12_BUFFER_CONFIG *const new_yv12 = get_frame_new_buffer(cm);
   const int recon_y_stride = lst_yv12->y_stride;
   const int recon_uv_stride = lst_yv12->uv_stride;
   int64_t intra_error = 0;
@@ -532,6 +533,9 @@ void vp9_first_pass(VP9_COMP *cpi) {
     vp9_initialize_rd_consts(cpi);
   }
 
+  // tiling is ignored in the first pass
+  vp9_tile_init(&tile, cm, 0, 0);
+
   // for each macroblock row in image
   for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {
     int_mv best_ref_mv;
@@ -578,11 +582,12 @@ void vp9_first_pass(VP9_COMP *cpi) {
         }
       }
       xd->mi_8x8[0]->mbmi.ref_frame[0] = INTRA_FRAME;
-      set_mi_row_col(cm, xd,
+      set_mi_row_col(xd, &tile,
                      mb_row << 1,
-                     1 << mi_height_log2(xd->mi_8x8[0]->mbmi.sb_type),
+                     num_8x8_blocks_high_lookup[xd->mi_8x8[0]->mbmi.sb_type],
                      mb_col << 1,
-                     1 << mi_width_log2(xd->mi_8x8[0]->mbmi.sb_type));
+                     num_8x8_blocks_wide_lookup[xd->mi_8x8[0]->mbmi.sb_type],
+                     cm->mi_rows, cm->mi_cols);
 
       if (cpi->sf.variance_adaptive_quantization) {
         int energy = vp9_block_energy(cpi, x, xd->mi_8x8[0]->mbmi.sb_type);
@@ -2164,17 +2169,14 @@ void vp9_second_pass(VP9_COMP *cpi) {
       cpi->ni_av_qi = tmp_q;
       cpi->avg_q = vp9_convert_qindex_to_q(tmp_q);
 
-#ifndef ONE_SHOT_Q_ESTIMATE
       // Limit the maxq value returned subsequently.
       // This increases the risk of overspend or underspend if the initial
       // estimate for the clip is bad, but helps prevent excessive
       // variation in Q, especially near the end of a clip
       // where for example a small overspend may cause Q to crash
       adjust_maxq_qrange(cpi);
-#endif
     }
 
-#ifndef ONE_SHOT_Q_ESTIMATE
     // The last few frames of a clip almost always have to few or too many
     // bits and for the sake of over exact rate control we dont want to make
     // radical adjustments to the allowed quantizer range just to use up a
@@ -2197,7 +2199,6 @@ void vp9_second_pass(VP9_COMP *cpi) {
       cpi->active_worst_quality =
           adjust_active_maxq(cpi->active_worst_quality, tmp_q);
     }
-#endif
   }
   vp9_zero(this_frame);
   if (EOF == input_stats(cpi, &this_frame))
diff --git a/vp9/encoder/vp9_mbgraph.c b/vp9/encoder/vp9_mbgraph.c
index 644363158..7b605b212 100644
--- a/vp9/encoder/vp9_mbgraph.c
+++ b/vp9/encoder/vp9_mbgraph.c
@@ -194,8 +194,8 @@ static void update_mbgraph_mb_stats
   x->plane[0].src.buf = buf->y_buffer + mb_y_offset;
   x->plane[0].src.stride = buf->y_stride;
 
-  xd->plane[0].dst.buf = cm->yv12_fb[cm->new_fb_idx].y_buffer + mb_y_offset;
-  xd->plane[0].dst.stride = cm->yv12_fb[cm->new_fb_idx].y_stride;
+  xd->plane[0].dst.buf = get_frame_new_buffer(cm)->y_buffer + mb_y_offset;
+  xd->plane[0].dst.stride = get_frame_new_buffer(cm)->y_stride;
 
   // do intra 16x16 prediction
   intra_error = find_best_16x16_intra(cpi, mb_y_offset,
diff --git a/vp9/encoder/vp9_modecosts.c b/vp9/encoder/vp9_modecosts.c
index b867d8b71..7eb659232 100644
--- a/vp9/encoder/vp9_modecosts.c
+++ b/vp9/encoder/vp9_modecosts.c
@@ -36,7 +36,7 @@ void vp9_init_mode_costs(VP9_COMP *c) {
                   vp9_kf_uv_mode_prob[INTRA_MODES - 1],
                   vp9_intra_mode_tree);
 
-  for (i = 0; i <= SWITCHABLE_FILTERS; ++i)
+  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
     vp9_cost_tokens((int *)c->mb.switchable_interp_costs[i],
                     cm->fc.switchable_interp_prob[i],
                     vp9_switchable_interp_tree);
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index accc338fb..f922f900a 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -312,6 +312,12 @@ static void dealloc_compressor_data(VP9_COMP *cpi) {
   cpi->mb_activity_map = 0;
   vpx_free(cpi->mb_norm_activity_map);
   cpi->mb_norm_activity_map = 0;
+
+  vpx_free(cpi->above_context[0]);
+  cpi->above_context[0] = NULL;
+
+  vpx_free(cpi->above_seg_context);
+  cpi->above_seg_context = NULL;
 }
 
 // Computes a q delta (in "q index" terms) to get from a starting q value
@@ -959,9 +965,9 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
     sf->optimize_coefficients = 0;
   }
 
-  cpi->mb.fwd_txm4x4 = vp9_short_fdct4x4;
+  cpi->mb.fwd_txm4x4 = vp9_fdct4x4;
   if (cpi->oxcf.lossless || cpi->mb.e_mbd.lossless) {
-    cpi->mb.fwd_txm4x4 = vp9_short_walsh4x4;
+    cpi->mb.fwd_txm4x4 = vp9_fwht4x4;
   }
 
   if (cpi->sf.subpel_search_method == SUBPEL_ITERATIVE) {
@@ -1026,11 +1032,6 @@ void vp9_alloc_compressor_data(VP9_COMP *cpi) {
     CHECK_MEM_ERROR(cm, cpi->tok, vpx_calloc(tokens, sizeof(*cpi->tok)));
   }
 
-  // Data used for real time vc mode to see if gf needs refreshing
-  cpi->inter_zz_count = 0;
-  cpi->gf_bad_count = 0;
-  cpi->gf_update_recommended = 0;
-
   vpx_free(cpi->mb_activity_map);
   CHECK_MEM_ERROR(cm, cpi->mb_activity_map,
                   vpx_calloc(sizeof(unsigned int),
@@ -1040,6 +1041,19 @@ void vp9_alloc_compressor_data(VP9_COMP *cpi) {
   CHECK_MEM_ERROR(cm, cpi->mb_norm_activity_map,
                   vpx_calloc(sizeof(unsigned int),
                              cm->mb_rows * cm->mb_cols));
+
+  // 2 contexts per 'mi unit', so that we have one context per 4x4 txfm
+  // block where mi unit size is 8x8.
+  vpx_free(cpi->above_context[0]);
+  CHECK_MEM_ERROR(cm, cpi->above_context[0],
+                  vpx_calloc(2 * mi_cols_aligned_to_sb(cm->mi_cols) *
+                             MAX_MB_PLANE,
+                             sizeof(*cpi->above_context[0])));
+
+  vpx_free(cpi->above_seg_context);
+  CHECK_MEM_ERROR(cm, cpi->above_seg_context,
+                  vpx_calloc(mi_cols_aligned_to_sb(cm->mi_cols),
+                             sizeof(*cpi->above_seg_context)));
 }
 
 
@@ -1072,6 +1086,15 @@ static void update_frame_size(VP9_COMP *cpi) {
       vp9_init_dsmotion_compensation(&cpi->mb, y_stride);
     }
   }
+
+  {
+    int i;
+    for (i = 1; i < MAX_MB_PLANE; ++i) {
+      cpi->above_context[i] = cpi->above_context[0] +
+                              i * sizeof(*cpi->above_context[0]) * 2 *
+                              mi_cols_aligned_to_sb(cm->mi_cols);
+    }
+  }
 }
 
 
@@ -1157,7 +1180,6 @@ static void init_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {
   int i;
 
   cpi->oxcf = *oxcf;
-  cpi->goldfreq = 7;
 
   cm->version = oxcf->version;
 
@@ -2669,8 +2691,7 @@ static void output_frame_level_debug_stats(VP9_COMP *cpi) {
 
   vp9_clear_system_state();  // __asm emms;
 
-  recon_err = vp9_calc_ss_err(cpi->Source,
-                              &cm->yv12_fb[cm->new_fb_idx]);
+  recon_err = vp9_calc_ss_err(cpi->Source, get_frame_new_buffer(cm));
 
   if (cpi->twopass.total_left_stats.coded_error != 0.0)
     fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d %10d"
@@ -2829,19 +2850,11 @@ static int pick_q_and_adjust_q_bounds(VP9_COMP *cpi,
     if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) {
       cpi->active_best_quality = cpi->cq_target_quality;
     } else {
-#ifdef ONE_SHOT_Q_ESTIMATE
-#ifdef STRICT_ONE_SHOT_Q
-      cpi->active_best_quality = q;
-#else
-      cpi->active_best_quality = inter_minq[q];
-#endif
-#else
       cpi->active_best_quality = inter_minq[q];
       // 1-pass: for now, use the average Q for the active_best, if its lower
       // than active_worst.
-      if (cpi->pass == 0 && (cpi->avg_frame_qindex < cpi->active_worst_quality))
+      if (cpi->pass == 0 && (cpi->avg_frame_qindex < q))
         cpi->active_best_quality = inter_minq[cpi->avg_frame_qindex];
-#endif
 
       // For the constrained quality mode we don't want
       // q to fall below the cq level.
@@ -2875,7 +2888,14 @@ static int pick_q_and_adjust_q_bounds(VP9_COMP *cpi,
   if (cm->frame_type == KEY_FRAME && !cpi->this_key_frame_forced) {
     *top_index =
       (cpi->active_worst_quality + cpi->active_best_quality * 3) / 4;
+    // If this is the first (key) frame in 1-pass, active best is the user
+    // best-allowed, and leave the top_index to active_worst.
+    if (cpi->pass == 0 && cpi->common.current_video_frame == 0) {
+      cpi->active_best_quality = cpi->oxcf.best_allowed_q;
+      *top_index = cpi->oxcf.worst_allowed_q;
+    }
   } else if (!cpi->is_src_frame_alt_ref &&
+             (cpi->oxcf.end_usage != USAGE_STREAM_FROM_SERVER) &&
              (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
     *top_index =
       (cpi->active_worst_quality + cpi->active_best_quality) / 2;
@@ -3169,8 +3189,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
       // Special case handling for forced key frames
       if ((cm->frame_type == KEY_FRAME) && cpi->this_key_frame_forced) {
         int last_q = q;
-        int kf_err = vp9_calc_ss_err(cpi->Source,
-                                     &cm->yv12_fb[cm->new_fb_idx]);
+        int kf_err = vp9_calc_ss_err(cpi->Source, get_frame_new_buffer(cm));
 
         int high_err_target = cpi->ambient_err;
         int low_err_target = cpi->ambient_err >> 1;
@@ -3306,14 +3325,13 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
   // fixed interval. Note the reconstruction error if it is the frame before
   // the force key frame
   if (cpi->next_key_frame_forced && (cpi->twopass.frames_to_key == 0)) {
-    cpi->ambient_err = vp9_calc_ss_err(cpi->Source,
-                                       &cm->yv12_fb[cm->new_fb_idx]);
+    cpi->ambient_err = vp9_calc_ss_err(cpi->Source, get_frame_new_buffer(cm));
   }
 
   if (cm->frame_type == KEY_FRAME)
     cpi->refresh_last_frame = 1;
 
-  cm->frame_to_show = &cm->yv12_fb[cm->new_fb_idx];
+  cm->frame_to_show = get_frame_new_buffer(cm);
 
 #if WRITE_RECON_BUFFER
   if (cm->show_frame)
@@ -3912,7 +3930,7 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
   cm->frame_flags = *frame_flags;
 
   // Reset the frame pointers to the current frame size
-  vp9_realloc_frame_buffer(&cm->yv12_fb[cm->new_fb_idx],
+  vp9_realloc_frame_buffer(get_frame_new_buffer(cm),
                            cm->width, cm->height,
                            cm->subsampling_x, cm->subsampling_y,
                            VP9BORDERINPIXELS);
diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h
index b1dfcbb9c..9429c7fed 100644
--- a/vp9/encoder/vp9_onyx_int.h
+++ b/vp9/encoder/vp9_onyx_int.h
@@ -29,11 +29,6 @@
 #include "vp9/common/vp9_findnearmv.h"
 #include "vp9/encoder/vp9_lookahead.h"
 
-// Experimental rate control switches
-#if CONFIG_ONESHOTQ
-#define ONE_SHOT_Q_ESTIMATE 0
-#define STRICT_ONE_SHOT_Q 0
-#endif
 #define DISABLE_RC_LONG_TERM_MEM 0
 
 // #define MODE_TEST_HIT_STATS
@@ -396,9 +391,9 @@ typedef struct VP9_COMP {
   // FIXME(rbultje) can this overflow?
   int rd_tx_select_threshes[4][TX_MODES];
 
-  int64_t rd_filter_diff[SWITCHABLE_FILTERS + 1];
-  int64_t rd_filter_threshes[4][SWITCHABLE_FILTERS + 1];
-  int64_t rd_filter_cache[SWITCHABLE_FILTERS + 1];
+  int64_t rd_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
+  int64_t rd_filter_threshes[4][SWITCHABLE_FILTER_CONTEXTS];
+  int64_t rd_filter_cache[SWITCHABLE_FILTER_CONTEXTS];
 
   int RDMULT;
   int RDDIV;
@@ -506,14 +501,9 @@ typedef struct VP9_COMP {
   int decimation_count;
 
   // for real time encoding
-  int avg_encode_time;              // microsecond
-  int avg_pick_mode_time;            // microsecond
   int speed;
-  unsigned int cpu_freq;           // Mhz
   int compressor_speed;
 
-  int interquantizer;
-  int goldfreq;
   int auto_worst_q;
   int cpu_used;
   int pass;
@@ -529,12 +519,6 @@ typedef struct VP9_COMP {
   unsigned int max_mv_magnitude;
   int mv_step_param;
 
-  // Data used for real time conferencing mode to help determine if it
-  // would be good to update the gf
-  int inter_zz_count;
-  int gf_bad_count;
-  int gf_update_recommended;
-
   unsigned char *segmentation_map;
 
   // segment threashold for encode breakout
@@ -641,7 +625,7 @@ typedef struct VP9_COMP {
 
   int dummy_packing;    /* flag to indicate if packing is dummy */
 
-  unsigned int switchable_interp_count[SWITCHABLE_FILTERS + 1]
+  unsigned int switchable_interp_count[SWITCHABLE_FILTER_CONTEXTS]
                                       [SWITCHABLE_FILTERS];
 
   unsigned int tx_stepdown_count[TX_SIZES];
@@ -675,6 +659,13 @@ typedef struct VP9_COMP {
   // Debug / test stats
   int64_t mode_test_hits[BLOCK_SIZES];
 #endif
+
+  /* Y,U,V,(A) */
+  ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
+  ENTROPY_CONTEXT left_context[MAX_MB_PLANE][16];
+
+  PARTITION_CONTEXT *above_seg_context;
+  PARTITION_CONTEXT left_seg_context[8];
 } VP9_COMP;
 
 static int get_ref_frame_idx(VP9_COMP *cpi, MV_REFERENCE_FRAME ref_frame) {
diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index 7ad8d1fb2..fca752524 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@@ -22,12 +22,14 @@
 extern int enc_debug;
 #endif
 
-void vp9_quantize_b_c(int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
-                      int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr,
-                      int16_t *quant_shift_ptr, int16_t *qcoeff_ptr,
-                      int16_t *dqcoeff_ptr, int16_t *dequant_ptr,
-                      int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan,
-                      const int16_t *iscan) {
+void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t n_coeffs,
+                      int skip_block,
+                      const int16_t *zbin_ptr, const int16_t *round_ptr,
+                      const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
+                      int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+                      const int16_t *dequant_ptr,
+                      int zbin_oq_value, uint16_t *eob_ptr,
+                      const int16_t *scan, const int16_t *iscan) {
   int i, rc, eob;
   int zbins[2], nzbins[2], zbin;
   int x, y, z, sz;
@@ -86,14 +88,15 @@ void vp9_quantize_b_c(int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
   *eob_ptr = eob + 1;
 }
 
-void vp9_quantize_b_32x32_c(int16_t *coeff_ptr, intptr_t n_coeffs,
+void vp9_quantize_b_32x32_c(const int16_t *coeff_ptr, intptr_t n_coeffs,
                             int skip_block,
-                            int16_t *zbin_ptr, int16_t *round_ptr,
-                            int16_t *quant_ptr, int16_t *quant_shift_ptr,
+                            const int16_t *zbin_ptr, const int16_t *round_ptr,
+                            const int16_t *quant_ptr,
+                            const int16_t *quant_shift_ptr,
                             int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
-                            int16_t *dequant_ptr, int zbin_oq_value,
-                            uint16_t *eob_ptr, const int16_t *scan,
-                            const int16_t *iscan) {
+                            const int16_t *dequant_ptr,
+                            int zbin_oq_value, uint16_t *eob_ptr,
+                            const int16_t *scan, const int16_t *iscan) {
   int i, rc, eob;
   int zbins[2], nzbins[2];
   int x, y, z, sz;
@@ -174,25 +177,19 @@ static INLINE struct plane_block_idx plane_block_idx(int y_blocks,
   return res;
 }
 
-void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type,
-                                int y_blocks) {
-  MACROBLOCKD *const xd = &mb->e_mbd;
+void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int y_blocks, int b_idx,
+                                const int16_t *scan, const int16_t *iscan) {
+  MACROBLOCKD *const xd = &x->e_mbd;
   const struct plane_block_idx pb_idx = plane_block_idx(y_blocks, b_idx);
-  const int16_t *scan = get_scan_4x4(tx_type);
-  const int16_t *iscan = get_iscan_4x4(tx_type);
-
-  vp9_quantize_b(BLOCK_OFFSET(mb->plane[pb_idx.plane].coeff, pb_idx.block),
-           16, mb->skip_block,
-           mb->plane[pb_idx.plane].zbin,
-           mb->plane[pb_idx.plane].round,
-           mb->plane[pb_idx.plane].quant,
-           mb->plane[pb_idx.plane].quant_shift,
-           BLOCK_OFFSET(xd->plane[pb_idx.plane].qcoeff, pb_idx.block),
-           BLOCK_OFFSET(xd->plane[pb_idx.plane].dqcoeff, pb_idx.block),
-           xd->plane[pb_idx.plane].dequant,
-           mb->plane[pb_idx.plane].zbin_extra,
-           &xd->plane[pb_idx.plane].eobs[pb_idx.block],
-           scan, iscan);
+  struct macroblock_plane* p = &x->plane[pb_idx.plane];
+  struct macroblockd_plane* pd = &xd->plane[pb_idx.plane];
+
+  vp9_quantize_b(BLOCK_OFFSET(p->coeff, pb_idx.block),
+           16, x->skip_block,
+           p->zbin, p->round, p->quant, p->quant_shift,
+           BLOCK_OFFSET(pd->qcoeff, pb_idx.block),
+           BLOCK_OFFSET(pd->dqcoeff, pb_idx.block),
+           pd->dequant, p->zbin_extra, &pd->eobs[pb_idx.block], scan, iscan);
 }
 
 static void invert_quant(int16_t *quant, int16_t *shift, int d) {
diff --git a/vp9/encoder/vp9_quantize.h b/vp9/encoder/vp9_quantize.h
index 459aa3359..c078e1d41 100644
--- a/vp9/encoder/vp9_quantize.h
+++ b/vp9/encoder/vp9_quantize.h
@@ -13,8 +13,9 @@
 
 #include "vp9/encoder/vp9_block.h"
 
-void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type,
-                                int y_blocks);
+void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int y_blocks, int b_idx,
+                                const int16_t *scan, const int16_t *iscan);
+
 struct VP9_COMP;
 
 void vp9_set_quantizer(struct VP9_COMP *cpi, int q);
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index f166b10a1..993919e5b 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -251,8 +251,7 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi) {
   fill_token_costs(cpi->mb.token_costs, cm->fc.coef_probs);
 
   for (i = 0; i < PARTITION_CONTEXTS; i++)
-    vp9_cost_tokens(cpi->mb.partition_cost[i],
-                    cm->fc.partition_prob[cm->frame_type][i],
+    vp9_cost_tokens(cpi->mb.partition_cost[i], get_partition_probs(cm, i),
                     vp9_partition_tree);
 
   /*rough estimate for costing*/
@@ -611,7 +610,7 @@ static void block_yrd_txfm(int plane, int block, BLOCK_SIZE plane_bsize,
   // TODO(jingning): temporarily enabled only for luma component
   rd = MIN(rd1, rd2);
   if (plane == 0)
-    x->zcoeff_blk[tx_size][block] = rd1 > rd2;
+    x->zcoeff_blk[tx_size][block] = rd1 > rd2 || !xd->plane[plane].eobs[block];
 
   args->this_rate += args->rate;
   args->this_dist += args->dist;
@@ -933,14 +932,15 @@ static void super_block_yrd(VP9_COMP *cpi,
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
   struct rdcost_block_args *rdcost_stack = &cpi->rdcost_stack;
+  const int b_inter_mode = is_inter_block(mbmi);
 
   assert(bs == mbmi->sb_type);
-  if (mbmi->ref_frame[0] > INTRA_FRAME)
+  if (b_inter_mode)
     vp9_subtract_sby(x, bs);
 
   if (cpi->sf.tx_size_search_method == USE_LARGESTALL ||
       (cpi->sf.tx_size_search_method != USE_FULL_RD &&
-       mbmi->ref_frame[0] == INTRA_FRAME)) {
+       !b_inter_mode)) {
     vpx_memset(txfm_cache, 0, TX_MODES * sizeof(int64_t));
     choose_largest_txfm_size(cpi, x, rate, distortion, skip, sse,
                              ref_best_rd, bs);
@@ -950,7 +950,7 @@ static void super_block_yrd(VP9_COMP *cpi,
   }
 
   if (cpi->sf.tx_size_search_method == USE_LARGESTINTRA_MODELINTER &&
-      mbmi->ref_frame[0] > INTRA_FRAME) {
+      b_inter_mode) {
     if (bs >= BLOCK_32X32)
       model_rd_for_sb_y_tx(cpi, bs, TX_32X32, x, xd,
                            &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32]);
@@ -1031,10 +1031,10 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
 
   ENTROPY_CONTEXT ta[2], tempa[2];
   ENTROPY_CONTEXT tl[2], templ[2];
-  TX_TYPE tx_type = DCT_DCT;
+
   const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
   const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
-  int idx, idy, block;
+  int idx, idy;
   uint8_t best_dst[8 * 8];
 
   assert(ib < 4);
@@ -1070,8 +1070,8 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
         const int16_t *nb;
         uint8_t *src = src_init + idx * 4 + idy * 4 * src_stride;
         uint8_t *dst = dst_init + idx * 4 + idy * 4 * dst_stride;
-
-        block = ib + idy * 2 + idx;
+        const int block = ib + idy * 2 + idx;
+        TX_TYPE tx_type;
         xd->mi_8x8[0]->bmi[block].as_mode = mode;
         src_diff = raster_block_offset_int16(BLOCK_8X8, block, p->src_diff);
         coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
@@ -1085,13 +1085,15 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
                            dst, dst_stride);
 
         tx_type = get_tx_type_4x4(PLANE_TYPE_Y_WITH_DC, xd, block);
+        get_scan_nb_4x4(tx_type, &scan, &nb);
+
         if (tx_type != DCT_DCT)
           vp9_short_fht4x4(src_diff, coeff, 8, tx_type);
         else
           x->fwd_txm4x4(src_diff, coeff, 8);
-        vp9_regular_quantize_b_4x4(x, block, tx_type, 16);
 
-        get_scan_nb_4x4(tx_type, &scan, &nb);
+        vp9_regular_quantize_b_4x4(x, 4, block, scan, get_iscan_4x4(tx_type));
+
         ratey += cost_coeffs(x, 0, block,
                              tempa + idx, templ + idy, TX_4X4, scan, nb);
         distortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, block),
@@ -1431,10 +1433,6 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
                                 int mi_row, int mi_col,
                                 int_mv single_newmv[MAX_REF_FRAMES],
                                 int *rate_mv);
-static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
-                                 BLOCK_SIZE bsize,
-                                 int mi_row, int mi_col,
-                                 int_mv *tmp_mv, int *rate_mv);
 
 static int labels2mode(MACROBLOCK *x, int i,
                        MB_PREDICTION_MODE this_mode,
@@ -1561,7 +1559,8 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi,
       coeff = BLOCK_OFFSET(p->coeff, k);
       x->fwd_txm4x4(raster_block_offset_int16(BLOCK_8X8, k, p->src_diff),
                     coeff, 8);
-      vp9_regular_quantize_b_4x4(x, k, DCT_DCT, 16);
+      vp9_regular_quantize_b_4x4(x, 4, k, get_scan_4x4(DCT_DCT),
+                                 get_iscan_4x4(DCT_DCT));
       thisdistortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, k),
                                         16, &ssz);
       thissse += ssz;
@@ -1645,6 +1644,7 @@ static INLINE void mi_buf_restore(MACROBLOCK *x, struct buf_2d orig_src,
 }
 
 static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
+                                    const TileInfo *const tile,
                                     BEST_SEG_INFO *bsi_buf, int filter_idx,
                                     int_mv seg_mvs[4][MAX_REF_FRAMES],
                                     int mi_row, int mi_col) {
@@ -1653,6 +1653,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
   MB_PREDICTION_MODE this_mode;
   MODE_INFO *mi = x->e_mbd.mi_8x8[0];
   MB_MODE_INFO *const mbmi = &mi->mbmi;
+  struct macroblockd_plane *const pd = &x->e_mbd.plane[0];
   const int label_count = 4;
   int64_t this_segment_rd = 0;
   int label_mv_thresh;
@@ -1667,8 +1668,8 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
   int subpelmv = 1, have_ref = 0;
   const int has_second_rf = has_second_ref(mbmi);
 
-  vpx_memcpy(t_above, x->e_mbd.plane[0].above_context, sizeof(t_above));
-  vpx_memcpy(t_left, x->e_mbd.plane[0].left_context, sizeof(t_left));
+  vpx_memcpy(t_above, pd->above_context, sizeof(t_above));
+  vpx_memcpy(t_left, pd->left_context, sizeof(t_left));
 
   v_fn_ptr = &cpi->fn_ptr[bsize];
 
@@ -1690,13 +1691,13 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
       i = idy * 2 + idx;
 
       frame_mv[ZEROMV][mbmi->ref_frame[0]].as_int = 0;
-      vp9_append_sub8x8_mvs_for_idx(&cpi->common, &x->e_mbd,
+      vp9_append_sub8x8_mvs_for_idx(&cpi->common, &x->e_mbd, tile,
                                     &frame_mv[NEARESTMV][mbmi->ref_frame[0]],
                                     &frame_mv[NEARMV][mbmi->ref_frame[0]],
                                     i, 0, mi_row, mi_col);
       if (has_second_rf) {
         frame_mv[ZEROMV][mbmi->ref_frame[1]].as_int = 0;
-        vp9_append_sub8x8_mvs_for_idx(&cpi->common, &x->e_mbd,
+        vp9_append_sub8x8_mvs_for_idx(&cpi->common, &x->e_mbd, tile,
                                       &frame_mv[NEARESTMV][mbmi->ref_frame[1]],
                                       &frame_mv[NEARMV][mbmi->ref_frame[1]],
                                       i, 1, mi_row, mi_col);
@@ -1746,7 +1747,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
           }
         }
 
-        vpx_memcpy(orig_pre, x->e_mbd.plane[0].pre, sizeof(orig_pre));
+        vpx_memcpy(orig_pre, pd->pre, sizeof(orig_pre));
         vpx_memcpy(bsi->rdstat[i][mode_idx].ta, t_above,
                    sizeof(bsi->rdstat[i][mode_idx].ta));
         vpx_memcpy(bsi->rdstat[i][mode_idx].tl, t_left,
@@ -1870,12 +1871,14 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
           mi_buf_restore(x, orig_src, orig_pre);
         }
 
-        if (has_second_rf && this_mode == NEWMV &&
-            mbmi->interp_filter == EIGHTTAP) {
+        if (has_second_rf) {
           if (seg_mvs[i][mbmi->ref_frame[1]].as_int == INVALID_MV ||
               seg_mvs[i][mbmi->ref_frame[0]].as_int == INVALID_MV)
             continue;
+        }
 
+        if (has_second_rf && this_mode == NEWMV &&
+            mbmi->interp_filter == EIGHTTAP) {
           // adjust src pointers
           mi_buf_shift(x, i);
           if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
@@ -1950,6 +1953,13 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
               ref_bsi->rdstat[i][mode_idx].brdcost < INT64_MAX) {
             vpx_memcpy(&bsi->rdstat[i][mode_idx], &ref_bsi->rdstat[i][mode_idx],
                        sizeof(SEG_RDSTAT));
+            if (num_4x4_blocks_wide > 1)
+              bsi->rdstat[i + 1][mode_idx].eobs =
+                  ref_bsi->rdstat[i + 1][mode_idx].eobs;
+            if (num_4x4_blocks_high > 1)
+              bsi->rdstat[i + 2][mode_idx].eobs =
+                  ref_bsi->rdstat[i + 2][mode_idx].eobs;
+
             if (bsi->rdstat[i][mode_idx].brdcost < best_rd) {
               mode_selected = this_mode;
               best_rd = bsi->rdstat[i][mode_idx].brdcost;
@@ -1970,7 +1980,11 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
           bsi->rdstat[i][mode_idx].brdcost += RDCOST(x->rdmult, x->rddiv,
                                             bsi->rdstat[i][mode_idx].brate, 0);
           bsi->rdstat[i][mode_idx].brate += bsi->rdstat[i][mode_idx].byrate;
-          bsi->rdstat[i][mode_idx].eobs = x->e_mbd.plane[0].eobs[i];
+          bsi->rdstat[i][mode_idx].eobs = pd->eobs[i];
+          if (num_4x4_blocks_wide > 1)
+            bsi->rdstat[i + 1][mode_idx].eobs = pd->eobs[i + 1];
+          if (num_4x4_blocks_high > 1)
+            bsi->rdstat[i + 2][mode_idx].eobs = pd->eobs[i + 2];
         }
 
         if (bsi->rdstat[i][mode_idx].brdcost < best_rd) {
@@ -2026,6 +2040,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
 }
 
 static int64_t rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x,
+                                           const TileInfo *const tile,
                                            int_mv *best_ref_mv,
                                            int_mv *second_best_ref_mv,
                                            int64_t best_rd,
@@ -2056,7 +2071,8 @@ static int64_t rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x,
   for (i = 0; i < 4; i++)
     bsi->modes[i] = ZEROMV;
 
-  rd_check_segment_txsize(cpi, x, bsi_buf, filter_idx, seg_mvs, mi_row, mi_col);
+  rd_check_segment_txsize(cpi, x, tile, bsi_buf, filter_idx, seg_mvs,
+                          mi_row, mi_col);
 
   if (bsi->segment_rd > best_rd)
     return INT64_MAX;
@@ -2204,7 +2220,7 @@ static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
                          int_mv *second_ref_mv,
                          int64_t comp_pred_diff[NB_PREDICTION_TYPES],
                          int64_t tx_size_diff[TX_MODES],
-                         int64_t best_filter_diff[SWITCHABLE_FILTERS + 1]) {
+                         int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS]) {
   MACROBLOCKD *const xd = &x->e_mbd;
 
   // Take a snapshot of the coding context so it can be
@@ -2222,7 +2238,7 @@ static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
 
   vpx_memcpy(ctx->tx_rd_diff, tx_size_diff, sizeof(ctx->tx_rd_diff));
   vpx_memcpy(ctx->best_filter_diff, best_filter_diff,
-             sizeof(*best_filter_diff) * (SWITCHABLE_FILTERS + 1));
+             sizeof(*best_filter_diff) * SWITCHABLE_FILTER_CONTEXTS);
 }
 
 static void setup_pred_block(const MACROBLOCKD *xd,
@@ -2252,6 +2268,7 @@ static void setup_pred_block(const MACROBLOCKD *xd,
 }
 
 static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
+                               const TileInfo *const tile,
                                int idx, MV_REFERENCE_FRAME frame_type,
                                BLOCK_SIZE block_size,
                                int mi_row, int mi_col,
@@ -2267,12 +2284,8 @@ static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
   // set up scaling factors
   scale[frame_type] = cpi->common.active_ref_scale[frame_type - 1];
 
-  scale[frame_type].x_offset_q4 =
-      ROUND_POWER_OF_TWO(mi_col * MI_SIZE * scale[frame_type].x_scale_fp,
-       REF_SCALE_SHIFT) & 0xf;
-  scale[frame_type].y_offset_q4 =
-      ROUND_POWER_OF_TWO(mi_row * MI_SIZE * scale[frame_type].y_scale_fp,
-       REF_SCALE_SHIFT) & 0xf;
+  scale[frame_type].sfc->set_scaled_offsets(&scale[frame_type],
+                                            mi_row * MI_SIZE, mi_col * MI_SIZE);
 
   // TODO(jkoleszar): Is the UV buffer ever used here? If so, need to make this
   // use the UV scaling factors.
@@ -2280,7 +2293,7 @@ static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
                    &scale[frame_type], &scale[frame_type]);
 
   // Gets an initial list of candidate vectors from neighbours and orders them
-  vp9_find_mv_refs(&cpi->common, xd, xd->mi_8x8[0],
+  vp9_find_mv_refs(cm, xd, tile, xd->mi_8x8[0],
                    xd->last_mi,
                    frame_type,
                    mbmi->ref_mvs[frame_type], mi_row, mi_col);
@@ -2294,7 +2307,7 @@ static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
   // Further refinement that is encode side only to test the top few candidates
   // in full and choose the best as the centre point for subsequent searches.
   // The current implementation doesn't support scaling.
-  if (!vp9_is_scaled(&scale[frame_type]) && block_size >= BLOCK_8X8)
+  if (!vp9_is_scaled(scale[frame_type].sfc) && block_size >= BLOCK_8X8)
     mv_pred(cpi, x, yv12_mb[frame_type][0].buf, yv12->y_stride,
             frame_type, block_size);
 }
@@ -2317,6 +2330,7 @@ static INLINE int get_switchable_rate(const MACROBLOCK *x) {
 }
 
 static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
+                                 const TileInfo *const tile,
                                  BLOCK_SIZE bsize,
                                  int mi_row, int mi_col,
                                  int_mv *tmp_mv, int *rate_mv) {
@@ -2501,9 +2515,9 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
     setup_pre_planes(xd, 1, scaled_ref_frame[1], mi_row, mi_col, NULL);
   }
 
-  xd->scale_factor[0].set_scaled_offsets(&xd->scale_factor[0],
+  xd->scale_factor[0].sfc->set_scaled_offsets(&xd->scale_factor[0],
                                          mi_row, mi_col);
-  xd->scale_factor[1].set_scaled_offsets(&xd->scale_factor[1],
+  xd->scale_factor[1].sfc->set_scaled_offsets(&xd->scale_factor[1],
                                          mi_row, mi_col);
   scaled_first_yv12 = xd->plane[0].pre[0];
 
@@ -2613,6 +2627,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
 }
 
 static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
+                                 const TileInfo *const tile,
                                  BLOCK_SIZE bsize,
                                  int64_t txfm_cache[],
                                  int *rate2, int64_t *distortion,
@@ -2620,7 +2635,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                  int *rate_y, int64_t *distortion_y,
                                  int *rate_uv, int64_t *distortion_uv,
                                  int *mode_excluded, int *disable_skip,
-                                 INTERPOLATIONFILTERTYPE *best_filter,
+                                 INTERPOLATION_TYPE *best_filter,
                                  int_mv (*mode_mv)[MAX_REF_FRAMES],
                                  int mi_row, int mi_col,
                                  int_mv single_newmv[MAX_REF_FRAMES],
@@ -2647,6 +2662,12 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   int orig_dst_stride[MAX_MB_PLANE];
   int rs = 0;
 
+  if (is_comp_pred) {
+    if (frame_mv[refs[0]].as_int == INVALID_MV ||
+        frame_mv[refs[1]].as_int == INVALID_MV)
+      return INT64_MAX;
+  }
+
   if (this_mode == NEWMV) {
     int rate_mv;
     if (is_comp_pred) {
@@ -2665,13 +2686,11 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                    &mbmi->ref_mvs[refs[1]][0].as_mv,
                                    x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
       }
-      if (frame_mv[refs[0]].as_int == INVALID_MV ||
-          frame_mv[refs[1]].as_int == INVALID_MV)
-        return INT64_MAX;
       *rate2 += rate_mv;
     } else {
       int_mv tmp_mv;
-      single_motion_search(cpi, x, bsize, mi_row, mi_col, &tmp_mv, &rate_mv);
+      single_motion_search(cpi, x, tile, bsize, mi_row, mi_col,
+                           &tmp_mv, &rate_mv);
       *rate2 += rate_mv;
       frame_mv[refs[0]].as_int =
           xd->mi_8x8[0]->bmi[0].as_mv[0].as_int = tmp_mv.as_int;
@@ -3082,6 +3101,7 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
 }
 
 int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
+                                  const TileInfo *const tile,
                                   int mi_row, int mi_col,
                                   int *returnrate,
                                   int64_t *returndistortion,
@@ -3111,8 +3131,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   int64_t best_tx_diff[TX_MODES];
   int64_t best_pred_diff[NB_PREDICTION_TYPES];
   int64_t best_pred_rd[NB_PREDICTION_TYPES];
-  int64_t best_filter_rd[SWITCHABLE_FILTERS + 1];
-  int64_t best_filter_diff[SWITCHABLE_FILTERS + 1];
+  int64_t best_filter_rd[SWITCHABLE_FILTER_CONTEXTS];
+  int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
   MB_MODE_INFO best_mbmode = { 0 };
   int j;
   int mode_index, best_mode_index = 0;
@@ -3122,7 +3142,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   int64_t best_inter_rd = INT64_MAX;
   MB_PREDICTION_MODE best_intra_mode = DC_PRED;
   MV_REFERENCE_FRAME best_inter_ref_frame = LAST_FRAME;
-  INTERPOLATIONFILTERTYPE tmp_best_filter = SWITCHABLE;
+  INTERPOLATION_TYPE tmp_best_filter = SWITCHABLE;
   int rate_uv_intra[TX_SIZES], rate_uv_tokenonly[TX_SIZES];
   int64_t dist_uv[TX_SIZES];
   int skip_uv[TX_SIZES];
@@ -3150,7 +3170,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     best_pred_rd[i] = INT64_MAX;
   for (i = 0; i < TX_MODES; i++)
     best_tx_rd[i] = INT64_MAX;
-  for (i = 0; i <= SWITCHABLE_FILTERS; i++)
+  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
     best_filter_rd[i] = INT64_MAX;
   for (i = 0; i < TX_SIZES; i++)
     rate_uv_intra[i] = INT_MAX;
@@ -3192,8 +3212,9 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
 
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
     if (cpi->ref_frame_flags & flag_list[ref_frame]) {
-      setup_buffer_inter(cpi, x, idx_list[ref_frame], ref_frame, block_size,
-                         mi_row, mi_col, frame_mv[NEARESTMV], frame_mv[NEARMV],
+      setup_buffer_inter(cpi, x, tile, idx_list[ref_frame], ref_frame,
+                         block_size, mi_row, mi_col,
+                         frame_mv[NEARESTMV], frame_mv[NEARMV],
                          yv12_mb, scale_factor);
     }
     frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
@@ -3437,7 +3458,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     } else {
       mbmi->mode = this_mode;
       compmode_cost = vp9_cost_bit(comp_mode_p, second_ref_frame > INTRA_FRAME);
-      this_rd = handle_inter_mode(cpi, x, bsize,
+      this_rd = handle_inter_mode(cpi, x, tile, bsize,
                                   tx_cache,
                                   &rate2, &distortion2, &skippable,
                                   &rate_y, &distortion_y,
@@ -3520,17 +3541,16 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     }
 
     // Keep record of best intra rd
-    if (xd->mi_8x8[0]->mbmi.ref_frame[0] == INTRA_FRAME &&
-        is_intra_mode(xd->mi_8x8[0]->mbmi.mode) &&
+    if (!is_inter_block(&xd->mi_8x8[0]->mbmi) &&
         this_rd < best_intra_rd) {
       best_intra_rd = this_rd;
       best_intra_mode = xd->mi_8x8[0]->mbmi.mode;
     }
+
     // Keep record of best inter rd with single reference
-    if (xd->mi_8x8[0]->mbmi.ref_frame[0] > INTRA_FRAME &&
-        xd->mi_8x8[0]->mbmi.ref_frame[1] == NONE &&
-        !mode_excluded &&
-        this_rd < best_inter_rd) {
+    if (is_inter_block(&xd->mi_8x8[0]->mbmi) &&
+        !has_second_ref(&xd->mi_8x8[0]->mbmi) &&
+        !mode_excluded && this_rd < best_inter_rd) {
       best_inter_rd = this_rd;
       best_inter_ref_frame = ref_frame;
     }
@@ -3538,7 +3558,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     if (!disable_skip && ref_frame == INTRA_FRAME) {
       for (i = 0; i < NB_PREDICTION_TYPES; ++i)
         best_pred_rd[i] = MIN(best_pred_rd[i], this_rd);
-      for (i = 0; i <= SWITCHABLE_FILTERS; i++)
+      for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
         best_filter_rd[i] = MIN(best_filter_rd[i], this_rd);
     }
 
@@ -3621,7 +3641,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
         cm->mcomp_filter_type != BILINEAR) {
       int64_t ref = cpi->rd_filter_cache[cm->mcomp_filter_type == SWITCHABLE ?
                               SWITCHABLE_FILTERS : cm->mcomp_filter_type];
-      for (i = 0; i <= SWITCHABLE_FILTERS; i++) {
+      for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
         int64_t adj_rd;
         // In cases of poor prediction, filter_cache[] can contain really big
         // values, which actually are bigger than this_rd itself. This can
@@ -3743,7 +3763,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   }
 
   if (!x->skip) {
-    for (i = 0; i <= SWITCHABLE_FILTERS; i++) {
+    for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
       if (best_filter_rd[i] == INT64_MAX)
         best_filter_diff[i] = 0;
       else
@@ -3779,6 +3799,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
 
 
 int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
+                                      const TileInfo *const tile,
                                       int mi_row, int mi_col,
                                       int *returnrate,
                                       int64_t *returndistortion,
@@ -3807,15 +3828,15 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
   int64_t best_tx_diff[TX_MODES];
   int64_t best_pred_diff[NB_PREDICTION_TYPES];
   int64_t best_pred_rd[NB_PREDICTION_TYPES];
-  int64_t best_filter_rd[SWITCHABLE_FILTERS + 1];
-  int64_t best_filter_diff[SWITCHABLE_FILTERS + 1];
+  int64_t best_filter_rd[SWITCHABLE_FILTER_CONTEXTS];
+  int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
   MB_MODE_INFO best_mbmode = { 0 };
   int mode_index, best_mode_index = 0;
   unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES];
   vp9_prob comp_mode_p;
   int64_t best_inter_rd = INT64_MAX;
   MV_REFERENCE_FRAME best_inter_ref_frame = LAST_FRAME;
-  INTERPOLATIONFILTERTYPE tmp_best_filter = SWITCHABLE;
+  INTERPOLATION_TYPE tmp_best_filter = SWITCHABLE;
   int rate_uv_intra[TX_SIZES], rate_uv_tokenonly[TX_SIZES];
   int64_t dist_uv[TX_SIZES];
   int skip_uv[TX_SIZES];
@@ -3830,7 +3851,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
   int best_skip2 = 0;
 
   x->skip_encode = cpi->sf.skip_encode_frame && xd->q_index < QIDX_SKIP_THRESH;
-  vp9_zero(x->zcoeff_blk);
+  vpx_memset(x->zcoeff_blk[TX_4X4], 0, 4);
 
   for (i = 0; i < 4; i++) {
     int j;
@@ -3845,7 +3866,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
     best_pred_rd[i] = INT64_MAX;
   for (i = 0; i < TX_MODES; i++)
     best_tx_rd[i] = INT64_MAX;
-  for (i = 0; i <= SWITCHABLE_FILTERS; i++)
+  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
     best_filter_rd[i] = INT64_MAX;
   for (i = 0; i < TX_SIZES; i++)
     rate_uv_intra[i] = INT_MAX;
@@ -3863,8 +3884,9 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
 
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
     if (cpi->ref_frame_flags & flag_list[ref_frame]) {
-      setup_buffer_inter(cpi, x, idx_list[ref_frame], ref_frame, block_size,
-                         mi_row, mi_col, frame_mv[NEARESTMV], frame_mv[NEARMV],
+      setup_buffer_inter(cpi, x, tile, idx_list[ref_frame], ref_frame,
+                         block_size, mi_row, mi_col,
+                         frame_mv[NEARESTMV], frame_mv[NEARMV],
                          yv12_mb, scale_factor);
     }
     frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
@@ -3962,11 +3984,11 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
     // TODO(jingning, jkoleszar): scaling reference frame not supported for
     // sub8x8 blocks.
     if (ref_frame > 0 &&
-        vp9_is_scaled(&scale_factor[ref_frame]))
+        vp9_is_scaled(scale_factor[ref_frame].sfc))
       continue;
 
     if (second_ref_frame > 0 &&
-        vp9_is_scaled(&scale_factor[second_ref_frame]))
+        vp9_is_scaled(scale_factor[second_ref_frame].sfc))
       continue;
 
     set_scale_factors(xd, ref_frame, second_ref_frame, scale_factor);
@@ -4094,7 +4116,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
             mbmi->interp_filter = switchable_filter_index;
             vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
 
-            tmp_rd = rd_pick_best_mbsegmentation(cpi, x,
+            tmp_rd = rd_pick_best_mbsegmentation(cpi, x, tile,
                                                  &mbmi->ref_mvs[ref_frame][0],
                                                  second_ref,
                                                  best_yrd,
@@ -4130,8 +4152,10 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
               tmp_best_sse = total_sse;
               tmp_best_skippable = skippable;
               tmp_best_mbmode = *mbmi;
-              for (i = 0; i < 4; i++)
+              for (i = 0; i < 4; i++) {
                 tmp_best_bmodes[i] = xd->mi_8x8[0]->bmi[i];
+                x->zcoeff_blk[TX_4X4][i] = !xd->plane[0].eobs[i];
+              }
               pred_exists = 1;
               if (switchable_filter_index == 0 &&
                   cpi->sf.use_rd_breakout &&
@@ -4158,7 +4182,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
       if (!pred_exists) {
         // Handles the special case when a filter that is not in the
         // switchable list (bilinear, 6-tap) is indicated at the frame level
-        tmp_rd = rd_pick_best_mbsegmentation(cpi, x,
+        tmp_rd = rd_pick_best_mbsegmentation(cpi, x, tile,
                      &mbmi->ref_mvs[ref_frame][0],
                      second_ref,
                      best_yrd,
@@ -4286,7 +4310,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
     if (!disable_skip && ref_frame == INTRA_FRAME) {
       for (i = 0; i < NB_PREDICTION_TYPES; ++i)
         best_pred_rd[i] = MIN(best_pred_rd[i], this_rd);
-      for (i = 0; i <= SWITCHABLE_FILTERS; i++)
+      for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
         best_filter_rd[i] = MIN(best_filter_rd[i], this_rd);
     }
 
@@ -4364,7 +4388,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
         cm->mcomp_filter_type != BILINEAR) {
       int64_t ref = cpi->rd_filter_cache[cm->mcomp_filter_type == SWITCHABLE ?
                               SWITCHABLE_FILTERS : cm->mcomp_filter_type];
-      for (i = 0; i <= SWITCHABLE_FILTERS; i++) {
+      for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
         int64_t adj_rd;
         // In cases of poor prediction, filter_cache[] can contain really big
         // values, which actually are bigger than this_rd itself. This can
@@ -4480,7 +4504,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
   }
 
   if (!x->skip) {
-    for (i = 0; i <= SWITCHABLE_FILTERS; i++) {
+    for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
       if (best_filter_rd[i] == INT64_MAX)
         best_filter_diff[i] = 0;
       else
diff --git a/vp9/encoder/vp9_rdopt.h b/vp9/encoder/vp9_rdopt.h
index 0b0bb18d7..92fb23548 100644
--- a/vp9/encoder/vp9_rdopt.h
+++ b/vp9/encoder/vp9_rdopt.h
@@ -18,6 +18,8 @@
   (((128 + ((int64_t)R) * (RM)) >> 8) + (D << DM))
 #define QIDX_SKIP_THRESH     115
 
+struct TileInfo;
+
 int vp9_compute_rd_mult(VP9_COMP *cpi, int qindex);
 
 void vp9_initialize_rd_consts(VP9_COMP *cpi);
@@ -29,14 +31,22 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
                                PICK_MODE_CONTEXT *ctx, int64_t best_rd);
 
 int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
+                                  const struct TileInfo *const tile,
                                   int mi_row, int mi_col,
-                                  int *r, int64_t *d, BLOCK_SIZE bsize,
-                                  PICK_MODE_CONTEXT *ctx, int64_t best_rd);
+                                  int *returnrate,
+                                  int64_t *returndistortion,
+                                  BLOCK_SIZE bsize,
+                                  PICK_MODE_CONTEXT *ctx,
+                                  int64_t best_rd_so_far);
 
 int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
+                                      const struct TileInfo *const tile,
                                       int mi_row, int mi_col,
-                                      int *r, int64_t *d, BLOCK_SIZE bsize,
-                                      PICK_MODE_CONTEXT *ctx, int64_t best_rd);
+                                      int *returnrate,
+                                      int64_t *returndistortion,
+                                      BLOCK_SIZE bsize,
+                                      PICK_MODE_CONTEXT *ctx,
+                                      int64_t best_rd_so_far);
 
 void vp9_init_me_luts();
 
diff --git a/vp9/encoder/vp9_segmentation.c b/vp9/encoder/vp9_segmentation.c
index 72e6be1e8..24f011f83 100644
--- a/vp9/encoder/vp9_segmentation.c
+++ b/vp9/encoder/vp9_segmentation.c
@@ -117,7 +117,8 @@ static int cost_segmap(int *segcounts, vp9_prob *probs) {
   return cost;
 }
 
-static void count_segs(VP9_COMP *cpi, MODE_INFO **mi_8x8,
+static void count_segs(VP9_COMP *cpi, const TileInfo *const tile,
+                       MODE_INFO **mi_8x8,
                        int *no_pred_segcounts,
                        int (*temporal_predictor_count)[2],
                        int *t_unpred_seg_counts,
@@ -132,7 +133,7 @@ static void count_segs(VP9_COMP *cpi, MODE_INFO **mi_8x8,
   xd->mi_8x8 = mi_8x8;
   segment_id = xd->mi_8x8[0]->mbmi.segment_id;
 
-  set_mi_row_col(cm, xd, mi_row, bh, mi_col, bw);
+  set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols);
 
   // Count the number of hits on each segment with no prediction
   no_pred_segcounts[segment_id]++;
@@ -157,7 +158,8 @@ static void count_segs(VP9_COMP *cpi, MODE_INFO **mi_8x8,
   }
 }
 
-static void count_segs_sb(VP9_COMP *cpi, MODE_INFO **mi_8x8,
+static void count_segs_sb(VP9_COMP *cpi, const TileInfo *const tile,
+                          MODE_INFO **mi_8x8,
                           int *no_pred_segcounts,
                           int (*temporal_predictor_count)[2],
                           int *t_unpred_seg_counts,
@@ -175,19 +177,20 @@ static void count_segs_sb(VP9_COMP *cpi, MODE_INFO **mi_8x8,
   bh = num_8x8_blocks_high_lookup[mi_8x8[0]->mbmi.sb_type];
 
   if (bw == bs && bh == bs) {
-    count_segs(cpi, mi_8x8, no_pred_segcounts, temporal_predictor_count,
+    count_segs(cpi, tile, mi_8x8, no_pred_segcounts, temporal_predictor_count,
                t_unpred_seg_counts, bs, bs, mi_row, mi_col);
   } else if (bw == bs && bh < bs) {
-    count_segs(cpi, mi_8x8, no_pred_segcounts, temporal_predictor_count,
+    count_segs(cpi, tile, mi_8x8, no_pred_segcounts, temporal_predictor_count,
                t_unpred_seg_counts, bs, hbs, mi_row, mi_col);
-    count_segs(cpi, mi_8x8 + hbs * mis, no_pred_segcounts,
+    count_segs(cpi, tile, mi_8x8 + hbs * mis, no_pred_segcounts,
                temporal_predictor_count, t_unpred_seg_counts, bs, hbs,
                mi_row + hbs, mi_col);
   } else if (bw < bs && bh == bs) {
-    count_segs(cpi, mi_8x8, no_pred_segcounts, temporal_predictor_count,
+    count_segs(cpi, tile, mi_8x8, no_pred_segcounts, temporal_predictor_count,
                t_unpred_seg_counts, hbs, bs, mi_row, mi_col);
-    count_segs(cpi, mi_8x8 + hbs, no_pred_segcounts, temporal_predictor_count,
-               t_unpred_seg_counts, hbs, bs, mi_row, mi_col + hbs);
+    count_segs(cpi, tile, mi_8x8 + hbs,
+               no_pred_segcounts, temporal_predictor_count, t_unpred_seg_counts,
+               hbs, bs, mi_row, mi_col + hbs);
   } else {
     const BLOCK_SIZE subsize = subsize_lookup[PARTITION_SPLIT][bsize];
     int n;
@@ -198,7 +201,7 @@ static void count_segs_sb(VP9_COMP *cpi, MODE_INFO **mi_8x8,
       const int mi_dc = hbs * (n & 1);
       const int mi_dr = hbs * (n >> 1);
 
-      count_segs_sb(cpi, &mi_8x8[mi_dr * mis + mi_dc],
+      count_segs_sb(cpi, tile, &mi_8x8[mi_dr * mis + mi_dc],
                     no_pred_segcounts, temporal_predictor_count,
                     t_unpred_seg_counts,
                     mi_row + mi_dr, mi_col + mi_dc, subsize);
@@ -234,15 +237,18 @@ void vp9_choose_segmap_coding_method(VP9_COMP *cpi) {
   // First of all generate stats regarding how well the last segment map
   // predicts this one
   for (tile_col = 0; tile_col < 1 << cm->log2_tile_cols; tile_col++) {
-    vp9_get_tile_col_offsets(cm, tile_col);
-    mi_ptr = cm->mi_grid_visible + cm->cur_tile_mi_col_start;
+    TileInfo tile;
+
+    vp9_tile_init(&tile, cm, 0, tile_col);
+    mi_ptr = cm->mi_grid_visible + tile.mi_col_start;
     for (mi_row = 0; mi_row < cm->mi_rows;
          mi_row += 8, mi_ptr += 8 * mis) {
       mi = mi_ptr;
-      for (mi_col = cm->cur_tile_mi_col_start; mi_col < cm->cur_tile_mi_col_end;
+      for (mi_col = tile.mi_col_start; mi_col < tile.mi_col_end;
            mi_col += 8, mi += 8)
-        count_segs_sb(cpi, mi, no_pred_segcounts, temporal_predictor_count,
-                      t_unpred_seg_counts, mi_row, mi_col, BLOCK_64X64);
+        count_segs_sb(cpi, &tile, mi, no_pred_segcounts,
+                      temporal_predictor_count, t_unpred_seg_counts,
+                      mi_row, mi_col, BLOCK_64X64);
     }
   }
 
diff --git a/vp9/encoder/vp9_subexp.c b/vp9/encoder/vp9_subexp.c
index eb864d96c..387fc9056 100644
--- a/vp9/encoder/vp9_subexp.c
+++ b/vp9/encoder/vp9_subexp.c
@@ -221,7 +221,7 @@ int vp9_prob_diff_update_savings_search_model(const unsigned int *ct,
 }
 
 void vp9_cond_prob_diff_update(vp9_writer *w, vp9_prob *oldp,
-                               unsigned int *ct) {
+                               const unsigned int ct[2]) {
   const vp9_prob upd = DIFF_UPDATE_PROB;
   vp9_prob newp = get_binary_prob(ct[0], ct[1]);
   const int savings = vp9_prob_diff_update_savings_search(ct, *oldp, &newp,
diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c
index 6ea05793d..2cace0378 100644
--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c
@@ -38,14 +38,15 @@ static void temporal_filter_predictors_mb_c(MACROBLOCKD *xd,
                                             int stride,
                                             int mv_row,
                                             int mv_col,
-                                            uint8_t *pred) {
+                                            uint8_t *pred,
+                                            struct scale_factors *scale) {
   const int which_mv = 0;
   MV mv = { mv_row, mv_col };
 
   vp9_build_inter_predictor(y_mb_ptr, stride,
                             &pred[0], 16,
                             &mv,
-                            &xd->scale_factor[which_mv],
+                            scale,
                             16, 16,
                             which_mv,
                             &xd->subpix, MV_PRECISION_Q3);
@@ -55,7 +56,7 @@ static void temporal_filter_predictors_mb_c(MACROBLOCKD *xd,
   vp9_build_inter_predictor(u_mb_ptr, stride,
                             &pred[256], 8,
                             &mv,
-                            &xd->scale_factor[which_mv],
+                            scale,
                             8, 8,
                             which_mv,
                             &xd->subpix, MV_PRECISION_Q4);
@@ -63,7 +64,7 @@ static void temporal_filter_predictors_mb_c(MACROBLOCKD *xd,
   vp9_build_inter_predictor(v_mb_ptr, stride,
                             &pred[320], 8,
                             &mv,
-                            &xd->scale_factor[which_mv],
+                            scale,
                             8, 8,
                             which_mv,
                             &xd->subpix, MV_PRECISION_Q4);
@@ -186,7 +187,8 @@ static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi,
 static void temporal_filter_iterate_c(VP9_COMP *cpi,
                                       int frame_count,
                                       int alt_ref_index,
-                                      int strength) {
+                                      int strength,
+                                      struct scale_factors *scale) {
   int byte;
   int frame;
   int mb_col, mb_row;
@@ -280,7 +282,7 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi,
            cpi->frames[frame]->y_stride,
            mbd->mi_8x8[0]->bmi[0].as_mv[0].as_mv.row,
            mbd->mi_8x8[0]->bmi[0].as_mv[0].as_mv.col,
-           predictor);
+           predictor, scale);
 
           // Apply the filter (YUV)
           vp9_temporal_filter_apply(f->y_buffer + mb_y_offset, f->y_stride,
@@ -374,6 +376,9 @@ void vp9_temporal_filter_prepare(VP9_COMP *cpi, int distance) {
   const int num_frames_forward = vp9_lookahead_depth(cpi->lookahead)
                                - (num_frames_backward + 1);
 
+  struct scale_factors scale;
+  struct scale_factors_common scale_comm;
+
   switch (blur_type) {
     case 1:
       // Backward Blur
@@ -432,9 +437,9 @@ void vp9_temporal_filter_prepare(VP9_COMP *cpi, int distance) {
 #endif
 
   // Setup scaling factors. Scaling on each of the arnr frames is not supported
-  vp9_setup_scale_factors_for_frame(&cpi->mb.e_mbd.scale_factor[0],
-      cm->yv12_fb[cm->new_fb_idx].y_crop_width,
-      cm->yv12_fb[cm->new_fb_idx].y_crop_height,
+  vp9_setup_scale_factors_for_frame(&scale, &scale_comm,
+      get_frame_new_buffer(cm)->y_crop_width,
+      get_frame_new_buffer(cm)->y_crop_height,
       cm->width, cm->height);
 
   // Setup frame pointers, NULL indicates frame not included in filter
@@ -447,7 +452,7 @@ void vp9_temporal_filter_prepare(VP9_COMP *cpi, int distance) {
   }
 
   temporal_filter_iterate_c(cpi, frames_to_blur, frames_to_blur_backward,
-                            strength);
+                            strength, &scale);
 }
 
 void configure_arnr_filter(VP9_COMP *cpi, const unsigned int this_frame,
diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c
index 550263aa8..7d4676e97 100644
--- a/vp9/encoder/vp9_tokenize.c
+++ b/vp9/encoder/vp9_tokenize.c
@@ -21,14 +21,6 @@
 #include "vp9/common/vp9_seg_common.h"
 #include "vp9/common/vp9_entropy.h"
 
-/* Global event counters used for accumulating statistics across several
-   compressions, then generating vp9_context.c = initial stats. */
-
-#ifdef ENTROPY_STATS
-vp9_coeff_accum context_counters[TX_SIZES][BLOCK_TYPES];
-extern vp9_coeff_stats tree_update_hist[TX_SIZES][BLOCK_TYPES];
-#endif  /* ENTROPY_STATS */
-
 static TOKENVALUE dct_value_tokens[DCT_MAX_VALUE * 2];
 const TOKENVALUE *vp9_dct_value_tokens_ptr;
 static int dct_value_cost[DCT_MAX_VALUE * 2];
@@ -89,6 +81,7 @@ struct tokenize_b_args {
   MACROBLOCKD *xd;
   TOKENEXTRA **tp;
   TX_SIZE tx_size;
+  uint8_t *token_cache;
 };
 
 static void set_entropy_context_b(int plane, int block, BLOCK_SIZE plane_bsize,
@@ -107,6 +100,7 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize,
   VP9_COMP *cpi = args->cpi;
   MACROBLOCKD *xd = args->xd;
   TOKENEXTRA **tp = args->tp;
+  uint8_t *token_cache = args->token_cache;
   struct macroblockd_plane *pd = &xd->plane[plane];
   MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
   int pt; /* near block/prev token context index */
@@ -121,7 +115,6 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize,
   vp9_coeff_count *const counts = cpi->coef_counts[tx_size];
   vp9_coeff_probs_model *const coef_probs = cpi->common.fc.coef_probs[tx_size];
   const int ref = is_inter_block(mbmi);
-  uint8_t token_cache[1024];
   const uint8_t *const band_translate = get_band_translate(tx_size);
   const int seg_eob = get_tx_eob(&cpi->common.seg, segment_id, tx_size);
   int aoff, loff;
@@ -205,7 +198,7 @@ void vp9_tokenize_sb(VP9_COMP *cpi, TOKENEXTRA **t, int dry_run,
   const int mb_skip_context = vp9_get_pred_context_mbskip(xd);
   const int skip_inc = !vp9_segfeature_active(&cm->seg, mbmi->segment_id,
                                               SEG_LVL_SKIP);
-  struct tokenize_b_args arg = {cpi, xd, t, mbmi->tx_size};
+  struct tokenize_b_args arg = {cpi, xd, t, mbmi->tx_size, cpi->mb.token_cache};
 
   mbmi->skip_coeff = vp9_sb_is_skippable(xd, bsize);
   if (mbmi->skip_coeff) {
@@ -226,149 +219,6 @@ void vp9_tokenize_sb(VP9_COMP *cpi, TOKENEXTRA **t, int dry_run,
   }
 }
 
-#ifdef ENTROPY_STATS
-void init_context_counters(void) {
-  FILE *f = fopen("context.bin", "rb");
-  if (!f) {
-    vp9_zero(context_counters);
-  } else {
-    fread(context_counters, sizeof(context_counters), 1, f);
-    fclose(f);
-  }
-
-  f = fopen("treeupdate.bin", "rb");
-  if (!f) {
-    vpx_memset(tree_update_hist, 0, sizeof(tree_update_hist));
-  } else {
-    fread(tree_update_hist, sizeof(tree_update_hist), 1, f);
-    fclose(f);
-  }
-}
-
-static void print_counter(FILE *f, vp9_coeff_accum *context_counters,
-                          int block_types, const char *header) {
-  int type, ref, band, pt, t;
-
-  fprintf(f, "static const vp9_coeff_count %s = {\n", header);
-
-#define Comma(X) (X ? "," : "")
-  type = 0;
-  do {
-    ref = 0;
-    fprintf(f, "%s\n  { /* block Type %d */", Comma(type), type);
-    do {
-      fprintf(f, "%s\n    { /* %s */", Comma(type), ref ? "Inter" : "Intra");
-      band = 0;
-      do {
-        fprintf(f, "%s\n      { /* Coeff Band %d */", Comma(band), band);
-        pt = 0;
-        do {
-          fprintf(f, "%s\n        {", Comma(pt));
-
-          t = 0;
-          do {
-            const int64_t x = context_counters[type][ref][band][pt][t];
-            const int y = (int) x;
-
-            assert(x == (int64_t) y);  /* no overflow handling yet */
-            fprintf(f, "%s %d", Comma(t), y);
-          } while (++t < 1 + MAX_ENTROPY_TOKENS);
-          fprintf(f, "}");
-        } while (++pt < PREV_COEF_CONTEXTS);
-        fprintf(f, "\n      }");
-      } while (++band < COEF_BANDS);
-      fprintf(f, "\n    }");
-    } while (++ref < REF_TYPES);
-    fprintf(f, "\n  }");
-  } while (++type < block_types);
-  fprintf(f, "\n};\n");
-}
-
-static void print_probs(FILE *f, vp9_coeff_accum *context_counters,
-                        int block_types, const char *header) {
-  int type, ref, band, pt, t;
-
-  fprintf(f, "static const vp9_coeff_probs %s = {", header);
-
-  type = 0;
-#define Newline(x, spaces) (x ? " " : "\n" spaces)
-  do {
-    fprintf(f, "%s%s{ /* block Type %d */",
-            Comma(type), Newline(type, "  "), type);
-    ref = 0;
-    do {
-      fprintf(f, "%s%s{ /* %s */",
-              Comma(band), Newline(band, "    "), ref ? "Inter" : "Intra");
-      band = 0;
-      do {
-        fprintf(f, "%s%s{ /* Coeff Band %d */",
-                Comma(band), Newline(band, "      "), band);
-        pt = 0;
-        do {
-          unsigned int branch_ct[ENTROPY_NODES][2];
-          unsigned int coef_counts[MAX_ENTROPY_TOKENS + 1];
-          vp9_prob coef_probs[ENTROPY_NODES];
-
-          if (pt >= 3 && band == 0)
-            break;
-          for (t = 0; t < MAX_ENTROPY_TOKENS + 1; ++t)
-            coef_counts[t] = context_counters[type][ref][band][pt][t];
-          vp9_tree_probs_from_distribution(vp9_coef_tree, coef_probs,
-                                           branch_ct, coef_counts, 0);
-          branch_ct[0][1] = coef_counts[MAX_ENTROPY_TOKENS] - branch_ct[0][0];
-          coef_probs[0] = get_binary_prob(branch_ct[0][0], branch_ct[0][1]);
-          fprintf(f, "%s\n      {", Comma(pt));
-
-          t = 0;
-          do {
-            fprintf(f, "%s %3d", Comma(t), coef_probs[t]);
-          } while (++t < ENTROPY_NODES);
-
-          fprintf(f, " }");
-        } while (++pt < PREV_COEF_CONTEXTS);
-        fprintf(f, "\n      }");
-      } while (++band < COEF_BANDS);
-      fprintf(f, "\n    }");
-    } while (++ref < REF_TYPES);
-    fprintf(f, "\n  }");
-  } while (++type < block_types);
-  fprintf(f, "\n};\n");
-}
-
-void print_context_counters() {
-  FILE *f = fopen("vp9_context.c", "w");
-
-  fprintf(f, "#include \"vp9_entropy.h\"\n");
-  fprintf(f, "\n/* *** GENERATED FILE: DO NOT EDIT *** */\n\n");
-
-  /* print counts */
-  print_counter(f, context_counters[TX_4X4], BLOCK_TYPES,
-                "vp9_default_coef_counts_4x4[BLOCK_TYPES]");
-  print_counter(f, context_counters[TX_8X8], BLOCK_TYPES,
-                "vp9_default_coef_counts_8x8[BLOCK_TYPES]");
-  print_counter(f, context_counters[TX_16X16], BLOCK_TYPES,
-                "vp9_default_coef_counts_16x16[BLOCK_TYPES]");
-  print_counter(f, context_counters[TX_32X32], BLOCK_TYPES,
-                "vp9_default_coef_counts_32x32[BLOCK_TYPES]");
-
-  /* print coefficient probabilities */
-  print_probs(f, context_counters[TX_4X4], BLOCK_TYPES,
-              "default_coef_probs_4x4[BLOCK_TYPES]");
-  print_probs(f, context_counters[TX_8X8], BLOCK_TYPES,
-              "default_coef_probs_8x8[BLOCK_TYPES]");
-  print_probs(f, context_counters[TX_16X16], BLOCK_TYPES,
-              "default_coef_probs_16x16[BLOCK_TYPES]");
-  print_probs(f, context_counters[TX_32X32], BLOCK_TYPES,
-              "default_coef_probs_32x32[BLOCK_TYPES]");
-
-  fclose(f);
-
-  f = fopen("context.bin", "wb");
-  fwrite(context_counters, sizeof(context_counters), 1, f);
-  fclose(f);
-}
-#endif
-
 void vp9_tokenize_initialize() {
   fill_value_tokens();
 }
diff --git a/vp9/encoder/vp9_tokenize.h b/vp9/encoder/vp9_tokenize.h
index b78e100ec..e24e31b80 100644
--- a/vp9/encoder/vp9_tokenize.h
+++ b/vp9/encoder/vp9_tokenize.h
@@ -28,9 +28,6 @@ typedef struct {
   uint8_t         skip_eob_node;
 } TOKENEXTRA;
 
-typedef int64_t vp9_coeff_accum[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS]
-                               [MAX_ENTROPY_TOKENS + 1];
-
 int vp9_sb_is_skippable(MACROBLOCKD *xd, BLOCK_SIZE bsize);
 int vp9_is_skippable_in_plane(MACROBLOCKD *xd, BLOCK_SIZE bsize,
                               int plane);
@@ -39,13 +36,6 @@ struct VP9_COMP;
 void vp9_tokenize_sb(struct VP9_COMP *cpi, TOKENEXTRA **t, int dry_run,
                      BLOCK_SIZE bsize);
 
-#ifdef ENTROPY_STATS
-void init_context_counters();
-void print_context_counters();
-
-extern vp9_coeff_accum context_counters[TX_SIZES][BLOCK_TYPES];
-#endif
-
 extern const int *vp9_dct_value_cost_ptr;
 /* TODO: The Token field should be broken out into a separate char array to
  *  improve cache locality, since it's needed for costing when the rest of the
diff --git a/vp9/encoder/vp9_vaq.c b/vp9/encoder/vp9_vaq.c
index 3179ae301..1f9cb8709 100644
--- a/vp9/encoder/vp9_vaq.c
+++ b/vp9/encoder/vp9_vaq.c
@@ -118,8 +118,8 @@ static unsigned int block_variance(VP9_COMP *cpi, MACROBLOCK *x,
       ((-xd->mb_to_bottom_edge) >> 3) : 0;
 
   if (right_overflow || bottom_overflow) {
-    int bw = (1 << (mi_width_log2(bs)  + 3)) - right_overflow;
-    int bh = (1 << (mi_height_log2(bs) + 3)) - bottom_overflow;
+    const int bw = 8 * num_8x8_blocks_wide_lookup[bs] - right_overflow;
+    const int bh = 8 * num_8x8_blocks_high_lookup[bs] - bottom_overflow;
     int avg;
     variance(x->plane[0].src.buf, x->plane[0].src.stride,
              vp9_64_zeros, 0, bw, bh, &sse, &avg);
diff --git a/vp9/encoder/x86/vp9_dct32x32_sse2.c b/vp9/encoder/x86/vp9_dct32x32_sse2.c
index de47a5bf1..2d59775ce 100644
--- a/vp9/encoder/x86/vp9_dct32x32_sse2.c
+++ b/vp9/encoder/x86/vp9_dct32x32_sse2.c
@@ -29,7 +29,7 @@ static INLINE __m128i k_packs_epi64(__m128i a, __m128i b) {
 }
 #endif
 
-void FDCT32x32_2D(int16_t *input,
+void FDCT32x32_2D(const int16_t *input,
                   int16_t *output_org, int stride) {
   // Calculate pre-multiplied strides
   const int str1 = stride;
@@ -93,13 +93,13 @@ void FDCT32x32_2D(int16_t *input,
       // Note: even though all the loads below are aligned, using the aligned
       //       intrinsic make the code slightly slower.
       if (0 == pass) {
-        int16_t *in  = &input[column_start];
+        const int16_t *in  = &input[column_start];
         // step1[i] =  (in[ 0 * stride] + in[(32 -  1) * stride]) << 2;
         // Note: the next four blocks could be in a loop. That would help the
         //       instruction cache but is actually slower.
         {
-          int16_t *ina =  in +  0 * str1;
-          int16_t *inb =  in + 31 * str1;
+          const int16_t *ina =  in +  0 * str1;
+          const int16_t *inb =  in + 31 * str1;
           __m128i *step1a = &step1[ 0];
           __m128i *step1b = &step1[31];
           const __m128i ina0  = _mm_loadu_si128((const __m128i *)(ina));
@@ -128,8 +128,8 @@ void FDCT32x32_2D(int16_t *input,
           step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
         }
         {
-          int16_t *ina =  in +  4 * str1;
-          int16_t *inb =  in + 27 * str1;
+          const int16_t *ina =  in +  4 * str1;
+          const int16_t *inb =  in + 27 * str1;
           __m128i *step1a = &step1[ 4];
           __m128i *step1b = &step1[27];
           const __m128i ina0  = _mm_loadu_si128((const __m128i *)(ina));
@@ -158,8 +158,8 @@ void FDCT32x32_2D(int16_t *input,
           step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
         }
         {
-          int16_t *ina =  in +  8 * str1;
-          int16_t *inb =  in + 23 * str1;
+          const int16_t *ina =  in +  8 * str1;
+          const int16_t *inb =  in + 23 * str1;
           __m128i *step1a = &step1[ 8];
           __m128i *step1b = &step1[23];
           const __m128i ina0  = _mm_loadu_si128((const __m128i *)(ina));
@@ -188,8 +188,8 @@ void FDCT32x32_2D(int16_t *input,
           step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
         }
         {
-          int16_t *ina =  in + 12 * str1;
-          int16_t *inb =  in + 19 * str1;
+          const int16_t *ina =  in + 12 * str1;
+          const int16_t *inb =  in + 19 * str1;
           __m128i *step1a = &step1[12];
           __m128i *step1b = &step1[19];
           const __m128i ina0  = _mm_loadu_si128((const __m128i *)(ina));
diff --git a/vp9/encoder/x86/vp9_dct_sse2.c b/vp9/encoder/x86/vp9_dct_sse2.c
index fa60e80eb..dc115018e 100644
--- a/vp9/encoder/x86/vp9_dct_sse2.c
+++ b/vp9/encoder/x86/vp9_dct_sse2.c
@@ -12,7 +12,7 @@
 #include "vp9/common/vp9_idct.h"  // for cospi constants
 #include "vpx_ports/mem.h"
 
-void vp9_short_fdct4x4_sse2(int16_t *input, int16_t *output, int stride) {
+void vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride) {
   // The 2D transform is done with two passes which are actually pretty
   // similar. In the first one, we transform the columns and transpose
   // the results. In the second one, we transform the rows. To achieve that,
@@ -111,7 +111,8 @@ void vp9_short_fdct4x4_sse2(int16_t *input, int16_t *output, int stride) {
   }
 }
 
-static INLINE void load_buffer_4x4(int16_t *input, __m128i *in, int stride) {
+static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in,
+                                   int stride) {
   const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
   const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
   __m128i mask;
@@ -242,7 +243,7 @@ void fadst4_1d_sse2(__m128i *in) {
   transpose_4x4(in);
 }
 
-void vp9_short_fht4x4_sse2(int16_t *input, int16_t *output,
+void vp9_short_fht4x4_sse2(const int16_t *input, int16_t *output,
                            int stride, int tx_type) {
   __m128i in[4];
   load_buffer_4x4(input, in, stride);
@@ -270,7 +271,7 @@ void vp9_short_fht4x4_sse2(int16_t *input, int16_t *output,
   write_buffer_4x4(output, in);
 }
 
-void vp9_short_fdct8x8_sse2(int16_t *input, int16_t *output, int stride) {
+void vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride) {
   int pass;
   // Constants
   //    When we use them, in one case, they are all the same. In all others
@@ -527,15 +528,16 @@ void vp9_short_fdct8x8_sse2(int16_t *input, int16_t *output, int stride) {
 }
 
 // load 8x8 array
-static INLINE void load_buffer_8x8(int16_t *input, __m128i *in, int stride) {
-  in[0]  = _mm_load_si128((__m128i *)(input + 0 * stride));
-  in[1]  = _mm_load_si128((__m128i *)(input + 1 * stride));
-  in[2]  = _mm_load_si128((__m128i *)(input + 2 * stride));
-  in[3]  = _mm_load_si128((__m128i *)(input + 3 * stride));
-  in[4]  = _mm_load_si128((__m128i *)(input + 4 * stride));
-  in[5]  = _mm_load_si128((__m128i *)(input + 5 * stride));
-  in[6]  = _mm_load_si128((__m128i *)(input + 6 * stride));
-  in[7]  = _mm_load_si128((__m128i *)(input + 7 * stride));
+static INLINE void load_buffer_8x8(const int16_t *input, __m128i *in,
+                                   int stride) {
+  in[0]  = _mm_load_si128((const __m128i *)(input + 0 * stride));
+  in[1]  = _mm_load_si128((const __m128i *)(input + 1 * stride));
+  in[2]  = _mm_load_si128((const __m128i *)(input + 2 * stride));
+  in[3]  = _mm_load_si128((const __m128i *)(input + 3 * stride));
+  in[4]  = _mm_load_si128((const __m128i *)(input + 4 * stride));
+  in[5]  = _mm_load_si128((const __m128i *)(input + 5 * stride));
+  in[6]  = _mm_load_si128((const __m128i *)(input + 6 * stride));
+  in[7]  = _mm_load_si128((const __m128i *)(input + 7 * stride));
 
   in[0] = _mm_slli_epi16(in[0], 2);
   in[1] = _mm_slli_epi16(in[1], 2);
@@ -1025,7 +1027,7 @@ void fadst8_1d_sse2(__m128i *in) {
   array_transpose_8x8(in, in);
 }
 
-void vp9_short_fht8x8_sse2(int16_t *input, int16_t *output,
+void vp9_short_fht8x8_sse2(const int16_t *input, int16_t *output,
                            int stride, int tx_type) {
   __m128i in[8];
   load_buffer_8x8(input, in, stride);
@@ -1054,7 +1056,7 @@ void vp9_short_fht8x8_sse2(int16_t *input, int16_t *output,
   write_buffer_8x8(output, in, 8);
 }
 
-void vp9_short_fdct16x16_sse2(int16_t *input, int16_t *output, int stride) {
+void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) {
   // The 2D transform is done with two passes which are actually pretty
   // similar. In the first one, we transform the columns and transpose
   // the results. In the second one, we transform the rows. To achieve that,
@@ -1064,7 +1066,7 @@ void vp9_short_fdct16x16_sse2(int16_t *input, int16_t *output, int stride) {
   int pass;
   // We need an intermediate buffer between passes.
   DECLARE_ALIGNED_ARRAY(16, int16_t, intermediate, 256);
-  int16_t *in = input;
+  const int16_t *in = input;
   int16_t *out = intermediate;
   // Constants
   //    When we use them, in one case, they are all the same. In all others
@@ -1679,7 +1681,7 @@ void vp9_short_fdct16x16_sse2(int16_t *input, int16_t *output, int stride) {
   }
 }
 
-static INLINE void load_buffer_16x16(int16_t* input, __m128i *in0,
+static INLINE void load_buffer_16x16(const int16_t* input, __m128i *in0,
                                      __m128i *in1, int stride) {
   // load first 8 columns
   load_buffer_8x8(input, in0, stride);
@@ -2531,7 +2533,7 @@ void fadst16_1d_sse2(__m128i *in0, __m128i *in1) {
   array_transpose_16x16(in0, in1);
 }
 
-void vp9_short_fht16x16_sse2(int16_t *input, int16_t *output,
+void vp9_short_fht16x16_sse2(const int16_t *input, int16_t *output,
                              int stride, int tx_type) {
   __m128i in0[16], in1[16];
   load_buffer_16x16(input, in0, in1, stride);
@@ -2563,13 +2565,13 @@ void vp9_short_fht16x16_sse2(int16_t *input, int16_t *output,
   write_buffer_16x16(output, in0, in1, 16);
 }
 
-#define FDCT32x32_2D vp9_short_fdct32x32_rd_sse2
+#define FDCT32x32_2D vp9_fdct32x32_rd_sse2
 #define FDCT32x32_HIGH_PRECISION 0
 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c"
 #undef  FDCT32x32_2D
 #undef  FDCT32x32_HIGH_PRECISION
 
-#define FDCT32x32_2D vp9_short_fdct32x32_sse2
+#define FDCT32x32_2D vp9_fdct32x32_sse2
 #define FDCT32x32_HIGH_PRECISION 1
 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c" // NOLINT
 #undef  FDCT32x32_2D
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index af6e66538..0badb0855 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -74,6 +74,7 @@ VP9_COMMON_SRCS-yes += common/vp9_scan.h
 VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_postproc_x86.h
 VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_asm_stubs.c
 VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_loopfilter_intrin_sse2.c
+VP9_COMMON_SRCS-$(HAVE_AVX2) += common/x86/vp9_loopfilter_intrin_avx2.c
 VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.h
 VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.c
 VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_loopfilter_mmx.asm
@@ -102,6 +103,11 @@ VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_convolve8_avg_horiz_dspr
 VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_convolve8_dspr2.c
 VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_convolve8_horiz_dspr2.c
 VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_convolve8_vert_dspr2.c
+VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_itrans4_dspr2.c
+VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_itrans8_dspr2.c
+VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_itrans16_dspr2.c
+VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_itrans32_cols_dspr2.c
+VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_itrans32_dspr2.c
 
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.c
 
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index ec2eac359..194203967 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -973,37 +973,16 @@ static vpx_codec_err_t vp9e_use_reference(vpx_codec_alg_priv_t *ctx,
 static vpx_codec_err_t vp9e_set_roi_map(vpx_codec_alg_priv_t *ctx,
                                         int ctr_id,
                                         va_list args) {
-  vpx_roi_map_t *data = va_arg(args, vpx_roi_map_t *);
-
-  if (data) {
-    vpx_roi_map_t *roi = (vpx_roi_map_t *)data;
-
-    if (!vp9_set_roimap(ctx->cpi, roi->roi_map, roi->rows, roi->cols,
-                        roi->delta_q, roi->delta_lf, roi->static_threshold))
-      return VPX_CODEC_OK;
-    else
-      return VPX_CODEC_INVALID_PARAM;
-  } else {
-    return VPX_CODEC_INVALID_PARAM;
-  }
+  // TODO(yaowu): Need to re-implement and test for VP9.
+  return VPX_CODEC_INVALID_PARAM;
 }
 
 
 static vpx_codec_err_t vp9e_set_activemap(vpx_codec_alg_priv_t *ctx,
                                           int ctr_id,
                                           va_list args) {
-  vpx_active_map_t *data = va_arg(args, vpx_active_map_t *);
-
-  if (data) {
-    vpx_active_map_t *map = (vpx_active_map_t *)data;
-
-    if (!vp9_set_active_map(ctx->cpi, map->active_map, map->rows, map->cols))
-      return VPX_CODEC_OK;
-    else
-      return VPX_CODEC_INVALID_PARAM;
-  } else {
-    return VPX_CODEC_INVALID_PARAM;
-  }
+  // TODO(yaowu): Need to re-implement and test for VP9.
+  return VPX_CODEC_INVALID_PARAM;
 }
 
 static vpx_codec_err_t vp9e_set_scalemode(vpx_codec_alg_priv_t *ctx,
@@ -1014,8 +993,9 @@ static vpx_codec_err_t vp9e_set_scalemode(vpx_codec_alg_priv_t *ctx,
   if (data) {
     int res;
     vpx_scaling_mode_t scalemode = *(vpx_scaling_mode_t *)data;
-    res = vp9_set_internal_size(ctx->cpi, scalemode.h_scaling_mode,
-                                scalemode.v_scaling_mode);
+    res = vp9_set_internal_size(ctx->cpi,
+                                (VPX_SCALING)scalemode.h_scaling_mode,
+                                (VPX_SCALING)scalemode.v_scaling_mode);
 
     if (!res) {
       return VPX_CODEC_OK;
diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c
index 6b923162f..5dacab454 100644
--- a/vp9/vp9_dx_iface.c
+++ b/vp9/vp9_dx_iface.c
@@ -172,9 +172,9 @@ static vpx_codec_err_t vp9_peek_si(const uint8_t         *data,
       rb.bit_offset += 1;  // show frame
       rb.bit_offset += 1;  // error resilient
 
-      if (vp9_rb_read_literal(&rb, 8) != SYNC_CODE_0 ||
-          vp9_rb_read_literal(&rb, 8) != SYNC_CODE_1 ||
-          vp9_rb_read_literal(&rb, 8) != SYNC_CODE_2) {
+      if (vp9_rb_read_literal(&rb, 8) != VP9_SYNC_CODE_0 ||
+          vp9_rb_read_literal(&rb, 8) != VP9_SYNC_CODE_1 ||
+          vp9_rb_read_literal(&rb, 8) != VP9_SYNC_CODE_2) {
         return VPX_CODEC_UNSUP_BITSTREAM;
       }
 
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index b454eee02..0993c6ce6 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -20,6 +20,7 @@ VP9_CX_SRCS-yes += vp9_cx_iface.c
 VP9_CX_SRCS-yes += encoder/vp9_bitstream.c
 VP9_CX_SRCS-yes += encoder/vp9_boolhuff.c
 VP9_CX_SRCS-yes += encoder/vp9_dct.c
+VP9_CX_SRCS-yes += encoder/vp9_dct.h
 VP9_CX_SRCS-yes += encoder/vp9_encodeframe.c
 VP9_CX_SRCS-yes += encoder/vp9_encodeframe.h
 VP9_CX_SRCS-yes += encoder/vp9_encodeintra.c