11 files changed, 5351 insertions, 350 deletions
diff --git a/vp9/common/mips/dspr2/vp9_common_dspr2.h b/vp9/common/mips/dspr2/vp9_common_dspr2.h
index dc88f1603..644264f65 100644
--- a/vp9/common/mips/dspr2/vp9_common_dspr2.h
+++ b/vp9/common/mips/dspr2/vp9_common_dspr2.h
@@ -81,6 +81,9 @@ static INLINE void vp9_prefetch_store_streamed(unsigned char *dst) {
   );
 }
 
+void vp9_idct32_1d_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
+                                      int dest_stride);
+
 void vp9_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                uint8_t *dst, ptrdiff_t dst_stride,
                                const int16_t *filter_x, int x_step_q4,
diff --git a/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c b/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c
new file mode 100644
index 000000000..1b2f5506a
--- /dev/null
+++ b/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c
@@ -0,0 +1,1315 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_idct.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+
+#if HAVE_DSPR2
+static void idct16_1d_rows_dspr2(const int16_t *input, int16_t *output,
+                                 uint32_t no_rows) {
+  int i;
+  int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
+  int step1_10, step1_11, step1_12, step1_13;
+  int step2_0, step2_1, step2_2, step2_3;
+  int step2_8, step2_9, step2_10, step2_11;
+  int step2_12, step2_13, step2_14, step2_15;
+  int load1, load2, load3, load4, load5, load6, load7, load8;
+  int result1, result2, result3, result4;
+  const int const_2_power_13 = 8192;
+
+  for (i = no_rows; i--; ) {
+    /* prefetch row */
+    vp9_prefetch_load((const uint8_t *)(input + 16));
+
+    __asm__ __volatile__ (
+        "lh       %[load1],              0(%[input])                    \n\t"
+        "lh       %[load2],             16(%[input])                    \n\t"
+        "lh       %[load3],              8(%[input])                    \n\t"
+        "lh       %[load4],             24(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+        "add      %[result1],           %[load1],       %[load2]        \n\t"
+        "sub      %[result2],           %[load1],       %[load2]        \n\t"
+        "madd     $ac1,                 %[result1],     %[cospi_16_64]  \n\t"
+        "madd     $ac2,                 %[result2],     %[cospi_16_64]  \n\t"
+        "extp     %[step2_0],           $ac1,           31              \n\t"
+        "extp     %[step2_1],           $ac2,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+        "madd     $ac3,                 %[load3],       %[cospi_24_64]  \n\t"
+        "msub     $ac3,                 %[load4],       %[cospi_8_64]   \n\t"
+        "extp     %[step2_2],           $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "madd     $ac1,                 %[load3],       %[cospi_8_64]   \n\t"
+        "madd     $ac1,                 %[load4],       %[cospi_24_64]  \n\t"
+        "extp     %[step2_3],           $ac1,           31              \n\t"
+
+        "add      %[step1_0],           %[step2_0],     %[step2_3]      \n\t"
+        "add      %[step1_1],           %[step2_1],     %[step2_2]      \n\t"
+        "sub      %[step1_2],           %[step2_1],     %[step2_2]      \n\t"
+        "sub      %[step1_3],           %[step2_0],     %[step2_3]      \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2),
+          [load3] "=&r" (load3), [load4] "=&r" (load4),
+          [result1] "=&r" (result1), [result2] "=&r" (result2),
+          [step2_0] "=&r" (step2_0), [step2_1] "=&r" (step2_1),
+          [step2_2] "=&r" (step2_2), [step2_3] "=&r" (step2_3),
+          [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1),
+          [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64),
+          [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load5],             2(%[input])                     \n\t"
+        "lh       %[load6],             30(%[input])                    \n\t"
+        "lh       %[load7],             18(%[input])                    \n\t"
+        "lh       %[load8],             14(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load5],       %[cospi_30_64]  \n\t"
+        "msub     $ac1,                 %[load6],       %[cospi_2_64]   \n\t"
+        "extp     %[result1],           $ac1,           31              \n\t"
+
+        "madd     $ac3,                 %[load7],       %[cospi_14_64]  \n\t"
+        "msub     $ac3,                 %[load8],       %[cospi_18_64]  \n\t"
+        "extp     %[result2],           $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac1,                 %[load7],       %[cospi_18_64]  \n\t"
+        "madd     $ac1,                 %[load8],       %[cospi_14_64]  \n\t"
+        "extp     %[result3],           $ac1,           31              \n\t"
+
+        "madd     $ac2,                 %[load5],       %[cospi_2_64]   \n\t"
+        "madd     $ac2,                 %[load6],       %[cospi_30_64]  \n\t"
+        "extp     %[result4],           $ac2,           31              \n\t"
+
+        "sub      %[load5],             %[result1],     %[result2]      \n\t"
+        "sub      %[load6],             %[result4],     %[result3]      \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load6],       %[cospi_24_64]  \n\t"
+        "msub     $ac1,                 %[load5],       %[cospi_8_64]   \n\t"
+        "madd     $ac3,                 %[load5],       %[cospi_24_64]  \n\t"
+        "madd     $ac3,                 %[load6],       %[cospi_8_64]   \n\t"
+
+        "extp     %[step2_9],           $ac1,           31              \n\t"
+        "extp     %[step2_14],          $ac3,           31              \n\t"
+        "add      %[step2_8],           %[result1],     %[result2]      \n\t"
+        "add      %[step2_15],          %[result4],     %[result3]      \n\t"
+
+        : [load5] "=&r" (load5), [load6] "=&r" (load6),
+          [load7] "=&r" (load7), [load8] "=&r" (load8),
+          [result1] "=&r" (result1), [result2] "=&r" (result2),
+          [result3] "=&r" (result3), [result4] "=&r" (result4),
+          [step2_8] "=r" (step2_8), [step2_15] "=r" (step2_15),
+          [step2_9] "=r" (step2_9), [step2_14] "=r" (step2_14)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64),
+          [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64),
+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load1],             10(%[input])                    \n\t"
+        "lh       %[load2],             22(%[input])                    \n\t"
+        "lh       %[load3],             26(%[input])                    \n\t"
+        "lh       %[load4],             6(%[input])                     \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load1],       %[cospi_22_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_10_64]  \n\t"
+        "extp     %[result1],           $ac1,           31              \n\t"
+
+        "madd     $ac3,                 %[load3],       %[cospi_6_64]   \n\t"
+        "msub     $ac3,                 %[load4],       %[cospi_26_64]  \n\t"
+        "extp     %[result2],           $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac1,                 %[load1],       %[cospi_10_64]  \n\t"
+        "madd     $ac1,                 %[load2],       %[cospi_22_64]  \n\t"
+        "extp     %[result3],           $ac1,           31              \n\t"
+
+        "madd     $ac2,                 %[load3],       %[cospi_26_64]  \n\t"
+        "madd     $ac2,                 %[load4],       %[cospi_6_64]   \n\t"
+        "extp     %[result4],           $ac2,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load1],             %[result2],     %[result1]      \n\t"
+        "sub      %[load2],             %[result4],     %[result3]      \n\t"
+
+        "msub     $ac1,                 %[load1],       %[cospi_24_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_8_64]   \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_24_64]  \n\t"
+        "msub     $ac3,                 %[load1],       %[cospi_8_64]   \n\t"
+
+        "extp     %[step2_10],          $ac1,           31              \n\t"
+        "extp     %[step2_13],          $ac3,           31              \n\t"
+        "add      %[step2_11],          %[result1],     %[result2]      \n\t"
+        "add      %[step2_12],          %[result4],     %[result3]      \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2),
+          [load3] "=&r" (load3), [load4] "=&r" (load4),
+          [result1] "=&r" (result1), [result2] "=&r" (result2),
+          [result3] "=&r" (result3), [result4] "=&r" (result4),
+          [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11),
+          [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64),
+          [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64),
+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load5],             4(%[input])                     \n\t"
+        "lh       %[load6],             28(%[input])                    \n\t"
+        "lh       %[load7],             20(%[input])                    \n\t"
+        "lh       %[load8],             12(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load5],       %[cospi_28_64]  \n\t"
+        "msub     $ac1,                 %[load6],       %[cospi_4_64]   \n\t"
+        "extp     %[result1],           $ac1,           31              \n\t"
+
+        "madd     $ac3,                 %[load7],       %[cospi_12_64]  \n\t"
+        "msub     $ac3,                 %[load8],       %[cospi_20_64]  \n\t"
+        "extp     %[result2],           $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac1,                 %[load7],       %[cospi_20_64]  \n\t"
+        "madd     $ac1,                 %[load8],       %[cospi_12_64]  \n\t"
+        "extp     %[result3],           $ac1,           31              \n\t"
+
+        "madd     $ac2,                 %[load5],       %[cospi_4_64]   \n\t"
+        "madd     $ac2,                 %[load6],       %[cospi_28_64]  \n\t"
+        "extp     %[result4],           $ac2,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load5],             %[result4],     %[result3]      \n\t"
+        "sub      %[load5],             %[load5],       %[result1]      \n\t"
+        "add      %[load5],             %[load5],       %[result2]      \n\t"
+
+        "sub      %[load6],             %[result1],     %[result2]      \n\t"
+        "sub      %[load6],             %[load6],       %[result3]      \n\t"
+        "add      %[load6],             %[load6],       %[result4]      \n\t"
+
+        "madd     $ac1,                 %[load5],       %[cospi_16_64]  \n\t"
+        "madd     $ac3,                 %[load6],       %[cospi_16_64]  \n\t"
+
+        "extp     %[step1_5],           $ac1,           31              \n\t"
+        "extp     %[step1_6],           $ac3,           31              \n\t"
+        "add      %[step1_4],           %[result1],     %[result2]      \n\t"
+        "add      %[step1_7],           %[result4],     %[result3]      \n\t"
+
+        : [load5] "=&r" (load5), [load6] "=&r" (load6),
+          [load7] "=&r" (load7), [load8] "=&r" (load8),
+          [result1] "=&r" (result1), [result2] "=&r" (result2),
+          [result3] "=&r" (result3), [result4] "=&r" (result4),
+          [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5),
+          [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64),
+          [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64),
+          [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    __asm__ __volatile__ (
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+
+        "sub      %[load5],             %[step2_14],    %[step2_13]     \n\t"
+        "sub      %[load5],             %[load5],       %[step2_9]      \n\t"
+        "add      %[load5],             %[load5],       %[step2_10]     \n\t"
+
+        "madd     $ac0,                 %[load5],       %[cospi_16_64]  \n\t"
+
+        "sub      %[load6],             %[step2_14],    %[step2_13]     \n\t"
+        "sub      %[load6],             %[load6],       %[step2_10]     \n\t"
+        "add      %[load6],             %[load6],       %[step2_9]      \n\t"
+
+        "madd     $ac1,                 %[load6],       %[cospi_16_64]  \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load5],             %[step2_15],    %[step2_12]     \n\t"
+        "sub      %[load5],             %[load5],       %[step2_8]      \n\t"
+        "add      %[load5],             %[load5],       %[step2_11]     \n\t"
+
+        "madd     $ac2,                 %[load5],       %[cospi_16_64]  \n\t"
+
+        "sub      %[load6],             %[step2_15],    %[step2_12]     \n\t"
+        "sub      %[load6],             %[load6],       %[step2_11]     \n\t"
+        "add      %[load6],             %[load6],       %[step2_8]      \n\t"
+
+        "madd     $ac3,                 %[load6],       %[cospi_16_64]  \n\t"
+
+        "extp     %[step1_10],          $ac0,           31              \n\t"
+        "extp     %[step1_13],          $ac1,           31              \n\t"
+        "extp     %[step1_11],          $ac2,           31              \n\t"
+        "extp     %[step1_12],          $ac3,           31              \n\t"
+
+        : [load5] "=&r" (load5), [load6] "=&r" (load6),
+          [step1_10] "=r" (step1_10), [step1_11] "=r" (step1_11),
+          [step1_12] "=r" (step1_12), [step1_13] "=r" (step1_13)
+        : [const_2_power_13] "r" (const_2_power_13),
+          [step2_14] "r" (step2_14), [step2_13] "r" (step2_13),
+          [step2_9] "r" (step2_9), [step2_10] "r" (step2_10),
+          [step2_15] "r" (step2_15), [step2_12] "r" (step2_12),
+          [step2_8] "r" (step2_8), [step2_11] "r" (step2_11),
+          [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    __asm__ __volatile__ (
+        "add      %[load5],             %[step1_0],     %[step1_7]      \n\t"
+        "add      %[load5],             %[load5],       %[step2_12]     \n\t"
+        "add      %[load5],             %[load5],       %[step2_15]     \n\t"
+        "add      %[load6],             %[step1_1],     %[step1_6]      \n\t"
+        "add      %[load6],             %[load6],       %[step2_13]     \n\t"
+        "add      %[load6],             %[load6],       %[step2_14]     \n\t"
+        "sh       %[load5],             0(%[output])                    \n\t"
+        "sh       %[load6],             32(%[output])                   \n\t"
+        "sub      %[load5],             %[step1_1],     %[step1_6]      \n\t"
+        "add      %[load5],             %[load5],       %[step2_9]      \n\t"
+        "add      %[load5],             %[load5],       %[step2_10]     \n\t"
+        "sub      %[load6],             %[step1_0],     %[step1_7]      \n\t"
+        "add      %[load6],             %[load6],       %[step2_8]      \n\t"
+        "add      %[load6],             %[load6],       %[step2_11]     \n\t"
+        "sh       %[load5],             192(%[output])                  \n\t"
+        "sh       %[load6],             224(%[output])                  \n\t"
+        "sub      %[load5],             %[step1_0],     %[step1_7]      \n\t"
+        "sub      %[load5],             %[load5],       %[step2_8]      \n\t"
+        "sub      %[load5],             %[load5],       %[step2_11]     \n\t"
+        "sub      %[load6],             %[step1_1],     %[step1_6]      \n\t"
+        "sub      %[load6],             %[load6],       %[step2_9]      \n\t"
+        "sub      %[load6],             %[load6],       %[step2_10]     \n\t"
+        "sh       %[load5],             256(%[output])                  \n\t"
+        "sh       %[load6],             288(%[output])                  \n\t"
+        "add      %[load5],             %[step1_1],     %[step1_6]      \n\t"
+        "sub      %[load5],             %[load5],       %[step2_13]     \n\t"
+        "sub      %[load5],             %[load5],       %[step2_14]     \n\t"
+        "add      %[load6],             %[step1_0],     %[step1_7]      \n\t"
+        "sub      %[load6],             %[load6],       %[step2_12]     \n\t"
+        "sub      %[load6],             %[load6],       %[step2_15]     \n\t"
+        "sh       %[load5],             448(%[output])                  \n\t"
+        "sh       %[load6],             480(%[output])                  \n\t"
+
+        : [load5] "=&r" (load5), [load6] "=&r" (load6)
+        : [output] "r" (output),
+          [step1_0] "r" (step1_0), [step1_1] "r" (step1_1),
+          [step1_6] "r" (step1_6), [step1_7] "r" (step1_7),
+          [step2_8] "r" (step2_8), [step2_9] "r" (step2_9),
+          [step2_10] "r" (step2_10), [step2_11] "r" (step2_11),
+          [step2_12] "r" (step2_12), [step2_13] "r" (step2_13),
+          [step2_14] "r" (step2_14), [step2_15] "r" (step2_15)
+    );
+
+    __asm__ __volatile__ (
+        "add      %[load5],             %[step1_2],     %[step1_5]      \n\t"
+        "add      %[load5],             %[load5],       %[step1_13]     \n\t"
+        "add      %[load6],             %[step1_3],     %[step1_4]      \n\t"
+        "add      %[load6],             %[load6],       %[step1_12]     \n\t"
+        "sh       %[load5],             64(%[output])                   \n\t"
+        "sh       %[load6],             96(%[output])                   \n\t"
+        "sub      %[load5],             %[step1_3],     %[step1_4]      \n\t"
+        "add      %[load5],             %[load5],       %[step1_11]     \n\t"
+        "sub      %[load6],             %[step1_2],     %[step1_5]      \n\t"
+        "add      %[load6],             %[load6],       %[step1_10]     \n\t"
+        "sh       %[load5],             128(%[output])                  \n\t"
+        "sh       %[load6],             160(%[output])                  \n\t"
+        "sub      %[load5],             %[step1_2],     %[step1_5]      \n\t"
+        "sub      %[load5],             %[load5],       %[step1_10]     \n\t"
+        "sub      %[load6],             %[step1_3],     %[step1_4]      \n\t"
+        "sub      %[load6],             %[load6],       %[step1_11]     \n\t"
+        "sh       %[load5],             320(%[output])                  \n\t"
+        "sh       %[load6],             352(%[output])                  \n\t"
+        "add      %[load5],             %[step1_3],     %[step1_4]      \n\t"
+        "sub      %[load5],             %[load5],       %[step1_12]     \n\t"
+        "add      %[load6],             %[step1_2],     %[step1_5]      \n\t"
+        "sub      %[load6],             %[load6],       %[step1_13]     \n\t"
+        "sh       %[load5],             384(%[output])                  \n\t"
+        "sh       %[load6],             416(%[output])                  \n\t"
+
+        : [load5] "=&r" (load5), [load6] "=&r" (load6)
+        : [output] "r" (output),
+          [step1_2] "r" (step1_2), [step1_3] "r" (step1_3),
+          [step1_4] "r" (step1_4), [step1_5] "r" (step1_5),
+          [step1_10] "r" (step1_10), [step1_11] "r" (step1_11),
+          [step1_12] "r" (step1_12), [step1_13] "r" (step1_13)
+    );
+
+    input += 16;
+    output += 1;
+  }
+}
+
+static void idct16_1d_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
+                                         int dest_stride) {
+  int i;
+  int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
+  int step1_8, step1_9, step1_10, step1_11;
+  int step1_12, step1_13, step1_14, step1_15;
+  int step2_0, step2_1, step2_2, step2_3;
+  int step2_8, step2_9, step2_10, step2_11;
+  int step2_12, step2_13, step2_14, step2_15;
+  int load1, load2, load3, load4, load5, load6, load7, load8;
+  int result1, result2, result3, result4;
+  const int const_2_power_13 = 8192;
+  uint8_t *dest_pix;
+  uint8_t *cm = vp9_ff_cropTbl;
+
+  /* prefetch vp9_ff_cropTbl */
+  vp9_prefetch_load(vp9_ff_cropTbl);
+  vp9_prefetch_load(vp9_ff_cropTbl +  32);
+  vp9_prefetch_load(vp9_ff_cropTbl +  64);
+  vp9_prefetch_load(vp9_ff_cropTbl +  96);
+  vp9_prefetch_load(vp9_ff_cropTbl + 128);
+  vp9_prefetch_load(vp9_ff_cropTbl + 160);
+  vp9_prefetch_load(vp9_ff_cropTbl + 192);
+  vp9_prefetch_load(vp9_ff_cropTbl + 224);
+
+  for (i = 0; i < 16; ++i) {
+    dest_pix = (dest + i);
+    __asm__ __volatile__ (
+        "lh       %[load1],              0(%[input])                    \n\t"
+        "lh       %[load2],             16(%[input])                    \n\t"
+        "lh       %[load3],              8(%[input])                    \n\t"
+        "lh       %[load4],             24(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+        "add      %[result1],           %[load1],       %[load2]        \n\t"
+        "sub      %[result2],           %[load1],       %[load2]        \n\t"
+        "madd     $ac1,                 %[result1],     %[cospi_16_64]  \n\t"
+        "madd     $ac2,                 %[result2],     %[cospi_16_64]  \n\t"
+        "extp     %[step2_0],           $ac1,           31              \n\t"
+        "extp     %[step2_1],           $ac2,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+        "madd     $ac3,                 %[load3],       %[cospi_24_64]  \n\t"
+        "msub     $ac3,                 %[load4],       %[cospi_8_64]   \n\t"
+        "extp     %[step2_2],           $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "madd     $ac1,                 %[load3],       %[cospi_8_64]   \n\t"
+        "madd     $ac1,                 %[load4],       %[cospi_24_64]  \n\t"
+        "extp     %[step2_3],           $ac1,           31              \n\t"
+
+        "add      %[step1_0],           %[step2_0],     %[step2_3]      \n\t"
+        "add      %[step1_1],           %[step2_1],     %[step2_2]      \n\t"
+        "sub      %[step1_2],           %[step2_1],     %[step2_2]      \n\t"
+        "sub      %[step1_3],           %[step2_0],     %[step2_3]      \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2),
+          [load3] "=&r" (load3), [load4] "=&r" (load4),
+          [result1] "=&r" (result1), [result2] "=&r" (result2),
+          [step2_0] "=&r" (step2_0), [step2_1] "=&r" (step2_1),
+          [step2_2] "=&r" (step2_2), [step2_3] "=&r" (step2_3),
+          [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1),
+          [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64),
+          [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load5],             2(%[input])                     \n\t"
+        "lh       %[load6],             30(%[input])                    \n\t"
+        "lh       %[load7],             18(%[input])                    \n\t"
+        "lh       %[load8],             14(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load5],       %[cospi_30_64]  \n\t"
+        "msub     $ac1,                 %[load6],       %[cospi_2_64]   \n\t"
+        "extp     %[result1],           $ac1,           31              \n\t"
+
+        "madd     $ac3,                 %[load7],       %[cospi_14_64]  \n\t"
+        "msub     $ac3,                 %[load8],       %[cospi_18_64]  \n\t"
+        "extp     %[result2],           $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac1,                 %[load7],       %[cospi_18_64]  \n\t"
+        "madd     $ac1,                 %[load8],       %[cospi_14_64]  \n\t"
+        "extp     %[result3],           $ac1,           31              \n\t"
+
+        "madd     $ac2,                 %[load5],        %[cospi_2_64]  \n\t"
+        "madd     $ac2,                 %[load6],        %[cospi_30_64] \n\t"
+        "extp     %[result4],           $ac2,            31             \n\t"
+
+        "sub      %[load5],             %[result1],     %[result2]      \n\t"
+        "sub      %[load6],             %[result4],     %[result3]      \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load6],       %[cospi_24_64]  \n\t"
+        "msub     $ac1,                 %[load5],       %[cospi_8_64]   \n\t"
+        "madd     $ac3,                 %[load5],       %[cospi_24_64]  \n\t"
+        "madd     $ac3,                 %[load6],       %[cospi_8_64]   \n\t"
+
+        "extp     %[step2_9],           $ac1,           31              \n\t"
+        "extp     %[step2_14],          $ac3,           31              \n\t"
+        "add      %[step2_8],           %[result1],     %[result2]      \n\t"
+        "add      %[step2_15],          %[result4],     %[result3]      \n\t"
+
+        : [load5] "=&r" (load5), [load6] "=&r" (load6),
+          [load7] "=&r" (load7), [load8] "=&r" (load8),
+          [result1] "=&r" (result1), [result2] "=&r" (result2),
+          [result3] "=&r" (result3), [result4] "=&r" (result4),
+          [step2_8] "=r" (step2_8), [step2_15] "=r" (step2_15),
+          [step2_9] "=r" (step2_9), [step2_14] "=r" (step2_14)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64),
+          [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64),
+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load1],             10(%[input])                    \n\t"
+        "lh       %[load2],             22(%[input])                    \n\t"
+        "lh       %[load3],             26(%[input])                    \n\t"
+        "lh       %[load4],             6(%[input])                     \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load1],    %[cospi_22_64]     \n\t"
+        "msub     $ac1,                 %[load2],    %[cospi_10_64]     \n\t"
+        "extp     %[result1],           $ac1,        31                 \n\t"
+
+        "madd     $ac3,                 %[load3],    %[cospi_6_64]      \n\t"
+        "msub     $ac3,                 %[load4],    %[cospi_26_64]     \n\t"
+        "extp     %[result2],           $ac3,        31                 \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac1,                 %[load1],    %[cospi_10_64]     \n\t"
+        "madd     $ac1,                 %[load2],    %[cospi_22_64]     \n\t"
+        "extp     %[result3],           $ac1,        31                 \n\t"
+
+        "madd     $ac2,                 %[load3],    %[cospi_26_64]     \n\t"
+        "madd     $ac2,                 %[load4],    %[cospi_6_64]      \n\t"
+        "extp     %[result4],           $ac2,        31                 \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load1],             %[result2],     %[result1]      \n\t"
+        "sub      %[load2],             %[result4],     %[result3]      \n\t"
+
+        "msub     $ac1,                 %[load1],       %[cospi_24_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_8_64]   \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_24_64]  \n\t"
+        "msub     $ac3,                 %[load1],       %[cospi_8_64]   \n\t"
+
+        "extp     %[step2_10],          $ac1,           31              \n\t"
+        "extp     %[step2_13],          $ac3,           31              \n\t"
+        "add      %[step2_11],          %[result1],     %[result2]      \n\t"
+        "add      %[step2_12],          %[result4],     %[result3]      \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2),
+          [load3] "=&r" (load3), [load4] "=&r" (load4),
+          [result1] "=&r" (result1), [result2] "=&r" (result2),
+          [result3] "=&r" (result3), [result4] "=&r" (result4),
+          [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11),
+          [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64),
+          [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64),
+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load5],             4(%[input])                   \n\t"
+        "lh       %[load6],             28(%[input])                  \n\t"
+        "lh       %[load7],             20(%[input])                  \n\t"
+        "lh       %[load8],             12(%[input])                  \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                          \n\t"
+        "mthi     $zero,                $ac1                          \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                          \n\t"
+        "mthi     $zero,                $ac3                          \n\t"
+
+        "madd     $ac1,                 %[load5],    %[cospi_28_64]   \n\t"
+        "msub     $ac1,                 %[load6],    %[cospi_4_64]    \n\t"
+        "extp     %[result1],           $ac1,        31               \n\t"
+
+        "madd     $ac3,                 %[load7],    %[cospi_12_64]   \n\t"
+        "msub     $ac3,                 %[load8],    %[cospi_20_64]   \n\t"
+        "extp     %[result2],           $ac3,        31               \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                          \n\t"
+        "mthi     $zero,                $ac1                          \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                          \n\t"
+        "mthi     $zero,                $ac2                          \n\t"
+
+        "madd     $ac1,                 %[load7],    %[cospi_20_64]   \n\t"
+        "madd     $ac1,                 %[load8],    %[cospi_12_64]   \n\t"
+        "extp     %[result3],           $ac1,        31               \n\t"
+
+        "madd     $ac2,                 %[load5],    %[cospi_4_64]    \n\t"
+        "madd     $ac2,                 %[load6],    %[cospi_28_64]   \n\t"
+        "extp     %[result4],           $ac2,        31               \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load5],             %[result4],     %[result3]      \n\t"
+        "sub      %[load5],             %[load5],       %[result1]      \n\t"
+        "add      %[load5],             %[load5],       %[result2]      \n\t"
+
+        "sub      %[load6],             %[result1],     %[result2]      \n\t"
+        "sub      %[load6],             %[load6],       %[result3]      \n\t"
+        "add      %[load6],             %[load6],       %[result4]      \n\t"
+
+        "madd     $ac1,                 %[load5],       %[cospi_16_64]  \n\t"
+        "madd     $ac3,                 %[load6],       %[cospi_16_64]  \n\t"
+
+        "extp     %[step1_5],           $ac1,           31              \n\t"
+        "extp     %[step1_6],           $ac3,           31              \n\t"
+
+        "add      %[step1_4],           %[result1],     %[result2]      \n\t"
+        "add      %[step1_7],           %[result4],     %[result3]      \n\t"
+
+        : [load5] "=&r" (load5), [load6] "=&r" (load6),
+          [load7] "=&r" (load7), [load8] "=&r" (load8),
+          [result1] "=&r" (result1), [result2] "=&r" (result2),
+          [result3] "=&r" (result3), [result4] "=&r" (result4),
+          [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5),
+          [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64),
+          [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64),
+          [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    __asm__ __volatile__ (
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+
+        "sub      %[load5],             %[step2_14],    %[step2_13]     \n\t"
+        "sub      %[load5],             %[load5],       %[step2_9]      \n\t"
+        "add      %[load5],             %[load5],       %[step2_10]     \n\t"
+
+        "madd     $ac0,                 %[load5],       %[cospi_16_64]  \n\t"
+
+        "sub      %[load6],             %[step2_14],    %[step2_13]     \n\t"
+        "sub      %[load6],             %[load6],       %[step2_10]     \n\t"
+        "add      %[load6],             %[load6],       %[step2_9]      \n\t"
+
+        "madd     $ac1,                 %[load6],       %[cospi_16_64]  \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load5],             %[step2_15],    %[step2_12]     \n\t"
+        "sub      %[load5],             %[load5],       %[step2_8]      \n\t"
+        "add      %[load5],             %[load5],       %[step2_11]     \n\t"
+
+        "madd     $ac2,                 %[load5],       %[cospi_16_64]  \n\t"
+
+        "sub      %[load6],             %[step2_15],    %[step2_12]     \n\t"
+        "sub      %[load6],             %[load6],       %[step2_11]     \n\t"
+        "add      %[load6],             %[load6],       %[step2_8]      \n\t"
+
+        "madd     $ac3,                 %[load6],       %[cospi_16_64]  \n\t"
+
+        "extp     %[step1_10],          $ac0,           31              \n\t"
+        "extp     %[step1_13],          $ac1,           31              \n\t"
+        "extp     %[step1_11],          $ac2,           31              \n\t"
+        "extp     %[step1_12],          $ac3,           31              \n\t"
+
+        : [load5] "=&r" (load5), [load6] "=&r" (load6),
+          [step1_10] "=r" (step1_10), [step1_11] "=r" (step1_11),
+          [step1_12] "=r" (step1_12), [step1_13] "=r" (step1_13)
+        : [const_2_power_13] "r" (const_2_power_13),
+          [step2_14] "r" (step2_14), [step2_13] "r" (step2_13),
+          [step2_9] "r" (step2_9), [step2_10] "r" (step2_10),
+          [step2_15] "r" (step2_15), [step2_12] "r" (step2_12),
+          [step2_8] "r" (step2_8), [step2_11] "r" (step2_11),
+          [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    step1_8 = step2_8 + step2_11;
+    step1_9 = step2_9 + step2_10;
+    step1_14 = step2_13 + step2_14;
+    step1_15 = step2_12 + step2_15;
+
+    __asm__ __volatile__ (
+        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
+        "add      %[load5],         %[step1_0],         %[step1_7]      \n\t"
+        "add      %[load5],         %[load5],           %[step1_15]     \n\t"
+        "addi     %[load5],         %[load5],           32              \n\t"
+        "sra      %[load5],         %[load5],           6               \n\t"
+        "add      %[load7],         %[load7],           %[load5]        \n\t"
+        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
+        "add      %[load6],         %[step1_1],         %[step1_6]      \n\t"
+        "add      %[load6],         %[load6],           %[step1_14]     \n\t"
+        "sb       %[load5],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
+        "addi     %[load6],         %[load6],           32              \n\t"
+        "sra      %[load6],         %[load6],           6               \n\t"
+        "add      %[load8],         %[load8],           %[load6]        \n\t"
+        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
+        "sb       %[load6],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+
+        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
+        "add      %[load5],         %[step1_2],         %[step1_5]      \n\t"
+        "add      %[load5],         %[load5],           %[step1_13]     \n\t"
+        "addi     %[load5],         %[load5],           32              \n\t"
+        "sra      %[load5],         %[load5],           6               \n\t"
+        "add      %[load7],         %[load7],           %[load5]        \n\t"
+        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
+        "add      %[load6],         %[step1_3],         %[step1_4]      \n\t"
+        "add      %[load6],         %[load6],           %[step1_12]     \n\t"
+        "sb       %[load5],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
+        "addi     %[load6],         %[load6],           32              \n\t"
+        "sra      %[load6],         %[load6],           6               \n\t"
+        "add      %[load8],         %[load8],           %[load6]        \n\t"
+        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
+        "sb       %[load6],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+
+        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
+        "sub      %[load5],         %[step1_3],         %[step1_4]      \n\t"
+        "add      %[load5],         %[load5],           %[step1_11]     \n\t"
+        "addi     %[load5],         %[load5],           32              \n\t"
+        "sra      %[load5],         %[load5],           6               \n\t"
+        "add      %[load7],         %[load7],           %[load5]        \n\t"
+        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
+        "sub      %[load6],         %[step1_2],         %[step1_5]      \n\t"
+        "add      %[load6],         %[load6],           %[step1_10]     \n\t"
+        "sb       %[load5],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
+        "addi     %[load6],         %[load6],           32              \n\t"
+        "sra      %[load6],         %[load6],           6               \n\t"
+        "add      %[load8],         %[load8],           %[load6]        \n\t"
+        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
+        "sb       %[load6],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+
+        "sub      %[load5],         %[step1_1],         %[step1_6]      \n\t"
+        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
+        "add      %[load5],         %[load5],           %[step1_9]      \n\t"
+        "addi     %[load5],         %[load5],           32              \n\t"
+        "sra      %[load5],         %[load5],           6               \n\t"
+        "add      %[load7],         %[load7],           %[load5]        \n\t"
+        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
+        "sub      %[load6],         %[step1_0],         %[step1_7]      \n\t"
+        "add      %[load6],         %[load6],           %[step1_8]      \n\t"
+        "sb       %[load5],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
+        "addi     %[load6],         %[load6],           32              \n\t"
+        "sra      %[load6],         %[load6],           6               \n\t"
+        "add      %[load8],         %[load8],           %[load6]        \n\t"
+        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
+        "sb       %[load6],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+
+        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
+        "sub      %[load5],         %[step1_0],         %[step1_7]      \n\t"
+        "sub      %[load5],         %[load5],           %[step1_8]      \n\t"
+        "addi     %[load5],         %[load5],           32              \n\t"
+        "sra      %[load5],         %[load5],           6               \n\t"
+        "add      %[load7],         %[load7],           %[load5]        \n\t"
+        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
+        "sub      %[load6],         %[step1_1],         %[step1_6]      \n\t"
+        "sub      %[load6],         %[load6],           %[step1_9]      \n\t"
+        "sb       %[load5],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
+        "addi     %[load6],         %[load6],           32              \n\t"
+        "sra      %[load6],         %[load6],           6               \n\t"
+        "add      %[load8],         %[load8],           %[load6]        \n\t"
+        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
+        "sb       %[load6],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+
+        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
+        "sub      %[load5],         %[step1_2],         %[step1_5]      \n\t"
+        "sub      %[load5],         %[load5],           %[step1_10]     \n\t"
+        "addi     %[load5],         %[load5],           32              \n\t"
+        "sra      %[load5],         %[load5],           6               \n\t"
+        "add      %[load7],         %[load7],           %[load5]        \n\t"
+        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
+        "sub      %[load6],         %[step1_3],         %[step1_4]      \n\t"
+        "sub      %[load6],         %[load6],           %[step1_11]     \n\t"
+        "sb       %[load5],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
+        "addi     %[load6],         %[load6],           32              \n\t"
+        "sra      %[load6],         %[load6],           6               \n\t"
+        "add      %[load8],         %[load8],           %[load6]        \n\t"
+        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
+        "sb       %[load6],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+
+        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
+        "add      %[load5],         %[step1_3],         %[step1_4]      \n\t"
+        "sub      %[load5],         %[load5],           %[step1_12]     \n\t"
+        "addi     %[load5],         %[load5],           32              \n\t"
+        "sra      %[load5],         %[load5],           6               \n\t"
+        "add      %[load7],         %[load7],           %[load5]        \n\t"
+        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
+        "add      %[load6],         %[step1_2],         %[step1_5]      \n\t"
+        "sub      %[load6],         %[load6],           %[step1_13]     \n\t"
+        "sb       %[load5],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
+        "addi     %[load6],         %[load6],           32              \n\t"
+        "sra      %[load6],         %[load6],           6               \n\t"
+        "add      %[load8],         %[load8],           %[load6]        \n\t"
+        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
+        "sb       %[load6],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+
+        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
+        "add      %[load5],         %[step1_1],         %[step1_6]      \n\t"
+        "sub      %[load5],         %[load5],           %[step1_14]     \n\t"
+        "addi     %[load5],         %[load5],           32              \n\t"
+        "sra      %[load5],         %[load5],           6               \n\t"
+        "add      %[load7],         %[load7],           %[load5]        \n\t"
+        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
+        "add      %[load6],         %[step1_0],         %[step1_7]      \n\t"
+        "sub      %[load6],         %[load6],           %[step1_15]     \n\t"
+        "sb       %[load5],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
+        "addi     %[load6],         %[load6],           32              \n\t"
+        "sra      %[load6],         %[load6],           6               \n\t"
+        "add      %[load8],         %[load8],           %[load6]        \n\t"
+        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
+        "sb       %[load6],         0(%[dest_pix])                      \n\t"
+
+        : [load5] "=&r" (load5), [load6] "=&r" (load6), [load7] "=&r" (load7),
+          [load8] "=&r" (load8), [dest_pix] "+r" (dest_pix)
+        : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
+          [step1_0] "r" (step1_0), [step1_1] "r" (step1_1),
+          [step1_2] "r" (step1_2), [step1_3] "r" (step1_3),
+          [step1_4] "r" (step1_4), [step1_5] "r" (step1_5),
+          [step1_6] "r" (step1_6), [step1_7] "r" (step1_7),
+          [step1_8] "r" (step1_8), [step1_9] "r" (step1_9),
+          [step1_10] "r" (step1_10), [step1_11] "r" (step1_11),
+          [step1_12] "r" (step1_12), [step1_13] "r" (step1_13),
+          [step1_14] "r" (step1_14), [step1_15] "r" (step1_15)
+    );
+
+    input += 16;
+  }
+}
+
+void vp9_idct16x16_256_add_dspr2(const int16_t *input, uint8_t *dest,
+                                 int dest_stride) {
+  DECLARE_ALIGNED(32, int16_t,  out[16 * 16]);
+  uint32_t pos = 45;
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp    %[pos],    1    \n\t"
+    :
+    : [pos] "r" (pos)
+  );
+
+  // First transform rows
+  idct16_1d_rows_dspr2(input, out, 16);
+
+  // Then transform columns and add to dest
+  idct16_1d_cols_add_blk_dspr2(out, dest, dest_stride);
+}
+
+static void iadst16_1d(const int16_t *input, int16_t *output) {
+  int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
+
+  int x0 = input[15];
+  int x1 = input[0];
+  int x2 = input[13];
+  int x3 = input[2];
+  int x4 = input[11];
+  int x5 = input[4];
+  int x6 = input[9];
+  int x7 = input[6];
+  int x8 = input[7];
+  int x9 = input[8];
+  int x10 = input[5];
+  int x11 = input[10];
+  int x12 = input[3];
+  int x13 = input[12];
+  int x14 = input[1];
+  int x15 = input[14];
+
+  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
+           | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {
+    output[0] = output[1] = output[2] = output[3] = output[4]
+              = output[5] = output[6] = output[7] = output[8]
+              = output[9] = output[10] = output[11] = output[12]
+              = output[13] = output[14] = output[15] = 0;
+    return;
+  }
+
+  // stage 1
+  s0 = x0 * cospi_1_64  + x1 * cospi_31_64;
+  s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
+  s2 = x2 * cospi_5_64  + x3 * cospi_27_64;
+  s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
+  s4 = x4 * cospi_9_64  + x5 * cospi_23_64;
+  s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
+  s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
+  s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
+  s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
+  s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
+  s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
+  s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
+  s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
+  s13 = x12 * cospi_7_64  - x13 * cospi_25_64;
+  s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
+  s15 = x14 * cospi_3_64  - x15 * cospi_29_64;
+
+  x0 = dct_const_round_shift(s0 + s8);
+  x1 = dct_const_round_shift(s1 + s9);
+  x2 = dct_const_round_shift(s2 + s10);
+  x3 = dct_const_round_shift(s3 + s11);
+  x4 = dct_const_round_shift(s4 + s12);
+  x5 = dct_const_round_shift(s5 + s13);
+  x6 = dct_const_round_shift(s6 + s14);
+  x7 = dct_const_round_shift(s7 + s15);
+  x8  = dct_const_round_shift(s0 - s8);
+  x9  = dct_const_round_shift(s1 - s9);
+  x10 = dct_const_round_shift(s2 - s10);
+  x11 = dct_const_round_shift(s3 - s11);
+  x12 = dct_const_round_shift(s4 - s12);
+  x13 = dct_const_round_shift(s5 - s13);
+  x14 = dct_const_round_shift(s6 - s14);
+  x15 = dct_const_round_shift(s7 - s15);
+
+  // stage 2
+  s0 = x0;
+  s1 = x1;
+  s2 = x2;
+  s3 = x3;
+  s4 = x4;
+  s5 = x5;
+  s6 = x6;
+  s7 = x7;
+  s8 =    x8 * cospi_4_64   + x9 * cospi_28_64;
+  s9 =    x8 * cospi_28_64  - x9 * cospi_4_64;
+  s10 =   x10 * cospi_20_64 + x11 * cospi_12_64;
+  s11 =   x10 * cospi_12_64 - x11 * cospi_20_64;
+  s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;
+  s13 =   x12 * cospi_4_64  + x13 * cospi_28_64;
+  s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;
+  s15 =   x14 * cospi_20_64 + x15 * cospi_12_64;
+
+  x0 = s0 + s4;
+  x1 = s1 + s5;
+  x2 = s2 + s6;
+  x3 = s3 + s7;
+  x4 = s0 - s4;
+  x5 = s1 - s5;
+  x6 = s2 - s6;
+  x7 = s3 - s7;
+  x8 = dct_const_round_shift(s8 + s12);
+  x9 = dct_const_round_shift(s9 + s13);
+  x10 = dct_const_round_shift(s10 + s14);
+  x11 = dct_const_round_shift(s11 + s15);
+  x12 = dct_const_round_shift(s8 - s12);
+  x13 = dct_const_round_shift(s9 - s13);
+  x14 = dct_const_round_shift(s10 - s14);
+  x15 = dct_const_round_shift(s11 - s15);
+
+  // stage 3
+  s0 = x0;
+  s1 = x1;
+  s2 = x2;
+  s3 = x3;
+  s4 = x4 * cospi_8_64  + x5 * cospi_24_64;
+  s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
+  s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;
+  s7 =   x6 * cospi_8_64  + x7 * cospi_24_64;
+  s8 = x8;
+  s9 = x9;
+  s10 = x10;
+  s11 = x11;
+  s12 = x12 * cospi_8_64  + x13 * cospi_24_64;
+  s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
+  s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;
+  s15 =   x14 * cospi_8_64  + x15 * cospi_24_64;
+
+  x0 = s0 + s2;
+  x1 = s1 + s3;
+  x2 = s0 - s2;
+  x3 = s1 - s3;
+  x4 = dct_const_round_shift(s4 + s6);
+  x5 = dct_const_round_shift(s5 + s7);
+  x6 = dct_const_round_shift(s4 - s6);
+  x7 = dct_const_round_shift(s5 - s7);
+  x8 = s8 + s10;
+  x9 = s9 + s11;
+  x10 = s8 - s10;
+  x11 = s9 - s11;
+  x12 = dct_const_round_shift(s12 + s14);
+  x13 = dct_const_round_shift(s13 + s15);
+  x14 = dct_const_round_shift(s12 - s14);
+  x15 = dct_const_round_shift(s13 - s15);
+
+  // stage 4
+  s2 = (- cospi_16_64) * (x2 + x3);
+  s3 = cospi_16_64 * (x2 - x3);
+  s6 = cospi_16_64 * (x6 + x7);
+  s7 = cospi_16_64 * (- x6 + x7);
+  s10 = cospi_16_64 * (x10 + x11);
+  s11 = cospi_16_64 * (- x10 + x11);
+  s14 = (- cospi_16_64) * (x14 + x15);
+  s15 = cospi_16_64 * (x14 - x15);
+
+  x2 = dct_const_round_shift(s2);
+  x3 = dct_const_round_shift(s3);
+  x6 = dct_const_round_shift(s6);
+  x7 = dct_const_round_shift(s7);
+  x10 = dct_const_round_shift(s10);
+  x11 = dct_const_round_shift(s11);
+  x14 = dct_const_round_shift(s14);
+  x15 = dct_const_round_shift(s15);
+
+  output[0] =  x0;
+  output[1] = -x8;
+  output[2] =  x12;
+  output[3] = -x4;
+  output[4] =  x6;
+  output[5] =  x14;
+  output[6] =  x10;
+  output[7] =  x2;
+  output[8] =  x3;
+  output[9] =  x11;
+  output[10] =  x15;
+  output[11] =  x7;
+  output[12] =  x5;
+  output[13] = -x13;
+  output[14] =  x9;
+  output[15] = -x1;
+}
+
+void vp9_iht16x16_256_add_dspr2(const int16_t *input, uint8_t *dest,
+                                int pitch, int tx_type) {
+  int i, j;
+  DECLARE_ALIGNED(32, int16_t,  out[16 * 16]);
+  int16_t *outptr = out;
+  int16_t temp_out[16];
+  uint32_t pos = 45;
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp    %[pos],    1    \n\t"
+    :
+    : [pos] "r" (pos)
+  );
+
+  switch (tx_type) {
+    case DCT_DCT:     // DCT in both horizontal and vertical
+      idct16_1d_rows_dspr2(input, outptr, 16);
+      idct16_1d_cols_add_blk_dspr2(out, dest, pitch);
+      break;
+    case ADST_DCT:    // ADST in vertical, DCT in horizontal
+      idct16_1d_rows_dspr2(input, outptr, 16);
+
+      outptr = out;
+
+      for (i = 0; i < 16; ++i) {
+        iadst16_1d(outptr, temp_out);
+
+        for (j = 0; j < 16; ++j)
+          dest[j * pitch + i] =
+                    clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+                                      + dest[j * pitch + i]);
+        outptr += 16;
+      }
+      break;
+    case DCT_ADST:    // DCT in vertical, ADST in horizontal
+    {
+      int16_t temp_in[16 * 16];
+
+      for (i = 0; i < 16; ++i) {
+        /* prefetch row */
+        vp9_prefetch_load((const uint8_t *)(input + 16));
+
+        iadst16_1d(input, outptr);
+        input += 16;
+        outptr += 16;
+      }
+
+      for (i = 0; i < 16; ++i)
+        for (j = 0; j < 16; ++j)
+            temp_in[j * 16 + i] = out[i * 16 + j];
+
+      idct16_1d_cols_add_blk_dspr2(temp_in, dest, pitch);
+    }
+    break;
+    case ADST_ADST:   // ADST in both directions
+    {
+      int16_t temp_in[16];
+
+      for (i = 0; i < 16; ++i) {
+        /* prefetch row */
+        vp9_prefetch_load((const uint8_t *)(input + 16));
+
+        iadst16_1d(input, outptr);
+        input += 16;
+        outptr += 16;
+      }
+
+      for (i = 0; i < 16; ++i) {
+        for (j = 0; j < 16; ++j)
+          temp_in[j] = out[j * 16 + i];
+        iadst16_1d(temp_in, temp_out);
+        for (j = 0; j < 16; ++j)
+          dest[j * pitch + i] =
+                    clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+                                      + dest[j * pitch + i]);
+      }
+    }
+    break;
+    default:
+      printf("vp9_short_iht16x16_add_dspr2 : Invalid tx_type\n");
+      break;
+  }
+}
+
+void vp9_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest,
+                                int dest_stride) {
+  DECLARE_ALIGNED(32, int16_t,  out[16 * 16]);
+  int16_t *outptr = out;
+  uint32_t i;
+  uint32_t pos = 45;
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp    %[pos],    1    \n\t"
+    :
+    : [pos] "r" (pos)
+  );
+
+  // First transform rows. Since all non-zero dct coefficients are in
+  // upper-left 4x4 area, we only need to calculate first 4 rows here.
+  idct16_1d_rows_dspr2(input, outptr, 4);
+
+  outptr += 4;
+  for (i = 0; i < 6; ++i) {
+    __asm__ __volatile__ (
+        "sw     $zero,    0(%[outptr])     \n\t"
+        "sw     $zero,   32(%[outptr])     \n\t"
+        "sw     $zero,   64(%[outptr])     \n\t"
+        "sw     $zero,   96(%[outptr])     \n\t"
+        "sw     $zero,  128(%[outptr])     \n\t"
+        "sw     $zero,  160(%[outptr])     \n\t"
+        "sw     $zero,  192(%[outptr])     \n\t"
+        "sw     $zero,  224(%[outptr])     \n\t"
+        "sw     $zero,  256(%[outptr])     \n\t"
+        "sw     $zero,  288(%[outptr])     \n\t"
+        "sw     $zero,  320(%[outptr])     \n\t"
+        "sw     $zero,  352(%[outptr])     \n\t"
+        "sw     $zero,  384(%[outptr])     \n\t"
+        "sw     $zero,  416(%[outptr])     \n\t"
+        "sw     $zero,  448(%[outptr])     \n\t"
+        "sw     $zero,  480(%[outptr])     \n\t"
+
+        :
+        : [outptr] "r" (outptr)
+    );
+
+    outptr += 2;
+  }
+
+  // Then transform columns
+  idct16_1d_cols_add_blk_dspr2(out, dest, dest_stride);
+}
+
+void vp9_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest,
+                               int dest_stride) {
+  uint32_t pos = 45;
+  int32_t out;
+  int32_t r;
+  int32_t a1, absa1;
+  int32_t vector_a1;
+  int32_t t1, t2, t3, t4;
+  int32_t vector_1, vector_2, vector_3, vector_4;
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp      %[pos],     1           \n\t"
+
+    :
+    : [pos] "r" (pos)
+  );
+
+  out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);
+  __asm__ __volatile__ (
+      "addi     %[out],     %[out],     32      \n\t"
+      "sra      %[a1],      %[out],     6       \n\t"
+
+      : [out] "+r" (out), [a1] "=r" (a1)
+      :
+  );
+
+  if (a1 < 0) {
+    /* use quad-byte
+     * input and output memory are four byte aligned */
+    __asm__ __volatile__ (
+        "abs        %[absa1],       %[a1]       \n\t"
+        "replv.qb   %[vector_a1],   %[absa1]    \n\t"
+
+        : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)
+        : [a1] "r" (a1)
+    );
+
+    for (r = 16; r--;) {
+      __asm__ __volatile__ (
+          "lw             %[t1],          0(%[dest])                      \n\t"
+          "lw             %[t2],          4(%[dest])                      \n\t"
+          "lw             %[t3],          8(%[dest])                      \n\t"
+          "lw             %[t4],          12(%[dest])                     \n\t"
+          "subu_s.qb      %[vector_1],    %[t1],          %[vector_a1]    \n\t"
+          "subu_s.qb      %[vector_2],    %[t2],          %[vector_a1]    \n\t"
+          "subu_s.qb      %[vector_3],    %[t3],          %[vector_a1]    \n\t"
+          "subu_s.qb      %[vector_4],    %[t4],          %[vector_a1]    \n\t"
+          "sw             %[vector_1],    0(%[dest])                      \n\t"
+          "sw             %[vector_2],    4(%[dest])                      \n\t"
+          "sw             %[vector_3],    8(%[dest])                      \n\t"
+          "sw             %[vector_4],    12(%[dest])                     \n\t"
+          "add            %[dest],        %[dest],        %[dest_stride]  \n\t"
+
+          : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4),
+            [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
+            [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4),
+            [dest] "+&r" (dest)
+          : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
+      );
+    }
+  } else {
+    /* use quad-byte
+     * input and output memory are four byte aligned */
+    __asm__ __volatile__ (
+        "replv.qb   %[vector_a1],   %[a1]   \n\t"
+
+        : [vector_a1] "=r" (vector_a1)
+        : [a1] "r" (a1)
+    );
+
+    for (r = 16; r--;) {
+      __asm__ __volatile__ (
+          "lw             %[t1],          0(%[dest])                      \n\t"
+          "lw             %[t2],          4(%[dest])                      \n\t"
+          "lw             %[t3],          8(%[dest])                      \n\t"
+          "lw             %[t4],          12(%[dest])                     \n\t"
+          "addu_s.qb      %[vector_1],    %[t1],          %[vector_a1]    \n\t"
+          "addu_s.qb      %[vector_2],    %[t2],          %[vector_a1]    \n\t"
+          "addu_s.qb      %[vector_3],    %[t3],          %[vector_a1]    \n\t"
+          "addu_s.qb      %[vector_4],    %[t4],          %[vector_a1]    \n\t"
+          "sw             %[vector_1],    0(%[dest])                      \n\t"
+          "sw             %[vector_2],    4(%[dest])                      \n\t"
+          "sw             %[vector_3],    8(%[dest])                      \n\t"
+          "sw             %[vector_4],    12(%[dest])                     \n\t"
+          "add            %[dest],        %[dest],        %[dest_stride]  \n\t"
+
+          : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4),
+            [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
+            [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4),
+            [dest] "+&r" (dest)
+          : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
+      );
+    }
+  }
+}
+#endif  // #if HAVE_DSPR2
diff --git a/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c b/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c
new file mode 100644
index 000000000..5e92db3d2
--- /dev/null
+++ b/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c
@@ -0,0 +1,1073 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_idct.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+
+#if HAVE_DSPR2
+void vp9_idct32_1d_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
+                                      int dest_stride) {
+  int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6;
+  int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13;
+  int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19;
+  int16_t step1_20, step1_21, step1_22, step1_23, step1_24, step1_25, step1_26;
+  int16_t step1_27, step1_28, step1_29, step1_30, step1_31;
+  int16_t step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
+  int16_t step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13;
+  int16_t step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20;
+  int16_t step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27;
+  int16_t step2_28, step2_29, step2_30, step2_31;
+  int16_t step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14;
+  int16_t step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21;
+  int16_t step3_22, step3_23, step3_24, step3_25, step3_26, step3_27;
+  int16_t step3_28, step3_29, step3_30, step3_31;
+  int temp0, temp1, temp2, temp3;
+  int load1, load2, load3, load4;
+  int result1, result2;
+  int i, temp21;
+  uint8_t *dest_pix, *dest_pix1;
+  const int const_2_power_13 = 8192;
+  uint8_t *cm = vp9_ff_cropTbl;
+
+  /* prefetch vp9_ff_cropTbl */
+  vp9_prefetch_load(vp9_ff_cropTbl);
+  vp9_prefetch_load(vp9_ff_cropTbl +  32);
+  vp9_prefetch_load(vp9_ff_cropTbl +  64);
+  vp9_prefetch_load(vp9_ff_cropTbl +  96);
+  vp9_prefetch_load(vp9_ff_cropTbl + 128);
+  vp9_prefetch_load(vp9_ff_cropTbl + 160);
+  vp9_prefetch_load(vp9_ff_cropTbl + 192);
+  vp9_prefetch_load(vp9_ff_cropTbl + 224);
+
+  for (i = 0; i < 32; ++i) {
+    dest_pix = dest + i;
+    dest_pix1 = dest + i + 31 * dest_stride;
+
+    __asm__ __volatile__ (
+        "lh       %[load1],             2(%[input])                     \n\t"
+        "lh       %[load2],             62(%[input])                    \n\t"
+        "lh       %[load3],             34(%[input])                    \n\t"
+        "lh       %[load4],             30(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load1],       %[cospi_31_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_1_64]   \n\t"
+        "extp     %[temp0],             $ac1,           31              \n\t"
+
+        "madd     $ac3,                 %[load1],       %[cospi_1_64]   \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_31_64]  \n\t"
+        "extp     %[temp3],             $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac2,                 %[load3],       %[cospi_15_64]  \n\t"
+        "msub     $ac2,                 %[load4],       %[cospi_17_64]  \n\t"
+        "extp     %[temp1],             $ac2,           31              \n\t"
+
+        "madd     $ac1,                 %[load3],       %[cospi_17_64]  \n\t"
+        "madd     $ac1,                 %[load4],       %[cospi_15_64]  \n\t"
+        "extp     %[temp2],             $ac1,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load1],             %[temp3],       %[temp2]        \n\t"
+        "sub      %[load2],             %[temp0],       %[temp1]        \n\t"
+
+        "madd     $ac1,                 %[load1],       %[cospi_28_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_4_64]   \n\t"
+        "madd     $ac3,                 %[load1],       %[cospi_4_64]   \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_28_64]  \n\t"
+
+        "extp     %[step1_17],          $ac1,           31              \n\t"
+        "extp     %[step1_30],          $ac3,           31              \n\t"
+        "add      %[step1_16],          %[temp0],       %[temp1]        \n\t"
+        "add      %[step1_31],          %[temp2],       %[temp3]        \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3),
+          [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+          [step1_16] "=r" (step1_16), [step1_17] "=r" (step1_17),
+          [step1_30] "=r" (step1_30), [step1_31] "=r" (step1_31)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_31_64] "r" (cospi_31_64), [cospi_1_64] "r" (cospi_1_64),
+          [cospi_4_64] "r" (cospi_4_64), [cospi_17_64] "r" (cospi_17_64),
+          [cospi_15_64] "r" (cospi_15_64), [cospi_28_64] "r" (cospi_28_64)
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load1],             18(%[input])                    \n\t"
+        "lh       %[load2],             46(%[input])                    \n\t"
+        "lh       %[load3],             50(%[input])                    \n\t"
+        "lh       %[load4],             14(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load1],       %[cospi_23_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_9_64]   \n\t"
+        "extp     %[temp0],             $ac1,           31              \n\t"
+
+        "madd     $ac3,                 %[load1],       %[cospi_9_64]   \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_23_64]  \n\t"
+        "extp     %[temp3],             $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac2,                 %[load3],       %[cospi_7_64]   \n\t"
+        "msub     $ac2,                 %[load4],       %[cospi_25_64]  \n\t"
+        "extp     %[temp1],             $ac2,           31              \n\t"
+
+        "madd     $ac1,                 %[load3],       %[cospi_25_64]  \n\t"
+        "madd     $ac1,                 %[load4],       %[cospi_7_64]   \n\t"
+        "extp     %[temp2],             $ac1,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load1],             %[temp1],       %[temp0]        \n\t"
+        "sub      %[load2],             %[temp2],       %[temp3]        \n\t"
+
+        "msub     $ac1,                 %[load1],       %[cospi_28_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_4_64]   \n\t"
+        "msub     $ac3,                 %[load1],       %[cospi_4_64]   \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_28_64]  \n\t"
+
+        "extp     %[step1_18],          $ac1,           31              \n\t"
+        "extp     %[step1_29],          $ac3,           31              \n\t"
+        "add      %[step1_19],          %[temp0],       %[temp1]        \n\t"
+        "add      %[step1_28],          %[temp2],       %[temp3]        \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3),
+          [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+          [step1_18] "=r" (step1_18), [step1_19] "=r" (step1_19),
+          [step1_28] "=r" (step1_28), [step1_29] "=r" (step1_29)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_23_64] "r" (cospi_23_64), [cospi_9_64] "r" (cospi_9_64),
+          [cospi_4_64] "r" (cospi_4_64), [cospi_7_64] "r" (cospi_7_64),
+          [cospi_25_64] "r" (cospi_25_64), [cospi_28_64] "r" (cospi_28_64)
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load1],             10(%[input])                    \n\t"
+        "lh       %[load2],             54(%[input])                    \n\t"
+        "lh       %[load3],             42(%[input])                    \n\t"
+        "lh       %[load4],             22(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load1],       %[cospi_27_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_5_64]   \n\t"
+        "extp     %[temp0],             $ac1,           31              \n\t"
+
+        "madd     $ac3,                 %[load1],       %[cospi_5_64]   \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_27_64]  \n\t"
+        "extp     %[temp3],             $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac2,                 %[load3],       %[cospi_11_64]  \n\t"
+        "msub     $ac2,                 %[load4],       %[cospi_21_64]  \n\t"
+        "extp     %[temp1],             $ac2,           31              \n\t"
+
+        "madd     $ac1,                 %[load3],       %[cospi_21_64]  \n\t"
+        "madd     $ac1,                 %[load4],       %[cospi_11_64]  \n\t"
+        "extp     %[temp2],             $ac1,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load1],             %[temp0],       %[temp1]        \n\t"
+        "sub      %[load2],             %[temp3],       %[temp2]        \n\t"
+
+        "madd     $ac1,                 %[load2],       %[cospi_12_64]  \n\t"
+        "msub     $ac1,                 %[load1],       %[cospi_20_64]  \n\t"
+        "madd     $ac3,                 %[load1],       %[cospi_12_64]  \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_20_64]  \n\t"
+
+        "extp     %[step1_21],          $ac1,           31              \n\t"
+        "extp     %[step1_26],          $ac3,           31              \n\t"
+        "add      %[step1_20],          %[temp0],       %[temp1]        \n\t"
+        "add      %[step1_27],          %[temp2],       %[temp3]        \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3),
+          [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+          [step1_20] "=r" (step1_20), [step1_21] "=r" (step1_21),
+          [step1_26] "=r" (step1_26), [step1_27] "=r" (step1_27)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_27_64] "r" (cospi_27_64), [cospi_5_64] "r" (cospi_5_64),
+          [cospi_11_64] "r" (cospi_11_64), [cospi_21_64] "r" (cospi_21_64),
+          [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64)
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load1],             26(%[input])                    \n\t"
+        "lh       %[load2],             38(%[input])                    \n\t"
+        "lh       %[load3],             58(%[input])                    \n\t"
+        "lh       %[load4],              6(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load1],       %[cospi_19_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_13_64]  \n\t"
+        "extp     %[temp0],             $ac1,           31              \n\t"
+        "madd     $ac3,                 %[load1],       %[cospi_13_64]  \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_19_64]  \n\t"
+        "extp     %[temp3],             $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac2,                 %[load3],       %[cospi_3_64]   \n\t"
+        "msub     $ac2,                 %[load4],       %[cospi_29_64]  \n\t"
+        "extp     %[temp1],             $ac2,           31              \n\t"
+        "madd     $ac1,                 %[load3],       %[cospi_29_64]  \n\t"
+        "madd     $ac1,                 %[load4],       %[cospi_3_64]   \n\t"
+        "extp     %[temp2],             $ac1,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load1],             %[temp1],       %[temp0]        \n\t"
+        "sub      %[load2],             %[temp2],       %[temp3]        \n\t"
+        "msub     $ac1,                 %[load1],       %[cospi_12_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_20_64]  \n\t"
+        "msub     $ac3,                 %[load1],       %[cospi_20_64]  \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_12_64]  \n\t"
+        "extp     %[step1_22],          $ac1,           31              \n\t"
+        "extp     %[step1_25],          $ac3,           31              \n\t"
+        "add      %[step1_23],          %[temp0],       %[temp1]        \n\t"
+        "add      %[step1_24],          %[temp2],       %[temp3]        \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3),
+          [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+          [step1_22] "=r" (step1_22), [step1_23] "=r" (step1_23),
+          [step1_24] "=r" (step1_24), [step1_25] "=r" (step1_25)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_19_64] "r" (cospi_19_64), [cospi_13_64] "r" (cospi_13_64),
+          [cospi_3_64] "r" (cospi_3_64), [cospi_29_64] "r" (cospi_29_64),
+          [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64)
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load1],              4(%[input])                    \n\t"
+        "lh       %[load2],             60(%[input])                    \n\t"
+        "lh       %[load3],             36(%[input])                    \n\t"
+        "lh       %[load4],             28(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load1],       %[cospi_30_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_2_64]   \n\t"
+        "extp     %[temp0],             $ac1,           31              \n\t"
+        "madd     $ac3,                 %[load1],       %[cospi_2_64]   \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_30_64]  \n\t"
+        "extp     %[temp3],             $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac2,                 %[load3],       %[cospi_14_64]  \n\t"
+        "msub     $ac2,                 %[load4],       %[cospi_18_64]  \n\t"
+        "extp     %[temp1],             $ac2,           31              \n\t"
+        "madd     $ac1,                 %[load3],       %[cospi_18_64]  \n\t"
+        "madd     $ac1,                 %[load4],       %[cospi_14_64]  \n\t"
+        "extp     %[temp2],             $ac1,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load1],             %[temp0],       %[temp1]        \n\t"
+        "sub      %[load2],             %[temp3],       %[temp2]        \n\t"
+        "msub     $ac1,                 %[load1],       %[cospi_8_64]   \n\t"
+        "madd     $ac1,                 %[load2],       %[cospi_24_64]  \n\t"
+        "madd     $ac3,                 %[load1],       %[cospi_24_64]  \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_8_64]   \n\t"
+        "extp     %[step2_9],           $ac1,           31              \n\t"
+        "extp     %[step2_14],          $ac3,           31              \n\t"
+        "add      %[step2_8],           %[temp0],       %[temp1]        \n\t"
+        "add      %[step2_15],          %[temp2],       %[temp3]        \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3),
+          [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+          [step2_8] "=r" (step2_8), [step2_9] "=r" (step2_9),
+          [step2_14] "=r" (step2_14), [step2_15] "=r" (step2_15)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64),
+          [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64),
+          [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64)
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load1],             20(%[input])                    \n\t"
+        "lh       %[load2],             44(%[input])                    \n\t"
+        "lh       %[load3],             52(%[input])                    \n\t"
+        "lh       %[load4],             12(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load1],       %[cospi_22_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_10_64]  \n\t"
+        "extp     %[temp0],             $ac1,           31              \n\t"
+        "madd     $ac3,                 %[load1],       %[cospi_10_64]  \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_22_64]  \n\t"
+        "extp     %[temp3],             $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac2,                 %[load3],       %[cospi_6_64]   \n\t"
+        "msub     $ac2,                 %[load4],       %[cospi_26_64]  \n\t"
+        "extp     %[temp1],             $ac2,           31              \n\t"
+        "madd     $ac1,                 %[load3],       %[cospi_26_64]  \n\t"
+        "madd     $ac1,                 %[load4],       %[cospi_6_64]   \n\t"
+        "extp     %[temp2],             $ac1,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load1],             %[temp1],       %[temp0]        \n\t"
+        "sub      %[load2],             %[temp2],       %[temp3]        \n\t"
+        "msub     $ac1,                 %[load1],       %[cospi_24_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_8_64]   \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_24_64]  \n\t"
+        "msub     $ac3,                 %[load1],       %[cospi_8_64]   \n\t"
+        "extp     %[step2_10],          $ac1,           31              \n\t"
+        "extp     %[step2_13],          $ac3,           31              \n\t"
+        "add      %[step2_11],          %[temp0],       %[temp1]        \n\t"
+        "add      %[step2_12],          %[temp2],       %[temp3]        \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3),
+          [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+          [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11),
+          [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64),
+          [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64),
+          [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64)
+    );
+
+    __asm__ __volatile__ (
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "sub      %[temp0],             %[step2_14],    %[step2_13]     \n\t"
+        "sub      %[temp0],             %[temp0],       %[step2_9]      \n\t"
+        "add      %[temp0],             %[temp0],       %[step2_10]     \n\t"
+        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "sub      %[temp1],             %[step2_14],    %[step2_13]     \n\t"
+        "add      %[temp1],             %[temp1],       %[step2_9]      \n\t"
+        "sub      %[temp1],             %[temp1],       %[step2_10]     \n\t"
+        "madd     $ac1,                 %[temp1],       %[cospi_16_64]  \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+        "sub      %[temp0],             %[step2_15],    %[step2_12]     \n\t"
+        "sub      %[temp0],             %[temp0],       %[step2_8]      \n\t"
+        "add      %[temp0],             %[temp0],       %[step2_11]     \n\t"
+        "madd     $ac2,                 %[temp0],       %[cospi_16_64]  \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+        "sub      %[temp1],             %[step2_15],    %[step2_12]     \n\t"
+        "add      %[temp1],             %[temp1],       %[step2_8]      \n\t"
+        "sub      %[temp1],             %[temp1],       %[step2_11]     \n\t"
+        "madd     $ac3,                 %[temp1],       %[cospi_16_64]  \n\t"
+
+        "add      %[step3_8],           %[step2_8],     %[step2_11]     \n\t"
+        "add      %[step3_9],           %[step2_9],     %[step2_10]     \n\t"
+        "add      %[step3_14],          %[step2_13],    %[step2_14]     \n\t"
+        "add      %[step3_15],          %[step2_12],    %[step2_15]     \n\t"
+        "extp     %[step3_10],          $ac0,           31              \n\t"
+        "extp     %[step3_13],          $ac1,           31              \n\t"
+        "extp     %[step3_11],          $ac2,           31              \n\t"
+        "extp     %[step3_12],          $ac3,           31              \n\t"
+
+        : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+          [step3_8] "=r" (step3_8), [step3_9] "=r" (step3_9),
+          [step3_10] "=r" (step3_10), [step3_11] "=r" (step3_11),
+          [step3_12] "=r" (step3_12), [step3_13] "=r" (step3_13),
+          [step3_14] "=r" (step3_14), [step3_15] "=r" (step3_15)
+        : [const_2_power_13] "r" (const_2_power_13), [step2_8] "r" (step2_8),
+          [step2_9] "r" (step2_9), [step2_10] "r" (step2_10),
+          [step2_11] "r" (step2_11), [step2_12] "r" (step2_12),
+          [step2_13] "r" (step2_13), [step2_14] "r" (step2_14),
+          [step2_15] "r" (step2_15), [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    step2_18 = step1_17 - step1_18;
+    step2_29 = step1_30 - step1_29;
+
+    __asm__ __volatile__ (
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "msub     $ac0,                 %[step2_18],    %[cospi_8_64]   \n\t"
+        "madd     $ac0,                 %[step2_29],    %[cospi_24_64]  \n\t"
+        "extp     %[step3_18],          $ac0,           31              \n\t"
+
+        : [step3_18] "=r" (step3_18)
+        : [const_2_power_13] "r" (const_2_power_13),
+          [step2_18] "r" (step2_18), [step2_29] "r" (step2_29),
+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+    );
+
+    temp21 = step2_18 * cospi_24_64 + step2_29 * cospi_8_64;
+    step3_29 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+    step2_19 = step1_16 - step1_19;
+    step2_28 = step1_31 - step1_28;
+
+    __asm__ __volatile__ (
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "msub     $ac0,                 %[step2_19],    %[cospi_8_64]   \n\t"
+        "madd     $ac0,                 %[step2_28],    %[cospi_24_64]  \n\t"
+        "extp     %[step3_19],          $ac0,           31              \n\t"
+
+        : [step3_19] "=r" (step3_19)
+        : [const_2_power_13] "r" (const_2_power_13),
+          [step2_19] "r" (step2_19), [step2_28] "r" (step2_28),
+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+    );
+
+    temp21 = step2_19 * cospi_24_64 + step2_28 * cospi_8_64;
+    step3_28 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+    step3_16 = step1_16 + step1_19;
+    step3_17 = step1_17 + step1_18;
+    step3_30 = step1_29 + step1_30;
+    step3_31 = step1_28 + step1_31;
+
+    step2_20 = step1_23 - step1_20;
+    step2_27 = step1_24 - step1_27;
+
+    __asm__ __volatile__ (
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "msub     $ac0,                 %[step2_20],    %[cospi_24_64]  \n\t"
+        "msub     $ac0,                 %[step2_27],    %[cospi_8_64]   \n\t"
+        "extp     %[step3_20],          $ac0,           31              \n\t"
+
+        : [step3_20] "=r" (step3_20)
+        : [const_2_power_13] "r" (const_2_power_13),
+          [step2_20] "r" (step2_20), [step2_27] "r" (step2_27),
+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+    );
+
+    temp21 = -step2_20 * cospi_8_64 + step2_27 * cospi_24_64;
+    step3_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+    step2_21 = step1_22 - step1_21;
+    step2_26 = step1_25 - step1_26;
+
+    __asm__ __volatile__ (
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "msub     $ac1,                 %[step2_21],    %[cospi_24_64]  \n\t"
+        "msub     $ac1,                 %[step2_26],    %[cospi_8_64]   \n\t"
+        "extp     %[step3_21],          $ac1,           31              \n\t"
+
+        : [step3_21] "=r" (step3_21)
+        : [const_2_power_13] "r" (const_2_power_13),
+          [step2_21] "r" (step2_21), [step2_26] "r" (step2_26),
+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+    );
+
+    temp21 = -step2_21 * cospi_8_64 + step2_26 * cospi_24_64;
+    step3_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+    step3_22 = step1_21 + step1_22;
+    step3_23 = step1_20 + step1_23;
+    step3_24 = step1_24 + step1_27;
+    step3_25 = step1_25 + step1_26;
+
+    step2_16 = step3_16 + step3_23;
+    step2_17 = step3_17 + step3_22;
+    step2_18 = step3_18 + step3_21;
+    step2_19 = step3_19 + step3_20;
+    step2_20 = step3_19 - step3_20;
+    step2_21 = step3_18 - step3_21;
+    step2_22 = step3_17 - step3_22;
+    step2_23 = step3_16 - step3_23;
+
+    step2_24 = step3_31 - step3_24;
+    step2_25 = step3_30 - step3_25;
+    step2_26 = step3_29 - step3_26;
+    step2_27 = step3_28 - step3_27;
+    step2_28 = step3_28 + step3_27;
+    step2_29 = step3_29 + step3_26;
+    step2_30 = step3_30 + step3_25;
+    step2_31 = step3_31 + step3_24;
+
+    __asm__ __volatile__ (
+        "lh       %[load1],             0(%[input])                     \n\t"
+        "lh       %[load2],             32(%[input])                    \n\t"
+        "lh       %[load3],             16(%[input])                    \n\t"
+        "lh       %[load4],             48(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+        "add      %[result1],           %[load1],       %[load2]        \n\t"
+        "sub      %[result2],           %[load1],       %[load2]        \n\t"
+        "madd     $ac1,                 %[result1],     %[cospi_16_64]  \n\t"
+        "madd     $ac2,                 %[result2],     %[cospi_16_64]  \n\t"
+        "extp     %[temp0],             $ac1,           31              \n\t"
+        "extp     %[temp1],             $ac2,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+        "madd     $ac3,                 %[load3],       %[cospi_24_64]  \n\t"
+        "msub     $ac3,                 %[load4],       %[cospi_8_64]   \n\t"
+        "extp     %[temp2],             $ac3,           31              \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "madd     $ac1,                 %[load3],       %[cospi_8_64]   \n\t"
+        "madd     $ac1,                 %[load4],       %[cospi_24_64]  \n\t"
+        "extp     %[temp3],             $ac1,           31              \n\t"
+        "add      %[step1_0],           %[temp0],       %[temp3]        \n\t"
+        "add      %[step1_1],           %[temp1],       %[temp2]        \n\t"
+        "sub      %[step1_2],           %[temp1],       %[temp2]        \n\t"
+        "sub      %[step1_3],           %[temp0],       %[temp3]        \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2),
+          [load3] "=&r" (load3), [load4] "=&r" (load4),
+          [result1] "=&r" (result1), [result2] "=&r" (result2),
+          [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+          [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1),
+          [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64),
+          [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load1],             8(%[input])                     \n\t"
+        "lh       %[load2],             56(%[input])                    \n\t"
+        "lh       %[load3],             40(%[input])                    \n\t"
+        "lh       %[load4],             24(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load1],       %[cospi_28_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_4_64]   \n\t"
+        "extp     %[temp0],             $ac1,           31              \n\t"
+        "madd     $ac3,                 %[load1],       %[cospi_4_64]   \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_28_64]  \n\t"
+        "extp     %[temp3],             $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac2,                 %[load3],       %[cospi_12_64]  \n\t"
+        "msub     $ac2,                 %[load4],       %[cospi_20_64]  \n\t"
+        "extp     %[temp1],             $ac2,           31              \n\t"
+        "madd     $ac1,                 %[load3],       %[cospi_20_64]  \n\t"
+        "madd     $ac1,                 %[load4],       %[cospi_12_64]  \n\t"
+        "extp     %[temp2],             $ac1,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load1],             %[temp3],       %[temp2]        \n\t"
+        "sub      %[load1],             %[load1],       %[temp0]        \n\t"
+        "add      %[load1],             %[load1],       %[temp1]        \n\t"
+        "sub      %[load2],             %[temp0],       %[temp1]        \n\t"
+        "sub      %[load2],             %[load2],       %[temp2]        \n\t"
+        "add      %[load2],             %[load2],       %[temp3]        \n\t"
+        "madd     $ac1,                 %[load1],       %[cospi_16_64]  \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_16_64]  \n\t"
+
+        "extp     %[step1_5],           $ac1,           31              \n\t"
+        "extp     %[step1_6],           $ac3,           31              \n\t"
+        "add      %[step1_4],           %[temp0],       %[temp1]        \n\t"
+        "add      %[step1_7],           %[temp3],       %[temp2]        \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2),
+          [load3] "=&r" (load3), [load4] "=&r" (load4),
+          [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+          [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5),
+          [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64),
+          [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64),
+          [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    step2_0 = step1_0 + step1_7;
+    step2_1 = step1_1 + step1_6;
+    step2_2 = step1_2 + step1_5;
+    step2_3 = step1_3 + step1_4;
+    step2_4 = step1_3 - step1_4;
+    step2_5 = step1_2 - step1_5;
+    step2_6 = step1_1 - step1_6;
+    step2_7 = step1_0 - step1_7;
+
+    // stage 7
+    step1_0 = step2_0 + step3_15;
+    step1_1 = step2_1 + step3_14;
+    step1_2 = step2_2 + step3_13;
+    step1_3 = step2_3 + step3_12;
+    step1_4 = step2_4 + step3_11;
+    step1_5 = step2_5 + step3_10;
+    step1_6 = step2_6 + step3_9;
+    step1_7 = step2_7 + step3_8;
+    step1_8 = step2_7 - step3_8;
+    step1_9 = step2_6 - step3_9;
+    step1_10 = step2_5 - step3_10;
+    step1_11 = step2_4 - step3_11;
+    step1_12 = step2_3 - step3_12;
+    step1_13 = step2_2 - step3_13;
+    step1_14 = step2_1 - step3_14;
+    step1_15 = step2_0 - step3_15;
+
+    __asm__ __volatile__ (
+        "sub      %[temp0],             %[step2_27],    %[step2_20]     \n\t"
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
+        "extp     %[step1_20],          $ac0,           31              \n\t"
+
+        : [temp0] "=&r" (temp0), [step1_20] "=r" (step1_20)
+        : [const_2_power_13] "r" (const_2_power_13), [step2_20] "r" (step2_20),
+          [step2_27] "r" (step2_27), [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    temp21 = (step2_20 + step2_27) * cospi_16_64;
+    step1_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+    __asm__ __volatile__ (
+        "sub      %[temp0],             %[step2_26],    %[step2_21]     \n\t"
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
+        "extp     %[step1_21],          $ac0,           31              \n\t"
+
+        : [temp0] "=&r" (temp0), [step1_21] "=r" (step1_21)
+        : [const_2_power_13] "r" (const_2_power_13), [step2_26] "r" (step2_26),
+          [step2_21] "r" (step2_21), [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    temp21 = (step2_21 + step2_26) * cospi_16_64;
+    step1_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+    __asm__ __volatile__ (
+        "sub      %[temp0],             %[step2_25],    %[step2_22]     \n\t"
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
+        "extp     %[step1_22],          $ac0,           31              \n\t"
+
+        : [temp0] "=&r" (temp0), [step1_22] "=r" (step1_22)
+        : [const_2_power_13] "r" (const_2_power_13), [step2_25] "r" (step2_25),
+          [step2_22] "r" (step2_22), [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    temp21 = (step2_22 + step2_25) * cospi_16_64;
+    step1_25 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+    __asm__ __volatile__ (
+        "sub      %[temp0],             %[step2_24],    %[step2_23]     \n\t"
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
+        "extp     %[step1_23],          $ac0,           31              \n\t"
+
+        : [temp0] "=&r" (temp0), [step1_23] "=r" (step1_23)
+        : [const_2_power_13] "r" (const_2_power_13), [step2_24] "r" (step2_24),
+          [step2_23] "r" (step2_23), [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    temp21 = (step2_23 + step2_24) * cospi_16_64;
+    step1_24 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+    __asm__ __volatile__ (
+        "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
+        "add      %[temp0],         %[step1_0],         %[step2_31]     \n\t"
+        "addi     %[temp0],         %[temp0],           32              \n\t"
+        "sra      %[temp0],         %[temp0],           6               \n\t"
+        "add      %[temp2],         %[temp2],           %[temp0]        \n\t"
+        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
+        "add      %[temp1],         %[step1_1],         %[step2_30]     \n\t"
+        "sb       %[temp0],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
+        "addi     %[temp1],         %[temp1],           32              \n\t"
+        "sra      %[temp1],         %[temp1],           6               \n\t"
+        "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
+        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
+        "sb       %[temp1],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+
+        "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
+        "add      %[temp0],         %[step1_2],         %[step2_29]     \n\t"
+        "addi     %[temp0],         %[temp0],           32              \n\t"
+        "sra      %[temp0],         %[temp0],           6               \n\t"
+        "add      %[temp2],         %[temp2],           %[temp0]        \n\t"
+        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
+        "add      %[temp1],         %[step1_3],         %[step2_28]     \n\t"
+        "sb       %[temp0],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
+        "addi     %[temp1],         %[temp1],           32              \n\t"
+        "sra      %[temp1],         %[temp1],           6               \n\t"
+        "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
+        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
+        "sb       %[temp1],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+
+        : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
+          [temp3] "=&r" (temp3), [dest_pix] "+r" (dest_pix)
+        : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
+          [step1_0] "r" (step1_0), [step1_1] "r" (step1_1),
+          [step1_2] "r" (step1_2), [step1_3] "r" (step1_3),
+          [step2_28] "r" (step2_28), [step2_29] "r" (step2_29),
+          [step2_30] "r" (step2_30), [step2_31] "r" (step2_31)
+    );
+
+    step3_12 = ROUND_POWER_OF_TWO((step1_3 - step2_28), 6);
+    step3_13 = ROUND_POWER_OF_TWO((step1_2 - step2_29), 6);
+    step3_14 = ROUND_POWER_OF_TWO((step1_1 - step2_30), 6);
+    step3_15 = ROUND_POWER_OF_TWO((step1_0 - step2_31), 6);
+
+    __asm__ __volatile__ (
+        "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
+        "add      %[temp2],         %[temp2],           %[step3_15]     \n\t"
+        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
+        "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+        "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
+        "add      %[temp3],         %[temp3],           %[step3_14]     \n\t"
+        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
+        "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+
+        "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
+        "add      %[temp2],         %[temp2],           %[step3_13]     \n\t"
+        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
+        "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+        "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
+        "add      %[temp3],         %[temp3],           %[step3_12]     \n\t"
+        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
+        "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+
+        : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
+          [temp3] "=&r" (temp3), [dest_pix1] "+r" (dest_pix1)
+        : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
+          [step3_12] "r" (step3_12), [step3_13] "r" (step3_13),
+          [step3_14] "r" (step3_14), [step3_15] "r" (step3_15)
+    );
+
+    __asm__ __volatile__ (
+        "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
+        "add      %[temp0],         %[step1_4],         %[step1_27]     \n\t"
+        "addi     %[temp0],         %[temp0],           32              \n\t"
+        "sra      %[temp0],         %[temp0],           6               \n\t"
+        "add      %[temp2],         %[temp2],           %[temp0]        \n\t"
+        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
+        "add      %[temp1],         %[step1_5],         %[step1_26]     \n\t"
+        "sb       %[temp0],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
+        "addi     %[temp1],         %[temp1],           32              \n\t"
+        "sra      %[temp1],         %[temp1],           6               \n\t"
+        "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
+        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
+        "sb       %[temp1],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+
+        "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
+        "add      %[temp0],         %[step1_6],         %[step1_25]     \n\t"
+        "addi     %[temp0],         %[temp0],           32              \n\t"
+        "sra      %[temp0],         %[temp0],           6               \n\t"
+        "add      %[temp2],         %[temp2],           %[temp0]        \n\t"
+        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
+        "add      %[temp1],         %[step1_7],         %[step1_24]     \n\t"
+        "sb       %[temp0],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
+        "addi     %[temp1],         %[temp1],           32              \n\t"
+        "sra      %[temp1],         %[temp1],           6               \n\t"
+        "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
+        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
+        "sb       %[temp1],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+
+        : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
+          [temp3] "=&r" (temp3), [dest_pix] "+r" (dest_pix)
+        : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
+          [step1_4] "r" (step1_4), [step1_5] "r" (step1_5),
+          [step1_6] "r" (step1_6), [step1_7] "r" (step1_7),
+          [step1_24] "r" (step1_24), [step1_25] "r" (step1_25),
+          [step1_26] "r" (step1_26), [step1_27] "r" (step1_27)
+    );
+
+    step3_12 = ROUND_POWER_OF_TWO((step1_7 - step1_24), 6);
+    step3_13 = ROUND_POWER_OF_TWO((step1_6 - step1_25), 6);
+    step3_14 = ROUND_POWER_OF_TWO((step1_5 - step1_26), 6);
+    step3_15 = ROUND_POWER_OF_TWO((step1_4 - step1_27), 6);
+
+    __asm__ __volatile__ (
+        "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
+        "add      %[temp2],         %[temp2],           %[step3_15]     \n\t"
+        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
+        "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+        "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
+        "add      %[temp3],         %[temp3],           %[step3_14]     \n\t"
+        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
+        "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+
+        "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
+        "add      %[temp2],         %[temp2],           %[step3_13]     \n\t"
+        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
+        "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+        "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
+        "add      %[temp3],         %[temp3],           %[step3_12]     \n\t"
+        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
+        "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+
+        : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
+          [temp3] "=&r" (temp3), [dest_pix1] "+r" (dest_pix1)
+        : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
+          [step3_12] "r" (step3_12), [step3_13] "r" (step3_13),
+          [step3_14] "r" (step3_14), [step3_15] "r" (step3_15)
+    );
+
+    __asm__ __volatile__ (
+        "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
+        "add      %[temp0],         %[step1_8],         %[step1_23]     \n\t"
+        "addi     %[temp0],         %[temp0],           32              \n\t"
+        "sra      %[temp0],         %[temp0],           6               \n\t"
+        "add      %[temp2],         %[temp2],           %[temp0]        \n\t"
+        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
+        "add      %[temp1],         %[step1_9],         %[step1_22]     \n\t"
+        "sb       %[temp0],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
+        "addi     %[temp1],         %[temp1],           32              \n\t"
+        "sra      %[temp1],         %[temp1],           6               \n\t"
+        "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
+        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
+        "sb       %[temp1],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+
+        "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
+        "add      %[temp0],         %[step1_10],        %[step1_21]     \n\t"
+        "addi     %[temp0],         %[temp0],           32              \n\t"
+        "sra      %[temp0],         %[temp0],           6               \n\t"
+        "add      %[temp2],         %[temp2],           %[temp0]        \n\t"
+        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
+        "add      %[temp1],         %[step1_11],        %[step1_20]     \n\t"
+        "sb       %[temp0],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
+        "addi     %[temp1],         %[temp1],           32              \n\t"
+        "sra      %[temp1],         %[temp1],           6               \n\t"
+        "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
+        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
+        "sb       %[temp1],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+
+        : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
+          [temp3] "=&r" (temp3), [dest_pix] "+r" (dest_pix)
+        : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
+          [step1_8] "r" (step1_8), [step1_9] "r" (step1_9),
+          [step1_10] "r" (step1_10), [step1_11] "r" (step1_11),
+          [step1_20] "r" (step1_20), [step1_21] "r" (step1_21),
+          [step1_22] "r" (step1_22), [step1_23] "r" (step1_23)
+    );
+
+    step3_12 = ROUND_POWER_OF_TWO((step1_11 - step1_20), 6);
+    step3_13 = ROUND_POWER_OF_TWO((step1_10 - step1_21), 6);
+    step3_14 = ROUND_POWER_OF_TWO((step1_9 - step1_22), 6);
+    step3_15 = ROUND_POWER_OF_TWO((step1_8 - step1_23), 6);
+
+    __asm__ __volatile__ (
+        "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
+        "add      %[temp2],         %[temp2],           %[step3_15]     \n\t"
+        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
+        "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+        "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
+        "add      %[temp3],         %[temp3],           %[step3_14]     \n\t"
+        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
+        "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+
+        "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
+        "add      %[temp2],         %[temp2],           %[step3_13]     \n\t"
+        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
+        "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+        "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
+        "add      %[temp3],         %[temp3],           %[step3_12]     \n\t"
+        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
+        "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+
+        : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
+          [temp3] "=&r" (temp3), [dest_pix1] "+r" (dest_pix1)
+        : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
+          [step3_12] "r" (step3_12), [step3_13] "r" (step3_13),
+          [step3_14] "r" (step3_14), [step3_15] "r" (step3_15)
+    );
+
+    __asm__ __volatile__ (
+        "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
+        "add      %[temp0],         %[step1_12],        %[step2_19]     \n\t"
+        "addi     %[temp0],         %[temp0],           32              \n\t"
+        "sra      %[temp0],         %[temp0],           6               \n\t"
+        "add      %[temp2],         %[temp2],           %[temp0]        \n\t"
+        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
+        "add      %[temp1],         %[step1_13],        %[step2_18]     \n\t"
+        "sb       %[temp0],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
+        "addi     %[temp1],         %[temp1],           32              \n\t"
+        "sra      %[temp1],         %[temp1],           6               \n\t"
+        "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
+        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
+        "sb       %[temp1],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+
+        "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
+        "add      %[temp0],         %[step1_14],        %[step2_17]     \n\t"
+        "addi     %[temp0],         %[temp0],           32              \n\t"
+        "sra      %[temp0],         %[temp0],           6               \n\t"
+        "add      %[temp2],         %[temp2],           %[temp0]        \n\t"
+        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
+        "add      %[temp1],         %[step1_15],        %[step2_16]     \n\t"
+        "sb       %[temp0],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
+        "addi     %[temp1],         %[temp1],           32              \n\t"
+        "sra      %[temp1],         %[temp1],           6               \n\t"
+        "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
+        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
+        "sb       %[temp1],         0(%[dest_pix])                      \n\t"
+
+        : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
+          [temp3] "=&r" (temp3), [dest_pix] "+r" (dest_pix)
+        : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
+          [step1_12] "r" (step1_12), [step1_13] "r" (step1_13),
+          [step1_14] "r" (step1_14), [step1_15] "r" (step1_15),
+          [step2_16] "r" (step2_16), [step2_17] "r" (step2_17),
+          [step2_18] "r" (step2_18), [step2_19] "r" (step2_19)
+    );
+
+    step3_12 = ROUND_POWER_OF_TWO((step1_15 - step2_16), 6);
+    step3_13 = ROUND_POWER_OF_TWO((step1_14 - step2_17), 6);
+    step3_14 = ROUND_POWER_OF_TWO((step1_13 - step2_18), 6);
+    step3_15 = ROUND_POWER_OF_TWO((step1_12 - step2_19), 6);
+
+    __asm__ __volatile__ (
+        "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
+        "add      %[temp2],         %[temp2],           %[step3_15]     \n\t"
+        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
+        "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+        "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
+        "add      %[temp3],         %[temp3],           %[step3_14]     \n\t"
+        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
+        "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+
+        "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
+        "add      %[temp2],         %[temp2],           %[step3_13]     \n\t"
+        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
+        "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+        "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
+        "add      %[temp3],         %[temp3],           %[step3_12]     \n\t"
+        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
+        "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
+
+        : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
+          [temp3] "=&r" (temp3), [dest_pix1] "+r" (dest_pix1)
+        : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
+          [step3_12] "r" (step3_12), [step3_13] "r" (step3_13),
+          [step3_14] "r" (step3_14), [step3_15] "r" (step3_15)
+    );
+
+    input += 32;
+  }
+}
+#endif  // #if HAVE_DSPR2
diff --git a/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c b/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c
new file mode 100644
index 000000000..d3aee73cb
--- /dev/null
+++ b/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c
@@ -0,0 +1,1013 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_idct.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+
+#if HAVE_DSPR2
+static void idct32_1d_rows_dspr2(const int16_t *input, int16_t *output) {
+  int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6;
+  int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13;
+  int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19, step1_20;
+  int16_t step1_21, step1_22, step1_23, step1_24, step1_25, step1_26, step1_27;
+  int16_t step1_28, step1_29, step1_30, step1_31;
+  int16_t step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
+  int16_t step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13;
+  int16_t step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20;
+  int16_t step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27;
+  int16_t step2_28, step2_29, step2_30, step2_31;
+  int16_t step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14;
+  int16_t step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21;
+  int16_t step3_22, step3_23, step3_24, step3_25, step3_26, step3_27, step3_28;
+  int16_t step3_29, step3_30, step3_31;
+  int temp0, temp1, temp2, temp3;
+  int load1, load2, load3, load4;
+  int result1, result2;
+  int temp21;
+  int i;
+  const int const_2_power_13 = 8192;
+  const int32_t *input_int;
+
+  for (i = 32; i--; ) {
+    input_int = (const int32_t *)input;
+
+    if (!(input_int[0]  | input_int[1]  | input_int[2]  | input_int[3]  |
+          input_int[4]  | input_int[5]  | input_int[6]  | input_int[7]  |
+          input_int[8]  | input_int[9]  | input_int[10] | input_int[11] |
+          input_int[12] | input_int[13] | input_int[14] | input_int[15])) {
+      input += 32;
+
+      __asm__ __volatile__ (
+          "sh     $zero,     0(%[output])     \n\t"
+          "sh     $zero,    64(%[output])     \n\t"
+          "sh     $zero,   128(%[output])     \n\t"
+          "sh     $zero,   192(%[output])     \n\t"
+          "sh     $zero,   256(%[output])     \n\t"
+          "sh     $zero,   320(%[output])     \n\t"
+          "sh     $zero,   384(%[output])     \n\t"
+          "sh     $zero,   448(%[output])     \n\t"
+          "sh     $zero,   512(%[output])     \n\t"
+          "sh     $zero,   576(%[output])     \n\t"
+          "sh     $zero,   640(%[output])     \n\t"
+          "sh     $zero,   704(%[output])     \n\t"
+          "sh     $zero,   768(%[output])     \n\t"
+          "sh     $zero,   832(%[output])     \n\t"
+          "sh     $zero,   896(%[output])     \n\t"
+          "sh     $zero,   960(%[output])     \n\t"
+          "sh     $zero,  1024(%[output])     \n\t"
+          "sh     $zero,  1088(%[output])     \n\t"
+          "sh     $zero,  1152(%[output])     \n\t"
+          "sh     $zero,  1216(%[output])     \n\t"
+          "sh     $zero,  1280(%[output])     \n\t"
+          "sh     $zero,  1344(%[output])     \n\t"
+          "sh     $zero,  1408(%[output])     \n\t"
+          "sh     $zero,  1472(%[output])     \n\t"
+          "sh     $zero,  1536(%[output])     \n\t"
+          "sh     $zero,  1600(%[output])     \n\t"
+          "sh     $zero,  1664(%[output])     \n\t"
+          "sh     $zero,  1728(%[output])     \n\t"
+          "sh     $zero,  1792(%[output])     \n\t"
+          "sh     $zero,  1856(%[output])     \n\t"
+          "sh     $zero,  1920(%[output])     \n\t"
+          "sh     $zero,  1984(%[output])     \n\t"
+
+          :
+          : [output] "r" (output)
+      );
+
+      output += 1;
+
+      continue;
+    }
+
+    /* prefetch row */
+    vp9_prefetch_load((const uint8_t *)(input + 32));
+    vp9_prefetch_load((const uint8_t *)(input + 48));
+
+    __asm__ __volatile__ (
+        "lh       %[load1],             2(%[input])                     \n\t"
+        "lh       %[load2],             62(%[input])                    \n\t"
+        "lh       %[load3],             34(%[input])                    \n\t"
+        "lh       %[load4],             30(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load1],       %[cospi_31_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_1_64]   \n\t"
+        "extp     %[temp0],             $ac1,           31              \n\t"
+
+        "madd     $ac3,                 %[load1],       %[cospi_1_64]   \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_31_64]  \n\t"
+        "extp     %[temp3],             $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac2,                 %[load3],       %[cospi_15_64]  \n\t"
+        "msub     $ac2,                 %[load4],       %[cospi_17_64]  \n\t"
+        "extp     %[temp1],             $ac2,           31              \n\t"
+
+        "madd     $ac1,                 %[load3],       %[cospi_17_64]  \n\t"
+        "madd     $ac1,                 %[load4],       %[cospi_15_64]  \n\t"
+        "extp     %[temp2],             $ac1,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load1],             %[temp3],       %[temp2]        \n\t"
+        "sub      %[load2],             %[temp0],       %[temp1]        \n\t"
+
+        "madd     $ac1,                 %[load1],       %[cospi_28_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_4_64]   \n\t"
+        "madd     $ac3,                 %[load1],       %[cospi_4_64]   \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_28_64]  \n\t"
+
+        "extp     %[step1_17],          $ac1,           31              \n\t"
+        "extp     %[step1_30],          $ac3,           31              \n\t"
+        "add      %[step1_16],          %[temp0],       %[temp1]        \n\t"
+        "add      %[step1_31],          %[temp2],       %[temp3]        \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2),
+          [load3] "=&r" (load3), [load4] "=&r" (load4),
+          [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+          [step1_16] "=r" (step1_16), [step1_17] "=r" (step1_17),
+          [step1_30] "=r" (step1_30), [step1_31] "=r" (step1_31)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_31_64] "r" (cospi_31_64), [cospi_1_64] "r" (cospi_1_64),
+          [cospi_4_64] "r" (cospi_4_64), [cospi_17_64] "r" (cospi_17_64),
+          [cospi_15_64] "r" (cospi_15_64), [cospi_28_64] "r" (cospi_28_64)
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load1],             18(%[input])                    \n\t"
+        "lh       %[load2],             46(%[input])                    \n\t"
+        "lh       %[load3],             50(%[input])                    \n\t"
+        "lh       %[load4],             14(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load1],       %[cospi_23_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_9_64]   \n\t"
+        "extp     %[temp0],             $ac1,           31              \n\t"
+
+        "madd     $ac3,                 %[load1],       %[cospi_9_64]   \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_23_64]  \n\t"
+        "extp     %[temp3],             $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac2,                 %[load3],       %[cospi_7_64]   \n\t"
+        "msub     $ac2,                 %[load4],       %[cospi_25_64]  \n\t"
+        "extp     %[temp1],             $ac2,           31              \n\t"
+
+        "madd     $ac1,                 %[load3],       %[cospi_25_64]  \n\t"
+        "madd     $ac1,                 %[load4],       %[cospi_7_64]   \n\t"
+        "extp     %[temp2],             $ac1,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load1],             %[temp1],       %[temp0]        \n\t"
+        "sub      %[load2],             %[temp2],       %[temp3]        \n\t"
+
+        "msub     $ac1,                 %[load1],       %[cospi_28_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_4_64]   \n\t"
+        "msub     $ac3,                 %[load1],       %[cospi_4_64]   \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_28_64]  \n\t"
+
+        "extp     %[step1_18],          $ac1,           31              \n\t"
+        "extp     %[step1_29],          $ac3,           31              \n\t"
+        "add      %[step1_19],          %[temp0],       %[temp1]        \n\t"
+        "add      %[step1_28],          %[temp2],       %[temp3]        \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2),
+          [load3] "=&r" (load3), [load4] "=&r" (load4),
+          [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+          [step1_18] "=r" (step1_18), [step1_19] "=r" (step1_19),
+          [step1_28] "=r" (step1_28), [step1_29] "=r" (step1_29)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_23_64] "r" (cospi_23_64), [cospi_9_64] "r" (cospi_9_64),
+          [cospi_4_64] "r" (cospi_4_64), [cospi_7_64] "r" (cospi_7_64),
+          [cospi_25_64] "r" (cospi_25_64), [cospi_28_64] "r" (cospi_28_64)
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load1],             10(%[input])                    \n\t"
+        "lh       %[load2],             54(%[input])                    \n\t"
+        "lh       %[load3],             42(%[input])                    \n\t"
+        "lh       %[load4],             22(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load1],       %[cospi_27_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_5_64]   \n\t"
+        "extp     %[temp0],             $ac1,           31              \n\t"
+
+        "madd     $ac3,                 %[load1],       %[cospi_5_64]   \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_27_64]  \n\t"
+        "extp     %[temp3],             $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac2,                 %[load3],       %[cospi_11_64]  \n\t"
+        "msub     $ac2,                 %[load4],       %[cospi_21_64]  \n\t"
+        "extp     %[temp1],             $ac2,           31              \n\t"
+
+        "madd     $ac1,                 %[load3],       %[cospi_21_64]  \n\t"
+        "madd     $ac1,                 %[load4],       %[cospi_11_64]  \n\t"
+        "extp     %[temp2],             $ac1,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load1],             %[temp0],       %[temp1]        \n\t"
+        "sub      %[load2],             %[temp3],       %[temp2]        \n\t"
+
+        "madd     $ac1,                 %[load2],       %[cospi_12_64]  \n\t"
+        "msub     $ac1,                 %[load1],       %[cospi_20_64]  \n\t"
+        "madd     $ac3,                 %[load1],       %[cospi_12_64]  \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_20_64]  \n\t"
+
+        "extp     %[step1_21],          $ac1,           31              \n\t"
+        "extp     %[step1_26],          $ac3,           31              \n\t"
+        "add      %[step1_20],          %[temp0],       %[temp1]        \n\t"
+        "add      %[step1_27],          %[temp2],       %[temp3]        \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2),
+          [load3] "=&r" (load3), [load4] "=&r" (load4),
+          [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+          [step1_20] "=r" (step1_20), [step1_21] "=r" (step1_21),
+          [step1_26] "=r" (step1_26), [step1_27] "=r" (step1_27)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_27_64] "r" (cospi_27_64), [cospi_5_64] "r" (cospi_5_64),
+          [cospi_11_64] "r" (cospi_11_64), [cospi_21_64] "r" (cospi_21_64),
+          [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64)
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load1],             26(%[input])                    \n\t"
+        "lh       %[load2],             38(%[input])                    \n\t"
+        "lh       %[load3],             58(%[input])                    \n\t"
+        "lh       %[load4],              6(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load1],       %[cospi_19_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_13_64]  \n\t"
+        "extp     %[temp0],             $ac1,           31              \n\t"
+
+        "madd     $ac3,                 %[load1],       %[cospi_13_64]  \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_19_64]  \n\t"
+        "extp     %[temp3],             $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac2,                 %[load3],       %[cospi_3_64]   \n\t"
+        "msub     $ac2,                 %[load4],       %[cospi_29_64]  \n\t"
+        "extp     %[temp1],             $ac2,           31              \n\t"
+
+        "madd     $ac1,                 %[load3],       %[cospi_29_64]  \n\t"
+        "madd     $ac1,                 %[load4],       %[cospi_3_64]   \n\t"
+        "extp     %[temp2],             $ac1,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load1],             %[temp1],       %[temp0]        \n\t"
+        "sub      %[load2],             %[temp2],       %[temp3]        \n\t"
+
+        "msub     $ac1,                 %[load1],       %[cospi_12_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_20_64]  \n\t"
+        "msub     $ac3,                 %[load1],       %[cospi_20_64]  \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_12_64]  \n\t"
+
+        "extp     %[step1_22],          $ac1,           31              \n\t"
+        "extp     %[step1_25],          $ac3,           31              \n\t"
+        "add      %[step1_23],          %[temp0],       %[temp1]        \n\t"
+        "add      %[step1_24],          %[temp2],       %[temp3]        \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2),
+          [load3] "=&r" (load3), [load4] "=&r" (load4),
+          [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+          [step1_22] "=r" (step1_22), [step1_23] "=r" (step1_23),
+          [step1_24] "=r" (step1_24), [step1_25] "=r" (step1_25)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_19_64] "r" (cospi_19_64), [cospi_13_64] "r" (cospi_13_64),
+          [cospi_3_64] "r" (cospi_3_64), [cospi_29_64] "r" (cospi_29_64),
+          [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64)
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load1],              4(%[input])                    \n\t"
+        "lh       %[load2],             60(%[input])                    \n\t"
+        "lh       %[load3],             36(%[input])                    \n\t"
+        "lh       %[load4],             28(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load1],       %[cospi_30_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_2_64]   \n\t"
+        "extp     %[temp0],             $ac1,           31              \n\t"
+
+        "madd     $ac3,                 %[load1],       %[cospi_2_64]   \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_30_64]  \n\t"
+        "extp     %[temp3],             $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac2,                 %[load3],       %[cospi_14_64]  \n\t"
+        "msub     $ac2,                 %[load4],       %[cospi_18_64]  \n\t"
+        "extp     %[temp1],             $ac2,           31              \n\t"
+
+        "madd     $ac1,                 %[load3],       %[cospi_18_64]  \n\t"
+        "madd     $ac1,                 %[load4],       %[cospi_14_64]  \n\t"
+        "extp     %[temp2],             $ac1,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load1],             %[temp0],       %[temp1]        \n\t"
+        "sub      %[load2],             %[temp3],       %[temp2]        \n\t"
+
+        "msub     $ac1,                 %[load1],       %[cospi_8_64]   \n\t"
+        "madd     $ac1,                 %[load2],       %[cospi_24_64]  \n\t"
+        "madd     $ac3,                 %[load1],       %[cospi_24_64]  \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_8_64]   \n\t"
+
+        "extp     %[step2_9],           $ac1,           31              \n\t"
+        "extp     %[step2_14],          $ac3,           31              \n\t"
+        "add      %[step2_8],           %[temp0],       %[temp1]        \n\t"
+        "add      %[step2_15],          %[temp2],       %[temp3]        \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2),
+          [load3] "=&r" (load3), [load4] "=&r" (load4),
+          [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+          [step2_8] "=r" (step2_8), [step2_9] "=r" (step2_9),
+          [step2_14] "=r" (step2_14), [step2_15] "=r" (step2_15)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64),
+          [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64),
+          [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64)
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load1],             20(%[input])                    \n\t"
+        "lh       %[load2],             44(%[input])                    \n\t"
+        "lh       %[load3],             52(%[input])                    \n\t"
+        "lh       %[load4],             12(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load1],       %[cospi_22_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_10_64]  \n\t"
+        "extp     %[temp0],             $ac1,           31              \n\t"
+
+        "madd     $ac3,                 %[load1],       %[cospi_10_64]  \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_22_64]  \n\t"
+        "extp     %[temp3],             $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac2,                 %[load3],       %[cospi_6_64]   \n\t"
+        "msub     $ac2,                 %[load4],       %[cospi_26_64]  \n\t"
+        "extp     %[temp1],             $ac2,           31              \n\t"
+
+        "madd     $ac1,                 %[load3],       %[cospi_26_64]  \n\t"
+        "madd     $ac1,                 %[load4],       %[cospi_6_64]   \n\t"
+        "extp     %[temp2],             $ac1,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load1],             %[temp1],       %[temp0]        \n\t"
+        "sub      %[load2],             %[temp2],       %[temp3]        \n\t"
+
+        "msub     $ac1,                 %[load1],       %[cospi_24_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_8_64]   \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_24_64]  \n\t"
+        "msub     $ac3,                 %[load1],       %[cospi_8_64]   \n\t"
+
+        "extp     %[step2_10],          $ac1,           31              \n\t"
+        "extp     %[step2_13],          $ac3,           31              \n\t"
+        "add      %[step2_11],          %[temp0],       %[temp1]        \n\t"
+        "add      %[step2_12],          %[temp2],       %[temp3]        \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2),
+          [load3] "=&r" (load3), [load4] "=&r" (load4),
+          [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+          [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11),
+          [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64),
+          [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64),
+          [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64)
+    );
+
+    __asm__ __volatile__ (
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "sub      %[temp0],             %[step2_14],    %[step2_13]     \n\t"
+        "sub      %[temp0],             %[temp0],       %[step2_9]      \n\t"
+        "add      %[temp0],             %[temp0],       %[step2_10]     \n\t"
+        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "sub      %[temp1],             %[step2_14],    %[step2_13]     \n\t"
+        "add      %[temp1],             %[temp1],       %[step2_9]      \n\t"
+        "sub      %[temp1],             %[temp1],       %[step2_10]     \n\t"
+        "madd     $ac1,                 %[temp1],       %[cospi_16_64]  \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+        "sub      %[temp0],             %[step2_15],    %[step2_12]     \n\t"
+        "sub      %[temp0],             %[temp0],       %[step2_8]      \n\t"
+        "add      %[temp0],             %[temp0],       %[step2_11]     \n\t"
+        "madd     $ac2,                 %[temp0],       %[cospi_16_64]  \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+        "sub      %[temp1],             %[step2_15],    %[step2_12]     \n\t"
+        "add      %[temp1],             %[temp1],       %[step2_8]      \n\t"
+        "sub      %[temp1],             %[temp1],       %[step2_11]     \n\t"
+        "madd     $ac3,                 %[temp1],       %[cospi_16_64]  \n\t"
+
+        "add      %[step3_8],           %[step2_8],     %[step2_11]     \n\t"
+        "add      %[step3_9],           %[step2_9],     %[step2_10]     \n\t"
+        "add      %[step3_14],          %[step2_13],    %[step2_14]     \n\t"
+        "add      %[step3_15],          %[step2_12],    %[step2_15]     \n\t"
+
+        "extp     %[step3_10],          $ac0,           31              \n\t"
+        "extp     %[step3_13],          $ac1,           31              \n\t"
+        "extp     %[step3_11],          $ac2,           31              \n\t"
+        "extp     %[step3_12],          $ac3,           31              \n\t"
+
+        : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+          [step3_8] "=r" (step3_8), [step3_9] "=r" (step3_9),
+          [step3_10] "=r" (step3_10), [step3_11] "=r" (step3_11),
+          [step3_12] "=r" (step3_12), [step3_13] "=r" (step3_13),
+          [step3_14] "=r" (step3_14), [step3_15] "=r" (step3_15)
+        : [const_2_power_13] "r" (const_2_power_13),
+          [step2_8] "r" (step2_8), [step2_9] "r" (step2_9),
+          [step2_10] "r" (step2_10), [step2_11] "r" (step2_11),
+          [step2_12] "r" (step2_12), [step2_13] "r" (step2_13),
+          [step2_14] "r" (step2_14), [step2_15] "r" (step2_15),
+          [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    step2_18 = step1_17 - step1_18;
+    step2_29 = step1_30 - step1_29;
+
+    __asm__ __volatile__ (
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "msub     $ac0,                 %[step2_18],    %[cospi_8_64]   \n\t"
+        "madd     $ac0,                 %[step2_29],    %[cospi_24_64]  \n\t"
+        "extp     %[step3_18],          $ac0,           31              \n\t"
+
+        : [step3_18] "=r" (step3_18)
+        : [const_2_power_13] "r" (const_2_power_13),
+          [step2_18] "r" (step2_18), [step2_29] "r" (step2_29),
+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+    );
+
+    temp21 = step2_18 * cospi_24_64 + step2_29 * cospi_8_64;
+    step3_29 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+    step2_19 = step1_16 - step1_19;
+    step2_28 = step1_31 - step1_28;
+
+    __asm__ __volatile__ (
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "msub     $ac0,                 %[step2_19],    %[cospi_8_64]   \n\t"
+        "madd     $ac0,                 %[step2_28],    %[cospi_24_64]  \n\t"
+        "extp     %[step3_19],          $ac0,           31              \n\t"
+
+        : [step3_19] "=r" (step3_19)
+        : [const_2_power_13] "r" (const_2_power_13),
+          [step2_19] "r" (step2_19), [step2_28] "r" (step2_28),
+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+    );
+
+    temp21 = step2_19 * cospi_24_64 + step2_28 * cospi_8_64;
+    step3_28 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+    step3_16 = step1_16 + step1_19;
+    step3_17 = step1_17 + step1_18;
+    step3_30 = step1_29 + step1_30;
+    step3_31 = step1_28 + step1_31;
+
+    step2_20 = step1_23 - step1_20;
+    step2_27 = step1_24 - step1_27;
+
+    __asm__ __volatile__ (
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "msub     $ac0,                 %[step2_20],    %[cospi_24_64]  \n\t"
+        "msub     $ac0,                 %[step2_27],    %[cospi_8_64]   \n\t"
+        "extp     %[step3_20],          $ac0,           31              \n\t"
+
+        : [step3_20] "=r" (step3_20)
+        : [const_2_power_13] "r" (const_2_power_13),
+          [step2_20] "r" (step2_20), [step2_27] "r" (step2_27),
+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+    );
+
+    temp21 = -step2_20 * cospi_8_64 + step2_27 * cospi_24_64;
+    step3_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+    step2_21 = step1_22 - step1_21;
+    step2_26 = step1_25 - step1_26;
+
+    __asm__ __volatile__ (
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "msub     $ac1,                 %[step2_21],    %[cospi_24_64]  \n\t"
+        "msub     $ac1,                 %[step2_26],    %[cospi_8_64]   \n\t"
+        "extp     %[step3_21],          $ac1,           31              \n\t"
+
+        : [step3_21] "=r" (step3_21)
+        : [const_2_power_13] "r" (const_2_power_13),
+          [step2_21] "r" (step2_21), [step2_26] "r" (step2_26),
+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+    );
+
+    temp21 = -step2_21 * cospi_8_64 + step2_26 * cospi_24_64;
+    step3_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+    step3_22 = step1_21 + step1_22;
+    step3_23 = step1_20 + step1_23;
+    step3_24 = step1_24 + step1_27;
+    step3_25 = step1_25 + step1_26;
+
+    step2_16 = step3_16 + step3_23;
+    step2_17 = step3_17 + step3_22;
+    step2_18 = step3_18 + step3_21;
+    step2_19 = step3_19 + step3_20;
+    step2_20 = step3_19 - step3_20;
+    step2_21 = step3_18 - step3_21;
+    step2_22 = step3_17 - step3_22;
+    step2_23 = step3_16 - step3_23;
+
+    step2_24 = step3_31 - step3_24;
+    step2_25 = step3_30 - step3_25;
+    step2_26 = step3_29 - step3_26;
+    step2_27 = step3_28 - step3_27;
+    step2_28 = step3_28 + step3_27;
+    step2_29 = step3_29 + step3_26;
+    step2_30 = step3_30 + step3_25;
+    step2_31 = step3_31 + step3_24;
+
+    __asm__ __volatile__ (
+        "lh       %[load1],             0(%[input])                     \n\t"
+        "lh       %[load2],             32(%[input])                    \n\t"
+        "lh       %[load3],             16(%[input])                    \n\t"
+        "lh       %[load4],             48(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+        "add      %[result1],           %[load1],       %[load2]        \n\t"
+        "sub      %[result2],           %[load1],       %[load2]        \n\t"
+        "madd     $ac1,                 %[result1],     %[cospi_16_64]  \n\t"
+        "madd     $ac2,                 %[result2],     %[cospi_16_64]  \n\t"
+        "extp     %[temp0],             $ac1,           31              \n\t"
+        "extp     %[temp1],             $ac2,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+        "madd     $ac3,                 %[load3],       %[cospi_24_64]  \n\t"
+        "msub     $ac3,                 %[load4],       %[cospi_8_64]   \n\t"
+        "extp     %[temp2],             $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "madd     $ac1,                 %[load3],       %[cospi_8_64]   \n\t"
+        "madd     $ac1,                 %[load4],       %[cospi_24_64]  \n\t"
+        "extp     %[temp3],             $ac1,           31              \n\t"
+
+        "add      %[step1_0],          %[temp0],        %[temp3]        \n\t"
+        "add      %[step1_1],          %[temp1],        %[temp2]        \n\t"
+        "sub      %[step1_2],          %[temp1],        %[temp2]        \n\t"
+        "sub      %[step1_3],          %[temp0],        %[temp3]        \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2),
+          [load3] "=&r" (load3), [load4] "=&r" (load4),
+          [result1] "=&r" (result1), [result2] "=&r" (result2),
+          [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+          [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1),
+          [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_16_64] "r" (cospi_16_64),
+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load1],             8(%[input])                     \n\t"
+        "lh       %[load2],             56(%[input])                    \n\t"
+        "lh       %[load3],             40(%[input])                    \n\t"
+        "lh       %[load4],             24(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load1],       %[cospi_28_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_4_64]   \n\t"
+        "extp     %[temp0],             $ac1,           31              \n\t"
+
+        "madd     $ac3,                 %[load1],       %[cospi_4_64]   \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_28_64]  \n\t"
+        "extp     %[temp3],             $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac2,                 %[load3],       %[cospi_12_64]  \n\t"
+        "msub     $ac2,                 %[load4],       %[cospi_20_64]  \n\t"
+        "extp     %[temp1],             $ac2,           31              \n\t"
+
+        "madd     $ac1,                 %[load3],       %[cospi_20_64]  \n\t"
+        "madd     $ac1,                 %[load4],       %[cospi_12_64]  \n\t"
+        "extp     %[temp2],             $ac1,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load1],             %[temp3],       %[temp2]        \n\t"
+        "sub      %[load1],             %[load1],       %[temp0]        \n\t"
+        "add      %[load1],             %[load1],       %[temp1]        \n\t"
+
+        "sub      %[load2],             %[temp0],       %[temp1]        \n\t"
+        "sub      %[load2],             %[load2],       %[temp2]        \n\t"
+        "add      %[load2],             %[load2],       %[temp3]        \n\t"
+
+        "madd     $ac1,                 %[load1],       %[cospi_16_64]  \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_16_64]  \n\t"
+
+        "extp     %[step1_5],           $ac1,           31              \n\t"
+        "extp     %[step1_6],           $ac3,           31              \n\t"
+        "add      %[step1_4],           %[temp0],       %[temp1]        \n\t"
+        "add      %[step1_7],           %[temp3],       %[temp2]        \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2),
+          [load3] "=&r" (load3), [load4] "=&r" (load4),
+          [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+          [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5),
+          [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64),
+          [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64),
+          [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    step2_0 = step1_0 + step1_7;
+    step2_1 = step1_1 + step1_6;
+    step2_2 = step1_2 + step1_5;
+    step2_3 = step1_3 + step1_4;
+    step2_4 = step1_3 - step1_4;
+    step2_5 = step1_2 - step1_5;
+    step2_6 = step1_1 - step1_6;
+    step2_7 = step1_0 - step1_7;
+
+    step1_0 = step2_0 + step3_15;
+    step1_1 = step2_1 + step3_14;
+    step1_2 = step2_2 + step3_13;
+    step1_3 = step2_3 + step3_12;
+    step1_4 = step2_4 + step3_11;
+    step1_5 = step2_5 + step3_10;
+    step1_6 = step2_6 + step3_9;
+    step1_7 = step2_7 + step3_8;
+    step1_8 = step2_7 - step3_8;
+    step1_9 = step2_6 - step3_9;
+    step1_10 = step2_5 - step3_10;
+    step1_11 = step2_4 - step3_11;
+    step1_12 = step2_3 - step3_12;
+    step1_13 = step2_2 - step3_13;
+    step1_14 = step2_1 - step3_14;
+    step1_15 = step2_0 - step3_15;
+
+    __asm__ __volatile__ (
+        "sub      %[temp0],             %[step2_27],    %[step2_20]     \n\t"
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
+        "extp     %[step1_20],          $ac0,           31              \n\t"
+
+        : [temp0] "=&r" (temp0), [step1_20] "=r" (step1_20)
+        : [const_2_power_13] "r" (const_2_power_13),
+          [step2_20] "r" (step2_20), [step2_27] "r" (step2_27),
+          [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    temp21 = (step2_20 + step2_27) * cospi_16_64;
+    step1_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+    __asm__ __volatile__ (
+        "sub      %[temp0],             %[step2_26],    %[step2_21]     \n\t"
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
+        "extp     %[step1_21],          $ac0,           31              \n\t"
+
+        : [temp0] "=&r" (temp0), [step1_21] "=r" (step1_21)
+        : [const_2_power_13] "r" (const_2_power_13),
+          [step2_26] "r" (step2_26), [step2_21] "r" (step2_21),
+          [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    temp21 = (step2_21 + step2_26) * cospi_16_64;
+    step1_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+    __asm__ __volatile__ (
+        "sub      %[temp0],             %[step2_25],    %[step2_22]     \n\t"
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
+        "extp     %[step1_22],          $ac0,           31              \n\t"
+
+        : [temp0] "=&r" (temp0), [step1_22] "=r" (step1_22)
+        : [const_2_power_13] "r" (const_2_power_13),
+          [step2_25] "r" (step2_25), [step2_22] "r" (step2_22),
+          [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    temp21 = (step2_22 + step2_25) * cospi_16_64;
+    step1_25 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+    __asm__ __volatile__ (
+        "sub      %[temp0],             %[step2_24],    %[step2_23]     \n\t"
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
+        "extp     %[step1_23],          $ac0,           31              \n\t"
+
+        : [temp0] "=&r" (temp0), [step1_23] "=r" (step1_23)
+        : [const_2_power_13] "r" (const_2_power_13),
+          [step2_24] "r" (step2_24), [step2_23] "r" (step2_23),
+          [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    temp21 = (step2_23 + step2_24) * cospi_16_64;
+    step1_24 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+    // final stage
+    output[0 * 32] = step1_0 + step2_31;
+    output[1 * 32] = step1_1 + step2_30;
+    output[2 * 32] = step1_2 + step2_29;
+    output[3 * 32] = step1_3 + step2_28;
+    output[4 * 32] = step1_4 + step1_27;
+    output[5 * 32] = step1_5 + step1_26;
+    output[6 * 32] = step1_6 + step1_25;
+    output[7 * 32] = step1_7 + step1_24;
+    output[8 * 32] = step1_8 + step1_23;
+    output[9 * 32] = step1_9 + step1_22;
+    output[10 * 32] = step1_10 + step1_21;
+    output[11 * 32] = step1_11 + step1_20;
+    output[12 * 32] = step1_12 + step2_19;
+    output[13 * 32] = step1_13 + step2_18;
+    output[14 * 32] = step1_14 + step2_17;
+    output[15 * 32] = step1_15 + step2_16;
+    output[16 * 32] = step1_15 - step2_16;
+    output[17 * 32] = step1_14 - step2_17;
+    output[18 * 32] = step1_13 - step2_18;
+    output[19 * 32] = step1_12 - step2_19;
+    output[20 * 32] = step1_11 - step1_20;
+    output[21 * 32] = step1_10 - step1_21;
+    output[22 * 32] = step1_9 - step1_22;
+    output[23 * 32] = step1_8 - step1_23;
+    output[24 * 32] = step1_7 - step1_24;
+    output[25 * 32] = step1_6 - step1_25;
+    output[26 * 32] = step1_5 - step1_26;
+    output[27 * 32] = step1_4 - step1_27;
+    output[28 * 32] = step1_3 - step2_28;
+    output[29 * 32] = step1_2 - step2_29;
+    output[30 * 32] = step1_1 - step2_30;
+    output[31 * 32] = step1_0 - step2_31;
+
+    input += 32;
+    output += 1;
+  }
+}
+
+void vp9_idct32x32_1024_add_dspr2(const int16_t *input, uint8_t *dest,
+                                  int dest_stride) {
+  DECLARE_ALIGNED(32, int16_t,  out[32 * 32]);
+  int16_t *outptr = out;
+  uint32_t pos = 45;
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp      %[pos],     1           \n\t"
+    :
+    : [pos] "r" (pos)
+  );
+
+  // Rows
+  idct32_1d_rows_dspr2(input, outptr);
+
+  // Columns
+  vp9_idct32_1d_cols_add_blk_dspr2(out, dest, dest_stride);
+}
+
+void vp9_idct32x32_1_add_dspr2(const int16_t *input, uint8_t *dest,
+                               int stride) {
+  int       r, out;
+  int32_t   a1, absa1;
+  int32_t   vector_a1;
+  int32_t   t1, t2, t3, t4;
+  int32_t   vector_1, vector_2, vector_3, vector_4;
+  uint32_t  pos = 45;
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp      %[pos],     1           \n\t"
+
+    :
+    : [pos] "r" (pos)
+  );
+
+  out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);
+  __asm__ __volatile__ (
+      "addi     %[out],    %[out],    32      \n\t"
+      "sra      %[a1],     %[out],    6       \n\t"
+
+      : [out] "+r" (out), [a1] "=r" (a1)
+      :
+  );
+
+  if (a1 < 0) {
+    /* use quad-byte
+     * input and output memory are four byte aligned */
+    __asm__ __volatile__ (
+        "abs        %[absa1],     %[a1]         \n\t"
+        "replv.qb   %[vector_a1], %[absa1]      \n\t"
+
+        : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)
+        : [a1] "r" (a1)
+    );
+
+    for (r = 32; r--;) {
+      __asm__ __volatile__ (
+          "lw             %[t1],          0(%[dest])                      \n\t"
+          "lw             %[t2],          4(%[dest])                      \n\t"
+          "lw             %[t3],          8(%[dest])                      \n\t"
+          "lw             %[t4],          12(%[dest])                     \n\t"
+          "subu_s.qb      %[vector_1],    %[t1],          %[vector_a1]    \n\t"
+          "subu_s.qb      %[vector_2],    %[t2],          %[vector_a1]    \n\t"
+          "subu_s.qb      %[vector_3],    %[t3],          %[vector_a1]    \n\t"
+          "subu_s.qb      %[vector_4],    %[t4],          %[vector_a1]    \n\t"
+          "sw             %[vector_1],    0(%[dest])                      \n\t"
+          "sw             %[vector_2],    4(%[dest])                      \n\t"
+          "sw             %[vector_3],    8(%[dest])                      \n\t"
+          "sw             %[vector_4],    12(%[dest])                     \n\t"
+
+          "lw             %[t1],          16(%[dest])                     \n\t"
+          "lw             %[t2],          20(%[dest])                     \n\t"
+          "lw             %[t3],          24(%[dest])                     \n\t"
+          "lw             %[t4],          28(%[dest])                     \n\t"
+          "subu_s.qb      %[vector_1],    %[t1],          %[vector_a1]    \n\t"
+          "subu_s.qb      %[vector_2],    %[t2],          %[vector_a1]    \n\t"
+          "subu_s.qb      %[vector_3],    %[t3],          %[vector_a1]    \n\t"
+          "subu_s.qb      %[vector_4],    %[t4],          %[vector_a1]    \n\t"
+          "sw             %[vector_1],    16(%[dest])                     \n\t"
+          "sw             %[vector_2],    20(%[dest])                     \n\t"
+          "sw             %[vector_3],    24(%[dest])                     \n\t"
+          "sw             %[vector_4],    28(%[dest])                     \n\t"
+
+          "add            %[dest],        %[dest],        %[stride]       \n\t"
+
+          : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4),
+            [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
+            [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4),
+            [dest] "+&r" (dest)
+          : [stride] "r" (stride), [vector_a1] "r" (vector_a1)
+      );
+    }
+  } else {
+    /* use quad-byte
+     * input and output memory are four byte aligned */
+    __asm__ __volatile__ (
+        "replv.qb       %[vector_a1],   %[a1]     \n\t"
+
+        : [vector_a1] "=r" (vector_a1)
+        : [a1] "r" (a1)
+    );
+
+    for (r = 32; r--;) {
+      __asm__ __volatile__ (
+          "lw             %[t1],          0(%[dest])                      \n\t"
+          "lw             %[t2],          4(%[dest])                      \n\t"
+          "lw             %[t3],          8(%[dest])                      \n\t"
+          "lw             %[t4],          12(%[dest])                     \n\t"
+          "addu_s.qb      %[vector_1],    %[t1],          %[vector_a1]    \n\t"
+          "addu_s.qb      %[vector_2],    %[t2],          %[vector_a1]    \n\t"
+          "addu_s.qb      %[vector_3],    %[t3],          %[vector_a1]    \n\t"
+          "addu_s.qb      %[vector_4],    %[t4],          %[vector_a1]    \n\t"
+          "sw             %[vector_1],    0(%[dest])                      \n\t"
+          "sw             %[vector_2],    4(%[dest])                      \n\t"
+          "sw             %[vector_3],    8(%[dest])                      \n\t"
+          "sw             %[vector_4],    12(%[dest])                     \n\t"
+
+          "lw             %[t1],          16(%[dest])                     \n\t"
+          "lw             %[t2],          20(%[dest])                     \n\t"
+          "lw             %[t3],          24(%[dest])                     \n\t"
+          "lw             %[t4],          28(%[dest])                     \n\t"
+          "addu_s.qb      %[vector_1],    %[t1],          %[vector_a1]    \n\t"
+          "addu_s.qb      %[vector_2],    %[t2],          %[vector_a1]    \n\t"
+          "addu_s.qb      %[vector_3],    %[t3],          %[vector_a1]    \n\t"
+          "addu_s.qb      %[vector_4],    %[t4],          %[vector_a1]    \n\t"
+          "sw             %[vector_1],    16(%[dest])                     \n\t"
+          "sw             %[vector_2],    20(%[dest])                     \n\t"
+          "sw             %[vector_3],    24(%[dest])                     \n\t"
+          "sw             %[vector_4],    28(%[dest])                     \n\t"
+
+          "add            %[dest],        %[dest],        %[stride]       \n\t"
+
+          : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4),
+            [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
+            [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4),
+            [dest] "+&r" (dest)
+          : [stride] "r" (stride), [vector_a1] "r" (vector_a1)
+      );
+    }
+  }
+}
+#endif  // #if HAVE_DSPR2
diff --git a/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c b/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c
new file mode 100644
index 000000000..5b7aa5e71
--- /dev/null
+++ b/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c
@@ -0,0 +1,438 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_idct.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+
+#if HAVE_DSPR2
+static void vp9_idct4_1d_rows_dspr2(const int16_t *input, int16_t *output) {
+  int16_t   step_0, step_1, step_2, step_3;
+  int       Temp0, Temp1, Temp2, Temp3;
+  const int const_2_power_13 = 8192;
+  int       i;
+
+  for (i = 4; i--; ) {
+    __asm__ __volatile__ (
+        /*
+          temp_1 = (input[0] + input[2]) * cospi_16_64;
+          step_0 = dct_const_round_shift(temp_1);
+
+          temp_2 = (input[0] - input[2]) * cospi_16_64;
+          step_1 = dct_const_round_shift(temp_2);
+        */
+        "lh       %[Temp0],             0(%[input])                     \n\t"
+        "lh       %[Temp1],             4(%[input])                     \n\t"
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "add      %[Temp2],             %[Temp0],       %[Temp1]        \n\t"
+        "sub      %[Temp3],             %[Temp0],       %[Temp1]        \n\t"
+        "madd     $ac0,                 %[Temp2],       %[cospi_16_64]  \n\t"
+        "lh       %[Temp0],             2(%[input])                     \n\t"
+        "lh       %[Temp1],             6(%[input])                     \n\t"
+        "extp     %[step_0],            $ac0,           31              \n\t"
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+
+        "madd     $ac1,                 %[Temp3],       %[cospi_16_64]  \n\t"
+        "extp     %[step_1],            $ac1,           31              \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+
+        /*
+          temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
+          step_2 = dct_const_round_shift(temp1);
+        */
+        "madd     $ac0,                 %[Temp0],       %[cospi_24_64]  \n\t"
+        "msub     $ac0,                 %[Temp1],       %[cospi_8_64]   \n\t"
+        "extp     %[step_2],            $ac0,           31              \n\t"
+
+        /*
+          temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
+          step_3 = dct_const_round_shift(temp2);
+        */
+        "madd     $ac1,                 %[Temp0],       %[cospi_8_64]   \n\t"
+        "madd     $ac1,                 %[Temp1],       %[cospi_24_64]  \n\t"
+        "extp     %[step_3],            $ac1,           31              \n\t"
+
+        /*
+          output[0]  = step_0 + step_3;
+          output[4]  = step_1 + step_2;
+          output[8]  = step_1 - step_2;
+          output[12] = step_0 - step_3;
+        */
+        "add      %[Temp0],             %[step_0],      %[step_3]       \n\t"
+        "sh       %[Temp0],             0(%[output])                    \n\t"
+
+        "add      %[Temp1],             %[step_1],      %[step_2]       \n\t"
+        "sh       %[Temp1],             8(%[output])                    \n\t"
+
+        "sub      %[Temp2],             %[step_1],      %[step_2]       \n\t"
+        "sh       %[Temp2],             16(%[output])                   \n\t"
+
+        "sub      %[Temp3],             %[step_0],      %[step_3]       \n\t"
+        "sh       %[Temp3],             24(%[output])                   \n\t"
+
+      : [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),
+        [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
+        [step_0] "=&r" (step_0), [step_1] "=&r" (step_1),
+        [step_2] "=&r" (step_2), [step_3] "=&r" (step_3),
+        [output] "+r" (output)
+      : [const_2_power_13] "r" (const_2_power_13),
+        [cospi_8_64] "r" (cospi_8_64), [cospi_16_64] "r" (cospi_16_64),
+        [cospi_24_64] "r" (cospi_24_64),
+        [input] "r" (input)
+    );
+
+    input += 4;
+    output += 1;
+  }
+}
+
+static void vp9_idct4_1d_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
+                                               int dest_stride) {
+  int16_t   step_0, step_1, step_2, step_3;
+  int       Temp0, Temp1, Temp2, Temp3;
+  const int const_2_power_13 = 8192;
+  int       i;
+  uint8_t   *dest_pix;
+  uint8_t   *cm = vp9_ff_cropTbl;
+
+  /* prefetch vp9_ff_cropTbl */
+  vp9_prefetch_load(vp9_ff_cropTbl);
+  vp9_prefetch_load(vp9_ff_cropTbl +  32);
+  vp9_prefetch_load(vp9_ff_cropTbl +  64);
+  vp9_prefetch_load(vp9_ff_cropTbl +  96);
+  vp9_prefetch_load(vp9_ff_cropTbl + 128);
+  vp9_prefetch_load(vp9_ff_cropTbl + 160);
+  vp9_prefetch_load(vp9_ff_cropTbl + 192);
+  vp9_prefetch_load(vp9_ff_cropTbl + 224);
+
+  for (i = 0; i < 4; ++i) {
+      dest_pix = (dest + i);
+
+    __asm__ __volatile__ (
+        /*
+          temp_1 = (input[0] + input[2]) * cospi_16_64;
+          step_0 = dct_const_round_shift(temp_1);
+
+          temp_2 = (input[0] - input[2]) * cospi_16_64;
+          step_1 = dct_const_round_shift(temp_2);
+        */
+        "lh       %[Temp0],             0(%[input])                     \n\t"
+        "lh       %[Temp1],             4(%[input])                     \n\t"
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "add      %[Temp2],             %[Temp0],       %[Temp1]        \n\t"
+        "sub      %[Temp3],             %[Temp0],       %[Temp1]        \n\t"
+        "madd     $ac0,                 %[Temp2],       %[cospi_16_64]  \n\t"
+        "lh       %[Temp0],             2(%[input])                     \n\t"
+        "lh       %[Temp1],             6(%[input])                     \n\t"
+        "extp     %[step_0],            $ac0,           31              \n\t"
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+
+        "madd     $ac1,                 %[Temp3],       %[cospi_16_64]  \n\t"
+        "extp     %[step_1],            $ac1,           31              \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+
+        /*
+          temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
+          step_2 = dct_const_round_shift(temp1);
+        */
+        "madd     $ac0,                 %[Temp0],       %[cospi_24_64]  \n\t"
+        "msub     $ac0,                 %[Temp1],       %[cospi_8_64]   \n\t"
+        "extp     %[step_2],            $ac0,           31              \n\t"
+
+        /*
+          temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
+          step_3 = dct_const_round_shift(temp2);
+        */
+        "madd     $ac1,                 %[Temp0],       %[cospi_8_64]   \n\t"
+        "madd     $ac1,                 %[Temp1],       %[cospi_24_64]  \n\t"
+        "extp     %[step_3],            $ac1,           31              \n\t"
+
+        /*
+          output[0]  = step_0 + step_3;
+          output[4]  = step_1 + step_2;
+          output[8]  = step_1 - step_2;
+          output[12] = step_0 - step_3;
+        */
+        "add      %[Temp0],             %[step_0],      %[step_3]       \n\t"
+        "addi     %[Temp0],             %[Temp0],       8               \n\t"
+        "sra      %[Temp0],             %[Temp0],       4               \n\t"
+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
+        "add      %[Temp0],             %[step_1],      %[step_2]       \n\t"
+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
+        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
+
+        "addi     %[Temp0],             %[Temp0],       8               \n\t"
+        "sra      %[Temp0],             %[Temp0],       4               \n\t"
+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
+        "sub      %[Temp0],             %[step_1],      %[step_2]       \n\t"
+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
+        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
+
+        "addi     %[Temp0],             %[Temp0],       8               \n\t"
+        "sra      %[Temp0],             %[Temp0],       4               \n\t"
+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
+        "sub      %[Temp0],             %[step_0],      %[step_3]       \n\t"
+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
+        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
+
+        "addi     %[Temp0],             %[Temp0],       8               \n\t"
+        "sra      %[Temp0],             %[Temp0],       4               \n\t"
+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
+
+      : [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),
+        [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
+        [step_0] "=&r" (step_0), [step_1] "=&r" (step_1),
+        [step_2] "=&r" (step_2), [step_3] "=&r" (step_3),
+        [dest_pix] "+r" (dest_pix)
+      : [const_2_power_13] "r" (const_2_power_13),
+        [cospi_8_64] "r" (cospi_8_64), [cospi_16_64] "r" (cospi_16_64),
+        [cospi_24_64] "r" (cospi_24_64),
+        [input] "r" (input), [cm] "r" (cm), [dest_stride] "r" (dest_stride)
+    );
+
+    input += 4;
+  }
+}
+
+void vp9_idct4x4_16_add_dspr2(const int16_t *input, uint8_t *dest,
+                              int dest_stride) {
+  DECLARE_ALIGNED(32, int16_t, out[4 * 4]);
+  int16_t *outptr = out;
+  uint32_t pos = 45;
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp      %[pos],     1           \n\t"
+    :
+    : [pos] "r" (pos)
+  );
+
+  // Rows
+  vp9_idct4_1d_rows_dspr2(input, outptr);
+
+  // Columns
+  vp9_idct4_1d_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+}
+
+void vp9_idct4x4_1_add_dspr2(const int16_t *input, uint8_t *dest,
+                             int dest_stride) {
+  int       a1, absa1;
+  int       r;
+  int32_t   out;
+  int       t2, vector_a1, vector_a;
+  uint32_t  pos = 45;
+  int16_t   input_dc = input[0];
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp      %[pos],     1           \n\t"
+
+    :
+    : [pos] "r" (pos)
+  );
+
+  out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input_dc);
+  __asm__ __volatile__ (
+      "addi     %[out],     %[out],    8       \n\t"
+      "sra      %[a1],      %[out],    4       \n\t"
+
+      : [out] "+r" (out), [a1] "=r" (a1)
+      :
+  );
+
+  if (a1 < 0) {
+    /* use quad-byte
+     * input and output memory are four byte aligned */
+    __asm__ __volatile__ (
+        "abs        %[absa1],     %[a1]         \n\t"
+        "replv.qb   %[vector_a1], %[absa1]      \n\t"
+
+        : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)
+        : [a1] "r" (a1)
+    );
+
+    for (r = 4; r--;) {
+      __asm__ __volatile__ (
+          "lw             %[t2],          0(%[dest])                      \n\t"
+          "subu_s.qb      %[vector_a],    %[t2],          %[vector_a1]    \n\t"
+          "sw             %[vector_a],    0(%[dest])                      \n\t"
+          "add            %[dest],        %[dest],        %[dest_stride]  \n\t"
+
+          : [t2] "=&r" (t2), [vector_a] "=&r" (vector_a),
+            [dest] "+&r" (dest)
+          : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
+      );
+    }
+  } else {
+    /* use quad-byte
+     * input and output memory are four byte aligned */
+    __asm__ __volatile__ (
+        "replv.qb       %[vector_a1],   %[a1]     \n\t"
+        : [vector_a1] "=r" (vector_a1)
+        : [a1] "r" (a1)
+    );
+
+    for (r = 4; r--;) {
+      __asm__ __volatile__ (
+          "lw           %[t2],          0(%[dest])                        \n\t"
+          "addu_s.qb    %[vector_a],    %[t2],            %[vector_a1]    \n\t"
+          "sw           %[vector_a],    0(%[dest])                        \n\t"
+          "add          %[dest],        %[dest],          %[dest_stride]  \n\t"
+
+          : [t2] "=&r" (t2), [vector_a] "=&r" (vector_a),
+            [dest] "+&r" (dest)
+          : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
+      );
+    }
+  }
+}
+
+static void iadst4_1d_dspr2(const int16_t *input, int16_t *output) {
+  int s0, s1, s2, s3, s4, s5, s6, s7;
+  int x0, x1, x2, x3;
+
+  x0 = input[0];
+  x1 = input[1];
+  x2 = input[2];
+  x3 = input[3];
+
+  if (!(x0 | x1 | x2 | x3)) {
+    output[0] = output[1] = output[2] = output[3] = 0;
+    return;
+  }
+
+  s0 = sinpi_1_9 * x0;
+  s1 = sinpi_2_9 * x0;
+  s2 = sinpi_3_9 * x1;
+  s3 = sinpi_4_9 * x2;
+  s4 = sinpi_1_9 * x2;
+  s5 = sinpi_2_9 * x3;
+  s6 = sinpi_4_9 * x3;
+  s7 = x0 - x2 + x3;
+
+  x0 = s0 + s3 + s5;
+  x1 = s1 - s4 - s6;
+  x2 = sinpi_3_9 * s7;
+  x3 = s2;
+
+  s0 = x0 + x3;
+  s1 = x1 + x3;
+  s2 = x2;
+  s3 = x0 + x1 - x3;
+
+  // 1-D transform scaling factor is sqrt(2).
+  // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
+  // + 1b (addition) = 29b.
+  // Hence the output bit depth is 15b.
+  output[0] = dct_const_round_shift(s0);
+  output[1] = dct_const_round_shift(s1);
+  output[2] = dct_const_round_shift(s2);
+  output[3] = dct_const_round_shift(s3);
+}
+
+void vp9_iht4x4_16_add_dspr2(const int16_t *input, uint8_t *dest,
+                             int dest_stride, int tx_type) {
+  int i, j;
+  DECLARE_ALIGNED(32, int16_t, out[4 * 4]);
+  int16_t *outptr = out;
+  int16_t temp_in[4 * 4], temp_out[4];
+  uint32_t pos = 45;
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp      %[pos],     1           \n\t"
+    :
+    : [pos] "r" (pos)
+  );
+
+  switch (tx_type) {
+    case DCT_DCT:   // DCT in both horizontal and vertical
+      vp9_idct4_1d_rows_dspr2(input, outptr);
+      vp9_idct4_1d_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+      break;
+    case ADST_DCT:  // ADST in vertical, DCT in horizontal
+      vp9_idct4_1d_rows_dspr2(input, outptr);
+
+      outptr = out;
+
+      for (i = 0; i < 4; ++i) {
+        iadst4_1d_dspr2(outptr, temp_out);
+
+        for (j = 0; j < 4; ++j)
+          dest[j * dest_stride + i] =
+                    clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
+                                      + dest[j * dest_stride + i]);
+
+        outptr += 4;
+      }
+      break;
+    case DCT_ADST:  // DCT in vertical, ADST in horizontal
+      for (i = 0; i < 4; ++i) {
+        iadst4_1d_dspr2(input, outptr);
+        input  += 4;
+        outptr += 4;
+      }
+
+      for (i = 0; i < 4; ++i) {
+        for (j = 0; j < 4; ++j) {
+          temp_in[i * 4 + j] = out[j * 4 + i];
+        }
+      }
+      vp9_idct4_1d_columns_add_blk_dspr2(&temp_in[0], dest, dest_stride);
+      break;
+    case ADST_ADST:  // ADST in both directions
+      for (i = 0; i < 4; ++i) {
+        iadst4_1d_dspr2(input, outptr);
+        input  += 4;
+        outptr += 4;
+      }
+
+      for (i = 0; i < 4; ++i) {
+        for (j = 0; j < 4; ++j)
+          temp_in[j] = out[j * 4 + i];
+        iadst4_1d_dspr2(temp_in, temp_out);
+
+        for (j = 0; j < 4; ++j)
+          dest[j * dest_stride + i] =
+                  clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
+                                      + dest[j * dest_stride + i]);
+      }
+      break;
+    default:
+      printf("vp9_short_iht4x4_add_dspr2 : Invalid tx_type\n");
+      break;
+  }
+}
+#endif  // #if HAVE_DSPR2
diff --git a/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c b/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c
new file mode 100644
index 000000000..93a08401d
--- /dev/null
+++ b/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c
@@ -0,0 +1,745 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_idct.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+
+#if HAVE_DSPR2
+static void idct8_1d_rows_dspr2(const int16_t *input, int16_t *output,
+                                uint32_t no_rows) {
+  int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
+  const int const_2_power_13 = 8192;
+  int Temp0, Temp1, Temp2, Temp3, Temp4;
+  int i;
+
+  for (i = no_rows; i--; ) {
+    __asm__ __volatile__ (
+        /*
+          temp_1 = (input[0] + input[4]) * cospi_16_64;
+          step2_0 = dct_const_round_shift(temp_1);
+
+          temp_2 = (input[0] - input[4]) * cospi_16_64;
+          step2_1 = dct_const_round_shift(temp_2);
+        */
+        "lh       %[Temp0],             0(%[input])                     \n\t"
+        "lh       %[Temp1],             8(%[input])                     \n\t"
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "add      %[Temp2],             %[Temp0],       %[Temp1]        \n\t"
+        "madd     $ac0,                 %[Temp2],       %[cospi_16_64]  \n\t"
+        "extp     %[Temp4],             $ac0,           31              \n\t"
+
+        "sub      %[Temp3],             %[Temp0],       %[Temp1]        \n\t"
+        "madd     $ac1,                 %[Temp3],       %[cospi_16_64]  \n\t"
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "extp     %[Temp2],             $ac1,           31              \n\t"
+
+        /*
+          temp_1 = input[2] * cospi_24_64 - input[6] * cospi_8_64;
+          step2_2 = dct_const_round_shift(temp_1);
+        */
+        "lh       %[Temp0],             4(%[input])                     \n\t"
+        "lh       %[Temp1],             12(%[input])                    \n\t"
+        "madd     $ac0,                 %[Temp0],       %[cospi_24_64]  \n\t"
+        "msub     $ac0,                 %[Temp1],       %[cospi_8_64]   \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "extp     %[Temp3],             $ac0,           31              \n\t"
+
+        /*
+          step1_1 = step2_1 + step2_2;
+          step1_2 = step2_1 - step2_2;
+        */
+        "add      %[step1_1],           %[Temp2],       %[Temp3]        \n\t"
+        "sub      %[step1_2],           %[Temp2],       %[Temp3]        \n\t"
+
+        /*
+          temp_2 = input[2] * cospi_8_64 + input[6] * cospi_24_64;
+          step2_3 = dct_const_round_shift(temp_2);
+        */
+        "madd     $ac1,                 %[Temp0],       %[cospi_8_64]   \n\t"
+        "madd     $ac1,                 %[Temp1],       %[cospi_24_64]  \n\t"
+        "extp     %[Temp1],             $ac1,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+
+        /*
+          step1_0 = step2_0 + step2_3;
+          step1_3 = step2_0 - step2_3;
+        */
+        "add      %[step1_0],           %[Temp4],       %[Temp1]        \n\t"
+        "sub      %[step1_3],           %[Temp4],       %[Temp1]        \n\t"
+
+        /*
+          temp_1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
+          step1_4 = dct_const_round_shift(temp_1);
+        */
+        "lh       %[Temp0],             2(%[input])                     \n\t"
+        "madd     $ac0,                 %[Temp0],       %[cospi_28_64]  \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "lh       %[Temp1],             14(%[input])                    \n\t"
+        "lh       %[Temp0],             2(%[input])                     \n\t"
+        "msub     $ac0,                 %[Temp1],       %[cospi_4_64]   \n\t"
+        "extp     %[step1_4],           $ac0,           31              \n\t"
+
+        /*
+          temp_2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
+          step1_7 = dct_const_round_shift(temp_2);
+        */
+        "madd     $ac1,                 %[Temp0],       %[cospi_4_64]   \n\t"
+        "madd     $ac1,                 %[Temp1],       %[cospi_28_64]  \n\t"
+        "extp     %[step1_7],           $ac1,           31              \n\t"
+
+        /*
+          temp_1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
+          step1_5 = dct_const_round_shift(temp_1);
+        */
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "lh       %[Temp0],             10(%[input])                    \n\t"
+        "madd     $ac0,                 %[Temp0],       %[cospi_12_64]  \n\t"
+        "lh       %[Temp1],             6(%[input])                     \n\t"
+        "msub     $ac0,                 %[Temp1],       %[cospi_20_64]  \n\t"
+        "extp     %[step1_5],           $ac0,           31              \n\t"
+
+        /*
+          temp_2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
+          step1_6 = dct_const_round_shift(temp_2);
+        */
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "lh       %[Temp0],             10(%[input])                    \n\t"
+        "madd     $ac1,                 %[Temp0],       %[cospi_20_64]  \n\t"
+        "lh       %[Temp1],             6(%[input])                     \n\t"
+        "madd     $ac1,                 %[Temp1],       %[cospi_12_64]  \n\t"
+        "extp     %[step1_6],           $ac1,           31              \n\t"
+
+        /*
+          temp_1 = (step1_7 - step1_6 - step1_4 + step1_5) * cospi_16_64;
+          temp_2 = (step1_4 - step1_5 - step1_6 + step1_7) * cospi_16_64;
+        */
+        "sub      %[Temp0],             %[step1_7],     %[step1_6]      \n\t"
+        "sub      %[Temp0],             %[Temp0],       %[step1_4]      \n\t"
+        "add      %[Temp0],             %[Temp0],       %[step1_5]      \n\t"
+        "sub      %[Temp1],             %[step1_4],     %[step1_5]      \n\t"
+        "sub      %[Temp1],             %[Temp1],       %[step1_6]      \n\t"
+        "add      %[Temp1],             %[Temp1],       %[step1_7]      \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+
+        "madd     $ac0,                 %[Temp0],       %[cospi_16_64]  \n\t"
+        "madd     $ac1,                 %[Temp1],       %[cospi_16_64]  \n\t"
+
+        /*
+          step1_4 = step1_4 + step1_5;
+          step1_7 = step1_6 + step1_7;
+        */
+        "add      %[step1_4],           %[step1_4],     %[step1_5]      \n\t"
+        "add      %[step1_7],           %[step1_7],     %[step1_6]      \n\t"
+
+        "extp     %[step1_5],           $ac0,           31              \n\t"
+        "extp     %[step1_6],           $ac1,           31              \n\t"
+
+        "add      %[Temp0],             %[step1_0],     %[step1_7]      \n\t"
+        "sh       %[Temp0],             0(%[output])                    \n\t"
+        "add      %[Temp1],             %[step1_1],     %[step1_6]      \n\t"
+        "sh       %[Temp1],             16(%[output])                   \n\t"
+        "add      %[Temp0],             %[step1_2],     %[step1_5]      \n\t"
+        "sh       %[Temp0],             32(%[output])                   \n\t"
+        "add      %[Temp1],             %[step1_3],     %[step1_4]      \n\t"
+        "sh       %[Temp1],             48(%[output])                   \n\t"
+
+        "sub      %[Temp0],             %[step1_3],     %[step1_4]      \n\t"
+        "sh       %[Temp0],             64(%[output])                   \n\t"
+        "sub      %[Temp1],             %[step1_2],     %[step1_5]      \n\t"
+        "sh       %[Temp1],             80(%[output])                   \n\t"
+        "sub      %[Temp0],             %[step1_1],     %[step1_6]      \n\t"
+        "sh       %[Temp0],             96(%[output])                   \n\t"
+        "sub      %[Temp1],             %[step1_0],     %[step1_7]      \n\t"
+        "sh       %[Temp1],             112(%[output])                  \n\t"
+
+        : [step1_0] "=&r" (step1_0), [step1_1] "=&r" (step1_1),
+          [step1_2] "=&r" (step1_2), [step1_3] "=&r" (step1_3),
+          [step1_4] "=&r" (step1_4), [step1_5] "=&r" (step1_5),
+          [step1_6] "=&r" (step1_6), [step1_7] "=&r" (step1_7),
+          [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),
+          [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
+          [Temp4] "=&r" (Temp4)
+        : [const_2_power_13] "r" (const_2_power_13),
+          [cospi_16_64] "r" (cospi_16_64), [cospi_28_64] "r" (cospi_28_64),
+          [cospi_4_64] "r" (cospi_4_64), [cospi_12_64] "r" (cospi_12_64),
+          [cospi_20_64] "r" (cospi_20_64), [cospi_8_64] "r" (cospi_8_64),
+          [cospi_24_64] "r" (cospi_24_64),
+          [output] "r" (output), [input] "r" (input)
+    );
+
+    input += 8;
+    output += 1;
+  }
+}
+
+static void idct8_1d_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
+                                           int dest_stride) {
+  int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
+  int Temp0, Temp1, Temp2, Temp3;
+  int i;
+  const int const_2_power_13 = 8192;
+  uint8_t *dest_pix;
+  uint8_t *cm = vp9_ff_cropTbl;
+
+  /* prefetch vp9_ff_cropTbl */
+  vp9_prefetch_load(vp9_ff_cropTbl);
+  vp9_prefetch_load(vp9_ff_cropTbl +  32);
+  vp9_prefetch_load(vp9_ff_cropTbl +  64);
+  vp9_prefetch_load(vp9_ff_cropTbl +  96);
+  vp9_prefetch_load(vp9_ff_cropTbl + 128);
+  vp9_prefetch_load(vp9_ff_cropTbl + 160);
+  vp9_prefetch_load(vp9_ff_cropTbl + 192);
+  vp9_prefetch_load(vp9_ff_cropTbl + 224);
+
+  for (i = 0; i < 8; ++i) {
+      dest_pix = (dest + i);
+
+    __asm__ __volatile__ (
+        /*
+          temp_1 = (input[0] + input[4]) * cospi_16_64;
+          step2_0 = dct_const_round_shift(temp_1);
+
+          temp_2 = (input[0] - input[4]) * cospi_16_64;
+          step2_1 = dct_const_round_shift(temp_2);
+        */
+        "lh       %[Temp0],             0(%[input])                     \n\t"
+        "lh       %[Temp1],             8(%[input])                     \n\t"
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "add      %[Temp2],             %[Temp0],       %[Temp1]        \n\t"
+        "madd     $ac0,                 %[Temp2],       %[cospi_16_64]  \n\t"
+        "extp     %[step1_6],           $ac0,           31              \n\t"
+
+        "sub      %[Temp3],             %[Temp0],       %[Temp1]        \n\t"
+        "madd     $ac1,                 %[Temp3],       %[cospi_16_64]  \n\t"
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "extp     %[Temp2],             $ac1,           31              \n\t"
+
+        /*
+          temp_1 = input[2] * cospi_24_64 - input[6] * cospi_8_64;
+          step2_2 = dct_const_round_shift(temp_1);
+        */
+        "lh       %[Temp0],             4(%[input])                     \n\t"
+        "lh       %[Temp1],             12(%[input])                    \n\t"
+        "madd     $ac0,                 %[Temp0],       %[cospi_24_64]  \n\t"
+        "msub     $ac0,                 %[Temp1],       %[cospi_8_64]   \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "extp     %[Temp3],             $ac0,           31              \n\t"
+
+        /*
+          step1_1 = step2_1 + step2_2;
+          step1_2 = step2_1 - step2_2;
+        */
+        "add      %[step1_1],           %[Temp2],       %[Temp3]        \n\t"
+        "sub      %[step1_2],           %[Temp2],       %[Temp3]        \n\t"
+
+        /*
+          temp_2 = input[2] * cospi_8_64 + input[6] * cospi_24_64;
+          step2_3 = dct_const_round_shift(temp_2);
+        */
+        "madd     $ac1,                 %[Temp0],       %[cospi_8_64]   \n\t"
+        "madd     $ac1,                 %[Temp1],       %[cospi_24_64]  \n\t"
+        "extp     %[Temp1],             $ac1,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+
+        /*
+          step1_0 = step2_0 + step2_3;
+          step1_3 = step2_0 - step2_3;
+        */
+        "add      %[step1_0],           %[step1_6],     %[Temp1]        \n\t"
+        "sub      %[step1_3],           %[step1_6],     %[Temp1]        \n\t"
+
+        /*
+          temp_1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
+          step1_4 = dct_const_round_shift(temp_1);
+        */
+        "lh       %[Temp0],             2(%[input])                     \n\t"
+        "madd     $ac0,                 %[Temp0],       %[cospi_28_64]  \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "lh       %[Temp1],             14(%[input])                    \n\t"
+        "lh       %[Temp0],             2(%[input])                     \n\t"
+        "msub     $ac0,                 %[Temp1],       %[cospi_4_64]   \n\t"
+        "extp     %[step1_4],           $ac0,           31              \n\t"
+
+        /*
+          temp_2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
+          step1_7 = dct_const_round_shift(temp_2);
+        */
+        "madd     $ac1,                 %[Temp0],       %[cospi_4_64]   \n\t"
+        "madd     $ac1,                 %[Temp1],       %[cospi_28_64]  \n\t"
+        "extp     %[step1_7],           $ac1,           31              \n\t"
+
+        /*
+          temp_1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
+          step1_5 = dct_const_round_shift(temp_1);
+        */
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "lh       %[Temp0],             10(%[input])                    \n\t"
+        "madd     $ac0,                 %[Temp0],       %[cospi_12_64]  \n\t"
+        "lh       %[Temp1],             6(%[input])                     \n\t"
+        "msub     $ac0,                 %[Temp1],       %[cospi_20_64]  \n\t"
+        "extp     %[step1_5],           $ac0,           31              \n\t"
+
+        /*
+          temp_2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
+          step1_6 = dct_const_round_shift(temp_2);
+        */
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "lh       %[Temp0],             10(%[input])                    \n\t"
+        "madd     $ac1,                 %[Temp0],       %[cospi_20_64]  \n\t"
+        "lh       %[Temp1],             6(%[input])                     \n\t"
+        "madd     $ac1,                 %[Temp1],       %[cospi_12_64]  \n\t"
+        "extp     %[step1_6],           $ac1,           31              \n\t"
+
+        /*
+          temp_1 = (step1_7 - step1_6 - step1_4 + step1_5) * cospi_16_64;
+          temp_2 = (step1_4 - step1_5 - step1_6 + step1_7) * cospi_16_64;
+        */
+        "sub      %[Temp0],             %[step1_7],     %[step1_6]      \n\t"
+        "sub      %[Temp0],             %[Temp0],       %[step1_4]      \n\t"
+        "add      %[Temp0],             %[Temp0],       %[step1_5]      \n\t"
+        "sub      %[Temp1],             %[step1_4],     %[step1_5]      \n\t"
+        "sub      %[Temp1],             %[Temp1],       %[step1_6]      \n\t"
+        "add      %[Temp1],             %[Temp1],       %[step1_7]      \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+
+        "madd     $ac0,                 %[Temp0],       %[cospi_16_64]  \n\t"
+        "madd     $ac1,                 %[Temp1],       %[cospi_16_64]  \n\t"
+
+        /*
+          step1_4 = step1_4 + step1_5;
+          step1_7 = step1_6 + step1_7;
+        */
+        "add      %[step1_4],           %[step1_4],     %[step1_5]      \n\t"
+        "add      %[step1_7],           %[step1_7],     %[step1_6]      \n\t"
+
+        "extp     %[step1_5],           $ac0,           31              \n\t"
+        "extp     %[step1_6],           $ac1,           31              \n\t"
+
+        /* add block */
+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
+        "add      %[Temp0],             %[step1_0],     %[step1_7]      \n\t"
+        "addi     %[Temp0],             %[Temp0],       16              \n\t"
+        "sra      %[Temp0],             %[Temp0],       5               \n\t"
+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
+        "add      %[Temp0],             %[step1_1],     %[step1_6]      \n\t"
+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
+        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
+
+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
+        "addi     %[Temp0],             %[Temp0],       16              \n\t"
+        "sra      %[Temp0],             %[Temp0],       5               \n\t"
+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
+        "add      %[Temp0],             %[step1_2],     %[step1_5]      \n\t"
+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
+        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
+
+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
+        "addi     %[Temp0],             %[Temp0],       16              \n\t"
+        "sra      %[Temp0],             %[Temp0],       5               \n\t"
+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
+        "add      %[Temp0],             %[step1_3],     %[step1_4]      \n\t"
+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
+        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
+
+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
+        "addi     %[Temp0],             %[Temp0],       16              \n\t"
+        "sra      %[Temp0],             %[Temp0],       5               \n\t"
+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
+        "sub      %[Temp0],             %[step1_3],     %[step1_4]      \n\t"
+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
+        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
+
+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
+        "addi     %[Temp0],             %[Temp0],       16              \n\t"
+        "sra      %[Temp0],             %[Temp0],       5               \n\t"
+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
+        "sub      %[Temp0],             %[step1_2],     %[step1_5]      \n\t"
+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
+        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
+
+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
+        "addi     %[Temp0],             %[Temp0],       16              \n\t"
+        "sra      %[Temp0],             %[Temp0],       5               \n\t"
+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
+        "sub      %[Temp0],             %[step1_1],     %[step1_6]      \n\t"
+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
+        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
+
+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
+        "addi     %[Temp0],             %[Temp0],       16              \n\t"
+        "sra      %[Temp0],             %[Temp0],       5               \n\t"
+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
+        "sub      %[Temp0],             %[step1_0],     %[step1_7]      \n\t"
+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
+        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
+
+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
+        "addi     %[Temp0],             %[Temp0],       16              \n\t"
+        "sra      %[Temp0],             %[Temp0],       5               \n\t"
+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
+
+        : [step1_0] "=&r" (step1_0), [step1_1] "=&r" (step1_1),
+          [step1_2] "=&r" (step1_2), [step1_3] "=&r" (step1_3),
+          [step1_4] "=&r" (step1_4), [step1_5] "=&r" (step1_5),
+          [step1_6] "=&r" (step1_6), [step1_7] "=&r" (step1_7),
+          [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),
+          [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
+          [dest_pix] "+r" (dest_pix)
+        : [const_2_power_13] "r" (const_2_power_13),
+          [cospi_16_64] "r" (cospi_16_64), [cospi_28_64] "r" (cospi_28_64),
+          [cospi_4_64] "r" (cospi_4_64), [cospi_12_64] "r" (cospi_12_64),
+          [cospi_20_64] "r" (cospi_20_64), [cospi_8_64] "r" (cospi_8_64),
+          [cospi_24_64] "r" (cospi_24_64),
+          [input] "r" (input), [cm] "r" (cm), [dest_stride] "r" (dest_stride)
+    );
+
+    input += 8;
+  }
+}
+
+void vp9_idct8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
+                              int dest_stride) {
+  DECLARE_ALIGNED(32, int16_t, out[8 * 8]);
+  int16_t *outptr = out;
+  uint32_t pos = 45;
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp    %[pos],    1    \n\t"
+    :
+    : [pos] "r" (pos)
+  );
+
+  // First transform rows
+  idct8_1d_rows_dspr2(input, outptr, 8);
+
+  // Then transform columns and add to dest
+  idct8_1d_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+}
+
+static void iadst8_1d_dspr2(const int16_t *input, int16_t *output) {
+  int s0, s1, s2, s3, s4, s5, s6, s7;
+  int x0, x1, x2, x3, x4, x5, x6, x7;
+
+  x0 = input[7];
+  x1 = input[0];
+  x2 = input[5];
+  x3 = input[2];
+  x4 = input[3];
+  x5 = input[4];
+  x6 = input[1];
+  x7 = input[6];
+
+  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
+    output[0] = output[1] = output[2] = output[3] = output[4]
+              = output[5] = output[6] = output[7] = 0;
+    return;
+  }
+
+  // stage 1
+  s0 = cospi_2_64  * x0 + cospi_30_64 * x1;
+  s1 = cospi_30_64 * x0 - cospi_2_64  * x1;
+  s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
+  s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
+  s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
+  s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
+  s6 = cospi_26_64 * x6 + cospi_6_64  * x7;
+  s7 = cospi_6_64  * x6 - cospi_26_64 * x7;
+
+  x0 = ROUND_POWER_OF_TWO((s0 + s4), DCT_CONST_BITS);
+  x1 = ROUND_POWER_OF_TWO((s1 + s5), DCT_CONST_BITS);
+  x2 = ROUND_POWER_OF_TWO((s2 + s6), DCT_CONST_BITS);
+  x3 = ROUND_POWER_OF_TWO((s3 + s7), DCT_CONST_BITS);
+  x4 = ROUND_POWER_OF_TWO((s0 - s4), DCT_CONST_BITS);
+  x5 = ROUND_POWER_OF_TWO((s1 - s5), DCT_CONST_BITS);
+  x6 = ROUND_POWER_OF_TWO((s2 - s6), DCT_CONST_BITS);
+  x7 = ROUND_POWER_OF_TWO((s3 - s7), DCT_CONST_BITS);
+
+  // stage 2
+  s0 = x0;
+  s1 = x1;
+  s2 = x2;
+  s3 = x3;
+  s4 =  cospi_8_64  * x4 + cospi_24_64 * x5;
+  s5 =  cospi_24_64 * x4 - cospi_8_64  * x5;
+  s6 = -cospi_24_64 * x6 + cospi_8_64  * x7;
+  s7 =  cospi_8_64  * x6 + cospi_24_64 * x7;
+
+  x0 = s0 + s2;
+  x1 = s1 + s3;
+  x2 = s0 - s2;
+  x3 = s1 - s3;
+  x4 = ROUND_POWER_OF_TWO((s4 + s6), DCT_CONST_BITS);
+  x5 = ROUND_POWER_OF_TWO((s5 + s7), DCT_CONST_BITS);
+  x6 = ROUND_POWER_OF_TWO((s4 - s6), DCT_CONST_BITS);
+  x7 = ROUND_POWER_OF_TWO((s5 - s7), DCT_CONST_BITS);
+
+  // stage 3
+  s2 = cospi_16_64 * (x2 + x3);
+  s3 = cospi_16_64 * (x2 - x3);
+  s6 = cospi_16_64 * (x6 + x7);
+  s7 = cospi_16_64 * (x6 - x7);
+
+  x2 = ROUND_POWER_OF_TWO((s2), DCT_CONST_BITS);
+  x3 = ROUND_POWER_OF_TWO((s3), DCT_CONST_BITS);
+  x6 = ROUND_POWER_OF_TWO((s6), DCT_CONST_BITS);
+  x7 = ROUND_POWER_OF_TWO((s7), DCT_CONST_BITS);
+
+  output[0] =  x0;
+  output[1] = -x4;
+  output[2] =  x6;
+  output[3] = -x2;
+  output[4] =  x3;
+  output[5] = -x7;
+  output[6] =  x5;
+  output[7] = -x1;
+}
+
+void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
+                             int dest_stride, int tx_type) {
+  int i, j;
+  DECLARE_ALIGNED(32, int16_t, out[8 * 8]);
+  int16_t *outptr = out;
+  int16_t temp_in[8 * 8], temp_out[8];
+  uint32_t pos = 45;
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp    %[pos],    1    \n\t"
+    :
+    : [pos] "r" (pos)
+  );
+
+  switch (tx_type) {
+    case DCT_DCT:     // DCT in both horizontal and vertical
+      idct8_1d_rows_dspr2(input, outptr, 8);
+      idct8_1d_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+      break;
+    case ADST_DCT:    // ADST in vertical, DCT in horizontal
+      idct8_1d_rows_dspr2(input, outptr, 8);
+
+      for (i = 0; i < 8; ++i) {
+        iadst8_1d_dspr2(&out[i * 8], temp_out);
+
+        for (j = 0; j < 8; ++j)
+          dest[j * dest_stride + i] =
+                    clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
+                                      + dest[j * dest_stride + i]);
+      }
+      break;
+    case DCT_ADST:    // DCT in vertical, ADST in horizontal
+      for (i = 0; i < 8; ++i) {
+        iadst8_1d_dspr2(input, outptr);
+        input += 8;
+        outptr += 8;
+      }
+
+      for (i = 0; i < 8; ++i) {
+        for (j = 0; j < 8; ++j) {
+          temp_in[i * 8 + j] = out[j * 8 + i];
+        }
+      }
+      idct8_1d_columns_add_blk_dspr2(&temp_in[0], dest, dest_stride);
+      break;
+    case ADST_ADST:   // ADST in both directions
+      for (i = 0; i < 8; ++i) {
+        iadst8_1d_dspr2(input, outptr);
+        input += 8;
+        outptr += 8;
+      }
+
+      for (i = 0; i < 8; ++i) {
+        for (j = 0; j < 8; ++j)
+          temp_in[j] = out[j * 8 + i];
+
+        iadst8_1d_dspr2(temp_in, temp_out);
+
+        for (j = 0; j < 8; ++j)
+          dest[j * dest_stride + i] =
+                clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
+                                      + dest[j * dest_stride + i]);
+      }
+      break;
+    default:
+      printf("vp9_short_iht8x8_add_dspr2 : Invalid tx_type\n");
+      break;
+  }
+}
+
+void vp9_idct8x8_10_add_dspr2(const int16_t *input, uint8_t *dest,
+                              int dest_stride) {
+  DECLARE_ALIGNED(32, int16_t, out[8 * 8]);
+  int16_t *outptr = out;
+  uint32_t pos = 45;
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp    %[pos],    1    \n\t"
+    :
+    : [pos] "r" (pos)
+  );
+
+  // First transform rows
+  idct8_1d_rows_dspr2(input, outptr, 4);
+
+  outptr += 4;
+
+  __asm__ __volatile__ (
+      "sw  $zero,   0(%[outptr])  \n\t"
+      "sw  $zero,   4(%[outptr])  \n\t"
+      "sw  $zero,  16(%[outptr])  \n\t"
+      "sw  $zero,  20(%[outptr])  \n\t"
+      "sw  $zero,  32(%[outptr])  \n\t"
+      "sw  $zero,  36(%[outptr])  \n\t"
+      "sw  $zero,  48(%[outptr])  \n\t"
+      "sw  $zero,  52(%[outptr])  \n\t"
+      "sw  $zero,  64(%[outptr])  \n\t"
+      "sw  $zero,  68(%[outptr])  \n\t"
+      "sw  $zero,  80(%[outptr])  \n\t"
+      "sw  $zero,  84(%[outptr])  \n\t"
+      "sw  $zero,  96(%[outptr])  \n\t"
+      "sw  $zero, 100(%[outptr])  \n\t"
+      "sw  $zero, 112(%[outptr])  \n\t"
+      "sw  $zero, 116(%[outptr])  \n\t"
+
+      :
+      : [outptr] "r" (outptr)
+  );
+
+
+  // Then transform columns and add to dest
+  idct8_1d_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+}
+
+void vp9_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest,
+                             int dest_stride) {
+  uint32_t pos = 45;
+  int32_t out;
+  int32_t r;
+  int32_t a1, absa1;
+  int32_t t1, t2, vector_a1, vector_1, vector_2;
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp      %[pos],     1           \n\t"
+
+    :
+    : [pos] "r" (pos)
+  );
+
+  out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);
+  __asm__ __volatile__ (
+      "addi     %[out],     %[out],     16      \n\t"
+      "sra      %[a1],      %[out],     5       \n\t"
+
+      : [out] "+r" (out), [a1] "=r" (a1)
+      :
+  );
+
+  if (a1 < 0) {
+    /* use quad-byte
+     * input and output memory are four byte aligned */
+    __asm__ __volatile__ (
+        "abs        %[absa1],       %[a1]       \n\t"
+        "replv.qb   %[vector_a1],   %[absa1]    \n\t"
+
+        : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)
+        : [a1] "r" (a1)
+    );
+
+    for (r = 8; r--;) {
+      __asm__ __volatile__ (
+          "lw           %[t1],          0(%[dest])                      \n\t"
+          "lw           %[t2],          4(%[dest])                      \n\t"
+          "subu_s.qb    %[vector_1],    %[t1],          %[vector_a1]    \n\t"
+          "subu_s.qb    %[vector_2],    %[t2],          %[vector_a1]    \n\t"
+          "sw           %[vector_1],    0(%[dest])                      \n\t"
+          "sw           %[vector_2],    4(%[dest])                      \n\t"
+          "add          %[dest],        %[dest],        %[dest_stride]  \n\t"
+
+          : [t1] "=&r" (t1), [t2] "=&r" (t2),
+            [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
+            [dest] "+&r" (dest)
+          : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
+      );
+    }
+  } else {
+    /* use quad-byte
+     * input and output memory are four byte aligned */
+    __asm__ __volatile__ (
+        "replv.qb   %[vector_a1],   %[a1]   \n\t"
+
+        : [vector_a1] "=r" (vector_a1)
+        : [a1] "r" (a1)
+    );
+
+    for (r = 8; r--;) {
+      __asm__ __volatile__ (
+          "lw           %[t1],          0(%[dest])                      \n\t"
+          "lw           %[t2],          4(%[dest])                      \n\t"
+          "addu_s.qb    %[vector_1],    %[t1],          %[vector_a1]    \n\t"
+          "addu_s.qb    %[vector_2],    %[t2],          %[vector_a1]    \n\t"
+          "sw           %[vector_1],    0(%[dest])                      \n\t"
+          "sw           %[vector_2],    4(%[dest])                      \n\t"
+          "add          %[dest],        %[dest],        %[dest_stride]  \n\t"
+
+          : [t1] "=&r" (t1), [t2] "=&r" (t2),
+            [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
+            [dest] "+r" (dest)
+          : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
+      );
+    }
+  }
+}
+#endif  // #if HAVE_DSPR2
diff --git a/vp9/common/vp9_enums.h b/vp9/common/vp9_enums.h
index 768ff2c94..1651b9050 100644
--- a/vp9/common/vp9_enums.h
+++ b/vp9/common/vp9_enums.h
@@ -76,4 +76,15 @@ typedef enum {
   ADST_ADST = 3                       // ADST in both directions
 } TX_TYPE;
 
+typedef enum {
+  UNKNOWN    = 0,
+  BT_601     = 1,  // YUV
+  BT_709     = 2,  // YUV
+  SMPTE_170  = 3,  // YUV
+  SMPTE_240  = 4,  // YUV
+  RESERVED_1 = 5,
+  RESERVED_2 = 6,
+  SRGB       = 7   // RGB
+} COLOR_SPACE;
+
 #endif  // VP9_COMMON_VP9_ENUMS_H_
diff --git a/vp9/common/vp9_idct.c b/vp9/common/vp9_idct.c
index 52b039d99..ea8683ea1 100644
--- a/vp9/common/vp9_idct.c
+++ b/vp9/common/vp9_idct.c
@@ -1280,6 +1280,31 @@ void vp9_idct32x32_1024_add_c(const int16_t *input, uint8_t *dest, int stride) {
   }
 }
 
+void vp9_idct32x32_34_add_c(const int16_t *input, uint8_t *dest, int stride) {
+  int16_t out[32 * 32] = {0};
+  int16_t *outptr = out;
+  int i, j;
+  int16_t temp_in[32], temp_out[32];
+
+  // Rows
+  // only upper-left 8x8 has non-zero coeff
+  for (i = 0; i < 8; ++i) {
+    idct32_1d(input, outptr);
+    input += 32;
+    outptr += 32;
+  }
+
+  // Columns
+  for (i = 0; i < 32; ++i) {
+    for (j = 0; j < 32; ++j)
+      temp_in[j] = out[j * 32 + i];
+    idct32_1d(temp_in, temp_out);
+    for (j = 0; j < 32; ++j)
+      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+                                  + dest[j * stride + i]);
+  }
+}
+
 void vp9_idct32x32_1_add_c(const int16_t *input, uint8_t *dest, int stride) {
   int i, j;
   int a1;
@@ -1350,6 +1375,9 @@ void vp9_idct32x32_add(const int16_t *input, uint8_t *dest, int stride,
   if (eob) {
     if (eob == 1)
       vp9_idct32x32_1_add(input, dest, stride);
+    else if (eob <= 34)
+      // non-zero coeff only in upper-left 8x8
+      vp9_idct32x32_34_add(input, dest, stride);
     else
       vp9_idct32x32_1024_add(input, dest, stride);
   }
diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h
index 289210ecb..704469e29 100644
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h
@@ -90,6 +90,8 @@ typedef struct VP9Common {
   DECLARE_ALIGNED(16, int16_t, a_dequant[QINDEX_RANGE][8]);
 #endif
 
+  COLOR_SPACE color_space;
+
   int width;
   int height;
   int display_width;
@@ -217,6 +219,13 @@ typedef struct VP9Common {
   int cur_tile_mi_row_start, cur_tile_mi_row_end;
 } VP9_COMMON;
 
+// ref == 0 => LAST_FRAME
+// ref == 1 => GOLDEN_FRAME
+// ref == 2 => ALTREF_FRAME
+static YV12_BUFFER_CONFIG *get_frame_ref_buffer(VP9_COMMON *cm, int ref) {
+  return &cm->yv12_fb[cm->active_ref_idx[ref]];
+}
+
 static int get_free_fb(VP9_COMMON *cm) {
   int i;
   for (i = 0; i < NUM_YV12_BUFFERS; i++)
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index ba96e5ad6..2d9fbff97 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -268,43 +268,46 @@ specialize vp9_convolve8_avg_vert sse2 ssse3 neon dspr2
 # dct
 #
 prototype void vp9_idct4x4_1_add "const int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_idct4x4_1_add sse2 neon
+specialize vp9_idct4x4_1_add sse2 neon dspr2
 
 prototype void vp9_idct4x4_16_add "const int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_idct4x4_16_add sse2 neon
+specialize vp9_idct4x4_16_add sse2 neon dspr2
 
 prototype void vp9_idct8x8_1_add "const int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_idct8x8_1_add sse2 neon
+specialize vp9_idct8x8_1_add sse2 neon dspr2
 
 prototype void vp9_idct8x8_64_add "const int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_idct8x8_64_add sse2 neon
+specialize vp9_idct8x8_64_add sse2 neon dspr2
 
 prototype void vp9_idct8x8_10_add "const int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_idct8x8_10_add sse2 neon
+specialize vp9_idct8x8_10_add sse2 neon dspr2
 
 prototype void vp9_idct16x16_1_add "const int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_idct16x16_1_add sse2 neon
+specialize vp9_idct16x16_1_add sse2 neon dspr2
 
 prototype void vp9_idct16x16_256_add "const int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_idct16x16_256_add sse2 neon
+specialize vp9_idct16x16_256_add sse2 neon dspr2
 
 prototype void vp9_idct16x16_10_add "const int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_idct16x16_10_add sse2 neon
+specialize vp9_idct16x16_10_add sse2 neon dspr2
 
 prototype void vp9_idct32x32_1024_add "const int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_idct32x32_1024_add sse2 neon
+specialize vp9_idct32x32_1024_add sse2 neon dspr2
+
+prototype void vp9_idct32x32_34_add "const int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_idct32x32_34_add sse2
 
 prototype void vp9_idct32x32_1_add "const int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_idct32x32_1_add sse2
+specialize vp9_idct32x32_1_add sse2 dspr2
 
 prototype void vp9_iht4x4_16_add "const int16_t *input, uint8_t *dest, int dest_stride, int tx_type"
-specialize vp9_iht4x4_16_add sse2 neon
+specialize vp9_iht4x4_16_add sse2 neon dspr2
 
 prototype void vp9_iht8x8_64_add "const int16_t *input, uint8_t *dest, int dest_stride, int tx_type"
-specialize vp9_iht8x8_64_add sse2 neon
+specialize vp9_iht8x8_64_add sse2 neon dspr2
 
 prototype void vp9_iht16x16_256_add "const int16_t *input, uint8_t *output, int pitch, int tx_type"
-specialize vp9_iht16x16_256_add sse2
+specialize vp9_iht16x16_256_add sse2 dspr2
 
 # dct and add
 
diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c
index 74de6c670..ccf5aac17 100644
--- a/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -415,7 +415,7 @@ void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride,
       res3 = _mm_packs_epi32(tmp6, tmp7); \
   }
 
-#define IDCT8x8_1D  \
+#define IDCT8_1D  \
   /* Stage1 */      \
   { \
     const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \
@@ -530,7 +530,7 @@ void vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
                   in4, in5, in6, in7);
 
     // 4-stage 1D idct8x8
-    IDCT8x8_1D
+    IDCT8_1D
   }
 
   // Final rounding and shift
@@ -643,7 +643,7 @@ static void idct8_1d_sse2(__m128i *in) {
                 in4, in5, in6, in7);
 
   // 4-stage 1D idct8x8
-  IDCT8x8_1D
+  IDCT8_1D
   in[0] = in0;
   in[1] = in1;
   in[2] = in2;
@@ -1068,7 +1068,7 @@ void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
                 in4, in5, in6, in7)
 
   // 1D idct8x8
-  IDCT8x8_1D
+  IDCT8_1D
 
   // Final rounding and shift
   in0 = _mm_adds_epi16(in0, final_rounding);
@@ -1099,7 +1099,7 @@ void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
   RECON_AND_STORE(dest, in7);
 }
 
-#define IDCT16x16_1D \
+#define IDCT16_1D \
   /* Stage2 */ \
   { \
     const __m128i lo_1_15 = _mm_unpacklo_epi16(in1, in15); \
@@ -1321,7 +1321,7 @@ void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest,
                     in12, in13, in14, in15);
     }
 
-    IDCT16x16_1D
+    IDCT16_1D
 
     // Stage7
     if (i == 0) {
@@ -2703,7 +2703,7 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
 
     in8 = in9 = in10 = in11 = in12 = in13 = in14 = in15 = zero;
 
-    IDCT16x16_1D
+    IDCT16_1D
 
     // Stage7
     in0 = _mm_add_epi16(stp2_0, stp1_15);
@@ -2785,6 +2785,698 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
     input += 8; \
   }  \
 
+#define IDCT32_1D \
+/* Stage1 */ \
+{ \
+  const __m128i lo_1_31 = _mm_unpacklo_epi16(in1, in31); \
+  const __m128i hi_1_31 = _mm_unpackhi_epi16(in1, in31); \
+  const __m128i lo_17_15 = _mm_unpacklo_epi16(in17, in15); \
+  const __m128i hi_17_15 = _mm_unpackhi_epi16(in17, in15); \
+  \
+  const __m128i lo_9_23 = _mm_unpacklo_epi16(in9, in23); \
+  const __m128i hi_9_23 = _mm_unpackhi_epi16(in9, in23); \
+  const __m128i lo_25_7= _mm_unpacklo_epi16(in25, in7); \
+  const __m128i hi_25_7 = _mm_unpackhi_epi16(in25, in7); \
+  \
+  const __m128i lo_5_27 = _mm_unpacklo_epi16(in5, in27); \
+  const __m128i hi_5_27 = _mm_unpackhi_epi16(in5, in27); \
+  const __m128i lo_21_11 = _mm_unpacklo_epi16(in21, in11); \
+  const __m128i hi_21_11 = _mm_unpackhi_epi16(in21, in11); \
+  \
+  const __m128i lo_13_19 = _mm_unpacklo_epi16(in13, in19); \
+  const __m128i hi_13_19 = _mm_unpackhi_epi16(in13, in19); \
+  const __m128i lo_29_3 = _mm_unpacklo_epi16(in29, in3); \
+  const __m128i hi_29_3 = _mm_unpackhi_epi16(in29, in3); \
+  \
+  MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \
+                         stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, \
+                         stp1_17, stp1_30) \
+  MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, \
+                         stg1_5, stg1_6, stg1_7, stp1_18, stp1_29, \
+                         stp1_19, stp1_28) \
+  MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, \
+                         stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, \
+                         stp1_21, stp1_26) \
+  MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, \
+                         stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, \
+                         stp1_23, stp1_24) \
+} \
+\
+/* Stage2 */ \
+{ \
+  const __m128i lo_2_30 = _mm_unpacklo_epi16(in2, in30); \
+  const __m128i hi_2_30 = _mm_unpackhi_epi16(in2, in30); \
+  const __m128i lo_18_14 = _mm_unpacklo_epi16(in18, in14); \
+  const __m128i hi_18_14 = _mm_unpackhi_epi16(in18, in14); \
+  \
+  const __m128i lo_10_22 = _mm_unpacklo_epi16(in10, in22); \
+  const __m128i hi_10_22 = _mm_unpackhi_epi16(in10, in22); \
+  const __m128i lo_26_6 = _mm_unpacklo_epi16(in26, in6); \
+  const __m128i hi_26_6 = _mm_unpackhi_epi16(in26, in6); \
+  \
+  MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \
+                         stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \
+                         stp2_14) \
+  MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, \
+                         stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, \
+                         stp2_11, stp2_12) \
+  \
+  stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \
+  stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \
+  stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); \
+  stp2_19 = _mm_add_epi16(stp1_19, stp1_18); \
+  \
+  stp2_20 = _mm_add_epi16(stp1_20, stp1_21); \
+  stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); \
+  stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); \
+  stp2_23 = _mm_add_epi16(stp1_23, stp1_22); \
+  \
+  stp2_24 = _mm_add_epi16(stp1_24, stp1_25); \
+  stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); \
+  stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); \
+  stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \
+  \
+  stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \
+  stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \
+  stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \
+  stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \
+} \
+\
+/* Stage3 */ \
+{ \
+  const __m128i lo_4_28 = _mm_unpacklo_epi16(in4, in28); \
+  const __m128i hi_4_28 = _mm_unpackhi_epi16(in4, in28); \
+  const __m128i lo_20_12 = _mm_unpacklo_epi16(in20, in12); \
+  const __m128i hi_20_12 = _mm_unpackhi_epi16(in20, in12); \
+  \
+  const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \
+  const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \
+  const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
+  const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
+  \
+  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
+  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
+  const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
+  const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
+  \
+  MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, \
+                         stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, \
+                         stp1_6) \
+  \
+  stp1_8 = _mm_add_epi16(stp2_8, stp2_9); \
+  stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \
+  stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
+  stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
+  stp1_12 = _mm_add_epi16(stp2_12, stp2_13); \
+  stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
+  stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
+  stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
+  \
+  MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
+                         stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \
+                         stp1_18, stp1_29) \
+  MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
+                         stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \
+                         stp1_22, stp1_25) \
+  \
+  stp1_16 = stp2_16; \
+  stp1_31 = stp2_31; \
+  stp1_19 = stp2_19; \
+  stp1_20 = stp2_20; \
+  stp1_23 = stp2_23; \
+  stp1_24 = stp2_24; \
+  stp1_27 = stp2_27; \
+  stp1_28 = stp2_28; \
+} \
+\
+/* Stage4 */ \
+{ \
+  const __m128i lo_0_16 = _mm_unpacklo_epi16(in0, in16); \
+  const __m128i hi_0_16 = _mm_unpackhi_epi16(in0, in16); \
+  const __m128i lo_8_24 = _mm_unpacklo_epi16(in8, in24); \
+  const __m128i hi_8_24 = _mm_unpackhi_epi16(in8, in24); \
+  \
+  const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
+  const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
+  const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
+  const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
+  \
+  MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, \
+                         stg4_1, stg4_2, stg4_3, stp2_0, stp2_1, \
+                         stp2_2, stp2_3) \
+  \
+  stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
+  stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
+  stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
+  stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
+  \
+  MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
+                         stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \
+                         stp2_10, stp2_13) \
+  \
+  stp2_8 = stp1_8; \
+  stp2_15 = stp1_15; \
+  stp2_11 = stp1_11; \
+  stp2_12 = stp1_12; \
+  \
+  stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
+  stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
+  stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
+  stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
+  stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
+  stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
+  stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
+  stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
+  \
+  stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
+  stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
+  stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
+  stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
+  stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
+  stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
+  stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
+  stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
+} \
+\
+/* Stage5 */ \
+{ \
+  const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
+  const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
+  const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
+  const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
+  \
+  const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
+  const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
+  const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
+  const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
+  \
+  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
+  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
+  \
+  stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
+  stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
+  stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
+  stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
+  \
+  tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
+  tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
+  tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
+  tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
+  \
+  tmp0 = _mm_add_epi32(tmp0, rounding); \
+  tmp1 = _mm_add_epi32(tmp1, rounding); \
+  tmp2 = _mm_add_epi32(tmp2, rounding); \
+  tmp3 = _mm_add_epi32(tmp3, rounding); \
+  \
+  tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+  tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+  tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+  tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+  \
+  stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
+  stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
+  \
+  stp1_4 = stp2_4; \
+  stp1_7 = stp2_7; \
+  \
+  stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
+  stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
+  stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
+  stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
+  stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
+  stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
+  stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
+  stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
+  \
+  stp1_16 = stp2_16; \
+  stp1_17 = stp2_17; \
+  \
+  MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
+                         stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \
+                         stp1_19, stp1_28) \
+  MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
+                         stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \
+                         stp1_21, stp1_26) \
+  \
+  stp1_22 = stp2_22; \
+  stp1_23 = stp2_23; \
+  stp1_24 = stp2_24; \
+  stp1_25 = stp2_25; \
+  stp1_30 = stp2_30; \
+  stp1_31 = stp2_31; \
+} \
+\
+/* Stage6 */ \
+{ \
+  const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
+  const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
+  const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
+  const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
+  \
+  stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
+  stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
+  stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
+  stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
+  stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
+  stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
+  stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
+  stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
+  \
+  stp2_8 = stp1_8; \
+  stp2_9 = stp1_9; \
+  stp2_14 = stp1_14; \
+  stp2_15 = stp1_15; \
+  \
+  MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
+                         stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \
+                         stp2_13, stp2_11, stp2_12) \
+  \
+  stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
+  stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
+  stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
+  stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
+  stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
+  stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
+  stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
+  stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
+  \
+  stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
+  stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
+  stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
+  stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
+  stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
+  stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
+  stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
+  stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
+} \
+\
+/* Stage7 */ \
+{ \
+  const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
+  const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
+  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
+  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
+  \
+  const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
+  const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
+  const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
+  const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
+  \
+  stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
+  stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
+  stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
+  stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
+  stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
+  stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
+  stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
+  stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
+  stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
+  stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
+  stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
+  stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
+  stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
+  stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
+  stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
+  stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
+  \
+  stp1_16 = stp2_16; \
+  stp1_17 = stp2_17; \
+  stp1_18 = stp2_18; \
+  stp1_19 = stp2_19; \
+  \
+  MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
+                         stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \
+                         stp1_21, stp1_26) \
+  MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
+                         stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \
+                         stp1_23, stp1_24) \
+  \
+  stp1_28 = stp2_28; \
+  stp1_29 = stp2_29; \
+  stp1_30 = stp2_30; \
+  stp1_31 = stp2_31; \
+}
+
+// Only upper-left 8x8 has non-zero coeff
+void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest,
+                                 int stride) {
+  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  const __m128i final_rounding = _mm_set1_epi16(1<<5);
+
+  // idct constants for each stage
+  const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
+  const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
+  const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
+  const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
+  const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
+  const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
+  const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
+  const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
+  const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
+  const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
+  const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
+  const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
+  const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
+  const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
+  const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
+  const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
+
+  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
+  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
+  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
+  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
+  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
+  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
+
+  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
+  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
+  const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+  const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
+  const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
+  const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
+  const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
+
+  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
+  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+
+  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+
+  __m128i in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12,
+          in13, in14, in15, in16, in17, in18, in19, in20, in21, in22, in23,
+          in24, in25, in26, in27, in28, in29, in30, in31;
+  __m128i col[128];
+  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
+          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
+          stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
+          stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29,
+          stp1_30, stp1_31;
+  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
+          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
+          stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22,
+          stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
+          stp2_30, stp2_31;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  int i, j, i32;
+
+  // We work on a 8x32 block each time, and loop 8 times for 2-D 32x32 idct.
+  for (i = 0; i < 8; i++) {
+    i32 = (i << 5);
+    if (i == 0) {
+      // First 1-D idct: first 8 rows
+      // Load input data.
+      LOAD_DQCOEFF(in0, input);
+      LOAD_DQCOEFF(in8, input);
+      LOAD_DQCOEFF(in16, input);
+      LOAD_DQCOEFF(in24, input);
+      LOAD_DQCOEFF(in1, input);
+      LOAD_DQCOEFF(in9, input);
+      LOAD_DQCOEFF(in17, input);
+      LOAD_DQCOEFF(in25, input);
+      LOAD_DQCOEFF(in2, input);
+      LOAD_DQCOEFF(in10, input);
+      LOAD_DQCOEFF(in18, input);
+      LOAD_DQCOEFF(in26, input);
+      LOAD_DQCOEFF(in3, input);
+      LOAD_DQCOEFF(in11, input);
+      LOAD_DQCOEFF(in19, input);
+      LOAD_DQCOEFF(in27, input);
+
+      LOAD_DQCOEFF(in4, input);
+      LOAD_DQCOEFF(in12, input);
+      LOAD_DQCOEFF(in20, input);
+      LOAD_DQCOEFF(in28, input);
+      LOAD_DQCOEFF(in5, input);
+      LOAD_DQCOEFF(in13, input);
+      LOAD_DQCOEFF(in21, input);
+      LOAD_DQCOEFF(in29, input);
+      LOAD_DQCOEFF(in6, input);
+      LOAD_DQCOEFF(in14, input);
+      LOAD_DQCOEFF(in22, input);
+      LOAD_DQCOEFF(in30, input);
+      LOAD_DQCOEFF(in7, input);
+      LOAD_DQCOEFF(in15, input);
+      LOAD_DQCOEFF(in23, input);
+      LOAD_DQCOEFF(in31, input);
+
+      // Transpose 32x8 block to 8x32 block
+      TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+                    in4, in5, in6, in7);
+      TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
+                    in10, in11, in12, in13, in14, in15);
+      TRANSPOSE_8X8(in16, in17, in18, in19, in20, in21, in22, in23, in16, in17,
+                    in18, in19, in20, in21, in22, in23);
+      TRANSPOSE_8X8(in24, in25, in26, in27, in28, in29, in30, in31, in24, in25,
+                    in26, in27, in28, in29, in30, in31);
+    } else if (i < 4) {
+      // First 1-D idct: next 24 zero-coeff rows
+      col[i32 + 0] = _mm_setzero_si128();
+      col[i32 + 1] = _mm_setzero_si128();
+      col[i32 + 2] = _mm_setzero_si128();
+      col[i32 + 3] = _mm_setzero_si128();
+      col[i32 + 4] = _mm_setzero_si128();
+      col[i32 + 5] = _mm_setzero_si128();
+      col[i32 + 6] = _mm_setzero_si128();
+      col[i32 + 7] = _mm_setzero_si128();
+      col[i32 + 8] = _mm_setzero_si128();
+      col[i32 + 9] = _mm_setzero_si128();
+      col[i32 + 10] = _mm_setzero_si128();
+      col[i32 + 11] = _mm_setzero_si128();
+      col[i32 + 12] = _mm_setzero_si128();
+      col[i32 + 13] = _mm_setzero_si128();
+      col[i32 + 14] = _mm_setzero_si128();
+      col[i32 + 15] = _mm_setzero_si128();
+      col[i32 + 16] = _mm_setzero_si128();
+      col[i32 + 17] = _mm_setzero_si128();
+      col[i32 + 18] = _mm_setzero_si128();
+      col[i32 + 19] = _mm_setzero_si128();
+      col[i32 + 20] = _mm_setzero_si128();
+      col[i32 + 21] = _mm_setzero_si128();
+      col[i32 + 22] = _mm_setzero_si128();
+      col[i32 + 23] = _mm_setzero_si128();
+      col[i32 + 24] = _mm_setzero_si128();
+      col[i32 + 25] = _mm_setzero_si128();
+      col[i32 + 26] = _mm_setzero_si128();
+      col[i32 + 27] = _mm_setzero_si128();
+      col[i32 + 28] = _mm_setzero_si128();
+      col[i32 + 29] = _mm_setzero_si128();
+      col[i32 + 30] = _mm_setzero_si128();
+      col[i32 + 31] = _mm_setzero_si128();
+      continue;
+    } else {
+      // Second 1-D idct
+      j = i - 4;
+
+      // Transpose 32x8 block to 8x32 block
+      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
+                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
+                    col[j * 8 + 6], col[j * 8 + 7], in0, in1, in2, in3, in4,
+                    in5, in6, in7);
+      j += 4;
+      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
+                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
+                    col[j * 8 + 6], col[j * 8 + 7], in8, in9, in10,
+                    in11, in12, in13, in14, in15);
+      j += 4;
+      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
+                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
+                    col[j * 8 + 6], col[j * 8 + 7], in16, in17, in18,
+                    in19, in20, in21, in22, in23);
+      j += 4;
+      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
+                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
+                    col[j * 8 + 6], col[j * 8 + 7], in24, in25, in26, in27,
+                    in28, in29, in30, in31);
+    }
+
+    IDCT32_1D
+
+    // final stage
+    if (i < 4) {
+      // 1_D: Store 32 intermediate results for each 8x32 block.
+      col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
+      col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
+      col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
+      col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
+      col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
+      col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
+      col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
+      col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
+      col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
+      col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
+      col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
+      col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
+      col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
+      col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
+      col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
+      col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
+      col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
+      col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
+      col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
+      col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
+      col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
+      col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
+      col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
+      col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
+      col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
+      col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
+      col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
+      col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
+      col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
+      col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
+      col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
+      col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
+    } else {
+      const __m128i zero = _mm_setzero_si128();
+
+      // 2_D: Calculate the results and store them to destination.
+      in0 = _mm_add_epi16(stp1_0, stp1_31);
+      in1 = _mm_add_epi16(stp1_1, stp1_30);
+      in2 = _mm_add_epi16(stp1_2, stp1_29);
+      in3 = _mm_add_epi16(stp1_3, stp1_28);
+      in4 = _mm_add_epi16(stp1_4, stp1_27);
+      in5 = _mm_add_epi16(stp1_5, stp1_26);
+      in6 = _mm_add_epi16(stp1_6, stp1_25);
+      in7 = _mm_add_epi16(stp1_7, stp1_24);
+      in8 = _mm_add_epi16(stp1_8, stp1_23);
+      in9 = _mm_add_epi16(stp1_9, stp1_22);
+      in10 = _mm_add_epi16(stp1_10, stp1_21);
+      in11 = _mm_add_epi16(stp1_11, stp1_20);
+      in12 = _mm_add_epi16(stp1_12, stp1_19);
+      in13 = _mm_add_epi16(stp1_13, stp1_18);
+      in14 = _mm_add_epi16(stp1_14, stp1_17);
+      in15 = _mm_add_epi16(stp1_15, stp1_16);
+      in16 = _mm_sub_epi16(stp1_15, stp1_16);
+      in17 = _mm_sub_epi16(stp1_14, stp1_17);
+      in18 = _mm_sub_epi16(stp1_13, stp1_18);
+      in19 = _mm_sub_epi16(stp1_12, stp1_19);
+      in20 = _mm_sub_epi16(stp1_11, stp1_20);
+      in21 = _mm_sub_epi16(stp1_10, stp1_21);
+      in22 = _mm_sub_epi16(stp1_9, stp1_22);
+      in23 = _mm_sub_epi16(stp1_8, stp1_23);
+      in24 = _mm_sub_epi16(stp1_7, stp1_24);
+      in25 = _mm_sub_epi16(stp1_6, stp1_25);
+      in26 = _mm_sub_epi16(stp1_5, stp1_26);
+      in27 = _mm_sub_epi16(stp1_4, stp1_27);
+      in28 = _mm_sub_epi16(stp1_3, stp1_28);
+      in29 = _mm_sub_epi16(stp1_2, stp1_29);
+      in30 = _mm_sub_epi16(stp1_1, stp1_30);
+      in31 = _mm_sub_epi16(stp1_0, stp1_31);
+
+      // Final rounding and shift
+      in0 = _mm_adds_epi16(in0, final_rounding);
+      in1 = _mm_adds_epi16(in1, final_rounding);
+      in2 = _mm_adds_epi16(in2, final_rounding);
+      in3 = _mm_adds_epi16(in3, final_rounding);
+      in4 = _mm_adds_epi16(in4, final_rounding);
+      in5 = _mm_adds_epi16(in5, final_rounding);
+      in6 = _mm_adds_epi16(in6, final_rounding);
+      in7 = _mm_adds_epi16(in7, final_rounding);
+      in8 = _mm_adds_epi16(in8, final_rounding);
+      in9 = _mm_adds_epi16(in9, final_rounding);
+      in10 = _mm_adds_epi16(in10, final_rounding);
+      in11 = _mm_adds_epi16(in11, final_rounding);
+      in12 = _mm_adds_epi16(in12, final_rounding);
+      in13 = _mm_adds_epi16(in13, final_rounding);
+      in14 = _mm_adds_epi16(in14, final_rounding);
+      in15 = _mm_adds_epi16(in15, final_rounding);
+      in16 = _mm_adds_epi16(in16, final_rounding);
+      in17 = _mm_adds_epi16(in17, final_rounding);
+      in18 = _mm_adds_epi16(in18, final_rounding);
+      in19 = _mm_adds_epi16(in19, final_rounding);
+      in20 = _mm_adds_epi16(in20, final_rounding);
+      in21 = _mm_adds_epi16(in21, final_rounding);
+      in22 = _mm_adds_epi16(in22, final_rounding);
+      in23 = _mm_adds_epi16(in23, final_rounding);
+      in24 = _mm_adds_epi16(in24, final_rounding);
+      in25 = _mm_adds_epi16(in25, final_rounding);
+      in26 = _mm_adds_epi16(in26, final_rounding);
+      in27 = _mm_adds_epi16(in27, final_rounding);
+      in28 = _mm_adds_epi16(in28, final_rounding);
+      in29 = _mm_adds_epi16(in29, final_rounding);
+      in30 = _mm_adds_epi16(in30, final_rounding);
+      in31 = _mm_adds_epi16(in31, final_rounding);
+
+      in0 = _mm_srai_epi16(in0, 6);
+      in1 = _mm_srai_epi16(in1, 6);
+      in2 = _mm_srai_epi16(in2, 6);
+      in3 = _mm_srai_epi16(in3, 6);
+      in4 = _mm_srai_epi16(in4, 6);
+      in5 = _mm_srai_epi16(in5, 6);
+      in6 = _mm_srai_epi16(in6, 6);
+      in7 = _mm_srai_epi16(in7, 6);
+      in8 = _mm_srai_epi16(in8, 6);
+      in9 = _mm_srai_epi16(in9, 6);
+      in10 = _mm_srai_epi16(in10, 6);
+      in11 = _mm_srai_epi16(in11, 6);
+      in12 = _mm_srai_epi16(in12, 6);
+      in13 = _mm_srai_epi16(in13, 6);
+      in14 = _mm_srai_epi16(in14, 6);
+      in15 = _mm_srai_epi16(in15, 6);
+      in16 = _mm_srai_epi16(in16, 6);
+      in17 = _mm_srai_epi16(in17, 6);
+      in18 = _mm_srai_epi16(in18, 6);
+      in19 = _mm_srai_epi16(in19, 6);
+      in20 = _mm_srai_epi16(in20, 6);
+      in21 = _mm_srai_epi16(in21, 6);
+      in22 = _mm_srai_epi16(in22, 6);
+      in23 = _mm_srai_epi16(in23, 6);
+      in24 = _mm_srai_epi16(in24, 6);
+      in25 = _mm_srai_epi16(in25, 6);
+      in26 = _mm_srai_epi16(in26, 6);
+      in27 = _mm_srai_epi16(in27, 6);
+      in28 = _mm_srai_epi16(in28, 6);
+      in29 = _mm_srai_epi16(in29, 6);
+      in30 = _mm_srai_epi16(in30, 6);
+      in31 = _mm_srai_epi16(in31, 6);
+
+      RECON_AND_STORE(dest, in0);
+      RECON_AND_STORE(dest, in1);
+      RECON_AND_STORE(dest, in2);
+      RECON_AND_STORE(dest, in3);
+      RECON_AND_STORE(dest, in4);
+      RECON_AND_STORE(dest, in5);
+      RECON_AND_STORE(dest, in6);
+      RECON_AND_STORE(dest, in7);
+      RECON_AND_STORE(dest, in8);
+      RECON_AND_STORE(dest, in9);
+      RECON_AND_STORE(dest, in10);
+      RECON_AND_STORE(dest, in11);
+      RECON_AND_STORE(dest, in12);
+      RECON_AND_STORE(dest, in13);
+      RECON_AND_STORE(dest, in14);
+      RECON_AND_STORE(dest, in15);
+      RECON_AND_STORE(dest, in16);
+      RECON_AND_STORE(dest, in17);
+      RECON_AND_STORE(dest, in18);
+      RECON_AND_STORE(dest, in19);
+      RECON_AND_STORE(dest, in20);
+      RECON_AND_STORE(dest, in21);
+      RECON_AND_STORE(dest, in22);
+      RECON_AND_STORE(dest, in23);
+      RECON_AND_STORE(dest, in24);
+      RECON_AND_STORE(dest, in25);
+      RECON_AND_STORE(dest, in26);
+      RECON_AND_STORE(dest, in27);
+      RECON_AND_STORE(dest, in28);
+      RECON_AND_STORE(dest, in29);
+      RECON_AND_STORE(dest, in30);
+      RECON_AND_STORE(dest, in31);
+
+      dest += 8 - (stride * 32);
+    }
+  }
+}
+
 void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest,
                                  int stride) {
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
@@ -3009,336 +3701,7 @@ void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest,
                     in28, in29, in30, in31);
     }
 
-    // Stage1
-    {
-      const __m128i lo_1_31 = _mm_unpacklo_epi16(in1, in31);
-      const __m128i hi_1_31 = _mm_unpackhi_epi16(in1, in31);
-      const __m128i lo_17_15 = _mm_unpacklo_epi16(in17, in15);
-      const __m128i hi_17_15 = _mm_unpackhi_epi16(in17, in15);
-
-      const __m128i lo_9_23 = _mm_unpacklo_epi16(in9, in23);
-      const __m128i hi_9_23 = _mm_unpackhi_epi16(in9, in23);
-      const __m128i lo_25_7= _mm_unpacklo_epi16(in25, in7);
-      const __m128i hi_25_7 = _mm_unpackhi_epi16(in25, in7);
-
-      const __m128i lo_5_27 = _mm_unpacklo_epi16(in5, in27);
-      const __m128i hi_5_27 = _mm_unpackhi_epi16(in5, in27);
-      const __m128i lo_21_11 = _mm_unpacklo_epi16(in21, in11);
-      const __m128i hi_21_11 = _mm_unpackhi_epi16(in21, in11);
-
-      const __m128i lo_13_19 = _mm_unpacklo_epi16(in13, in19);
-      const __m128i hi_13_19 = _mm_unpackhi_epi16(in13, in19);
-      const __m128i lo_29_3 = _mm_unpacklo_epi16(in29, in3);
-      const __m128i hi_29_3 = _mm_unpackhi_epi16(in29, in3);
-
-      MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0,
-                             stg1_1, stg1_2, stg1_3, stp1_16, stp1_31,
-                             stp1_17, stp1_30)
-      MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4,
-                             stg1_5, stg1_6, stg1_7, stp1_18, stp1_29,
-                             stp1_19, stp1_28)
-      MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8,
-                             stg1_9, stg1_10, stg1_11, stp1_20, stp1_27,
-                             stp1_21, stp1_26)
-      MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12,
-                             stg1_13, stg1_14, stg1_15, stp1_22, stp1_25,
-                             stp1_23, stp1_24)
-    }
-
-    // Stage2
-    {
-      const __m128i lo_2_30 = _mm_unpacklo_epi16(in2, in30);
-      const __m128i hi_2_30 = _mm_unpackhi_epi16(in2, in30);
-      const __m128i lo_18_14 = _mm_unpacklo_epi16(in18, in14);
-      const __m128i hi_18_14 = _mm_unpackhi_epi16(in18, in14);
-
-      const __m128i lo_10_22 = _mm_unpacklo_epi16(in10, in22);
-      const __m128i hi_10_22 = _mm_unpackhi_epi16(in10, in22);
-      const __m128i lo_26_6 = _mm_unpacklo_epi16(in26, in6);
-      const __m128i hi_26_6 = _mm_unpackhi_epi16(in26, in6);
-
-      MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0,
-                             stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9,
-                             stp2_14)
-      MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4,
-                             stg2_5, stg2_6, stg2_7, stp2_10, stp2_13,
-                             stp2_11, stp2_12)
-
-      stp2_16 = _mm_add_epi16(stp1_16, stp1_17);
-      stp2_17 = _mm_sub_epi16(stp1_16, stp1_17);
-      stp2_18 = _mm_sub_epi16(stp1_19, stp1_18);
-      stp2_19 = _mm_add_epi16(stp1_19, stp1_18);
-
-      stp2_20 = _mm_add_epi16(stp1_20, stp1_21);
-      stp2_21 = _mm_sub_epi16(stp1_20, stp1_21);
-      stp2_22 = _mm_sub_epi16(stp1_23, stp1_22);
-      stp2_23 = _mm_add_epi16(stp1_23, stp1_22);
-
-      stp2_24 = _mm_add_epi16(stp1_24, stp1_25);
-      stp2_25 = _mm_sub_epi16(stp1_24, stp1_25);
-      stp2_26 = _mm_sub_epi16(stp1_27, stp1_26);
-      stp2_27 = _mm_add_epi16(stp1_27, stp1_26);
-
-      stp2_28 = _mm_add_epi16(stp1_28, stp1_29);
-      stp2_29 = _mm_sub_epi16(stp1_28, stp1_29);
-      stp2_30 = _mm_sub_epi16(stp1_31, stp1_30);
-      stp2_31 = _mm_add_epi16(stp1_31, stp1_30);
-    }
-
-    // Stage3
-    {
-      const __m128i lo_4_28 = _mm_unpacklo_epi16(in4, in28);
-      const __m128i hi_4_28 = _mm_unpackhi_epi16(in4, in28);
-      const __m128i lo_20_12 = _mm_unpacklo_epi16(in20, in12);
-      const __m128i hi_20_12 = _mm_unpackhi_epi16(in20, in12);
-
-      const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30);
-      const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30);
-      const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29);
-      const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29);
-
-      const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);
-      const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);
-      const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25);
-      const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25);
-
-      MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0,
-                             stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5,
-                             stp1_6)
-
-      stp1_8 = _mm_add_epi16(stp2_8, stp2_9);
-      stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);
-      stp1_10 = _mm_sub_epi16(stp2_11, stp2_10);
-      stp1_11 = _mm_add_epi16(stp2_11, stp2_10);
-      stp1_12 = _mm_add_epi16(stp2_12, stp2_13);
-      stp1_13 = _mm_sub_epi16(stp2_12, stp2_13);
-      stp1_14 = _mm_sub_epi16(stp2_15, stp2_14);
-      stp1_15 = _mm_add_epi16(stp2_15, stp2_14);
-
-      MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4,
-                             stg3_5, stg3_6, stg3_4, stp1_17, stp1_30,
-                             stp1_18, stp1_29)
-      MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8,
-                             stg3_9, stg3_10, stg3_8, stp1_21, stp1_26,
-                             stp1_22, stp1_25)
-
-      stp1_16 = stp2_16;
-      stp1_31 = stp2_31;
-      stp1_19 = stp2_19;
-      stp1_20 = stp2_20;
-      stp1_23 = stp2_23;
-      stp1_24 = stp2_24;
-      stp1_27 = stp2_27;
-      stp1_28 = stp2_28;
-    }
-
-    // Stage4
-    {
-      const __m128i lo_0_16 = _mm_unpacklo_epi16(in0, in16);
-      const __m128i hi_0_16 = _mm_unpackhi_epi16(in0, in16);
-      const __m128i lo_8_24 = _mm_unpacklo_epi16(in8, in24);
-      const __m128i hi_8_24 = _mm_unpackhi_epi16(in8, in24);
-
-      const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);
-      const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14);
-      const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
-      const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);
-
-      MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0,
-                             stg4_1, stg4_2, stg4_3, stp2_0, stp2_1,
-                             stp2_2, stp2_3)
-
-      stp2_4 = _mm_add_epi16(stp1_4, stp1_5);
-      stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);
-      stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);
-      stp2_7 = _mm_add_epi16(stp1_7, stp1_6);
-
-      MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4,
-                             stg4_5, stg4_6, stg4_4, stp2_9, stp2_14,
-                             stp2_10, stp2_13)
-
-      stp2_8 = stp1_8;
-      stp2_15 = stp1_15;
-      stp2_11 = stp1_11;
-      stp2_12 = stp1_12;
-
-      stp2_16 = _mm_add_epi16(stp1_16, stp1_19);
-      stp2_17 = _mm_add_epi16(stp1_17, stp1_18);
-      stp2_18 = _mm_sub_epi16(stp1_17, stp1_18);
-      stp2_19 = _mm_sub_epi16(stp1_16, stp1_19);
-      stp2_20 = _mm_sub_epi16(stp1_23, stp1_20);
-      stp2_21 = _mm_sub_epi16(stp1_22, stp1_21);
-      stp2_22 = _mm_add_epi16(stp1_22, stp1_21);
-      stp2_23 = _mm_add_epi16(stp1_23, stp1_20);
-
-      stp2_24 = _mm_add_epi16(stp1_24, stp1_27);
-      stp2_25 = _mm_add_epi16(stp1_25, stp1_26);
-      stp2_26 = _mm_sub_epi16(stp1_25, stp1_26);
-      stp2_27 = _mm_sub_epi16(stp1_24, stp1_27);
-      stp2_28 = _mm_sub_epi16(stp1_31, stp1_28);
-      stp2_29 = _mm_sub_epi16(stp1_30, stp1_29);
-      stp2_30 = _mm_add_epi16(stp1_29, stp1_30);
-      stp2_31 = _mm_add_epi16(stp1_28, stp1_31);
-    }
-
-    // Stage5
-    {
-      const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);
-      const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5);
-      const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29);
-      const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29);
-
-      const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28);
-      const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28);
-      const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);
-      const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);
-
-      const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);
-      const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);
-
-      stp1_0 = _mm_add_epi16(stp2_0, stp2_3);
-      stp1_1 = _mm_add_epi16(stp2_1, stp2_2);
-      stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);
-      stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);
-
-      tmp0 = _mm_madd_epi16(lo_6_5, stg4_1);
-      tmp1 = _mm_madd_epi16(hi_6_5, stg4_1);
-      tmp2 = _mm_madd_epi16(lo_6_5, stg4_0);
-      tmp3 = _mm_madd_epi16(hi_6_5, stg4_0);
-
-      tmp0 = _mm_add_epi32(tmp0, rounding);
-      tmp1 = _mm_add_epi32(tmp1, rounding);
-      tmp2 = _mm_add_epi32(tmp2, rounding);
-      tmp3 = _mm_add_epi32(tmp3, rounding);
-
-      tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-      tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
-      tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-      tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
-
-      stp1_5 = _mm_packs_epi32(tmp0, tmp1);
-      stp1_6 = _mm_packs_epi32(tmp2, tmp3);
-
-      stp1_4 = stp2_4;
-      stp1_7 = stp2_7;
-
-      stp1_8 = _mm_add_epi16(stp2_8, stp2_11);
-      stp1_9 = _mm_add_epi16(stp2_9, stp2_10);
-      stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);
-      stp1_11 = _mm_sub_epi16(stp2_8, stp2_11);
-      stp1_12 = _mm_sub_epi16(stp2_15, stp2_12);
-      stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);
-      stp1_14 = _mm_add_epi16(stp2_14, stp2_13);
-      stp1_15 = _mm_add_epi16(stp2_15, stp2_12);
-
-      stp1_16 = stp2_16;
-      stp1_17 = stp2_17;
-
-      MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4,
-                             stg4_5, stg4_4, stg4_5, stp1_18, stp1_29,
-                             stp1_19, stp1_28)
-      MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6,
-                             stg4_4, stg4_6, stg4_4, stp1_20, stp1_27,
-                             stp1_21, stp1_26)
-
-      stp1_22 = stp2_22;
-      stp1_23 = stp2_23;
-      stp1_24 = stp2_24;
-      stp1_25 = stp2_25;
-      stp1_30 = stp2_30;
-      stp1_31 = stp2_31;
-    }
-
-    // Stage6
-    {
-      const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
-      const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);
-      const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);
-      const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12);
-
-      stp2_0 = _mm_add_epi16(stp1_0, stp1_7);
-      stp2_1 = _mm_add_epi16(stp1_1, stp1_6);
-      stp2_2 = _mm_add_epi16(stp1_2, stp1_5);
-      stp2_3 = _mm_add_epi16(stp1_3, stp1_4);
-      stp2_4 = _mm_sub_epi16(stp1_3, stp1_4);
-      stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);
-      stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);
-      stp2_7 = _mm_sub_epi16(stp1_0, stp1_7);
-
-      stp2_8 = stp1_8;
-      stp2_9 = stp1_9;
-      stp2_14 = stp1_14;
-      stp2_15 = stp1_15;
-
-      MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12,
-                             stg6_0, stg4_0, stg6_0, stg4_0, stp2_10,
-                             stp2_13, stp2_11, stp2_12)
-
-      stp2_16 = _mm_add_epi16(stp1_16, stp1_23);
-      stp2_17 = _mm_add_epi16(stp1_17, stp1_22);
-      stp2_18 = _mm_add_epi16(stp1_18, stp1_21);
-      stp2_19 = _mm_add_epi16(stp1_19, stp1_20);
-      stp2_20 = _mm_sub_epi16(stp1_19, stp1_20);
-      stp2_21 = _mm_sub_epi16(stp1_18, stp1_21);
-      stp2_22 = _mm_sub_epi16(stp1_17, stp1_22);
-      stp2_23 = _mm_sub_epi16(stp1_16, stp1_23);
-
-      stp2_24 = _mm_sub_epi16(stp1_31, stp1_24);
-      stp2_25 = _mm_sub_epi16(stp1_30, stp1_25);
-      stp2_26 = _mm_sub_epi16(stp1_29, stp1_26);
-      stp2_27 = _mm_sub_epi16(stp1_28, stp1_27);
-      stp2_28 = _mm_add_epi16(stp1_27, stp1_28);
-      stp2_29 = _mm_add_epi16(stp1_26, stp1_29);
-      stp2_30 = _mm_add_epi16(stp1_25, stp1_30);
-      stp2_31 = _mm_add_epi16(stp1_24, stp1_31);
-    }
-
-    // Stage7
-    {
-      const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);
-      const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);
-      const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);
-      const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);
-
-      const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25);
-      const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25);
-      const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24);
-      const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24);
-
-      stp1_0 = _mm_add_epi16(stp2_0, stp2_15);
-      stp1_1 = _mm_add_epi16(stp2_1, stp2_14);
-      stp1_2 = _mm_add_epi16(stp2_2, stp2_13);
-      stp1_3 = _mm_add_epi16(stp2_3, stp2_12);
-      stp1_4 = _mm_add_epi16(stp2_4, stp2_11);
-      stp1_5 = _mm_add_epi16(stp2_5, stp2_10);
-      stp1_6 = _mm_add_epi16(stp2_6, stp2_9);
-      stp1_7 = _mm_add_epi16(stp2_7, stp2_8);
-      stp1_8 = _mm_sub_epi16(stp2_7, stp2_8);
-      stp1_9 = _mm_sub_epi16(stp2_6, stp2_9);
-      stp1_10 = _mm_sub_epi16(stp2_5, stp2_10);
-      stp1_11 = _mm_sub_epi16(stp2_4, stp2_11);
-      stp1_12 = _mm_sub_epi16(stp2_3, stp2_12);
-      stp1_13 = _mm_sub_epi16(stp2_2, stp2_13);
-      stp1_14 = _mm_sub_epi16(stp2_1, stp2_14);
-      stp1_15 = _mm_sub_epi16(stp2_0, stp2_15);
-
-      stp1_16 = stp2_16;
-      stp1_17 = stp2_17;
-      stp1_18 = stp2_18;
-      stp1_19 = stp2_19;
-
-      MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0,
-                             stg4_0, stg6_0, stg4_0, stp1_20, stp1_27,
-                             stp1_21, stp1_26)
-      MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0,
-                             stg4_0, stg6_0, stg4_0, stp1_22, stp1_25,
-                             stp1_23, stp1_24)
-
-      stp1_28 = stp2_28;
-      stp1_29 = stp2_29;
-      stp1_30 = stp2_30;
-      stp1_31 = stp2_31;
-    }
+    IDCT32_1D
 
     // final stage
     if (i < 4) {