summaryrefslogtreecommitdiff
path: root/vp9/common
diff options
context:
space:
mode:
Diffstat (limited to 'vp9/common')
-rw-r--r--vp9/common/mips/dspr2/vp9_common_dspr2.h3
-rw-r--r--vp9/common/mips/dspr2/vp9_itrans16_dspr2.c1315
-rw-r--r--vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c1073
-rw-r--r--vp9/common/mips/dspr2/vp9_itrans32_dspr2.c1013
-rw-r--r--vp9/common/mips/dspr2/vp9_itrans4_dspr2.c438
-rw-r--r--vp9/common/mips/dspr2/vp9_itrans8_dspr2.c745
-rw-r--r--vp9/common/vp9_enums.h11
-rw-r--r--vp9/common/vp9_idct.c28
-rw-r--r--vp9/common/vp9_onyxc_int.h9
-rw-r--r--vp9/common/vp9_rtcd_defs.sh29
-rw-r--r--vp9/common/x86/vp9_idct_intrin_sse2.c1037
11 files changed, 5351 insertions, 350 deletions
diff --git a/vp9/common/mips/dspr2/vp9_common_dspr2.h b/vp9/common/mips/dspr2/vp9_common_dspr2.h
index dc88f1603..644264f65 100644
--- a/vp9/common/mips/dspr2/vp9_common_dspr2.h
+++ b/vp9/common/mips/dspr2/vp9_common_dspr2.h
@@ -81,6 +81,9 @@ static INLINE void vp9_prefetch_store_streamed(unsigned char *dst) {
);
}
+void vp9_idct32_1d_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
+ int dest_stride);
+
void vp9_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
diff --git a/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c b/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c
new file mode 100644
index 000000000..1b2f5506a
--- /dev/null
+++ b/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c
@@ -0,0 +1,1315 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_idct.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+
+#if HAVE_DSPR2
+static void idct16_1d_rows_dspr2(const int16_t *input, int16_t *output,
+ uint32_t no_rows) {
+ int i;
+ int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
+ int step1_10, step1_11, step1_12, step1_13;
+ int step2_0, step2_1, step2_2, step2_3;
+ int step2_8, step2_9, step2_10, step2_11;
+ int step2_12, step2_13, step2_14, step2_15;
+ int load1, load2, load3, load4, load5, load6, load7, load8;
+ int result1, result2, result3, result4;
+ const int const_2_power_13 = 8192;
+
+ for (i = no_rows; i--; ) {
+ /* prefetch row */
+ vp9_prefetch_load((const uint8_t *)(input + 16));
+
+ __asm__ __volatile__ (
+ "lh %[load1], 0(%[input]) \n\t"
+ "lh %[load2], 16(%[input]) \n\t"
+ "lh %[load3], 8(%[input]) \n\t"
+ "lh %[load4], 24(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "add %[result1], %[load1], %[load2] \n\t"
+ "sub %[result2], %[load1], %[load2] \n\t"
+ "madd $ac1, %[result1], %[cospi_16_64] \n\t"
+ "madd $ac2, %[result2], %[cospi_16_64] \n\t"
+ "extp %[step2_0], $ac1, 31 \n\t"
+ "extp %[step2_1], $ac2, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "madd $ac3, %[load3], %[cospi_24_64] \n\t"
+ "msub $ac3, %[load4], %[cospi_8_64] \n\t"
+ "extp %[step2_2], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "madd $ac1, %[load3], %[cospi_8_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_24_64] \n\t"
+ "extp %[step2_3], $ac1, 31 \n\t"
+
+ "add %[step1_0], %[step2_0], %[step2_3] \n\t"
+ "add %[step1_1], %[step2_1], %[step2_2] \n\t"
+ "sub %[step1_2], %[step2_1], %[step2_2] \n\t"
+ "sub %[step1_3], %[step2_0], %[step2_3] \n\t"
+
+ : [load1] "=&r" (load1), [load2] "=&r" (load2),
+ [load3] "=&r" (load3), [load4] "=&r" (load4),
+ [result1] "=&r" (result1), [result2] "=&r" (result2),
+ [step2_0] "=&r" (step2_0), [step2_1] "=&r" (step2_1),
+ [step2_2] "=&r" (step2_2), [step2_3] "=&r" (step2_3),
+ [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1),
+ [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3)
+ : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+ [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64),
+ [cospi_16_64] "r" (cospi_16_64)
+ );
+
+ __asm__ __volatile__ (
+ "lh %[load5], 2(%[input]) \n\t"
+ "lh %[load6], 30(%[input]) \n\t"
+ "lh %[load7], 18(%[input]) \n\t"
+ "lh %[load8], 14(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load5], %[cospi_30_64] \n\t"
+ "msub $ac1, %[load6], %[cospi_2_64] \n\t"
+ "extp %[result1], $ac1, 31 \n\t"
+
+ "madd $ac3, %[load7], %[cospi_14_64] \n\t"
+ "msub $ac3, %[load8], %[cospi_18_64] \n\t"
+ "extp %[result2], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac1, %[load7], %[cospi_18_64] \n\t"
+ "madd $ac1, %[load8], %[cospi_14_64] \n\t"
+ "extp %[result3], $ac1, 31 \n\t"
+
+ "madd $ac2, %[load5], %[cospi_2_64] \n\t"
+ "madd $ac2, %[load6], %[cospi_30_64] \n\t"
+ "extp %[result4], $ac2, 31 \n\t"
+
+ "sub %[load5], %[result1], %[result2] \n\t"
+ "sub %[load6], %[result4], %[result3] \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load6], %[cospi_24_64] \n\t"
+ "msub $ac1, %[load5], %[cospi_8_64] \n\t"
+ "madd $ac3, %[load5], %[cospi_24_64] \n\t"
+ "madd $ac3, %[load6], %[cospi_8_64] \n\t"
+
+ "extp %[step2_9], $ac1, 31 \n\t"
+ "extp %[step2_14], $ac3, 31 \n\t"
+ "add %[step2_8], %[result1], %[result2] \n\t"
+ "add %[step2_15], %[result4], %[result3] \n\t"
+
+ : [load5] "=&r" (load5), [load6] "=&r" (load6),
+ [load7] "=&r" (load7), [load8] "=&r" (load8),
+ [result1] "=&r" (result1), [result2] "=&r" (result2),
+ [result3] "=&r" (result3), [result4] "=&r" (result4),
+ [step2_8] "=r" (step2_8), [step2_15] "=r" (step2_15),
+ [step2_9] "=r" (step2_9), [step2_14] "=r" (step2_14)
+ : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+ [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64),
+ [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64),
+ [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+ );
+
+ __asm__ __volatile__ (
+ "lh %[load1], 10(%[input]) \n\t"
+ "lh %[load2], 22(%[input]) \n\t"
+ "lh %[load3], 26(%[input]) \n\t"
+ "lh %[load4], 6(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_22_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_10_64] \n\t"
+ "extp %[result1], $ac1, 31 \n\t"
+
+ "madd $ac3, %[load3], %[cospi_6_64] \n\t"
+ "msub $ac3, %[load4], %[cospi_26_64] \n\t"
+ "extp %[result2], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_10_64] \n\t"
+ "madd $ac1, %[load2], %[cospi_22_64] \n\t"
+ "extp %[result3], $ac1, 31 \n\t"
+
+ "madd $ac2, %[load3], %[cospi_26_64] \n\t"
+ "madd $ac2, %[load4], %[cospi_6_64] \n\t"
+ "extp %[result4], $ac2, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load1], %[result2], %[result1] \n\t"
+ "sub %[load2], %[result4], %[result3] \n\t"
+
+ "msub $ac1, %[load1], %[cospi_24_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_8_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_24_64] \n\t"
+ "msub $ac3, %[load1], %[cospi_8_64] \n\t"
+
+ "extp %[step2_10], $ac1, 31 \n\t"
+ "extp %[step2_13], $ac3, 31 \n\t"
+ "add %[step2_11], %[result1], %[result2] \n\t"
+ "add %[step2_12], %[result4], %[result3] \n\t"
+
+ : [load1] "=&r" (load1), [load2] "=&r" (load2),
+ [load3] "=&r" (load3), [load4] "=&r" (load4),
+ [result1] "=&r" (result1), [result2] "=&r" (result2),
+ [result3] "=&r" (result3), [result4] "=&r" (result4),
+ [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11),
+ [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13)
+ : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+ [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64),
+ [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64),
+ [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+ );
+
+ __asm__ __volatile__ (
+ "lh %[load5], 4(%[input]) \n\t"
+ "lh %[load6], 28(%[input]) \n\t"
+ "lh %[load7], 20(%[input]) \n\t"
+ "lh %[load8], 12(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load5], %[cospi_28_64] \n\t"
+ "msub $ac1, %[load6], %[cospi_4_64] \n\t"
+ "extp %[result1], $ac1, 31 \n\t"
+
+ "madd $ac3, %[load7], %[cospi_12_64] \n\t"
+ "msub $ac3, %[load8], %[cospi_20_64] \n\t"
+ "extp %[result2], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac1, %[load7], %[cospi_20_64] \n\t"
+ "madd $ac1, %[load8], %[cospi_12_64] \n\t"
+ "extp %[result3], $ac1, 31 \n\t"
+
+ "madd $ac2, %[load5], %[cospi_4_64] \n\t"
+ "madd $ac2, %[load6], %[cospi_28_64] \n\t"
+ "extp %[result4], $ac2, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load5], %[result4], %[result3] \n\t"
+ "sub %[load5], %[load5], %[result1] \n\t"
+ "add %[load5], %[load5], %[result2] \n\t"
+
+ "sub %[load6], %[result1], %[result2] \n\t"
+ "sub %[load6], %[load6], %[result3] \n\t"
+ "add %[load6], %[load6], %[result4] \n\t"
+
+ "madd $ac1, %[load5], %[cospi_16_64] \n\t"
+ "madd $ac3, %[load6], %[cospi_16_64] \n\t"
+
+ "extp %[step1_5], $ac1, 31 \n\t"
+ "extp %[step1_6], $ac3, 31 \n\t"
+ "add %[step1_4], %[result1], %[result2] \n\t"
+ "add %[step1_7], %[result4], %[result3] \n\t"
+
+ : [load5] "=&r" (load5), [load6] "=&r" (load6),
+ [load7] "=&r" (load7), [load8] "=&r" (load8),
+ [result1] "=&r" (result1), [result2] "=&r" (result2),
+ [result3] "=&r" (result3), [result4] "=&r" (result4),
+ [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5),
+ [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7)
+ : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+ [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64),
+ [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64),
+ [cospi_16_64] "r" (cospi_16_64)
+ );
+
+ __asm__ __volatile__ (
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+
+ "sub %[load5], %[step2_14], %[step2_13] \n\t"
+ "sub %[load5], %[load5], %[step2_9] \n\t"
+ "add %[load5], %[load5], %[step2_10] \n\t"
+
+ "madd $ac0, %[load5], %[cospi_16_64] \n\t"
+
+ "sub %[load6], %[step2_14], %[step2_13] \n\t"
+ "sub %[load6], %[load6], %[step2_10] \n\t"
+ "add %[load6], %[load6], %[step2_9] \n\t"
+
+ "madd $ac1, %[load6], %[cospi_16_64] \n\t"
+
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load5], %[step2_15], %[step2_12] \n\t"
+ "sub %[load5], %[load5], %[step2_8] \n\t"
+ "add %[load5], %[load5], %[step2_11] \n\t"
+
+ "madd $ac2, %[load5], %[cospi_16_64] \n\t"
+
+ "sub %[load6], %[step2_15], %[step2_12] \n\t"
+ "sub %[load6], %[load6], %[step2_11] \n\t"
+ "add %[load6], %[load6], %[step2_8] \n\t"
+
+ "madd $ac3, %[load6], %[cospi_16_64] \n\t"
+
+ "extp %[step1_10], $ac0, 31 \n\t"
+ "extp %[step1_13], $ac1, 31 \n\t"
+ "extp %[step1_11], $ac2, 31 \n\t"
+ "extp %[step1_12], $ac3, 31 \n\t"
+
+ : [load5] "=&r" (load5), [load6] "=&r" (load6),
+ [step1_10] "=r" (step1_10), [step1_11] "=r" (step1_11),
+ [step1_12] "=r" (step1_12), [step1_13] "=r" (step1_13)
+ : [const_2_power_13] "r" (const_2_power_13),
+ [step2_14] "r" (step2_14), [step2_13] "r" (step2_13),
+ [step2_9] "r" (step2_9), [step2_10] "r" (step2_10),
+ [step2_15] "r" (step2_15), [step2_12] "r" (step2_12),
+ [step2_8] "r" (step2_8), [step2_11] "r" (step2_11),
+ [cospi_16_64] "r" (cospi_16_64)
+ );
+
+ __asm__ __volatile__ (
+ "add %[load5], %[step1_0], %[step1_7] \n\t"
+ "add %[load5], %[load5], %[step2_12] \n\t"
+ "add %[load5], %[load5], %[step2_15] \n\t"
+ "add %[load6], %[step1_1], %[step1_6] \n\t"
+ "add %[load6], %[load6], %[step2_13] \n\t"
+ "add %[load6], %[load6], %[step2_14] \n\t"
+ "sh %[load5], 0(%[output]) \n\t"
+ "sh %[load6], 32(%[output]) \n\t"
+ "sub %[load5], %[step1_1], %[step1_6] \n\t"
+ "add %[load5], %[load5], %[step2_9] \n\t"
+ "add %[load5], %[load5], %[step2_10] \n\t"
+ "sub %[load6], %[step1_0], %[step1_7] \n\t"
+ "add %[load6], %[load6], %[step2_8] \n\t"
+ "add %[load6], %[load6], %[step2_11] \n\t"
+ "sh %[load5], 192(%[output]) \n\t"
+ "sh %[load6], 224(%[output]) \n\t"
+ "sub %[load5], %[step1_0], %[step1_7] \n\t"
+ "sub %[load5], %[load5], %[step2_8] \n\t"
+ "sub %[load5], %[load5], %[step2_11] \n\t"
+ "sub %[load6], %[step1_1], %[step1_6] \n\t"
+ "sub %[load6], %[load6], %[step2_9] \n\t"
+ "sub %[load6], %[load6], %[step2_10] \n\t"
+ "sh %[load5], 256(%[output]) \n\t"
+ "sh %[load6], 288(%[output]) \n\t"
+ "add %[load5], %[step1_1], %[step1_6] \n\t"
+ "sub %[load5], %[load5], %[step2_13] \n\t"
+ "sub %[load5], %[load5], %[step2_14] \n\t"
+ "add %[load6], %[step1_0], %[step1_7] \n\t"
+ "sub %[load6], %[load6], %[step2_12] \n\t"
+ "sub %[load6], %[load6], %[step2_15] \n\t"
+ "sh %[load5], 448(%[output]) \n\t"
+ "sh %[load6], 480(%[output]) \n\t"
+
+ : [load5] "=&r" (load5), [load6] "=&r" (load6)
+ : [output] "r" (output),
+ [step1_0] "r" (step1_0), [step1_1] "r" (step1_1),
+ [step1_6] "r" (step1_6), [step1_7] "r" (step1_7),
+ [step2_8] "r" (step2_8), [step2_9] "r" (step2_9),
+ [step2_10] "r" (step2_10), [step2_11] "r" (step2_11),
+ [step2_12] "r" (step2_12), [step2_13] "r" (step2_13),
+ [step2_14] "r" (step2_14), [step2_15] "r" (step2_15)
+ );
+
+ __asm__ __volatile__ (
+ "add %[load5], %[step1_2], %[step1_5] \n\t"
+ "add %[load5], %[load5], %[step1_13] \n\t"
+ "add %[load6], %[step1_3], %[step1_4] \n\t"
+ "add %[load6], %[load6], %[step1_12] \n\t"
+ "sh %[load5], 64(%[output]) \n\t"
+ "sh %[load6], 96(%[output]) \n\t"
+ "sub %[load5], %[step1_3], %[step1_4] \n\t"
+ "add %[load5], %[load5], %[step1_11] \n\t"
+ "sub %[load6], %[step1_2], %[step1_5] \n\t"
+ "add %[load6], %[load6], %[step1_10] \n\t"
+ "sh %[load5], 128(%[output]) \n\t"
+ "sh %[load6], 160(%[output]) \n\t"
+ "sub %[load5], %[step1_2], %[step1_5] \n\t"
+ "sub %[load5], %[load5], %[step1_10] \n\t"
+ "sub %[load6], %[step1_3], %[step1_4] \n\t"
+ "sub %[load6], %[load6], %[step1_11] \n\t"
+ "sh %[load5], 320(%[output]) \n\t"
+ "sh %[load6], 352(%[output]) \n\t"
+ "add %[load5], %[step1_3], %[step1_4] \n\t"
+ "sub %[load5], %[load5], %[step1_12] \n\t"
+ "add %[load6], %[step1_2], %[step1_5] \n\t"
+ "sub %[load6], %[load6], %[step1_13] \n\t"
+ "sh %[load5], 384(%[output]) \n\t"
+ "sh %[load6], 416(%[output]) \n\t"
+
+ : [load5] "=&r" (load5), [load6] "=&r" (load6)
+ : [output] "r" (output),
+ [step1_2] "r" (step1_2), [step1_3] "r" (step1_3),
+ [step1_4] "r" (step1_4), [step1_5] "r" (step1_5),
+ [step1_10] "r" (step1_10), [step1_11] "r" (step1_11),
+ [step1_12] "r" (step1_12), [step1_13] "r" (step1_13)
+ );
+
+ input += 16;
+ output += 1;
+ }
+}
+
+static void idct16_1d_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
+ int dest_stride) {
+ int i;
+ int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
+ int step1_8, step1_9, step1_10, step1_11;
+ int step1_12, step1_13, step1_14, step1_15;
+ int step2_0, step2_1, step2_2, step2_3;
+ int step2_8, step2_9, step2_10, step2_11;
+ int step2_12, step2_13, step2_14, step2_15;
+ int load1, load2, load3, load4, load5, load6, load7, load8;
+ int result1, result2, result3, result4;
+ const int const_2_power_13 = 8192;
+ uint8_t *dest_pix;
+ uint8_t *cm = vp9_ff_cropTbl;
+
+ /* prefetch vp9_ff_cropTbl */
+ vp9_prefetch_load(vp9_ff_cropTbl);
+ vp9_prefetch_load(vp9_ff_cropTbl + 32);
+ vp9_prefetch_load(vp9_ff_cropTbl + 64);
+ vp9_prefetch_load(vp9_ff_cropTbl + 96);
+ vp9_prefetch_load(vp9_ff_cropTbl + 128);
+ vp9_prefetch_load(vp9_ff_cropTbl + 160);
+ vp9_prefetch_load(vp9_ff_cropTbl + 192);
+ vp9_prefetch_load(vp9_ff_cropTbl + 224);
+
+ for (i = 0; i < 16; ++i) {
+ dest_pix = (dest + i);
+ __asm__ __volatile__ (
+ "lh %[load1], 0(%[input]) \n\t"
+ "lh %[load2], 16(%[input]) \n\t"
+ "lh %[load3], 8(%[input]) \n\t"
+ "lh %[load4], 24(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "add %[result1], %[load1], %[load2] \n\t"
+ "sub %[result2], %[load1], %[load2] \n\t"
+ "madd $ac1, %[result1], %[cospi_16_64] \n\t"
+ "madd $ac2, %[result2], %[cospi_16_64] \n\t"
+ "extp %[step2_0], $ac1, 31 \n\t"
+ "extp %[step2_1], $ac2, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "madd $ac3, %[load3], %[cospi_24_64] \n\t"
+ "msub $ac3, %[load4], %[cospi_8_64] \n\t"
+ "extp %[step2_2], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "madd $ac1, %[load3], %[cospi_8_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_24_64] \n\t"
+ "extp %[step2_3], $ac1, 31 \n\t"
+
+ "add %[step1_0], %[step2_0], %[step2_3] \n\t"
+ "add %[step1_1], %[step2_1], %[step2_2] \n\t"
+ "sub %[step1_2], %[step2_1], %[step2_2] \n\t"
+ "sub %[step1_3], %[step2_0], %[step2_3] \n\t"
+
+ : [load1] "=&r" (load1), [load2] "=&r" (load2),
+ [load3] "=&r" (load3), [load4] "=&r" (load4),
+ [result1] "=&r" (result1), [result2] "=&r" (result2),
+ [step2_0] "=&r" (step2_0), [step2_1] "=&r" (step2_1),
+ [step2_2] "=&r" (step2_2), [step2_3] "=&r" (step2_3),
+ [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1),
+ [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3)
+ : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+ [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64),
+ [cospi_16_64] "r" (cospi_16_64)
+ );
+
+ __asm__ __volatile__ (
+ "lh %[load5], 2(%[input]) \n\t"
+ "lh %[load6], 30(%[input]) \n\t"
+ "lh %[load7], 18(%[input]) \n\t"
+ "lh %[load8], 14(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load5], %[cospi_30_64] \n\t"
+ "msub $ac1, %[load6], %[cospi_2_64] \n\t"
+ "extp %[result1], $ac1, 31 \n\t"
+
+ "madd $ac3, %[load7], %[cospi_14_64] \n\t"
+ "msub $ac3, %[load8], %[cospi_18_64] \n\t"
+ "extp %[result2], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac1, %[load7], %[cospi_18_64] \n\t"
+ "madd $ac1, %[load8], %[cospi_14_64] \n\t"
+ "extp %[result3], $ac1, 31 \n\t"
+
+ "madd $ac2, %[load5], %[cospi_2_64] \n\t"
+ "madd $ac2, %[load6], %[cospi_30_64] \n\t"
+ "extp %[result4], $ac2, 31 \n\t"
+
+ "sub %[load5], %[result1], %[result2] \n\t"
+ "sub %[load6], %[result4], %[result3] \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load6], %[cospi_24_64] \n\t"
+ "msub $ac1, %[load5], %[cospi_8_64] \n\t"
+ "madd $ac3, %[load5], %[cospi_24_64] \n\t"
+ "madd $ac3, %[load6], %[cospi_8_64] \n\t"
+
+ "extp %[step2_9], $ac1, 31 \n\t"
+ "extp %[step2_14], $ac3, 31 \n\t"
+ "add %[step2_8], %[result1], %[result2] \n\t"
+ "add %[step2_15], %[result4], %[result3] \n\t"
+
+ : [load5] "=&r" (load5), [load6] "=&r" (load6),
+ [load7] "=&r" (load7), [load8] "=&r" (load8),
+ [result1] "=&r" (result1), [result2] "=&r" (result2),
+ [result3] "=&r" (result3), [result4] "=&r" (result4),
+ [step2_8] "=r" (step2_8), [step2_15] "=r" (step2_15),
+ [step2_9] "=r" (step2_9), [step2_14] "=r" (step2_14)
+ : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+ [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64),
+ [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64),
+ [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+ );
+
+ __asm__ __volatile__ (
+ "lh %[load1], 10(%[input]) \n\t"
+ "lh %[load2], 22(%[input]) \n\t"
+ "lh %[load3], 26(%[input]) \n\t"
+ "lh %[load4], 6(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_22_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_10_64] \n\t"
+ "extp %[result1], $ac1, 31 \n\t"
+
+ "madd $ac3, %[load3], %[cospi_6_64] \n\t"
+ "msub $ac3, %[load4], %[cospi_26_64] \n\t"
+ "extp %[result2], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_10_64] \n\t"
+ "madd $ac1, %[load2], %[cospi_22_64] \n\t"
+ "extp %[result3], $ac1, 31 \n\t"
+
+ "madd $ac2, %[load3], %[cospi_26_64] \n\t"
+ "madd $ac2, %[load4], %[cospi_6_64] \n\t"
+ "extp %[result4], $ac2, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load1], %[result2], %[result1] \n\t"
+ "sub %[load2], %[result4], %[result3] \n\t"
+
+ "msub $ac1, %[load1], %[cospi_24_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_8_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_24_64] \n\t"
+ "msub $ac3, %[load1], %[cospi_8_64] \n\t"
+
+ "extp %[step2_10], $ac1, 31 \n\t"
+ "extp %[step2_13], $ac3, 31 \n\t"
+ "add %[step2_11], %[result1], %[result2] \n\t"
+ "add %[step2_12], %[result4], %[result3] \n\t"
+
+ : [load1] "=&r" (load1), [load2] "=&r" (load2),
+ [load3] "=&r" (load3), [load4] "=&r" (load4),
+ [result1] "=&r" (result1), [result2] "=&r" (result2),
+ [result3] "=&r" (result3), [result4] "=&r" (result4),
+ [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11),
+ [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13)
+ : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+ [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64),
+ [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64),
+ [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+ );
+
+ __asm__ __volatile__ (
+ "lh %[load5], 4(%[input]) \n\t"
+ "lh %[load6], 28(%[input]) \n\t"
+ "lh %[load7], 20(%[input]) \n\t"
+ "lh %[load8], 12(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load5], %[cospi_28_64] \n\t"
+ "msub $ac1, %[load6], %[cospi_4_64] \n\t"
+ "extp %[result1], $ac1, 31 \n\t"
+
+ "madd $ac3, %[load7], %[cospi_12_64] \n\t"
+ "msub $ac3, %[load8], %[cospi_20_64] \n\t"
+ "extp %[result2], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac1, %[load7], %[cospi_20_64] \n\t"
+ "madd $ac1, %[load8], %[cospi_12_64] \n\t"
+ "extp %[result3], $ac1, 31 \n\t"
+
+ "madd $ac2, %[load5], %[cospi_4_64] \n\t"
+ "madd $ac2, %[load6], %[cospi_28_64] \n\t"
+ "extp %[result4], $ac2, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load5], %[result4], %[result3] \n\t"
+ "sub %[load5], %[load5], %[result1] \n\t"
+ "add %[load5], %[load5], %[result2] \n\t"
+
+ "sub %[load6], %[result1], %[result2] \n\t"
+ "sub %[load6], %[load6], %[result3] \n\t"
+ "add %[load6], %[load6], %[result4] \n\t"
+
+ "madd $ac1, %[load5], %[cospi_16_64] \n\t"
+ "madd $ac3, %[load6], %[cospi_16_64] \n\t"
+
+ "extp %[step1_5], $ac1, 31 \n\t"
+ "extp %[step1_6], $ac3, 31 \n\t"
+
+ "add %[step1_4], %[result1], %[result2] \n\t"
+ "add %[step1_7], %[result4], %[result3] \n\t"
+
+ : [load5] "=&r" (load5), [load6] "=&r" (load6),
+ [load7] "=&r" (load7), [load8] "=&r" (load8),
+ [result1] "=&r" (result1), [result2] "=&r" (result2),
+ [result3] "=&r" (result3), [result4] "=&r" (result4),
+ [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5),
+ [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7)
+ : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+ [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64),
+ [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64),
+ [cospi_16_64] "r" (cospi_16_64)
+ );
+
+ __asm__ __volatile__ (
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+
+ "sub %[load5], %[step2_14], %[step2_13] \n\t"
+ "sub %[load5], %[load5], %[step2_9] \n\t"
+ "add %[load5], %[load5], %[step2_10] \n\t"
+
+ "madd $ac0, %[load5], %[cospi_16_64] \n\t"
+
+ "sub %[load6], %[step2_14], %[step2_13] \n\t"
+ "sub %[load6], %[load6], %[step2_10] \n\t"
+ "add %[load6], %[load6], %[step2_9] \n\t"
+
+ "madd $ac1, %[load6], %[cospi_16_64] \n\t"
+
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load5], %[step2_15], %[step2_12] \n\t"
+ "sub %[load5], %[load5], %[step2_8] \n\t"
+ "add %[load5], %[load5], %[step2_11] \n\t"
+
+ "madd $ac2, %[load5], %[cospi_16_64] \n\t"
+
+ "sub %[load6], %[step2_15], %[step2_12] \n\t"
+ "sub %[load6], %[load6], %[step2_11] \n\t"
+ "add %[load6], %[load6], %[step2_8] \n\t"
+
+ "madd $ac3, %[load6], %[cospi_16_64] \n\t"
+
+ "extp %[step1_10], $ac0, 31 \n\t"
+ "extp %[step1_13], $ac1, 31 \n\t"
+ "extp %[step1_11], $ac2, 31 \n\t"
+ "extp %[step1_12], $ac3, 31 \n\t"
+
+ : [load5] "=&r" (load5), [load6] "=&r" (load6),
+ [step1_10] "=r" (step1_10), [step1_11] "=r" (step1_11),
+ [step1_12] "=r" (step1_12), [step1_13] "=r" (step1_13)
+ : [const_2_power_13] "r" (const_2_power_13),
+ [step2_14] "r" (step2_14), [step2_13] "r" (step2_13),
+ [step2_9] "r" (step2_9), [step2_10] "r" (step2_10),
+ [step2_15] "r" (step2_15), [step2_12] "r" (step2_12),
+ [step2_8] "r" (step2_8), [step2_11] "r" (step2_11),
+ [cospi_16_64] "r" (cospi_16_64)
+ );
+
+ step1_8 = step2_8 + step2_11;
+ step1_9 = step2_9 + step2_10;
+ step1_14 = step2_13 + step2_14;
+ step1_15 = step2_12 + step2_15;
+
+ __asm__ __volatile__ (
+ "lbu %[load7], 0(%[dest_pix]) \n\t"
+ "add %[load5], %[step1_0], %[step1_7] \n\t"
+ "add %[load5], %[load5], %[step1_15] \n\t"
+ "addi %[load5], %[load5], 32 \n\t"
+ "sra %[load5], %[load5], 6 \n\t"
+ "add %[load7], %[load7], %[load5] \n\t"
+ "lbux %[load5], %[load7](%[cm]) \n\t"
+ "add %[load6], %[step1_1], %[step1_6] \n\t"
+ "add %[load6], %[load6], %[step1_14] \n\t"
+ "sb %[load5], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "lbu %[load8], 0(%[dest_pix]) \n\t"
+ "addi %[load6], %[load6], 32 \n\t"
+ "sra %[load6], %[load6], 6 \n\t"
+ "add %[load8], %[load8], %[load6] \n\t"
+ "lbux %[load6], %[load8](%[cm]) \n\t"
+ "sb %[load6], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "lbu %[load7], 0(%[dest_pix]) \n\t"
+ "add %[load5], %[step1_2], %[step1_5] \n\t"
+ "add %[load5], %[load5], %[step1_13] \n\t"
+ "addi %[load5], %[load5], 32 \n\t"
+ "sra %[load5], %[load5], 6 \n\t"
+ "add %[load7], %[load7], %[load5] \n\t"
+ "lbux %[load5], %[load7](%[cm]) \n\t"
+ "add %[load6], %[step1_3], %[step1_4] \n\t"
+ "add %[load6], %[load6], %[step1_12] \n\t"
+ "sb %[load5], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "lbu %[load8], 0(%[dest_pix]) \n\t"
+ "addi %[load6], %[load6], 32 \n\t"
+ "sra %[load6], %[load6], 6 \n\t"
+ "add %[load8], %[load8], %[load6] \n\t"
+ "lbux %[load6], %[load8](%[cm]) \n\t"
+ "sb %[load6], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "lbu %[load7], 0(%[dest_pix]) \n\t"
+ "sub %[load5], %[step1_3], %[step1_4] \n\t"
+ "add %[load5], %[load5], %[step1_11] \n\t"
+ "addi %[load5], %[load5], 32 \n\t"
+ "sra %[load5], %[load5], 6 \n\t"
+ "add %[load7], %[load7], %[load5] \n\t"
+ "lbux %[load5], %[load7](%[cm]) \n\t"
+ "sub %[load6], %[step1_2], %[step1_5] \n\t"
+ "add %[load6], %[load6], %[step1_10] \n\t"
+ "sb %[load5], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "lbu %[load8], 0(%[dest_pix]) \n\t"
+ "addi %[load6], %[load6], 32 \n\t"
+ "sra %[load6], %[load6], 6 \n\t"
+ "add %[load8], %[load8], %[load6] \n\t"
+ "lbux %[load6], %[load8](%[cm]) \n\t"
+ "sb %[load6], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "sub %[load5], %[step1_1], %[step1_6] \n\t"
+ "lbu %[load7], 0(%[dest_pix]) \n\t"
+ "add %[load5], %[load5], %[step1_9] \n\t"
+ "addi %[load5], %[load5], 32 \n\t"
+ "sra %[load5], %[load5], 6 \n\t"
+ "add %[load7], %[load7], %[load5] \n\t"
+ "lbux %[load5], %[load7](%[cm]) \n\t"
+ "sub %[load6], %[step1_0], %[step1_7] \n\t"
+ "add %[load6], %[load6], %[step1_8] \n\t"
+ "sb %[load5], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "lbu %[load8], 0(%[dest_pix]) \n\t"
+ "addi %[load6], %[load6], 32 \n\t"
+ "sra %[load6], %[load6], 6 \n\t"
+ "add %[load8], %[load8], %[load6] \n\t"
+ "lbux %[load6], %[load8](%[cm]) \n\t"
+ "sb %[load6], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "lbu %[load7], 0(%[dest_pix]) \n\t"
+ "sub %[load5], %[step1_0], %[step1_7] \n\t"
+ "sub %[load5], %[load5], %[step1_8] \n\t"
+ "addi %[load5], %[load5], 32 \n\t"
+ "sra %[load5], %[load5], 6 \n\t"
+ "add %[load7], %[load7], %[load5] \n\t"
+ "lbux %[load5], %[load7](%[cm]) \n\t"
+ "sub %[load6], %[step1_1], %[step1_6] \n\t"
+ "sub %[load6], %[load6], %[step1_9] \n\t"
+ "sb %[load5], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "lbu %[load8], 0(%[dest_pix]) \n\t"
+ "addi %[load6], %[load6], 32 \n\t"
+ "sra %[load6], %[load6], 6 \n\t"
+ "add %[load8], %[load8], %[load6] \n\t"
+ "lbux %[load6], %[load8](%[cm]) \n\t"
+ "sb %[load6], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "lbu %[load7], 0(%[dest_pix]) \n\t"
+ "sub %[load5], %[step1_2], %[step1_5] \n\t"
+ "sub %[load5], %[load5], %[step1_10] \n\t"
+ "addi %[load5], %[load5], 32 \n\t"
+ "sra %[load5], %[load5], 6 \n\t"
+ "add %[load7], %[load7], %[load5] \n\t"
+ "lbux %[load5], %[load7](%[cm]) \n\t"
+ "sub %[load6], %[step1_3], %[step1_4] \n\t"
+ "sub %[load6], %[load6], %[step1_11] \n\t"
+ "sb %[load5], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "lbu %[load8], 0(%[dest_pix]) \n\t"
+ "addi %[load6], %[load6], 32 \n\t"
+ "sra %[load6], %[load6], 6 \n\t"
+ "add %[load8], %[load8], %[load6] \n\t"
+ "lbux %[load6], %[load8](%[cm]) \n\t"
+ "sb %[load6], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "lbu %[load7], 0(%[dest_pix]) \n\t"
+ "add %[load5], %[step1_3], %[step1_4] \n\t"
+ "sub %[load5], %[load5], %[step1_12] \n\t"
+ "addi %[load5], %[load5], 32 \n\t"
+ "sra %[load5], %[load5], 6 \n\t"
+ "add %[load7], %[load7], %[load5] \n\t"
+ "lbux %[load5], %[load7](%[cm]) \n\t"
+ "add %[load6], %[step1_2], %[step1_5] \n\t"
+ "sub %[load6], %[load6], %[step1_13] \n\t"
+ "sb %[load5], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "lbu %[load8], 0(%[dest_pix]) \n\t"
+ "addi %[load6], %[load6], 32 \n\t"
+ "sra %[load6], %[load6], 6 \n\t"
+ "add %[load8], %[load8], %[load6] \n\t"
+ "lbux %[load6], %[load8](%[cm]) \n\t"
+ "sb %[load6], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "lbu %[load7], 0(%[dest_pix]) \n\t"
+ "add %[load5], %[step1_1], %[step1_6] \n\t"
+ "sub %[load5], %[load5], %[step1_14] \n\t"
+ "addi %[load5], %[load5], 32 \n\t"
+ "sra %[load5], %[load5], 6 \n\t"
+ "add %[load7], %[load7], %[load5] \n\t"
+ "lbux %[load5], %[load7](%[cm]) \n\t"
+ "add %[load6], %[step1_0], %[step1_7] \n\t"
+ "sub %[load6], %[load6], %[step1_15] \n\t"
+ "sb %[load5], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "lbu %[load8], 0(%[dest_pix]) \n\t"
+ "addi %[load6], %[load6], 32 \n\t"
+ "sra %[load6], %[load6], 6 \n\t"
+ "add %[load8], %[load8], %[load6] \n\t"
+ "lbux %[load6], %[load8](%[cm]) \n\t"
+ "sb %[load6], 0(%[dest_pix]) \n\t"
+
+ : [load5] "=&r" (load5), [load6] "=&r" (load6), [load7] "=&r" (load7),
+ [load8] "=&r" (load8), [dest_pix] "+r" (dest_pix)
+ : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
+ [step1_0] "r" (step1_0), [step1_1] "r" (step1_1),
+ [step1_2] "r" (step1_2), [step1_3] "r" (step1_3),
+ [step1_4] "r" (step1_4), [step1_5] "r" (step1_5),
+ [step1_6] "r" (step1_6), [step1_7] "r" (step1_7),
+ [step1_8] "r" (step1_8), [step1_9] "r" (step1_9),
+ [step1_10] "r" (step1_10), [step1_11] "r" (step1_11),
+ [step1_12] "r" (step1_12), [step1_13] "r" (step1_13),
+ [step1_14] "r" (step1_14), [step1_15] "r" (step1_15)
+ );
+
+ input += 16;
+ }
+}
+
+void vp9_idct16x16_256_add_dspr2(const int16_t *input, uint8_t *dest,
+ int dest_stride) {
+ DECLARE_ALIGNED(32, int16_t, out[16 * 16]);
+ uint32_t pos = 45;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__ (
+ "wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r" (pos)
+ );
+
+ // First transform rows
+ idct16_1d_rows_dspr2(input, out, 16);
+
+ // Then transform columns and add to dest
+ idct16_1d_cols_add_blk_dspr2(out, dest, dest_stride);
+}
+
+static void iadst16_1d(const int16_t *input, int16_t *output) {
+ int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
+
+ int x0 = input[15];
+ int x1 = input[0];
+ int x2 = input[13];
+ int x3 = input[2];
+ int x4 = input[11];
+ int x5 = input[4];
+ int x6 = input[9];
+ int x7 = input[6];
+ int x8 = input[7];
+ int x9 = input[8];
+ int x10 = input[5];
+ int x11 = input[10];
+ int x12 = input[3];
+ int x13 = input[12];
+ int x14 = input[1];
+ int x15 = input[14];
+
+ if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
+ | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {
+ output[0] = output[1] = output[2] = output[3] = output[4]
+ = output[5] = output[6] = output[7] = output[8]
+ = output[9] = output[10] = output[11] = output[12]
+ = output[13] = output[14] = output[15] = 0;
+ return;
+ }
+
+ // stage 1
+ s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
+ s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
+ s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
+ s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
+ s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
+ s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
+ s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
+ s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
+ s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
+ s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
+ s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
+ s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
+ s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
+ s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
+ s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
+ s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
+
+ x0 = dct_const_round_shift(s0 + s8);
+ x1 = dct_const_round_shift(s1 + s9);
+ x2 = dct_const_round_shift(s2 + s10);
+ x3 = dct_const_round_shift(s3 + s11);
+ x4 = dct_const_round_shift(s4 + s12);
+ x5 = dct_const_round_shift(s5 + s13);
+ x6 = dct_const_round_shift(s6 + s14);
+ x7 = dct_const_round_shift(s7 + s15);
+ x8 = dct_const_round_shift(s0 - s8);
+ x9 = dct_const_round_shift(s1 - s9);
+ x10 = dct_const_round_shift(s2 - s10);
+ x11 = dct_const_round_shift(s3 - s11);
+ x12 = dct_const_round_shift(s4 - s12);
+ x13 = dct_const_round_shift(s5 - s13);
+ x14 = dct_const_round_shift(s6 - s14);
+ x15 = dct_const_round_shift(s7 - s15);
+
+ // stage 2
+ s0 = x0;
+ s1 = x1;
+ s2 = x2;
+ s3 = x3;
+ s4 = x4;
+ s5 = x5;
+ s6 = x6;
+ s7 = x7;
+ s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
+ s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
+ s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
+ s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
+ s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;
+ s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
+ s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;
+ s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
+
+ x0 = s0 + s4;
+ x1 = s1 + s5;
+ x2 = s2 + s6;
+ x3 = s3 + s7;
+ x4 = s0 - s4;
+ x5 = s1 - s5;
+ x6 = s2 - s6;
+ x7 = s3 - s7;
+ x8 = dct_const_round_shift(s8 + s12);
+ x9 = dct_const_round_shift(s9 + s13);
+ x10 = dct_const_round_shift(s10 + s14);
+ x11 = dct_const_round_shift(s11 + s15);
+ x12 = dct_const_round_shift(s8 - s12);
+ x13 = dct_const_round_shift(s9 - s13);
+ x14 = dct_const_round_shift(s10 - s14);
+ x15 = dct_const_round_shift(s11 - s15);
+
+ // stage 3
+ s0 = x0;
+ s1 = x1;
+ s2 = x2;
+ s3 = x3;
+ s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
+ s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
+ s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;
+ s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
+ s8 = x8;
+ s9 = x9;
+ s10 = x10;
+ s11 = x11;
+ s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
+ s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
+ s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;
+ s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
+
+ x0 = s0 + s2;
+ x1 = s1 + s3;
+ x2 = s0 - s2;
+ x3 = s1 - s3;
+ x4 = dct_const_round_shift(s4 + s6);
+ x5 = dct_const_round_shift(s5 + s7);
+ x6 = dct_const_round_shift(s4 - s6);
+ x7 = dct_const_round_shift(s5 - s7);
+ x8 = s8 + s10;
+ x9 = s9 + s11;
+ x10 = s8 - s10;
+ x11 = s9 - s11;
+ x12 = dct_const_round_shift(s12 + s14);
+ x13 = dct_const_round_shift(s13 + s15);
+ x14 = dct_const_round_shift(s12 - s14);
+ x15 = dct_const_round_shift(s13 - s15);
+
+ // stage 4
+ s2 = (- cospi_16_64) * (x2 + x3);
+ s3 = cospi_16_64 * (x2 - x3);
+ s6 = cospi_16_64 * (x6 + x7);
+ s7 = cospi_16_64 * (- x6 + x7);
+ s10 = cospi_16_64 * (x10 + x11);
+ s11 = cospi_16_64 * (- x10 + x11);
+ s14 = (- cospi_16_64) * (x14 + x15);
+ s15 = cospi_16_64 * (x14 - x15);
+
+ x2 = dct_const_round_shift(s2);
+ x3 = dct_const_round_shift(s3);
+ x6 = dct_const_round_shift(s6);
+ x7 = dct_const_round_shift(s7);
+ x10 = dct_const_round_shift(s10);
+ x11 = dct_const_round_shift(s11);
+ x14 = dct_const_round_shift(s14);
+ x15 = dct_const_round_shift(s15);
+
+ output[0] = x0;
+ output[1] = -x8;
+ output[2] = x12;
+ output[3] = -x4;
+ output[4] = x6;
+ output[5] = x14;
+ output[6] = x10;
+ output[7] = x2;
+ output[8] = x3;
+ output[9] = x11;
+ output[10] = x15;
+ output[11] = x7;
+ output[12] = x5;
+ output[13] = -x13;
+ output[14] = x9;
+ output[15] = -x1;
+}
+
+void vp9_iht16x16_256_add_dspr2(const int16_t *input, uint8_t *dest,
+ int pitch, int tx_type) {
+ int i, j;
+ DECLARE_ALIGNED(32, int16_t, out[16 * 16]);
+ int16_t *outptr = out;
+ int16_t temp_out[16];
+ uint32_t pos = 45;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__ (
+ "wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r" (pos)
+ );
+
+ switch (tx_type) {
+ case DCT_DCT: // DCT in both horizontal and vertical
+ idct16_1d_rows_dspr2(input, outptr, 16);
+ idct16_1d_cols_add_blk_dspr2(out, dest, pitch);
+ break;
+ case ADST_DCT: // ADST in vertical, DCT in horizontal
+ idct16_1d_rows_dspr2(input, outptr, 16);
+
+ outptr = out;
+
+ for (i = 0; i < 16; ++i) {
+ iadst16_1d(outptr, temp_out);
+
+ for (j = 0; j < 16; ++j)
+ dest[j * pitch + i] =
+ clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+ + dest[j * pitch + i]);
+ outptr += 16;
+ }
+ break;
+ case DCT_ADST: // DCT in vertical, ADST in horizontal
+ {
+ int16_t temp_in[16 * 16];
+
+ for (i = 0; i < 16; ++i) {
+ /* prefetch row */
+ vp9_prefetch_load((const uint8_t *)(input + 16));
+
+ iadst16_1d(input, outptr);
+ input += 16;
+ outptr += 16;
+ }
+
+ for (i = 0; i < 16; ++i)
+ for (j = 0; j < 16; ++j)
+ temp_in[j * 16 + i] = out[i * 16 + j];
+
+ idct16_1d_cols_add_blk_dspr2(temp_in, dest, pitch);
+ }
+ break;
+ case ADST_ADST: // ADST in both directions
+ {
+ int16_t temp_in[16];
+
+ for (i = 0; i < 16; ++i) {
+ /* prefetch row */
+ vp9_prefetch_load((const uint8_t *)(input + 16));
+
+ iadst16_1d(input, outptr);
+ input += 16;
+ outptr += 16;
+ }
+
+ for (i = 0; i < 16; ++i) {
+ for (j = 0; j < 16; ++j)
+ temp_in[j] = out[j * 16 + i];
+ iadst16_1d(temp_in, temp_out);
+ for (j = 0; j < 16; ++j)
+ dest[j * pitch + i] =
+ clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+ + dest[j * pitch + i]);
+ }
+ }
+ break;
+ default:
+ printf("vp9_short_iht16x16_add_dspr2 : Invalid tx_type\n");
+ break;
+ }
+}
+
+void vp9_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest,
+ int dest_stride) {
+ DECLARE_ALIGNED(32, int16_t, out[16 * 16]);
+ int16_t *outptr = out;
+ uint32_t i;
+ uint32_t pos = 45;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__ (
+ "wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r" (pos)
+ );
+
+ // First transform rows. Since all non-zero dct coefficients are in
+ // upper-left 4x4 area, we only need to calculate first 4 rows here.
+ idct16_1d_rows_dspr2(input, outptr, 4);
+
+ outptr += 4;
+ for (i = 0; i < 6; ++i) {
+ __asm__ __volatile__ (
+ "sw $zero, 0(%[outptr]) \n\t"
+ "sw $zero, 32(%[outptr]) \n\t"
+ "sw $zero, 64(%[outptr]) \n\t"
+ "sw $zero, 96(%[outptr]) \n\t"
+ "sw $zero, 128(%[outptr]) \n\t"
+ "sw $zero, 160(%[outptr]) \n\t"
+ "sw $zero, 192(%[outptr]) \n\t"
+ "sw $zero, 224(%[outptr]) \n\t"
+ "sw $zero, 256(%[outptr]) \n\t"
+ "sw $zero, 288(%[outptr]) \n\t"
+ "sw $zero, 320(%[outptr]) \n\t"
+ "sw $zero, 352(%[outptr]) \n\t"
+ "sw $zero, 384(%[outptr]) \n\t"
+ "sw $zero, 416(%[outptr]) \n\t"
+ "sw $zero, 448(%[outptr]) \n\t"
+ "sw $zero, 480(%[outptr]) \n\t"
+
+ :
+ : [outptr] "r" (outptr)
+ );
+
+ outptr += 2;
+ }
+
+ // Then transform columns
+ idct16_1d_cols_add_blk_dspr2(out, dest, dest_stride);
+}
+
+void vp9_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest,
+ int dest_stride) {
+ uint32_t pos = 45;
+ int32_t out;
+ int32_t r;
+ int32_t a1, absa1;
+ int32_t vector_a1;
+ int32_t t1, t2, t3, t4;
+ int32_t vector_1, vector_2, vector_3, vector_4;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__ (
+ "wrdsp %[pos], 1 \n\t"
+
+ :
+ : [pos] "r" (pos)
+ );
+
+ out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);
+ __asm__ __volatile__ (
+ "addi %[out], %[out], 32 \n\t"
+ "sra %[a1], %[out], 6 \n\t"
+
+ : [out] "+r" (out), [a1] "=r" (a1)
+ :
+ );
+
+ if (a1 < 0) {
+ /* use quad-byte
+ * input and output memory are four byte aligned */
+ __asm__ __volatile__ (
+ "abs %[absa1], %[a1] \n\t"
+ "replv.qb %[vector_a1], %[absa1] \n\t"
+
+ : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)
+ : [a1] "r" (a1)
+ );
+
+ for (r = 16; r--;) {
+ __asm__ __volatile__ (
+ "lw %[t1], 0(%[dest]) \n\t"
+ "lw %[t2], 4(%[dest]) \n\t"
+ "lw %[t3], 8(%[dest]) \n\t"
+ "lw %[t4], 12(%[dest]) \n\t"
+ "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"
+ "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"
+ "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t"
+ "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t"
+ "sw %[vector_1], 0(%[dest]) \n\t"
+ "sw %[vector_2], 4(%[dest]) \n\t"
+ "sw %[vector_3], 8(%[dest]) \n\t"
+ "sw %[vector_4], 12(%[dest]) \n\t"
+ "add %[dest], %[dest], %[dest_stride] \n\t"
+
+ : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4),
+ [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
+ [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4),
+ [dest] "+&r" (dest)
+ : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
+ );
+ }
+ } else {
+ /* use quad-byte
+ * input and output memory are four byte aligned */
+ __asm__ __volatile__ (
+ "replv.qb %[vector_a1], %[a1] \n\t"
+
+ : [vector_a1] "=r" (vector_a1)
+ : [a1] "r" (a1)
+ );
+
+ for (r = 16; r--;) {
+ __asm__ __volatile__ (
+ "lw %[t1], 0(%[dest]) \n\t"
+ "lw %[t2], 4(%[dest]) \n\t"
+ "lw %[t3], 8(%[dest]) \n\t"
+ "lw %[t4], 12(%[dest]) \n\t"
+ "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"
+ "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"
+ "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t"
+ "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t"
+ "sw %[vector_1], 0(%[dest]) \n\t"
+ "sw %[vector_2], 4(%[dest]) \n\t"
+ "sw %[vector_3], 8(%[dest]) \n\t"
+ "sw %[vector_4], 12(%[dest]) \n\t"
+ "add %[dest], %[dest], %[dest_stride] \n\t"
+
+ : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4),
+ [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
+ [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4),
+ [dest] "+&r" (dest)
+ : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
+ );
+ }
+ }
+}
+#endif // #if HAVE_DSPR2
diff --git a/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c b/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c
new file mode 100644
index 000000000..5e92db3d2
--- /dev/null
+++ b/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c
@@ -0,0 +1,1073 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_idct.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+
+#if HAVE_DSPR2
+void vp9_idct32_1d_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
+ int dest_stride) {
+ int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6;
+ int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13;
+ int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19;
+ int16_t step1_20, step1_21, step1_22, step1_23, step1_24, step1_25, step1_26;
+ int16_t step1_27, step1_28, step1_29, step1_30, step1_31;
+ int16_t step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
+ int16_t step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13;
+ int16_t step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20;
+ int16_t step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27;
+ int16_t step2_28, step2_29, step2_30, step2_31;
+ int16_t step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14;
+ int16_t step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21;
+ int16_t step3_22, step3_23, step3_24, step3_25, step3_26, step3_27;
+ int16_t step3_28, step3_29, step3_30, step3_31;
+ int temp0, temp1, temp2, temp3;
+ int load1, load2, load3, load4;
+ int result1, result2;
+ int i, temp21;
+ uint8_t *dest_pix, *dest_pix1;
+ const int const_2_power_13 = 8192;
+ uint8_t *cm = vp9_ff_cropTbl;
+
+ /* prefetch vp9_ff_cropTbl */
+ vp9_prefetch_load(vp9_ff_cropTbl);
+ vp9_prefetch_load(vp9_ff_cropTbl + 32);
+ vp9_prefetch_load(vp9_ff_cropTbl + 64);
+ vp9_prefetch_load(vp9_ff_cropTbl + 96);
+ vp9_prefetch_load(vp9_ff_cropTbl + 128);
+ vp9_prefetch_load(vp9_ff_cropTbl + 160);
+ vp9_prefetch_load(vp9_ff_cropTbl + 192);
+ vp9_prefetch_load(vp9_ff_cropTbl + 224);
+
+ for (i = 0; i < 32; ++i) {
+ dest_pix = dest + i;
+ dest_pix1 = dest + i + 31 * dest_stride;
+
+ __asm__ __volatile__ (
+ "lh %[load1], 2(%[input]) \n\t"
+ "lh %[load2], 62(%[input]) \n\t"
+ "lh %[load3], 34(%[input]) \n\t"
+ "lh %[load4], 30(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_31_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_1_64] \n\t"
+ "extp %[temp0], $ac1, 31 \n\t"
+
+ "madd $ac3, %[load1], %[cospi_1_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_31_64] \n\t"
+ "extp %[temp3], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac2, %[load3], %[cospi_15_64] \n\t"
+ "msub $ac2, %[load4], %[cospi_17_64] \n\t"
+ "extp %[temp1], $ac2, 31 \n\t"
+
+ "madd $ac1, %[load3], %[cospi_17_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_15_64] \n\t"
+ "extp %[temp2], $ac1, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load1], %[temp3], %[temp2] \n\t"
+ "sub %[load2], %[temp0], %[temp1] \n\t"
+
+ "madd $ac1, %[load1], %[cospi_28_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_4_64] \n\t"
+ "madd $ac3, %[load1], %[cospi_4_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_28_64] \n\t"
+
+ "extp %[step1_17], $ac1, 31 \n\t"
+ "extp %[step1_30], $ac3, 31 \n\t"
+ "add %[step1_16], %[temp0], %[temp1] \n\t"
+ "add %[step1_31], %[temp2], %[temp3] \n\t"
+
+ : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3),
+ [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+ [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+ [step1_16] "=r" (step1_16), [step1_17] "=r" (step1_17),
+ [step1_30] "=r" (step1_30), [step1_31] "=r" (step1_31)
+ : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+ [cospi_31_64] "r" (cospi_31_64), [cospi_1_64] "r" (cospi_1_64),
+ [cospi_4_64] "r" (cospi_4_64), [cospi_17_64] "r" (cospi_17_64),
+ [cospi_15_64] "r" (cospi_15_64), [cospi_28_64] "r" (cospi_28_64)
+ );
+
+ __asm__ __volatile__ (
+ "lh %[load1], 18(%[input]) \n\t"
+ "lh %[load2], 46(%[input]) \n\t"
+ "lh %[load3], 50(%[input]) \n\t"
+ "lh %[load4], 14(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_23_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_9_64] \n\t"
+ "extp %[temp0], $ac1, 31 \n\t"
+
+ "madd $ac3, %[load1], %[cospi_9_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_23_64] \n\t"
+ "extp %[temp3], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac2, %[load3], %[cospi_7_64] \n\t"
+ "msub $ac2, %[load4], %[cospi_25_64] \n\t"
+ "extp %[temp1], $ac2, 31 \n\t"
+
+ "madd $ac1, %[load3], %[cospi_25_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_7_64] \n\t"
+ "extp %[temp2], $ac1, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load1], %[temp1], %[temp0] \n\t"
+ "sub %[load2], %[temp2], %[temp3] \n\t"
+
+ "msub $ac1, %[load1], %[cospi_28_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_4_64] \n\t"
+ "msub $ac3, %[load1], %[cospi_4_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_28_64] \n\t"
+
+ "extp %[step1_18], $ac1, 31 \n\t"
+ "extp %[step1_29], $ac3, 31 \n\t"
+ "add %[step1_19], %[temp0], %[temp1] \n\t"
+ "add %[step1_28], %[temp2], %[temp3] \n\t"
+
+ : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3),
+ [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+ [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+ [step1_18] "=r" (step1_18), [step1_19] "=r" (step1_19),
+ [step1_28] "=r" (step1_28), [step1_29] "=r" (step1_29)
+ : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+ [cospi_23_64] "r" (cospi_23_64), [cospi_9_64] "r" (cospi_9_64),
+ [cospi_4_64] "r" (cospi_4_64), [cospi_7_64] "r" (cospi_7_64),
+ [cospi_25_64] "r" (cospi_25_64), [cospi_28_64] "r" (cospi_28_64)
+ );
+
+ __asm__ __volatile__ (
+ "lh %[load1], 10(%[input]) \n\t"
+ "lh %[load2], 54(%[input]) \n\t"
+ "lh %[load3], 42(%[input]) \n\t"
+ "lh %[load4], 22(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_27_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_5_64] \n\t"
+ "extp %[temp0], $ac1, 31 \n\t"
+
+ "madd $ac3, %[load1], %[cospi_5_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_27_64] \n\t"
+ "extp %[temp3], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac2, %[load3], %[cospi_11_64] \n\t"
+ "msub $ac2, %[load4], %[cospi_21_64] \n\t"
+ "extp %[temp1], $ac2, 31 \n\t"
+
+ "madd $ac1, %[load3], %[cospi_21_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_11_64] \n\t"
+ "extp %[temp2], $ac1, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load1], %[temp0], %[temp1] \n\t"
+ "sub %[load2], %[temp3], %[temp2] \n\t"
+
+ "madd $ac1, %[load2], %[cospi_12_64] \n\t"
+ "msub $ac1, %[load1], %[cospi_20_64] \n\t"
+ "madd $ac3, %[load1], %[cospi_12_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_20_64] \n\t"
+
+ "extp %[step1_21], $ac1, 31 \n\t"
+ "extp %[step1_26], $ac3, 31 \n\t"
+ "add %[step1_20], %[temp0], %[temp1] \n\t"
+ "add %[step1_27], %[temp2], %[temp3] \n\t"
+
+ : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3),
+ [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+ [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+ [step1_20] "=r" (step1_20), [step1_21] "=r" (step1_21),
+ [step1_26] "=r" (step1_26), [step1_27] "=r" (step1_27)
+ : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+ [cospi_27_64] "r" (cospi_27_64), [cospi_5_64] "r" (cospi_5_64),
+ [cospi_11_64] "r" (cospi_11_64), [cospi_21_64] "r" (cospi_21_64),
+ [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64)
+ );
+
+ __asm__ __volatile__ (
+ "lh %[load1], 26(%[input]) \n\t"
+ "lh %[load2], 38(%[input]) \n\t"
+ "lh %[load3], 58(%[input]) \n\t"
+ "lh %[load4], 6(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_19_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_13_64] \n\t"
+ "extp %[temp0], $ac1, 31 \n\t"
+ "madd $ac3, %[load1], %[cospi_13_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_19_64] \n\t"
+ "extp %[temp3], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac2, %[load3], %[cospi_3_64] \n\t"
+ "msub $ac2, %[load4], %[cospi_29_64] \n\t"
+ "extp %[temp1], $ac2, 31 \n\t"
+ "madd $ac1, %[load3], %[cospi_29_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_3_64] \n\t"
+ "extp %[temp2], $ac1, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load1], %[temp1], %[temp0] \n\t"
+ "sub %[load2], %[temp2], %[temp3] \n\t"
+ "msub $ac1, %[load1], %[cospi_12_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_20_64] \n\t"
+ "msub $ac3, %[load1], %[cospi_20_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_12_64] \n\t"
+ "extp %[step1_22], $ac1, 31 \n\t"
+ "extp %[step1_25], $ac3, 31 \n\t"
+ "add %[step1_23], %[temp0], %[temp1] \n\t"
+ "add %[step1_24], %[temp2], %[temp3] \n\t"
+
+ : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3),
+ [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+ [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+ [step1_22] "=r" (step1_22), [step1_23] "=r" (step1_23),
+ [step1_24] "=r" (step1_24), [step1_25] "=r" (step1_25)
+ : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+ [cospi_19_64] "r" (cospi_19_64), [cospi_13_64] "r" (cospi_13_64),
+ [cospi_3_64] "r" (cospi_3_64), [cospi_29_64] "r" (cospi_29_64),
+ [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64)
+ );
+
+ __asm__ __volatile__ (
+ "lh %[load1], 4(%[input]) \n\t"
+ "lh %[load2], 60(%[input]) \n\t"
+ "lh %[load3], 36(%[input]) \n\t"
+ "lh %[load4], 28(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_30_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_2_64] \n\t"
+ "extp %[temp0], $ac1, 31 \n\t"
+ "madd $ac3, %[load1], %[cospi_2_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_30_64] \n\t"
+ "extp %[temp3], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac2, %[load3], %[cospi_14_64] \n\t"
+ "msub $ac2, %[load4], %[cospi_18_64] \n\t"
+ "extp %[temp1], $ac2, 31 \n\t"
+ "madd $ac1, %[load3], %[cospi_18_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_14_64] \n\t"
+ "extp %[temp2], $ac1, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load1], %[temp0], %[temp1] \n\t"
+ "sub %[load2], %[temp3], %[temp2] \n\t"
+ "msub $ac1, %[load1], %[cospi_8_64] \n\t"
+ "madd $ac1, %[load2], %[cospi_24_64] \n\t"
+ "madd $ac3, %[load1], %[cospi_24_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_8_64] \n\t"
+ "extp %[step2_9], $ac1, 31 \n\t"
+ "extp %[step2_14], $ac3, 31 \n\t"
+ "add %[step2_8], %[temp0], %[temp1] \n\t"
+ "add %[step2_15], %[temp2], %[temp3] \n\t"
+
+ : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3),
+ [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+ [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+ [step2_8] "=r" (step2_8), [step2_9] "=r" (step2_9),
+ [step2_14] "=r" (step2_14), [step2_15] "=r" (step2_15)
+ : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+ [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64),
+ [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64),
+ [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64)
+ );
+
+ __asm__ __volatile__ (
+ "lh %[load1], 20(%[input]) \n\t"
+ "lh %[load2], 44(%[input]) \n\t"
+ "lh %[load3], 52(%[input]) \n\t"
+ "lh %[load4], 12(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_22_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_10_64] \n\t"
+ "extp %[temp0], $ac1, 31 \n\t"
+ "madd $ac3, %[load1], %[cospi_10_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_22_64] \n\t"
+ "extp %[temp3], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac2, %[load3], %[cospi_6_64] \n\t"
+ "msub $ac2, %[load4], %[cospi_26_64] \n\t"
+ "extp %[temp1], $ac2, 31 \n\t"
+ "madd $ac1, %[load3], %[cospi_26_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_6_64] \n\t"
+ "extp %[temp2], $ac1, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load1], %[temp1], %[temp0] \n\t"
+ "sub %[load2], %[temp2], %[temp3] \n\t"
+ "msub $ac1, %[load1], %[cospi_24_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_8_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_24_64] \n\t"
+ "msub $ac3, %[load1], %[cospi_8_64] \n\t"
+ "extp %[step2_10], $ac1, 31 \n\t"
+ "extp %[step2_13], $ac3, 31 \n\t"
+ "add %[step2_11], %[temp0], %[temp1] \n\t"
+ "add %[step2_12], %[temp2], %[temp3] \n\t"
+
+ : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3),
+ [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+ [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+ [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11),
+ [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13)
+ : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+ [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64),
+ [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64),
+ [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64)
+ );
+
+ __asm__ __volatile__ (
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "sub %[temp0], %[step2_14], %[step2_13] \n\t"
+ "sub %[temp0], %[temp0], %[step2_9] \n\t"
+ "add %[temp0], %[temp0], %[step2_10] \n\t"
+ "madd $ac0, %[temp0], %[cospi_16_64] \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "sub %[temp1], %[step2_14], %[step2_13] \n\t"
+ "add %[temp1], %[temp1], %[step2_9] \n\t"
+ "sub %[temp1], %[temp1], %[step2_10] \n\t"
+ "madd $ac1, %[temp1], %[cospi_16_64] \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "sub %[temp0], %[step2_15], %[step2_12] \n\t"
+ "sub %[temp0], %[temp0], %[step2_8] \n\t"
+ "add %[temp0], %[temp0], %[step2_11] \n\t"
+ "madd $ac2, %[temp0], %[cospi_16_64] \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "sub %[temp1], %[step2_15], %[step2_12] \n\t"
+ "add %[temp1], %[temp1], %[step2_8] \n\t"
+ "sub %[temp1], %[temp1], %[step2_11] \n\t"
+ "madd $ac3, %[temp1], %[cospi_16_64] \n\t"
+
+ "add %[step3_8], %[step2_8], %[step2_11] \n\t"
+ "add %[step3_9], %[step2_9], %[step2_10] \n\t"
+ "add %[step3_14], %[step2_13], %[step2_14] \n\t"
+ "add %[step3_15], %[step2_12], %[step2_15] \n\t"
+ "extp %[step3_10], $ac0, 31 \n\t"
+ "extp %[step3_13], $ac1, 31 \n\t"
+ "extp %[step3_11], $ac2, 31 \n\t"
+ "extp %[step3_12], $ac3, 31 \n\t"
+
+ : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+ [step3_8] "=r" (step3_8), [step3_9] "=r" (step3_9),
+ [step3_10] "=r" (step3_10), [step3_11] "=r" (step3_11),
+ [step3_12] "=r" (step3_12), [step3_13] "=r" (step3_13),
+ [step3_14] "=r" (step3_14), [step3_15] "=r" (step3_15)
+ : [const_2_power_13] "r" (const_2_power_13), [step2_8] "r" (step2_8),
+ [step2_9] "r" (step2_9), [step2_10] "r" (step2_10),
+ [step2_11] "r" (step2_11), [step2_12] "r" (step2_12),
+ [step2_13] "r" (step2_13), [step2_14] "r" (step2_14),
+ [step2_15] "r" (step2_15), [cospi_16_64] "r" (cospi_16_64)
+ );
+
+ step2_18 = step1_17 - step1_18;
+ step2_29 = step1_30 - step1_29;
+
+ __asm__ __volatile__ (
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "msub $ac0, %[step2_18], %[cospi_8_64] \n\t"
+ "madd $ac0, %[step2_29], %[cospi_24_64] \n\t"
+ "extp %[step3_18], $ac0, 31 \n\t"
+
+ : [step3_18] "=r" (step3_18)
+ : [const_2_power_13] "r" (const_2_power_13),
+ [step2_18] "r" (step2_18), [step2_29] "r" (step2_29),
+ [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+ );
+
+ temp21 = step2_18 * cospi_24_64 + step2_29 * cospi_8_64;
+ step3_29 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+ step2_19 = step1_16 - step1_19;
+ step2_28 = step1_31 - step1_28;
+
+ __asm__ __volatile__ (
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "msub $ac0, %[step2_19], %[cospi_8_64] \n\t"
+ "madd $ac0, %[step2_28], %[cospi_24_64] \n\t"
+ "extp %[step3_19], $ac0, 31 \n\t"
+
+ : [step3_19] "=r" (step3_19)
+ : [const_2_power_13] "r" (const_2_power_13),
+ [step2_19] "r" (step2_19), [step2_28] "r" (step2_28),
+ [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+ );
+
+ temp21 = step2_19 * cospi_24_64 + step2_28 * cospi_8_64;
+ step3_28 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+ step3_16 = step1_16 + step1_19;
+ step3_17 = step1_17 + step1_18;
+ step3_30 = step1_29 + step1_30;
+ step3_31 = step1_28 + step1_31;
+
+ step2_20 = step1_23 - step1_20;
+ step2_27 = step1_24 - step1_27;
+
+ __asm__ __volatile__ (
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "msub $ac0, %[step2_20], %[cospi_24_64] \n\t"
+ "msub $ac0, %[step2_27], %[cospi_8_64] \n\t"
+ "extp %[step3_20], $ac0, 31 \n\t"
+
+ : [step3_20] "=r" (step3_20)
+ : [const_2_power_13] "r" (const_2_power_13),
+ [step2_20] "r" (step2_20), [step2_27] "r" (step2_27),
+ [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+ );
+
+ temp21 = -step2_20 * cospi_8_64 + step2_27 * cospi_24_64;
+ step3_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+ step2_21 = step1_22 - step1_21;
+ step2_26 = step1_25 - step1_26;
+
+ __asm__ __volatile__ (
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "msub $ac1, %[step2_21], %[cospi_24_64] \n\t"
+ "msub $ac1, %[step2_26], %[cospi_8_64] \n\t"
+ "extp %[step3_21], $ac1, 31 \n\t"
+
+ : [step3_21] "=r" (step3_21)
+ : [const_2_power_13] "r" (const_2_power_13),
+ [step2_21] "r" (step2_21), [step2_26] "r" (step2_26),
+ [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+ );
+
+ temp21 = -step2_21 * cospi_8_64 + step2_26 * cospi_24_64;
+ step3_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+ step3_22 = step1_21 + step1_22;
+ step3_23 = step1_20 + step1_23;
+ step3_24 = step1_24 + step1_27;
+ step3_25 = step1_25 + step1_26;
+
+ step2_16 = step3_16 + step3_23;
+ step2_17 = step3_17 + step3_22;
+ step2_18 = step3_18 + step3_21;
+ step2_19 = step3_19 + step3_20;
+ step2_20 = step3_19 - step3_20;
+ step2_21 = step3_18 - step3_21;
+ step2_22 = step3_17 - step3_22;
+ step2_23 = step3_16 - step3_23;
+
+ step2_24 = step3_31 - step3_24;
+ step2_25 = step3_30 - step3_25;
+ step2_26 = step3_29 - step3_26;
+ step2_27 = step3_28 - step3_27;
+ step2_28 = step3_28 + step3_27;
+ step2_29 = step3_29 + step3_26;
+ step2_30 = step3_30 + step3_25;
+ step2_31 = step3_31 + step3_24;
+
+ __asm__ __volatile__ (
+ "lh %[load1], 0(%[input]) \n\t"
+ "lh %[load2], 32(%[input]) \n\t"
+ "lh %[load3], 16(%[input]) \n\t"
+ "lh %[load4], 48(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "add %[result1], %[load1], %[load2] \n\t"
+ "sub %[result2], %[load1], %[load2] \n\t"
+ "madd $ac1, %[result1], %[cospi_16_64] \n\t"
+ "madd $ac2, %[result2], %[cospi_16_64] \n\t"
+ "extp %[temp0], $ac1, 31 \n\t"
+ "extp %[temp1], $ac2, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "madd $ac3, %[load3], %[cospi_24_64] \n\t"
+ "msub $ac3, %[load4], %[cospi_8_64] \n\t"
+ "extp %[temp2], $ac3, 31 \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "madd $ac1, %[load3], %[cospi_8_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_24_64] \n\t"
+ "extp %[temp3], $ac1, 31 \n\t"
+ "add %[step1_0], %[temp0], %[temp3] \n\t"
+ "add %[step1_1], %[temp1], %[temp2] \n\t"
+ "sub %[step1_2], %[temp1], %[temp2] \n\t"
+ "sub %[step1_3], %[temp0], %[temp3] \n\t"
+
+ : [load1] "=&r" (load1), [load2] "=&r" (load2),
+ [load3] "=&r" (load3), [load4] "=&r" (load4),
+ [result1] "=&r" (result1), [result2] "=&r" (result2),
+ [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+ [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+ [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1),
+ [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3)
+ : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+ [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64),
+ [cospi_16_64] "r" (cospi_16_64)
+ );
+
+ __asm__ __volatile__ (
+ "lh %[load1], 8(%[input]) \n\t"
+ "lh %[load2], 56(%[input]) \n\t"
+ "lh %[load3], 40(%[input]) \n\t"
+ "lh %[load4], 24(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_28_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_4_64] \n\t"
+ "extp %[temp0], $ac1, 31 \n\t"
+ "madd $ac3, %[load1], %[cospi_4_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_28_64] \n\t"
+ "extp %[temp3], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac2, %[load3], %[cospi_12_64] \n\t"
+ "msub $ac2, %[load4], %[cospi_20_64] \n\t"
+ "extp %[temp1], $ac2, 31 \n\t"
+ "madd $ac1, %[load3], %[cospi_20_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_12_64] \n\t"
+ "extp %[temp2], $ac1, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load1], %[temp3], %[temp2] \n\t"
+ "sub %[load1], %[load1], %[temp0] \n\t"
+ "add %[load1], %[load1], %[temp1] \n\t"
+ "sub %[load2], %[temp0], %[temp1] \n\t"
+ "sub %[load2], %[load2], %[temp2] \n\t"
+ "add %[load2], %[load2], %[temp3] \n\t"
+ "madd $ac1, %[load1], %[cospi_16_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_16_64] \n\t"
+
+ "extp %[step1_5], $ac1, 31 \n\t"
+ "extp %[step1_6], $ac3, 31 \n\t"
+ "add %[step1_4], %[temp0], %[temp1] \n\t"
+ "add %[step1_7], %[temp3], %[temp2] \n\t"
+
+ : [load1] "=&r" (load1), [load2] "=&r" (load2),
+ [load3] "=&r" (load3), [load4] "=&r" (load4),
+ [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+ [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+ [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5),
+ [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7)
+ : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+ [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64),
+ [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64),
+ [cospi_16_64] "r" (cospi_16_64)
+ );
+
+ step2_0 = step1_0 + step1_7;
+ step2_1 = step1_1 + step1_6;
+ step2_2 = step1_2 + step1_5;
+ step2_3 = step1_3 + step1_4;
+ step2_4 = step1_3 - step1_4;
+ step2_5 = step1_2 - step1_5;
+ step2_6 = step1_1 - step1_6;
+ step2_7 = step1_0 - step1_7;
+
+ // stage 7
+ step1_0 = step2_0 + step3_15;
+ step1_1 = step2_1 + step3_14;
+ step1_2 = step2_2 + step3_13;
+ step1_3 = step2_3 + step3_12;
+ step1_4 = step2_4 + step3_11;
+ step1_5 = step2_5 + step3_10;
+ step1_6 = step2_6 + step3_9;
+ step1_7 = step2_7 + step3_8;
+ step1_8 = step2_7 - step3_8;
+ step1_9 = step2_6 - step3_9;
+ step1_10 = step2_5 - step3_10;
+ step1_11 = step2_4 - step3_11;
+ step1_12 = step2_3 - step3_12;
+ step1_13 = step2_2 - step3_13;
+ step1_14 = step2_1 - step3_14;
+ step1_15 = step2_0 - step3_15;
+
+ __asm__ __volatile__ (
+ "sub %[temp0], %[step2_27], %[step2_20] \n\t"
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "madd $ac0, %[temp0], %[cospi_16_64] \n\t"
+ "extp %[step1_20], $ac0, 31 \n\t"
+
+ : [temp0] "=&r" (temp0), [step1_20] "=r" (step1_20)
+ : [const_2_power_13] "r" (const_2_power_13), [step2_20] "r" (step2_20),
+ [step2_27] "r" (step2_27), [cospi_16_64] "r" (cospi_16_64)
+ );
+
+ temp21 = (step2_20 + step2_27) * cospi_16_64;
+ step1_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+ __asm__ __volatile__ (
+ "sub %[temp0], %[step2_26], %[step2_21] \n\t"
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "madd $ac0, %[temp0], %[cospi_16_64] \n\t"
+ "extp %[step1_21], $ac0, 31 \n\t"
+
+ : [temp0] "=&r" (temp0), [step1_21] "=r" (step1_21)
+ : [const_2_power_13] "r" (const_2_power_13), [step2_26] "r" (step2_26),
+ [step2_21] "r" (step2_21), [cospi_16_64] "r" (cospi_16_64)
+ );
+
+ temp21 = (step2_21 + step2_26) * cospi_16_64;
+ step1_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+ __asm__ __volatile__ (
+ "sub %[temp0], %[step2_25], %[step2_22] \n\t"
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "madd $ac0, %[temp0], %[cospi_16_64] \n\t"
+ "extp %[step1_22], $ac0, 31 \n\t"
+
+ : [temp0] "=&r" (temp0), [step1_22] "=r" (step1_22)
+ : [const_2_power_13] "r" (const_2_power_13), [step2_25] "r" (step2_25),
+ [step2_22] "r" (step2_22), [cospi_16_64] "r" (cospi_16_64)
+ );
+
+ temp21 = (step2_22 + step2_25) * cospi_16_64;
+ step1_25 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+ __asm__ __volatile__ (
+ "sub %[temp0], %[step2_24], %[step2_23] \n\t"
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "madd $ac0, %[temp0], %[cospi_16_64] \n\t"
+ "extp %[step1_23], $ac0, 31 \n\t"
+
+ : [temp0] "=&r" (temp0), [step1_23] "=r" (step1_23)
+ : [const_2_power_13] "r" (const_2_power_13), [step2_24] "r" (step2_24),
+ [step2_23] "r" (step2_23), [cospi_16_64] "r" (cospi_16_64)
+ );
+
+ temp21 = (step2_23 + step2_24) * cospi_16_64;
+ step1_24 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+ __asm__ __volatile__ (
+ "lbu %[temp2], 0(%[dest_pix]) \n\t"
+ "add %[temp0], %[step1_0], %[step2_31] \n\t"
+ "addi %[temp0], %[temp0], 32 \n\t"
+ "sra %[temp0], %[temp0], 6 \n\t"
+ "add %[temp2], %[temp2], %[temp0] \n\t"
+ "lbux %[temp0], %[temp2](%[cm]) \n\t"
+ "add %[temp1], %[step1_1], %[step2_30] \n\t"
+ "sb %[temp0], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "lbu %[temp3], 0(%[dest_pix]) \n\t"
+ "addi %[temp1], %[temp1], 32 \n\t"
+ "sra %[temp1], %[temp1], 6 \n\t"
+ "add %[temp3], %[temp3], %[temp1] \n\t"
+ "lbux %[temp1], %[temp3](%[cm]) \n\t"
+ "sb %[temp1], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "lbu %[temp2], 0(%[dest_pix]) \n\t"
+ "add %[temp0], %[step1_2], %[step2_29] \n\t"
+ "addi %[temp0], %[temp0], 32 \n\t"
+ "sra %[temp0], %[temp0], 6 \n\t"
+ "add %[temp2], %[temp2], %[temp0] \n\t"
+ "lbux %[temp0], %[temp2](%[cm]) \n\t"
+ "add %[temp1], %[step1_3], %[step2_28] \n\t"
+ "sb %[temp0], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "lbu %[temp3], 0(%[dest_pix]) \n\t"
+ "addi %[temp1], %[temp1], 32 \n\t"
+ "sra %[temp1], %[temp1], 6 \n\t"
+ "add %[temp3], %[temp3], %[temp1] \n\t"
+ "lbux %[temp1], %[temp3](%[cm]) \n\t"
+ "sb %[temp1], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
+ [temp3] "=&r" (temp3), [dest_pix] "+r" (dest_pix)
+ : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
+ [step1_0] "r" (step1_0), [step1_1] "r" (step1_1),
+ [step1_2] "r" (step1_2), [step1_3] "r" (step1_3),
+ [step2_28] "r" (step2_28), [step2_29] "r" (step2_29),
+ [step2_30] "r" (step2_30), [step2_31] "r" (step2_31)
+ );
+
+ step3_12 = ROUND_POWER_OF_TWO((step1_3 - step2_28), 6);
+ step3_13 = ROUND_POWER_OF_TWO((step1_2 - step2_29), 6);
+ step3_14 = ROUND_POWER_OF_TWO((step1_1 - step2_30), 6);
+ step3_15 = ROUND_POWER_OF_TWO((step1_0 - step2_31), 6);
+
+ __asm__ __volatile__ (
+ "lbu %[temp2], 0(%[dest_pix1]) \n\t"
+ "add %[temp2], %[temp2], %[step3_15] \n\t"
+ "lbux %[temp0], %[temp2](%[cm]) \n\t"
+ "sb %[temp0], 0(%[dest_pix1]) \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+ "lbu %[temp3], 0(%[dest_pix1]) \n\t"
+ "add %[temp3], %[temp3], %[step3_14] \n\t"
+ "lbux %[temp1], %[temp3](%[cm]) \n\t"
+ "sb %[temp1], 0(%[dest_pix1]) \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+
+ "lbu %[temp2], 0(%[dest_pix1]) \n\t"
+ "add %[temp2], %[temp2], %[step3_13] \n\t"
+ "lbux %[temp0], %[temp2](%[cm]) \n\t"
+ "sb %[temp0], 0(%[dest_pix1]) \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+ "lbu %[temp3], 0(%[dest_pix1]) \n\t"
+ "add %[temp3], %[temp3], %[step3_12] \n\t"
+ "lbux %[temp1], %[temp3](%[cm]) \n\t"
+ "sb %[temp1], 0(%[dest_pix1]) \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+
+ : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
+ [temp3] "=&r" (temp3), [dest_pix1] "+r" (dest_pix1)
+ : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
+ [step3_12] "r" (step3_12), [step3_13] "r" (step3_13),
+ [step3_14] "r" (step3_14), [step3_15] "r" (step3_15)
+ );
+
+ __asm__ __volatile__ (
+ "lbu %[temp2], 0(%[dest_pix]) \n\t"
+ "add %[temp0], %[step1_4], %[step1_27] \n\t"
+ "addi %[temp0], %[temp0], 32 \n\t"
+ "sra %[temp0], %[temp0], 6 \n\t"
+ "add %[temp2], %[temp2], %[temp0] \n\t"
+ "lbux %[temp0], %[temp2](%[cm]) \n\t"
+ "add %[temp1], %[step1_5], %[step1_26] \n\t"
+ "sb %[temp0], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "lbu %[temp3], 0(%[dest_pix]) \n\t"
+ "addi %[temp1], %[temp1], 32 \n\t"
+ "sra %[temp1], %[temp1], 6 \n\t"
+ "add %[temp3], %[temp3], %[temp1] \n\t"
+ "lbux %[temp1], %[temp3](%[cm]) \n\t"
+ "sb %[temp1], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "lbu %[temp2], 0(%[dest_pix]) \n\t"
+ "add %[temp0], %[step1_6], %[step1_25] \n\t"
+ "addi %[temp0], %[temp0], 32 \n\t"
+ "sra %[temp0], %[temp0], 6 \n\t"
+ "add %[temp2], %[temp2], %[temp0] \n\t"
+ "lbux %[temp0], %[temp2](%[cm]) \n\t"
+ "add %[temp1], %[step1_7], %[step1_24] \n\t"
+ "sb %[temp0], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "lbu %[temp3], 0(%[dest_pix]) \n\t"
+ "addi %[temp1], %[temp1], 32 \n\t"
+ "sra %[temp1], %[temp1], 6 \n\t"
+ "add %[temp3], %[temp3], %[temp1] \n\t"
+ "lbux %[temp1], %[temp3](%[cm]) \n\t"
+ "sb %[temp1], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
+ [temp3] "=&r" (temp3), [dest_pix] "+r" (dest_pix)
+ : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
+ [step1_4] "r" (step1_4), [step1_5] "r" (step1_5),
+ [step1_6] "r" (step1_6), [step1_7] "r" (step1_7),
+ [step1_24] "r" (step1_24), [step1_25] "r" (step1_25),
+ [step1_26] "r" (step1_26), [step1_27] "r" (step1_27)
+ );
+
+ step3_12 = ROUND_POWER_OF_TWO((step1_7 - step1_24), 6);
+ step3_13 = ROUND_POWER_OF_TWO((step1_6 - step1_25), 6);
+ step3_14 = ROUND_POWER_OF_TWO((step1_5 - step1_26), 6);
+ step3_15 = ROUND_POWER_OF_TWO((step1_4 - step1_27), 6);
+
+ __asm__ __volatile__ (
+ "lbu %[temp2], 0(%[dest_pix1]) \n\t"
+ "add %[temp2], %[temp2], %[step3_15] \n\t"
+ "lbux %[temp0], %[temp2](%[cm]) \n\t"
+ "sb %[temp0], 0(%[dest_pix1]) \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+ "lbu %[temp3], 0(%[dest_pix1]) \n\t"
+ "add %[temp3], %[temp3], %[step3_14] \n\t"
+ "lbux %[temp1], %[temp3](%[cm]) \n\t"
+ "sb %[temp1], 0(%[dest_pix1]) \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+
+ "lbu %[temp2], 0(%[dest_pix1]) \n\t"
+ "add %[temp2], %[temp2], %[step3_13] \n\t"
+ "lbux %[temp0], %[temp2](%[cm]) \n\t"
+ "sb %[temp0], 0(%[dest_pix1]) \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+ "lbu %[temp3], 0(%[dest_pix1]) \n\t"
+ "add %[temp3], %[temp3], %[step3_12] \n\t"
+ "lbux %[temp1], %[temp3](%[cm]) \n\t"
+ "sb %[temp1], 0(%[dest_pix1]) \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+
+ : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
+ [temp3] "=&r" (temp3), [dest_pix1] "+r" (dest_pix1)
+ : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
+ [step3_12] "r" (step3_12), [step3_13] "r" (step3_13),
+ [step3_14] "r" (step3_14), [step3_15] "r" (step3_15)
+ );
+
+ __asm__ __volatile__ (
+ "lbu %[temp2], 0(%[dest_pix]) \n\t"
+ "add %[temp0], %[step1_8], %[step1_23] \n\t"
+ "addi %[temp0], %[temp0], 32 \n\t"
+ "sra %[temp0], %[temp0], 6 \n\t"
+ "add %[temp2], %[temp2], %[temp0] \n\t"
+ "lbux %[temp0], %[temp2](%[cm]) \n\t"
+ "add %[temp1], %[step1_9], %[step1_22] \n\t"
+ "sb %[temp0], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "lbu %[temp3], 0(%[dest_pix]) \n\t"
+ "addi %[temp1], %[temp1], 32 \n\t"
+ "sra %[temp1], %[temp1], 6 \n\t"
+ "add %[temp3], %[temp3], %[temp1] \n\t"
+ "lbux %[temp1], %[temp3](%[cm]) \n\t"
+ "sb %[temp1], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "lbu %[temp2], 0(%[dest_pix]) \n\t"
+ "add %[temp0], %[step1_10], %[step1_21] \n\t"
+ "addi %[temp0], %[temp0], 32 \n\t"
+ "sra %[temp0], %[temp0], 6 \n\t"
+ "add %[temp2], %[temp2], %[temp0] \n\t"
+ "lbux %[temp0], %[temp2](%[cm]) \n\t"
+ "add %[temp1], %[step1_11], %[step1_20] \n\t"
+ "sb %[temp0], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "lbu %[temp3], 0(%[dest_pix]) \n\t"
+ "addi %[temp1], %[temp1], 32 \n\t"
+ "sra %[temp1], %[temp1], 6 \n\t"
+ "add %[temp3], %[temp3], %[temp1] \n\t"
+ "lbux %[temp1], %[temp3](%[cm]) \n\t"
+ "sb %[temp1], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
+ [temp3] "=&r" (temp3), [dest_pix] "+r" (dest_pix)
+ : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
+ [step1_8] "r" (step1_8), [step1_9] "r" (step1_9),
+ [step1_10] "r" (step1_10), [step1_11] "r" (step1_11),
+ [step1_20] "r" (step1_20), [step1_21] "r" (step1_21),
+ [step1_22] "r" (step1_22), [step1_23] "r" (step1_23)
+ );
+
+ step3_12 = ROUND_POWER_OF_TWO((step1_11 - step1_20), 6);
+ step3_13 = ROUND_POWER_OF_TWO((step1_10 - step1_21), 6);
+ step3_14 = ROUND_POWER_OF_TWO((step1_9 - step1_22), 6);
+ step3_15 = ROUND_POWER_OF_TWO((step1_8 - step1_23), 6);
+
+ __asm__ __volatile__ (
+ "lbu %[temp2], 0(%[dest_pix1]) \n\t"
+ "add %[temp2], %[temp2], %[step3_15] \n\t"
+ "lbux %[temp0], %[temp2](%[cm]) \n\t"
+ "sb %[temp0], 0(%[dest_pix1]) \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+ "lbu %[temp3], 0(%[dest_pix1]) \n\t"
+ "add %[temp3], %[temp3], %[step3_14] \n\t"
+ "lbux %[temp1], %[temp3](%[cm]) \n\t"
+ "sb %[temp1], 0(%[dest_pix1]) \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+
+ "lbu %[temp2], 0(%[dest_pix1]) \n\t"
+ "add %[temp2], %[temp2], %[step3_13] \n\t"
+ "lbux %[temp0], %[temp2](%[cm]) \n\t"
+ "sb %[temp0], 0(%[dest_pix1]) \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+ "lbu %[temp3], 0(%[dest_pix1]) \n\t"
+ "add %[temp3], %[temp3], %[step3_12] \n\t"
+ "lbux %[temp1], %[temp3](%[cm]) \n\t"
+ "sb %[temp1], 0(%[dest_pix1]) \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+
+ : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
+ [temp3] "=&r" (temp3), [dest_pix1] "+r" (dest_pix1)
+ : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
+ [step3_12] "r" (step3_12), [step3_13] "r" (step3_13),
+ [step3_14] "r" (step3_14), [step3_15] "r" (step3_15)
+ );
+
+ __asm__ __volatile__ (
+ "lbu %[temp2], 0(%[dest_pix]) \n\t"
+ "add %[temp0], %[step1_12], %[step2_19] \n\t"
+ "addi %[temp0], %[temp0], 32 \n\t"
+ "sra %[temp0], %[temp0], 6 \n\t"
+ "add %[temp2], %[temp2], %[temp0] \n\t"
+ "lbux %[temp0], %[temp2](%[cm]) \n\t"
+ "add %[temp1], %[step1_13], %[step2_18] \n\t"
+ "sb %[temp0], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "lbu %[temp3], 0(%[dest_pix]) \n\t"
+ "addi %[temp1], %[temp1], 32 \n\t"
+ "sra %[temp1], %[temp1], 6 \n\t"
+ "add %[temp3], %[temp3], %[temp1] \n\t"
+ "lbux %[temp1], %[temp3](%[cm]) \n\t"
+ "sb %[temp1], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "lbu %[temp2], 0(%[dest_pix]) \n\t"
+ "add %[temp0], %[step1_14], %[step2_17] \n\t"
+ "addi %[temp0], %[temp0], 32 \n\t"
+ "sra %[temp0], %[temp0], 6 \n\t"
+ "add %[temp2], %[temp2], %[temp0] \n\t"
+ "lbux %[temp0], %[temp2](%[cm]) \n\t"
+ "add %[temp1], %[step1_15], %[step2_16] \n\t"
+ "sb %[temp0], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "lbu %[temp3], 0(%[dest_pix]) \n\t"
+ "addi %[temp1], %[temp1], 32 \n\t"
+ "sra %[temp1], %[temp1], 6 \n\t"
+ "add %[temp3], %[temp3], %[temp1] \n\t"
+ "lbux %[temp1], %[temp3](%[cm]) \n\t"
+ "sb %[temp1], 0(%[dest_pix]) \n\t"
+
+ : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
+ [temp3] "=&r" (temp3), [dest_pix] "+r" (dest_pix)
+ : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
+ [step1_12] "r" (step1_12), [step1_13] "r" (step1_13),
+ [step1_14] "r" (step1_14), [step1_15] "r" (step1_15),
+ [step2_16] "r" (step2_16), [step2_17] "r" (step2_17),
+ [step2_18] "r" (step2_18), [step2_19] "r" (step2_19)
+ );
+
+ step3_12 = ROUND_POWER_OF_TWO((step1_15 - step2_16), 6);
+ step3_13 = ROUND_POWER_OF_TWO((step1_14 - step2_17), 6);
+ step3_14 = ROUND_POWER_OF_TWO((step1_13 - step2_18), 6);
+ step3_15 = ROUND_POWER_OF_TWO((step1_12 - step2_19), 6);
+
+ __asm__ __volatile__ (
+ "lbu %[temp2], 0(%[dest_pix1]) \n\t"
+ "add %[temp2], %[temp2], %[step3_15] \n\t"
+ "lbux %[temp0], %[temp2](%[cm]) \n\t"
+ "sb %[temp0], 0(%[dest_pix1]) \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+ "lbu %[temp3], 0(%[dest_pix1]) \n\t"
+ "add %[temp3], %[temp3], %[step3_14] \n\t"
+ "lbux %[temp1], %[temp3](%[cm]) \n\t"
+ "sb %[temp1], 0(%[dest_pix1]) \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+
+ "lbu %[temp2], 0(%[dest_pix1]) \n\t"
+ "add %[temp2], %[temp2], %[step3_13] \n\t"
+ "lbux %[temp0], %[temp2](%[cm]) \n\t"
+ "sb %[temp0], 0(%[dest_pix1]) \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+ "lbu %[temp3], 0(%[dest_pix1]) \n\t"
+ "add %[temp3], %[temp3], %[step3_12] \n\t"
+ "lbux %[temp1], %[temp3](%[cm]) \n\t"
+ "sb %[temp1], 0(%[dest_pix1]) \n\t"
+
+ : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
+ [temp3] "=&r" (temp3), [dest_pix1] "+r" (dest_pix1)
+ : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
+ [step3_12] "r" (step3_12), [step3_13] "r" (step3_13),
+ [step3_14] "r" (step3_14), [step3_15] "r" (step3_15)
+ );
+
+ input += 32;
+ }
+}
+#endif // #if HAVE_DSPR2
diff --git a/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c b/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c
new file mode 100644
index 000000000..d3aee73cb
--- /dev/null
+++ b/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c
@@ -0,0 +1,1013 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_idct.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+
+#if HAVE_DSPR2
+static void idct32_1d_rows_dspr2(const int16_t *input, int16_t *output) {
+ int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6;
+ int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13;
+ int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19, step1_20;
+ int16_t step1_21, step1_22, step1_23, step1_24, step1_25, step1_26, step1_27;
+ int16_t step1_28, step1_29, step1_30, step1_31;
+ int16_t step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
+ int16_t step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13;
+ int16_t step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20;
+ int16_t step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27;
+ int16_t step2_28, step2_29, step2_30, step2_31;
+ int16_t step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14;
+ int16_t step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21;
+ int16_t step3_22, step3_23, step3_24, step3_25, step3_26, step3_27, step3_28;
+ int16_t step3_29, step3_30, step3_31;
+ int temp0, temp1, temp2, temp3;
+ int load1, load2, load3, load4;
+ int result1, result2;
+ int temp21;
+ int i;
+ const int const_2_power_13 = 8192;
+ const int32_t *input_int;
+
+ for (i = 32; i--; ) {
+ input_int = (const int32_t *)input;
+
+ if (!(input_int[0] | input_int[1] | input_int[2] | input_int[3] |
+ input_int[4] | input_int[5] | input_int[6] | input_int[7] |
+ input_int[8] | input_int[9] | input_int[10] | input_int[11] |
+ input_int[12] | input_int[13] | input_int[14] | input_int[15])) {
+ input += 32;
+
+ __asm__ __volatile__ (
+ "sh $zero, 0(%[output]) \n\t"
+ "sh $zero, 64(%[output]) \n\t"
+ "sh $zero, 128(%[output]) \n\t"
+ "sh $zero, 192(%[output]) \n\t"
+ "sh $zero, 256(%[output]) \n\t"
+ "sh $zero, 320(%[output]) \n\t"
+ "sh $zero, 384(%[output]) \n\t"
+ "sh $zero, 448(%[output]) \n\t"
+ "sh $zero, 512(%[output]) \n\t"
+ "sh $zero, 576(%[output]) \n\t"
+ "sh $zero, 640(%[output]) \n\t"
+ "sh $zero, 704(%[output]) \n\t"
+ "sh $zero, 768(%[output]) \n\t"
+ "sh $zero, 832(%[output]) \n\t"
+ "sh $zero, 896(%[output]) \n\t"
+ "sh $zero, 960(%[output]) \n\t"
+ "sh $zero, 1024(%[output]) \n\t"
+ "sh $zero, 1088(%[output]) \n\t"
+ "sh $zero, 1152(%[output]) \n\t"
+ "sh $zero, 1216(%[output]) \n\t"
+ "sh $zero, 1280(%[output]) \n\t"
+ "sh $zero, 1344(%[output]) \n\t"
+ "sh $zero, 1408(%[output]) \n\t"
+ "sh $zero, 1472(%[output]) \n\t"
+ "sh $zero, 1536(%[output]) \n\t"
+ "sh $zero, 1600(%[output]) \n\t"
+ "sh $zero, 1664(%[output]) \n\t"
+ "sh $zero, 1728(%[output]) \n\t"
+ "sh $zero, 1792(%[output]) \n\t"
+ "sh $zero, 1856(%[output]) \n\t"
+ "sh $zero, 1920(%[output]) \n\t"
+ "sh $zero, 1984(%[output]) \n\t"
+
+ :
+ : [output] "r" (output)
+ );
+
+ output += 1;
+
+ continue;
+ }
+
+ /* prefetch row */
+ vp9_prefetch_load((const uint8_t *)(input + 32));
+ vp9_prefetch_load((const uint8_t *)(input + 48));
+
+ __asm__ __volatile__ (
+ "lh %[load1], 2(%[input]) \n\t"
+ "lh %[load2], 62(%[input]) \n\t"
+ "lh %[load3], 34(%[input]) \n\t"
+ "lh %[load4], 30(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_31_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_1_64] \n\t"
+ "extp %[temp0], $ac1, 31 \n\t"
+
+ "madd $ac3, %[load1], %[cospi_1_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_31_64] \n\t"
+ "extp %[temp3], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac2, %[load3], %[cospi_15_64] \n\t"
+ "msub $ac2, %[load4], %[cospi_17_64] \n\t"
+ "extp %[temp1], $ac2, 31 \n\t"
+
+ "madd $ac1, %[load3], %[cospi_17_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_15_64] \n\t"
+ "extp %[temp2], $ac1, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load1], %[temp3], %[temp2] \n\t"
+ "sub %[load2], %[temp0], %[temp1] \n\t"
+
+ "madd $ac1, %[load1], %[cospi_28_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_4_64] \n\t"
+ "madd $ac3, %[load1], %[cospi_4_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_28_64] \n\t"
+
+ "extp %[step1_17], $ac1, 31 \n\t"
+ "extp %[step1_30], $ac3, 31 \n\t"
+ "add %[step1_16], %[temp0], %[temp1] \n\t"
+ "add %[step1_31], %[temp2], %[temp3] \n\t"
+
+ : [load1] "=&r" (load1), [load2] "=&r" (load2),
+ [load3] "=&r" (load3), [load4] "=&r" (load4),
+ [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+ [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+ [step1_16] "=r" (step1_16), [step1_17] "=r" (step1_17),
+ [step1_30] "=r" (step1_30), [step1_31] "=r" (step1_31)
+ : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+ [cospi_31_64] "r" (cospi_31_64), [cospi_1_64] "r" (cospi_1_64),
+ [cospi_4_64] "r" (cospi_4_64), [cospi_17_64] "r" (cospi_17_64),
+ [cospi_15_64] "r" (cospi_15_64), [cospi_28_64] "r" (cospi_28_64)
+ );
+
+ __asm__ __volatile__ (
+ "lh %[load1], 18(%[input]) \n\t"
+ "lh %[load2], 46(%[input]) \n\t"
+ "lh %[load3], 50(%[input]) \n\t"
+ "lh %[load4], 14(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_23_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_9_64] \n\t"
+ "extp %[temp0], $ac1, 31 \n\t"
+
+ "madd $ac3, %[load1], %[cospi_9_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_23_64] \n\t"
+ "extp %[temp3], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac2, %[load3], %[cospi_7_64] \n\t"
+ "msub $ac2, %[load4], %[cospi_25_64] \n\t"
+ "extp %[temp1], $ac2, 31 \n\t"
+
+ "madd $ac1, %[load3], %[cospi_25_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_7_64] \n\t"
+ "extp %[temp2], $ac1, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load1], %[temp1], %[temp0] \n\t"
+ "sub %[load2], %[temp2], %[temp3] \n\t"
+
+ "msub $ac1, %[load1], %[cospi_28_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_4_64] \n\t"
+ "msub $ac3, %[load1], %[cospi_4_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_28_64] \n\t"
+
+ "extp %[step1_18], $ac1, 31 \n\t"
+ "extp %[step1_29], $ac3, 31 \n\t"
+ "add %[step1_19], %[temp0], %[temp1] \n\t"
+ "add %[step1_28], %[temp2], %[temp3] \n\t"
+
+ : [load1] "=&r" (load1), [load2] "=&r" (load2),
+ [load3] "=&r" (load3), [load4] "=&r" (load4),
+ [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+ [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+ [step1_18] "=r" (step1_18), [step1_19] "=r" (step1_19),
+ [step1_28] "=r" (step1_28), [step1_29] "=r" (step1_29)
+ : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+ [cospi_23_64] "r" (cospi_23_64), [cospi_9_64] "r" (cospi_9_64),
+ [cospi_4_64] "r" (cospi_4_64), [cospi_7_64] "r" (cospi_7_64),
+ [cospi_25_64] "r" (cospi_25_64), [cospi_28_64] "r" (cospi_28_64)
+ );
+
+ __asm__ __volatile__ (
+ "lh %[load1], 10(%[input]) \n\t"
+ "lh %[load2], 54(%[input]) \n\t"
+ "lh %[load3], 42(%[input]) \n\t"
+ "lh %[load4], 22(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_27_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_5_64] \n\t"
+ "extp %[temp0], $ac1, 31 \n\t"
+
+ "madd $ac3, %[load1], %[cospi_5_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_27_64] \n\t"
+ "extp %[temp3], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac2, %[load3], %[cospi_11_64] \n\t"
+ "msub $ac2, %[load4], %[cospi_21_64] \n\t"
+ "extp %[temp1], $ac2, 31 \n\t"
+
+ "madd $ac1, %[load3], %[cospi_21_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_11_64] \n\t"
+ "extp %[temp2], $ac1, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load1], %[temp0], %[temp1] \n\t"
+ "sub %[load2], %[temp3], %[temp2] \n\t"
+
+ "madd $ac1, %[load2], %[cospi_12_64] \n\t"
+ "msub $ac1, %[load1], %[cospi_20_64] \n\t"
+ "madd $ac3, %[load1], %[cospi_12_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_20_64] \n\t"
+
+ "extp %[step1_21], $ac1, 31 \n\t"
+ "extp %[step1_26], $ac3, 31 \n\t"
+ "add %[step1_20], %[temp0], %[temp1] \n\t"
+ "add %[step1_27], %[temp2], %[temp3] \n\t"
+
+ : [load1] "=&r" (load1), [load2] "=&r" (load2),
+ [load3] "=&r" (load3), [load4] "=&r" (load4),
+ [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+ [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+ [step1_20] "=r" (step1_20), [step1_21] "=r" (step1_21),
+ [step1_26] "=r" (step1_26), [step1_27] "=r" (step1_27)
+ : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+ [cospi_27_64] "r" (cospi_27_64), [cospi_5_64] "r" (cospi_5_64),
+ [cospi_11_64] "r" (cospi_11_64), [cospi_21_64] "r" (cospi_21_64),
+ [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64)
+ );
+
+ __asm__ __volatile__ (
+ "lh %[load1], 26(%[input]) \n\t"
+ "lh %[load2], 38(%[input]) \n\t"
+ "lh %[load3], 58(%[input]) \n\t"
+ "lh %[load4], 6(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_19_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_13_64] \n\t"
+ "extp %[temp0], $ac1, 31 \n\t"
+
+ "madd $ac3, %[load1], %[cospi_13_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_19_64] \n\t"
+ "extp %[temp3], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac2, %[load3], %[cospi_3_64] \n\t"
+ "msub $ac2, %[load4], %[cospi_29_64] \n\t"
+ "extp %[temp1], $ac2, 31 \n\t"
+
+ "madd $ac1, %[load3], %[cospi_29_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_3_64] \n\t"
+ "extp %[temp2], $ac1, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load1], %[temp1], %[temp0] \n\t"
+ "sub %[load2], %[temp2], %[temp3] \n\t"
+
+ "msub $ac1, %[load1], %[cospi_12_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_20_64] \n\t"
+ "msub $ac3, %[load1], %[cospi_20_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_12_64] \n\t"
+
+ "extp %[step1_22], $ac1, 31 \n\t"
+ "extp %[step1_25], $ac3, 31 \n\t"
+ "add %[step1_23], %[temp0], %[temp1] \n\t"
+ "add %[step1_24], %[temp2], %[temp3] \n\t"
+
+ : [load1] "=&r" (load1), [load2] "=&r" (load2),
+ [load3] "=&r" (load3), [load4] "=&r" (load4),
+ [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+ [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+ [step1_22] "=r" (step1_22), [step1_23] "=r" (step1_23),
+ [step1_24] "=r" (step1_24), [step1_25] "=r" (step1_25)
+ : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+ [cospi_19_64] "r" (cospi_19_64), [cospi_13_64] "r" (cospi_13_64),
+ [cospi_3_64] "r" (cospi_3_64), [cospi_29_64] "r" (cospi_29_64),
+ [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64)
+ );
+
+ __asm__ __volatile__ (
+ "lh %[load1], 4(%[input]) \n\t"
+ "lh %[load2], 60(%[input]) \n\t"
+ "lh %[load3], 36(%[input]) \n\t"
+ "lh %[load4], 28(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_30_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_2_64] \n\t"
+ "extp %[temp0], $ac1, 31 \n\t"
+
+ "madd $ac3, %[load1], %[cospi_2_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_30_64] \n\t"
+ "extp %[temp3], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac2, %[load3], %[cospi_14_64] \n\t"
+ "msub $ac2, %[load4], %[cospi_18_64] \n\t"
+ "extp %[temp1], $ac2, 31 \n\t"
+
+ "madd $ac1, %[load3], %[cospi_18_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_14_64] \n\t"
+ "extp %[temp2], $ac1, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load1], %[temp0], %[temp1] \n\t"
+ "sub %[load2], %[temp3], %[temp2] \n\t"
+
+ "msub $ac1, %[load1], %[cospi_8_64] \n\t"
+ "madd $ac1, %[load2], %[cospi_24_64] \n\t"
+ "madd $ac3, %[load1], %[cospi_24_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_8_64] \n\t"
+
+ "extp %[step2_9], $ac1, 31 \n\t"
+ "extp %[step2_14], $ac3, 31 \n\t"
+ "add %[step2_8], %[temp0], %[temp1] \n\t"
+ "add %[step2_15], %[temp2], %[temp3] \n\t"
+
+ : [load1] "=&r" (load1), [load2] "=&r" (load2),
+ [load3] "=&r" (load3), [load4] "=&r" (load4),
+ [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+ [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+ [step2_8] "=r" (step2_8), [step2_9] "=r" (step2_9),
+ [step2_14] "=r" (step2_14), [step2_15] "=r" (step2_15)
+ : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+ [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64),
+ [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64),
+ [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64)
+ );
+
+ __asm__ __volatile__ (
+ "lh %[load1], 20(%[input]) \n\t"
+ "lh %[load2], 44(%[input]) \n\t"
+ "lh %[load3], 52(%[input]) \n\t"
+ "lh %[load4], 12(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_22_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_10_64] \n\t"
+ "extp %[temp0], $ac1, 31 \n\t"
+
+ "madd $ac3, %[load1], %[cospi_10_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_22_64] \n\t"
+ "extp %[temp3], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac2, %[load3], %[cospi_6_64] \n\t"
+ "msub $ac2, %[load4], %[cospi_26_64] \n\t"
+ "extp %[temp1], $ac2, 31 \n\t"
+
+ "madd $ac1, %[load3], %[cospi_26_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_6_64] \n\t"
+ "extp %[temp2], $ac1, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load1], %[temp1], %[temp0] \n\t"
+ "sub %[load2], %[temp2], %[temp3] \n\t"
+
+ "msub $ac1, %[load1], %[cospi_24_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_8_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_24_64] \n\t"
+ "msub $ac3, %[load1], %[cospi_8_64] \n\t"
+
+ "extp %[step2_10], $ac1, 31 \n\t"
+ "extp %[step2_13], $ac3, 31 \n\t"
+ "add %[step2_11], %[temp0], %[temp1] \n\t"
+ "add %[step2_12], %[temp2], %[temp3] \n\t"
+
+ : [load1] "=&r" (load1), [load2] "=&r" (load2),
+ [load3] "=&r" (load3), [load4] "=&r" (load4),
+ [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+ [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+ [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11),
+ [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13)
+ : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+ [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64),
+ [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64),
+ [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64)
+ );
+
+ __asm__ __volatile__ (
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "sub %[temp0], %[step2_14], %[step2_13] \n\t"
+ "sub %[temp0], %[temp0], %[step2_9] \n\t"
+ "add %[temp0], %[temp0], %[step2_10] \n\t"
+ "madd $ac0, %[temp0], %[cospi_16_64] \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "sub %[temp1], %[step2_14], %[step2_13] \n\t"
+ "add %[temp1], %[temp1], %[step2_9] \n\t"
+ "sub %[temp1], %[temp1], %[step2_10] \n\t"
+ "madd $ac1, %[temp1], %[cospi_16_64] \n\t"
+
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "sub %[temp0], %[step2_15], %[step2_12] \n\t"
+ "sub %[temp0], %[temp0], %[step2_8] \n\t"
+ "add %[temp0], %[temp0], %[step2_11] \n\t"
+ "madd $ac2, %[temp0], %[cospi_16_64] \n\t"
+
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "sub %[temp1], %[step2_15], %[step2_12] \n\t"
+ "add %[temp1], %[temp1], %[step2_8] \n\t"
+ "sub %[temp1], %[temp1], %[step2_11] \n\t"
+ "madd $ac3, %[temp1], %[cospi_16_64] \n\t"
+
+ "add %[step3_8], %[step2_8], %[step2_11] \n\t"
+ "add %[step3_9], %[step2_9], %[step2_10] \n\t"
+ "add %[step3_14], %[step2_13], %[step2_14] \n\t"
+ "add %[step3_15], %[step2_12], %[step2_15] \n\t"
+
+ "extp %[step3_10], $ac0, 31 \n\t"
+ "extp %[step3_13], $ac1, 31 \n\t"
+ "extp %[step3_11], $ac2, 31 \n\t"
+ "extp %[step3_12], $ac3, 31 \n\t"
+
+ : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+ [step3_8] "=r" (step3_8), [step3_9] "=r" (step3_9),
+ [step3_10] "=r" (step3_10), [step3_11] "=r" (step3_11),
+ [step3_12] "=r" (step3_12), [step3_13] "=r" (step3_13),
+ [step3_14] "=r" (step3_14), [step3_15] "=r" (step3_15)
+ : [const_2_power_13] "r" (const_2_power_13),
+ [step2_8] "r" (step2_8), [step2_9] "r" (step2_9),
+ [step2_10] "r" (step2_10), [step2_11] "r" (step2_11),
+ [step2_12] "r" (step2_12), [step2_13] "r" (step2_13),
+ [step2_14] "r" (step2_14), [step2_15] "r" (step2_15),
+ [cospi_16_64] "r" (cospi_16_64)
+ );
+
+ step2_18 = step1_17 - step1_18;
+ step2_29 = step1_30 - step1_29;
+
+ __asm__ __volatile__ (
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "msub $ac0, %[step2_18], %[cospi_8_64] \n\t"
+ "madd $ac0, %[step2_29], %[cospi_24_64] \n\t"
+ "extp %[step3_18], $ac0, 31 \n\t"
+
+ : [step3_18] "=r" (step3_18)
+ : [const_2_power_13] "r" (const_2_power_13),
+ [step2_18] "r" (step2_18), [step2_29] "r" (step2_29),
+ [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+ );
+
+ temp21 = step2_18 * cospi_24_64 + step2_29 * cospi_8_64;
+ step3_29 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+ step2_19 = step1_16 - step1_19;
+ step2_28 = step1_31 - step1_28;
+
+ __asm__ __volatile__ (
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "msub $ac0, %[step2_19], %[cospi_8_64] \n\t"
+ "madd $ac0, %[step2_28], %[cospi_24_64] \n\t"
+ "extp %[step3_19], $ac0, 31 \n\t"
+
+ : [step3_19] "=r" (step3_19)
+ : [const_2_power_13] "r" (const_2_power_13),
+ [step2_19] "r" (step2_19), [step2_28] "r" (step2_28),
+ [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+ );
+
+ temp21 = step2_19 * cospi_24_64 + step2_28 * cospi_8_64;
+ step3_28 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+ step3_16 = step1_16 + step1_19;
+ step3_17 = step1_17 + step1_18;
+ step3_30 = step1_29 + step1_30;
+ step3_31 = step1_28 + step1_31;
+
+ step2_20 = step1_23 - step1_20;
+ step2_27 = step1_24 - step1_27;
+
+ __asm__ __volatile__ (
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "msub $ac0, %[step2_20], %[cospi_24_64] \n\t"
+ "msub $ac0, %[step2_27], %[cospi_8_64] \n\t"
+ "extp %[step3_20], $ac0, 31 \n\t"
+
+ : [step3_20] "=r" (step3_20)
+ : [const_2_power_13] "r" (const_2_power_13),
+ [step2_20] "r" (step2_20), [step2_27] "r" (step2_27),
+ [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+ );
+
+ temp21 = -step2_20 * cospi_8_64 + step2_27 * cospi_24_64;
+ step3_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+ step2_21 = step1_22 - step1_21;
+ step2_26 = step1_25 - step1_26;
+
+ __asm__ __volatile__ (
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "msub $ac1, %[step2_21], %[cospi_24_64] \n\t"
+ "msub $ac1, %[step2_26], %[cospi_8_64] \n\t"
+ "extp %[step3_21], $ac1, 31 \n\t"
+
+ : [step3_21] "=r" (step3_21)
+ : [const_2_power_13] "r" (const_2_power_13),
+ [step2_21] "r" (step2_21), [step2_26] "r" (step2_26),
+ [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+ );
+
+ temp21 = -step2_21 * cospi_8_64 + step2_26 * cospi_24_64;
+ step3_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+ step3_22 = step1_21 + step1_22;
+ step3_23 = step1_20 + step1_23;
+ step3_24 = step1_24 + step1_27;
+ step3_25 = step1_25 + step1_26;
+
+ step2_16 = step3_16 + step3_23;
+ step2_17 = step3_17 + step3_22;
+ step2_18 = step3_18 + step3_21;
+ step2_19 = step3_19 + step3_20;
+ step2_20 = step3_19 - step3_20;
+ step2_21 = step3_18 - step3_21;
+ step2_22 = step3_17 - step3_22;
+ step2_23 = step3_16 - step3_23;
+
+ step2_24 = step3_31 - step3_24;
+ step2_25 = step3_30 - step3_25;
+ step2_26 = step3_29 - step3_26;
+ step2_27 = step3_28 - step3_27;
+ step2_28 = step3_28 + step3_27;
+ step2_29 = step3_29 + step3_26;
+ step2_30 = step3_30 + step3_25;
+ step2_31 = step3_31 + step3_24;
+
+ __asm__ __volatile__ (
+ "lh %[load1], 0(%[input]) \n\t"
+ "lh %[load2], 32(%[input]) \n\t"
+ "lh %[load3], 16(%[input]) \n\t"
+ "lh %[load4], 48(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "add %[result1], %[load1], %[load2] \n\t"
+ "sub %[result2], %[load1], %[load2] \n\t"
+ "madd $ac1, %[result1], %[cospi_16_64] \n\t"
+ "madd $ac2, %[result2], %[cospi_16_64] \n\t"
+ "extp %[temp0], $ac1, 31 \n\t"
+ "extp %[temp1], $ac2, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "madd $ac3, %[load3], %[cospi_24_64] \n\t"
+ "msub $ac3, %[load4], %[cospi_8_64] \n\t"
+ "extp %[temp2], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "madd $ac1, %[load3], %[cospi_8_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_24_64] \n\t"
+ "extp %[temp3], $ac1, 31 \n\t"
+
+ "add %[step1_0], %[temp0], %[temp3] \n\t"
+ "add %[step1_1], %[temp1], %[temp2] \n\t"
+ "sub %[step1_2], %[temp1], %[temp2] \n\t"
+ "sub %[step1_3], %[temp0], %[temp3] \n\t"
+
+ : [load1] "=&r" (load1), [load2] "=&r" (load2),
+ [load3] "=&r" (load3), [load4] "=&r" (load4),
+ [result1] "=&r" (result1), [result2] "=&r" (result2),
+ [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+ [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+ [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1),
+ [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3)
+ : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+ [cospi_16_64] "r" (cospi_16_64),
+ [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+
+ );
+
+ __asm__ __volatile__ (
+ "lh %[load1], 8(%[input]) \n\t"
+ "lh %[load2], 56(%[input]) \n\t"
+ "lh %[load3], 40(%[input]) \n\t"
+ "lh %[load4], 24(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_28_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_4_64] \n\t"
+ "extp %[temp0], $ac1, 31 \n\t"
+
+ "madd $ac3, %[load1], %[cospi_4_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_28_64] \n\t"
+ "extp %[temp3], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac2, %[load3], %[cospi_12_64] \n\t"
+ "msub $ac2, %[load4], %[cospi_20_64] \n\t"
+ "extp %[temp1], $ac2, 31 \n\t"
+
+ "madd $ac1, %[load3], %[cospi_20_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_12_64] \n\t"
+ "extp %[temp2], $ac1, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load1], %[temp3], %[temp2] \n\t"
+ "sub %[load1], %[load1], %[temp0] \n\t"
+ "add %[load1], %[load1], %[temp1] \n\t"
+
+ "sub %[load2], %[temp0], %[temp1] \n\t"
+ "sub %[load2], %[load2], %[temp2] \n\t"
+ "add %[load2], %[load2], %[temp3] \n\t"
+
+ "madd $ac1, %[load1], %[cospi_16_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_16_64] \n\t"
+
+ "extp %[step1_5], $ac1, 31 \n\t"
+ "extp %[step1_6], $ac3, 31 \n\t"
+ "add %[step1_4], %[temp0], %[temp1] \n\t"
+ "add %[step1_7], %[temp3], %[temp2] \n\t"
+
+ : [load1] "=&r" (load1), [load2] "=&r" (load2),
+ [load3] "=&r" (load3), [load4] "=&r" (load4),
+ [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+ [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+ [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5),
+ [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7)
+ : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+ [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64),
+ [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64),
+ [cospi_16_64] "r" (cospi_16_64)
+ );
+
+ step2_0 = step1_0 + step1_7;
+ step2_1 = step1_1 + step1_6;
+ step2_2 = step1_2 + step1_5;
+ step2_3 = step1_3 + step1_4;
+ step2_4 = step1_3 - step1_4;
+ step2_5 = step1_2 - step1_5;
+ step2_6 = step1_1 - step1_6;
+ step2_7 = step1_0 - step1_7;
+
+ step1_0 = step2_0 + step3_15;
+ step1_1 = step2_1 + step3_14;
+ step1_2 = step2_2 + step3_13;
+ step1_3 = step2_3 + step3_12;
+ step1_4 = step2_4 + step3_11;
+ step1_5 = step2_5 + step3_10;
+ step1_6 = step2_6 + step3_9;
+ step1_7 = step2_7 + step3_8;
+ step1_8 = step2_7 - step3_8;
+ step1_9 = step2_6 - step3_9;
+ step1_10 = step2_5 - step3_10;
+ step1_11 = step2_4 - step3_11;
+ step1_12 = step2_3 - step3_12;
+ step1_13 = step2_2 - step3_13;
+ step1_14 = step2_1 - step3_14;
+ step1_15 = step2_0 - step3_15;
+
+ __asm__ __volatile__ (
+ "sub %[temp0], %[step2_27], %[step2_20] \n\t"
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "madd $ac0, %[temp0], %[cospi_16_64] \n\t"
+ "extp %[step1_20], $ac0, 31 \n\t"
+
+ : [temp0] "=&r" (temp0), [step1_20] "=r" (step1_20)
+ : [const_2_power_13] "r" (const_2_power_13),
+ [step2_20] "r" (step2_20), [step2_27] "r" (step2_27),
+ [cospi_16_64] "r" (cospi_16_64)
+ );
+
+ temp21 = (step2_20 + step2_27) * cospi_16_64;
+ step1_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+ __asm__ __volatile__ (
+ "sub %[temp0], %[step2_26], %[step2_21] \n\t"
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "madd $ac0, %[temp0], %[cospi_16_64] \n\t"
+ "extp %[step1_21], $ac0, 31 \n\t"
+
+ : [temp0] "=&r" (temp0), [step1_21] "=r" (step1_21)
+ : [const_2_power_13] "r" (const_2_power_13),
+ [step2_26] "r" (step2_26), [step2_21] "r" (step2_21),
+ [cospi_16_64] "r" (cospi_16_64)
+ );
+
+ temp21 = (step2_21 + step2_26) * cospi_16_64;
+ step1_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+ __asm__ __volatile__ (
+ "sub %[temp0], %[step2_25], %[step2_22] \n\t"
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "madd $ac0, %[temp0], %[cospi_16_64] \n\t"
+ "extp %[step1_22], $ac0, 31 \n\t"
+
+ : [temp0] "=&r" (temp0), [step1_22] "=r" (step1_22)
+ : [const_2_power_13] "r" (const_2_power_13),
+ [step2_25] "r" (step2_25), [step2_22] "r" (step2_22),
+ [cospi_16_64] "r" (cospi_16_64)
+ );
+
+ temp21 = (step2_22 + step2_25) * cospi_16_64;
+ step1_25 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+ __asm__ __volatile__ (
+ "sub %[temp0], %[step2_24], %[step2_23] \n\t"
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "madd $ac0, %[temp0], %[cospi_16_64] \n\t"
+ "extp %[step1_23], $ac0, 31 \n\t"
+
+ : [temp0] "=&r" (temp0), [step1_23] "=r" (step1_23)
+ : [const_2_power_13] "r" (const_2_power_13),
+ [step2_24] "r" (step2_24), [step2_23] "r" (step2_23),
+ [cospi_16_64] "r" (cospi_16_64)
+ );
+
+ temp21 = (step2_23 + step2_24) * cospi_16_64;
+ step1_24 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+ // final stage
+ output[0 * 32] = step1_0 + step2_31;
+ output[1 * 32] = step1_1 + step2_30;
+ output[2 * 32] = step1_2 + step2_29;
+ output[3 * 32] = step1_3 + step2_28;
+ output[4 * 32] = step1_4 + step1_27;
+ output[5 * 32] = step1_5 + step1_26;
+ output[6 * 32] = step1_6 + step1_25;
+ output[7 * 32] = step1_7 + step1_24;
+ output[8 * 32] = step1_8 + step1_23;
+ output[9 * 32] = step1_9 + step1_22;
+ output[10 * 32] = step1_10 + step1_21;
+ output[11 * 32] = step1_11 + step1_20;
+ output[12 * 32] = step1_12 + step2_19;
+ output[13 * 32] = step1_13 + step2_18;
+ output[14 * 32] = step1_14 + step2_17;
+ output[15 * 32] = step1_15 + step2_16;
+ output[16 * 32] = step1_15 - step2_16;
+ output[17 * 32] = step1_14 - step2_17;
+ output[18 * 32] = step1_13 - step2_18;
+ output[19 * 32] = step1_12 - step2_19;
+ output[20 * 32] = step1_11 - step1_20;
+ output[21 * 32] = step1_10 - step1_21;
+ output[22 * 32] = step1_9 - step1_22;
+ output[23 * 32] = step1_8 - step1_23;
+ output[24 * 32] = step1_7 - step1_24;
+ output[25 * 32] = step1_6 - step1_25;
+ output[26 * 32] = step1_5 - step1_26;
+ output[27 * 32] = step1_4 - step1_27;
+ output[28 * 32] = step1_3 - step2_28;
+ output[29 * 32] = step1_2 - step2_29;
+ output[30 * 32] = step1_1 - step2_30;
+ output[31 * 32] = step1_0 - step2_31;
+
+ input += 32;
+ output += 1;
+ }
+}
+
+void vp9_idct32x32_1024_add_dspr2(const int16_t *input, uint8_t *dest,
+ int dest_stride) {
+ DECLARE_ALIGNED(32, int16_t, out[32 * 32]);
+ int16_t *outptr = out;
+ uint32_t pos = 45;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__ (
+ "wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r" (pos)
+ );
+
+ // Rows
+ idct32_1d_rows_dspr2(input, outptr);
+
+ // Columns
+ vp9_idct32_1d_cols_add_blk_dspr2(out, dest, dest_stride);
+}
+
+void vp9_idct32x32_1_add_dspr2(const int16_t *input, uint8_t *dest,
+ int stride) {
+ int r, out;
+ int32_t a1, absa1;
+ int32_t vector_a1;
+ int32_t t1, t2, t3, t4;
+ int32_t vector_1, vector_2, vector_3, vector_4;
+ uint32_t pos = 45;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__ (
+ "wrdsp %[pos], 1 \n\t"
+
+ :
+ : [pos] "r" (pos)
+ );
+
+ out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);
+ __asm__ __volatile__ (
+ "addi %[out], %[out], 32 \n\t"
+ "sra %[a1], %[out], 6 \n\t"
+
+ : [out] "+r" (out), [a1] "=r" (a1)
+ :
+ );
+
+ if (a1 < 0) {
+ /* use quad-byte
+ * input and output memory are four byte aligned */
+ __asm__ __volatile__ (
+ "abs %[absa1], %[a1] \n\t"
+ "replv.qb %[vector_a1], %[absa1] \n\t"
+
+ : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)
+ : [a1] "r" (a1)
+ );
+
+ for (r = 32; r--;) {
+ __asm__ __volatile__ (
+ "lw %[t1], 0(%[dest]) \n\t"
+ "lw %[t2], 4(%[dest]) \n\t"
+ "lw %[t3], 8(%[dest]) \n\t"
+ "lw %[t4], 12(%[dest]) \n\t"
+ "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"
+ "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"
+ "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t"
+ "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t"
+ "sw %[vector_1], 0(%[dest]) \n\t"
+ "sw %[vector_2], 4(%[dest]) \n\t"
+ "sw %[vector_3], 8(%[dest]) \n\t"
+ "sw %[vector_4], 12(%[dest]) \n\t"
+
+ "lw %[t1], 16(%[dest]) \n\t"
+ "lw %[t2], 20(%[dest]) \n\t"
+ "lw %[t3], 24(%[dest]) \n\t"
+ "lw %[t4], 28(%[dest]) \n\t"
+ "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"
+ "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"
+ "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t"
+ "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t"
+ "sw %[vector_1], 16(%[dest]) \n\t"
+ "sw %[vector_2], 20(%[dest]) \n\t"
+ "sw %[vector_3], 24(%[dest]) \n\t"
+ "sw %[vector_4], 28(%[dest]) \n\t"
+
+ "add %[dest], %[dest], %[stride] \n\t"
+
+ : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4),
+ [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
+ [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4),
+ [dest] "+&r" (dest)
+ : [stride] "r" (stride), [vector_a1] "r" (vector_a1)
+ );
+ }
+ } else {
+ /* use quad-byte
+ * input and output memory are four byte aligned */
+ __asm__ __volatile__ (
+ "replv.qb %[vector_a1], %[a1] \n\t"
+
+ : [vector_a1] "=r" (vector_a1)
+ : [a1] "r" (a1)
+ );
+
+ for (r = 32; r--;) {
+ __asm__ __volatile__ (
+ "lw %[t1], 0(%[dest]) \n\t"
+ "lw %[t2], 4(%[dest]) \n\t"
+ "lw %[t3], 8(%[dest]) \n\t"
+ "lw %[t4], 12(%[dest]) \n\t"
+ "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"
+ "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"
+ "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t"
+ "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t"
+ "sw %[vector_1], 0(%[dest]) \n\t"
+ "sw %[vector_2], 4(%[dest]) \n\t"
+ "sw %[vector_3], 8(%[dest]) \n\t"
+ "sw %[vector_4], 12(%[dest]) \n\t"
+
+ "lw %[t1], 16(%[dest]) \n\t"
+ "lw %[t2], 20(%[dest]) \n\t"
+ "lw %[t3], 24(%[dest]) \n\t"
+ "lw %[t4], 28(%[dest]) \n\t"
+ "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"
+ "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"
+ "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t"
+ "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t"
+ "sw %[vector_1], 16(%[dest]) \n\t"
+ "sw %[vector_2], 20(%[dest]) \n\t"
+ "sw %[vector_3], 24(%[dest]) \n\t"
+ "sw %[vector_4], 28(%[dest]) \n\t"
+
+ "add %[dest], %[dest], %[stride] \n\t"
+
+ : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4),
+ [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
+ [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4),
+ [dest] "+&r" (dest)
+ : [stride] "r" (stride), [vector_a1] "r" (vector_a1)
+ );
+ }
+ }
+}
+#endif // #if HAVE_DSPR2
diff --git a/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c b/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c
new file mode 100644
index 000000000..5b7aa5e71
--- /dev/null
+++ b/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c
@@ -0,0 +1,438 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_idct.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+
+#if HAVE_DSPR2
+static void vp9_idct4_1d_rows_dspr2(const int16_t *input, int16_t *output) {
+ int16_t step_0, step_1, step_2, step_3;
+ int Temp0, Temp1, Temp2, Temp3;
+ const int const_2_power_13 = 8192;
+ int i;
+
+ for (i = 4; i--; ) {
+ __asm__ __volatile__ (
+ /*
+ temp_1 = (input[0] + input[2]) * cospi_16_64;
+ step_0 = dct_const_round_shift(temp_1);
+
+ temp_2 = (input[0] - input[2]) * cospi_16_64;
+ step_1 = dct_const_round_shift(temp_2);
+ */
+ "lh %[Temp0], 0(%[input]) \n\t"
+ "lh %[Temp1], 4(%[input]) \n\t"
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "add %[Temp2], %[Temp0], %[Temp1] \n\t"
+ "sub %[Temp3], %[Temp0], %[Temp1] \n\t"
+ "madd $ac0, %[Temp2], %[cospi_16_64] \n\t"
+ "lh %[Temp0], 2(%[input]) \n\t"
+ "lh %[Temp1], 6(%[input]) \n\t"
+ "extp %[step_0], $ac0, 31 \n\t"
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+
+ "madd $ac1, %[Temp3], %[cospi_16_64] \n\t"
+ "extp %[step_1], $ac1, 31 \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+
+ /*
+ temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
+ step_2 = dct_const_round_shift(temp1);
+ */
+ "madd $ac0, %[Temp0], %[cospi_24_64] \n\t"
+ "msub $ac0, %[Temp1], %[cospi_8_64] \n\t"
+ "extp %[step_2], $ac0, 31 \n\t"
+
+ /*
+ temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
+ step_3 = dct_const_round_shift(temp2);
+ */
+ "madd $ac1, %[Temp0], %[cospi_8_64] \n\t"
+ "madd $ac1, %[Temp1], %[cospi_24_64] \n\t"
+ "extp %[step_3], $ac1, 31 \n\t"
+
+ /*
+ output[0] = step_0 + step_3;
+ output[4] = step_1 + step_2;
+ output[8] = step_1 - step_2;
+ output[12] = step_0 - step_3;
+ */
+ "add %[Temp0], %[step_0], %[step_3] \n\t"
+ "sh %[Temp0], 0(%[output]) \n\t"
+
+ "add %[Temp1], %[step_1], %[step_2] \n\t"
+ "sh %[Temp1], 8(%[output]) \n\t"
+
+ "sub %[Temp2], %[step_1], %[step_2] \n\t"
+ "sh %[Temp2], 16(%[output]) \n\t"
+
+ "sub %[Temp3], %[step_0], %[step_3] \n\t"
+ "sh %[Temp3], 24(%[output]) \n\t"
+
+ : [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),
+ [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
+ [step_0] "=&r" (step_0), [step_1] "=&r" (step_1),
+ [step_2] "=&r" (step_2), [step_3] "=&r" (step_3),
+ [output] "+r" (output)
+ : [const_2_power_13] "r" (const_2_power_13),
+ [cospi_8_64] "r" (cospi_8_64), [cospi_16_64] "r" (cospi_16_64),
+ [cospi_24_64] "r" (cospi_24_64),
+ [input] "r" (input)
+ );
+
+ input += 4;
+ output += 1;
+ }
+}
+
+static void vp9_idct4_1d_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
+ int dest_stride) {
+ int16_t step_0, step_1, step_2, step_3;
+ int Temp0, Temp1, Temp2, Temp3;
+ const int const_2_power_13 = 8192;
+ int i;
+ uint8_t *dest_pix;
+ uint8_t *cm = vp9_ff_cropTbl;
+
+ /* prefetch vp9_ff_cropTbl */
+ vp9_prefetch_load(vp9_ff_cropTbl);
+ vp9_prefetch_load(vp9_ff_cropTbl + 32);
+ vp9_prefetch_load(vp9_ff_cropTbl + 64);
+ vp9_prefetch_load(vp9_ff_cropTbl + 96);
+ vp9_prefetch_load(vp9_ff_cropTbl + 128);
+ vp9_prefetch_load(vp9_ff_cropTbl + 160);
+ vp9_prefetch_load(vp9_ff_cropTbl + 192);
+ vp9_prefetch_load(vp9_ff_cropTbl + 224);
+
+ for (i = 0; i < 4; ++i) {
+ dest_pix = (dest + i);
+
+ __asm__ __volatile__ (
+ /*
+ temp_1 = (input[0] + input[2]) * cospi_16_64;
+ step_0 = dct_const_round_shift(temp_1);
+
+ temp_2 = (input[0] - input[2]) * cospi_16_64;
+ step_1 = dct_const_round_shift(temp_2);
+ */
+ "lh %[Temp0], 0(%[input]) \n\t"
+ "lh %[Temp1], 4(%[input]) \n\t"
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "add %[Temp2], %[Temp0], %[Temp1] \n\t"
+ "sub %[Temp3], %[Temp0], %[Temp1] \n\t"
+ "madd $ac0, %[Temp2], %[cospi_16_64] \n\t"
+ "lh %[Temp0], 2(%[input]) \n\t"
+ "lh %[Temp1], 6(%[input]) \n\t"
+ "extp %[step_0], $ac0, 31 \n\t"
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+
+ "madd $ac1, %[Temp3], %[cospi_16_64] \n\t"
+ "extp %[step_1], $ac1, 31 \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+
+ /*
+ temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
+ step_2 = dct_const_round_shift(temp1);
+ */
+ "madd $ac0, %[Temp0], %[cospi_24_64] \n\t"
+ "msub $ac0, %[Temp1], %[cospi_8_64] \n\t"
+ "extp %[step_2], $ac0, 31 \n\t"
+
+ /*
+ temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
+ step_3 = dct_const_round_shift(temp2);
+ */
+ "madd $ac1, %[Temp0], %[cospi_8_64] \n\t"
+ "madd $ac1, %[Temp1], %[cospi_24_64] \n\t"
+ "extp %[step_3], $ac1, 31 \n\t"
+
+ /*
+ output[0] = step_0 + step_3;
+ output[4] = step_1 + step_2;
+ output[8] = step_1 - step_2;
+ output[12] = step_0 - step_3;
+ */
+ "add %[Temp0], %[step_0], %[step_3] \n\t"
+ "addi %[Temp0], %[Temp0], 8 \n\t"
+ "sra %[Temp0], %[Temp0], 4 \n\t"
+ "lbu %[Temp1], 0(%[dest_pix]) \n\t"
+ "add %[Temp1], %[Temp1], %[Temp0] \n\t"
+ "add %[Temp0], %[step_1], %[step_2] \n\t"
+ "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
+ "sb %[Temp2], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "addi %[Temp0], %[Temp0], 8 \n\t"
+ "sra %[Temp0], %[Temp0], 4 \n\t"
+ "lbu %[Temp1], 0(%[dest_pix]) \n\t"
+ "add %[Temp1], %[Temp1], %[Temp0] \n\t"
+ "sub %[Temp0], %[step_1], %[step_2] \n\t"
+ "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
+ "sb %[Temp2], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "addi %[Temp0], %[Temp0], 8 \n\t"
+ "sra %[Temp0], %[Temp0], 4 \n\t"
+ "lbu %[Temp1], 0(%[dest_pix]) \n\t"
+ "add %[Temp1], %[Temp1], %[Temp0] \n\t"
+ "sub %[Temp0], %[step_0], %[step_3] \n\t"
+ "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
+ "sb %[Temp2], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "addi %[Temp0], %[Temp0], 8 \n\t"
+ "sra %[Temp0], %[Temp0], 4 \n\t"
+ "lbu %[Temp1], 0(%[dest_pix]) \n\t"
+ "add %[Temp1], %[Temp1], %[Temp0] \n\t"
+ "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
+ "sb %[Temp2], 0(%[dest_pix]) \n\t"
+
+ : [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),
+ [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
+ [step_0] "=&r" (step_0), [step_1] "=&r" (step_1),
+ [step_2] "=&r" (step_2), [step_3] "=&r" (step_3),
+ [dest_pix] "+r" (dest_pix)
+ : [const_2_power_13] "r" (const_2_power_13),
+ [cospi_8_64] "r" (cospi_8_64), [cospi_16_64] "r" (cospi_16_64),
+ [cospi_24_64] "r" (cospi_24_64),
+ [input] "r" (input), [cm] "r" (cm), [dest_stride] "r" (dest_stride)
+ );
+
+ input += 4;
+ }
+}
+
+void vp9_idct4x4_16_add_dspr2(const int16_t *input, uint8_t *dest,
+ int dest_stride) {
+ DECLARE_ALIGNED(32, int16_t, out[4 * 4]);
+ int16_t *outptr = out;
+ uint32_t pos = 45;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__ (
+ "wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r" (pos)
+ );
+
+ // Rows
+ vp9_idct4_1d_rows_dspr2(input, outptr);
+
+ // Columns
+ vp9_idct4_1d_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+}
+
+void vp9_idct4x4_1_add_dspr2(const int16_t *input, uint8_t *dest,
+ int dest_stride) {
+ int a1, absa1;
+ int r;
+ int32_t out;
+ int t2, vector_a1, vector_a;
+ uint32_t pos = 45;
+ int16_t input_dc = input[0];
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__ (
+ "wrdsp %[pos], 1 \n\t"
+
+ :
+ : [pos] "r" (pos)
+ );
+
+ out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input_dc);
+ __asm__ __volatile__ (
+ "addi %[out], %[out], 8 \n\t"
+ "sra %[a1], %[out], 4 \n\t"
+
+ : [out] "+r" (out), [a1] "=r" (a1)
+ :
+ );
+
+ if (a1 < 0) {
+ /* use quad-byte
+ * input and output memory are four byte aligned */
+ __asm__ __volatile__ (
+ "abs %[absa1], %[a1] \n\t"
+ "replv.qb %[vector_a1], %[absa1] \n\t"
+
+ : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)
+ : [a1] "r" (a1)
+ );
+
+ for (r = 4; r--;) {
+ __asm__ __volatile__ (
+ "lw %[t2], 0(%[dest]) \n\t"
+ "subu_s.qb %[vector_a], %[t2], %[vector_a1] \n\t"
+ "sw %[vector_a], 0(%[dest]) \n\t"
+ "add %[dest], %[dest], %[dest_stride] \n\t"
+
+ : [t2] "=&r" (t2), [vector_a] "=&r" (vector_a),
+ [dest] "+&r" (dest)
+ : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
+ );
+ }
+ } else {
+ /* use quad-byte
+ * input and output memory are four byte aligned */
+ __asm__ __volatile__ (
+ "replv.qb %[vector_a1], %[a1] \n\t"
+ : [vector_a1] "=r" (vector_a1)
+ : [a1] "r" (a1)
+ );
+
+ for (r = 4; r--;) {
+ __asm__ __volatile__ (
+ "lw %[t2], 0(%[dest]) \n\t"
+ "addu_s.qb %[vector_a], %[t2], %[vector_a1] \n\t"
+ "sw %[vector_a], 0(%[dest]) \n\t"
+ "add %[dest], %[dest], %[dest_stride] \n\t"
+
+ : [t2] "=&r" (t2), [vector_a] "=&r" (vector_a),
+ [dest] "+&r" (dest)
+ : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
+ );
+ }
+ }
+}
+
+static void iadst4_1d_dspr2(const int16_t *input, int16_t *output) {
+ int s0, s1, s2, s3, s4, s5, s6, s7;
+ int x0, x1, x2, x3;
+
+ x0 = input[0];
+ x1 = input[1];
+ x2 = input[2];
+ x3 = input[3];
+
+ if (!(x0 | x1 | x2 | x3)) {
+ output[0] = output[1] = output[2] = output[3] = 0;
+ return;
+ }
+
+ s0 = sinpi_1_9 * x0;
+ s1 = sinpi_2_9 * x0;
+ s2 = sinpi_3_9 * x1;
+ s3 = sinpi_4_9 * x2;
+ s4 = sinpi_1_9 * x2;
+ s5 = sinpi_2_9 * x3;
+ s6 = sinpi_4_9 * x3;
+ s7 = x0 - x2 + x3;
+
+ x0 = s0 + s3 + s5;
+ x1 = s1 - s4 - s6;
+ x2 = sinpi_3_9 * s7;
+ x3 = s2;
+
+ s0 = x0 + x3;
+ s1 = x1 + x3;
+ s2 = x2;
+ s3 = x0 + x1 - x3;
+
+ // 1-D transform scaling factor is sqrt(2).
+ // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
+ // + 1b (addition) = 29b.
+ // Hence the output bit depth is 15b.
+ output[0] = dct_const_round_shift(s0);
+ output[1] = dct_const_round_shift(s1);
+ output[2] = dct_const_round_shift(s2);
+ output[3] = dct_const_round_shift(s3);
+}
+
+void vp9_iht4x4_16_add_dspr2(const int16_t *input, uint8_t *dest,
+ int dest_stride, int tx_type) {
+ int i, j;
+ DECLARE_ALIGNED(32, int16_t, out[4 * 4]);
+ int16_t *outptr = out;
+ int16_t temp_in[4 * 4], temp_out[4];
+ uint32_t pos = 45;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__ (
+ "wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r" (pos)
+ );
+
+ switch (tx_type) {
+ case DCT_DCT: // DCT in both horizontal and vertical
+ vp9_idct4_1d_rows_dspr2(input, outptr);
+ vp9_idct4_1d_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+ break;
+ case ADST_DCT: // ADST in vertical, DCT in horizontal
+ vp9_idct4_1d_rows_dspr2(input, outptr);
+
+ outptr = out;
+
+ for (i = 0; i < 4; ++i) {
+ iadst4_1d_dspr2(outptr, temp_out);
+
+ for (j = 0; j < 4; ++j)
+ dest[j * dest_stride + i] =
+ clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
+ + dest[j * dest_stride + i]);
+
+ outptr += 4;
+ }
+ break;
+ case DCT_ADST: // DCT in vertical, ADST in horizontal
+ for (i = 0; i < 4; ++i) {
+ iadst4_1d_dspr2(input, outptr);
+ input += 4;
+ outptr += 4;
+ }
+
+ for (i = 0; i < 4; ++i) {
+ for (j = 0; j < 4; ++j) {
+ temp_in[i * 4 + j] = out[j * 4 + i];
+ }
+ }
+ vp9_idct4_1d_columns_add_blk_dspr2(&temp_in[0], dest, dest_stride);
+ break;
+ case ADST_ADST: // ADST in both directions
+ for (i = 0; i < 4; ++i) {
+ iadst4_1d_dspr2(input, outptr);
+ input += 4;
+ outptr += 4;
+ }
+
+ for (i = 0; i < 4; ++i) {
+ for (j = 0; j < 4; ++j)
+ temp_in[j] = out[j * 4 + i];
+ iadst4_1d_dspr2(temp_in, temp_out);
+
+ for (j = 0; j < 4; ++j)
+ dest[j * dest_stride + i] =
+ clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
+ + dest[j * dest_stride + i]);
+ }
+ break;
+ default:
+ printf("vp9_short_iht4x4_add_dspr2 : Invalid tx_type\n");
+ break;
+ }
+}
+#endif // #if HAVE_DSPR2
diff --git a/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c b/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c
new file mode 100644
index 000000000..93a08401d
--- /dev/null
+++ b/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c
@@ -0,0 +1,745 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_idct.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+
+#if HAVE_DSPR2
+static void idct8_1d_rows_dspr2(const int16_t *input, int16_t *output,
+ uint32_t no_rows) {
+ int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
+ const int const_2_power_13 = 8192;
+ int Temp0, Temp1, Temp2, Temp3, Temp4;
+ int i;
+
+ for (i = no_rows; i--; ) {
+ __asm__ __volatile__ (
+ /*
+ temp_1 = (input[0] + input[4]) * cospi_16_64;
+ step2_0 = dct_const_round_shift(temp_1);
+
+ temp_2 = (input[0] - input[4]) * cospi_16_64;
+ step2_1 = dct_const_round_shift(temp_2);
+ */
+ "lh %[Temp0], 0(%[input]) \n\t"
+ "lh %[Temp1], 8(%[input]) \n\t"
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "add %[Temp2], %[Temp0], %[Temp1] \n\t"
+ "madd $ac0, %[Temp2], %[cospi_16_64] \n\t"
+ "extp %[Temp4], $ac0, 31 \n\t"
+
+ "sub %[Temp3], %[Temp0], %[Temp1] \n\t"
+ "madd $ac1, %[Temp3], %[cospi_16_64] \n\t"
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "extp %[Temp2], $ac1, 31 \n\t"
+
+ /*
+ temp_1 = input[2] * cospi_24_64 - input[6] * cospi_8_64;
+ step2_2 = dct_const_round_shift(temp_1);
+ */
+ "lh %[Temp0], 4(%[input]) \n\t"
+ "lh %[Temp1], 12(%[input]) \n\t"
+ "madd $ac0, %[Temp0], %[cospi_24_64] \n\t"
+ "msub $ac0, %[Temp1], %[cospi_8_64] \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "extp %[Temp3], $ac0, 31 \n\t"
+
+ /*
+ step1_1 = step2_1 + step2_2;
+ step1_2 = step2_1 - step2_2;
+ */
+ "add %[step1_1], %[Temp2], %[Temp3] \n\t"
+ "sub %[step1_2], %[Temp2], %[Temp3] \n\t"
+
+ /*
+ temp_2 = input[2] * cospi_8_64 + input[6] * cospi_24_64;
+ step2_3 = dct_const_round_shift(temp_2);
+ */
+ "madd $ac1, %[Temp0], %[cospi_8_64] \n\t"
+ "madd $ac1, %[Temp1], %[cospi_24_64] \n\t"
+ "extp %[Temp1], $ac1, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+
+ /*
+ step1_0 = step2_0 + step2_3;
+ step1_3 = step2_0 - step2_3;
+ */
+ "add %[step1_0], %[Temp4], %[Temp1] \n\t"
+ "sub %[step1_3], %[Temp4], %[Temp1] \n\t"
+
+ /*
+ temp_1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
+ step1_4 = dct_const_round_shift(temp_1);
+ */
+ "lh %[Temp0], 2(%[input]) \n\t"
+ "madd $ac0, %[Temp0], %[cospi_28_64] \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "lh %[Temp1], 14(%[input]) \n\t"
+ "lh %[Temp0], 2(%[input]) \n\t"
+ "msub $ac0, %[Temp1], %[cospi_4_64] \n\t"
+ "extp %[step1_4], $ac0, 31 \n\t"
+
+ /*
+ temp_2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
+ step1_7 = dct_const_round_shift(temp_2);
+ */
+ "madd $ac1, %[Temp0], %[cospi_4_64] \n\t"
+ "madd $ac1, %[Temp1], %[cospi_28_64] \n\t"
+ "extp %[step1_7], $ac1, 31 \n\t"
+
+ /*
+ temp_1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
+ step1_5 = dct_const_round_shift(temp_1);
+ */
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "lh %[Temp0], 10(%[input]) \n\t"
+ "madd $ac0, %[Temp0], %[cospi_12_64] \n\t"
+ "lh %[Temp1], 6(%[input]) \n\t"
+ "msub $ac0, %[Temp1], %[cospi_20_64] \n\t"
+ "extp %[step1_5], $ac0, 31 \n\t"
+
+ /*
+ temp_2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
+ step1_6 = dct_const_round_shift(temp_2);
+ */
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "lh %[Temp0], 10(%[input]) \n\t"
+ "madd $ac1, %[Temp0], %[cospi_20_64] \n\t"
+ "lh %[Temp1], 6(%[input]) \n\t"
+ "madd $ac1, %[Temp1], %[cospi_12_64] \n\t"
+ "extp %[step1_6], $ac1, 31 \n\t"
+
+ /*
+ temp_1 = (step1_7 - step1_6 - step1_4 + step1_5) * cospi_16_64;
+ temp_2 = (step1_4 - step1_5 - step1_6 + step1_7) * cospi_16_64;
+ */
+ "sub %[Temp0], %[step1_7], %[step1_6] \n\t"
+ "sub %[Temp0], %[Temp0], %[step1_4] \n\t"
+ "add %[Temp0], %[Temp0], %[step1_5] \n\t"
+ "sub %[Temp1], %[step1_4], %[step1_5] \n\t"
+ "sub %[Temp1], %[Temp1], %[step1_6] \n\t"
+ "add %[Temp1], %[Temp1], %[step1_7] \n\t"
+
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+
+ "madd $ac0, %[Temp0], %[cospi_16_64] \n\t"
+ "madd $ac1, %[Temp1], %[cospi_16_64] \n\t"
+
+ /*
+ step1_4 = step1_4 + step1_5;
+ step1_7 = step1_6 + step1_7;
+ */
+ "add %[step1_4], %[step1_4], %[step1_5] \n\t"
+ "add %[step1_7], %[step1_7], %[step1_6] \n\t"
+
+ "extp %[step1_5], $ac0, 31 \n\t"
+ "extp %[step1_6], $ac1, 31 \n\t"
+
+ "add %[Temp0], %[step1_0], %[step1_7] \n\t"
+ "sh %[Temp0], 0(%[output]) \n\t"
+ "add %[Temp1], %[step1_1], %[step1_6] \n\t"
+ "sh %[Temp1], 16(%[output]) \n\t"
+ "add %[Temp0], %[step1_2], %[step1_5] \n\t"
+ "sh %[Temp0], 32(%[output]) \n\t"
+ "add %[Temp1], %[step1_3], %[step1_4] \n\t"
+ "sh %[Temp1], 48(%[output]) \n\t"
+
+ "sub %[Temp0], %[step1_3], %[step1_4] \n\t"
+ "sh %[Temp0], 64(%[output]) \n\t"
+ "sub %[Temp1], %[step1_2], %[step1_5] \n\t"
+ "sh %[Temp1], 80(%[output]) \n\t"
+ "sub %[Temp0], %[step1_1], %[step1_6] \n\t"
+ "sh %[Temp0], 96(%[output]) \n\t"
+ "sub %[Temp1], %[step1_0], %[step1_7] \n\t"
+ "sh %[Temp1], 112(%[output]) \n\t"
+
+ : [step1_0] "=&r" (step1_0), [step1_1] "=&r" (step1_1),
+ [step1_2] "=&r" (step1_2), [step1_3] "=&r" (step1_3),
+ [step1_4] "=&r" (step1_4), [step1_5] "=&r" (step1_5),
+ [step1_6] "=&r" (step1_6), [step1_7] "=&r" (step1_7),
+ [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),
+ [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
+ [Temp4] "=&r" (Temp4)
+ : [const_2_power_13] "r" (const_2_power_13),
+ [cospi_16_64] "r" (cospi_16_64), [cospi_28_64] "r" (cospi_28_64),
+ [cospi_4_64] "r" (cospi_4_64), [cospi_12_64] "r" (cospi_12_64),
+ [cospi_20_64] "r" (cospi_20_64), [cospi_8_64] "r" (cospi_8_64),
+ [cospi_24_64] "r" (cospi_24_64),
+ [output] "r" (output), [input] "r" (input)
+ );
+
+ input += 8;
+ output += 1;
+ }
+}
+
+static void idct8_1d_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
+ int dest_stride) {
+ int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
+ int Temp0, Temp1, Temp2, Temp3;
+ int i;
+ const int const_2_power_13 = 8192;
+ uint8_t *dest_pix;
+ uint8_t *cm = vp9_ff_cropTbl;
+
+ /* prefetch vp9_ff_cropTbl */
+ vp9_prefetch_load(vp9_ff_cropTbl);
+ vp9_prefetch_load(vp9_ff_cropTbl + 32);
+ vp9_prefetch_load(vp9_ff_cropTbl + 64);
+ vp9_prefetch_load(vp9_ff_cropTbl + 96);
+ vp9_prefetch_load(vp9_ff_cropTbl + 128);
+ vp9_prefetch_load(vp9_ff_cropTbl + 160);
+ vp9_prefetch_load(vp9_ff_cropTbl + 192);
+ vp9_prefetch_load(vp9_ff_cropTbl + 224);
+
+ for (i = 0; i < 8; ++i) {
+ dest_pix = (dest + i);
+
+ __asm__ __volatile__ (
+ /*
+ temp_1 = (input[0] + input[4]) * cospi_16_64;
+ step2_0 = dct_const_round_shift(temp_1);
+
+ temp_2 = (input[0] - input[4]) * cospi_16_64;
+ step2_1 = dct_const_round_shift(temp_2);
+ */
+ "lh %[Temp0], 0(%[input]) \n\t"
+ "lh %[Temp1], 8(%[input]) \n\t"
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "add %[Temp2], %[Temp0], %[Temp1] \n\t"
+ "madd $ac0, %[Temp2], %[cospi_16_64] \n\t"
+ "extp %[step1_6], $ac0, 31 \n\t"
+
+ "sub %[Temp3], %[Temp0], %[Temp1] \n\t"
+ "madd $ac1, %[Temp3], %[cospi_16_64] \n\t"
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "extp %[Temp2], $ac1, 31 \n\t"
+
+ /*
+ temp_1 = input[2] * cospi_24_64 - input[6] * cospi_8_64;
+ step2_2 = dct_const_round_shift(temp_1);
+ */
+ "lh %[Temp0], 4(%[input]) \n\t"
+ "lh %[Temp1], 12(%[input]) \n\t"
+ "madd $ac0, %[Temp0], %[cospi_24_64] \n\t"
+ "msub $ac0, %[Temp1], %[cospi_8_64] \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "extp %[Temp3], $ac0, 31 \n\t"
+
+ /*
+ step1_1 = step2_1 + step2_2;
+ step1_2 = step2_1 - step2_2;
+ */
+ "add %[step1_1], %[Temp2], %[Temp3] \n\t"
+ "sub %[step1_2], %[Temp2], %[Temp3] \n\t"
+
+ /*
+ temp_2 = input[2] * cospi_8_64 + input[6] * cospi_24_64;
+ step2_3 = dct_const_round_shift(temp_2);
+ */
+ "madd $ac1, %[Temp0], %[cospi_8_64] \n\t"
+ "madd $ac1, %[Temp1], %[cospi_24_64] \n\t"
+ "extp %[Temp1], $ac1, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+
+ /*
+ step1_0 = step2_0 + step2_3;
+ step1_3 = step2_0 - step2_3;
+ */
+ "add %[step1_0], %[step1_6], %[Temp1] \n\t"
+ "sub %[step1_3], %[step1_6], %[Temp1] \n\t"
+
+ /*
+ temp_1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
+ step1_4 = dct_const_round_shift(temp_1);
+ */
+ "lh %[Temp0], 2(%[input]) \n\t"
+ "madd $ac0, %[Temp0], %[cospi_28_64] \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "lh %[Temp1], 14(%[input]) \n\t"
+ "lh %[Temp0], 2(%[input]) \n\t"
+ "msub $ac0, %[Temp1], %[cospi_4_64] \n\t"
+ "extp %[step1_4], $ac0, 31 \n\t"
+
+ /*
+ temp_2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
+ step1_7 = dct_const_round_shift(temp_2);
+ */
+ "madd $ac1, %[Temp0], %[cospi_4_64] \n\t"
+ "madd $ac1, %[Temp1], %[cospi_28_64] \n\t"
+ "extp %[step1_7], $ac1, 31 \n\t"
+
+ /*
+ temp_1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
+ step1_5 = dct_const_round_shift(temp_1);
+ */
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "lh %[Temp0], 10(%[input]) \n\t"
+ "madd $ac0, %[Temp0], %[cospi_12_64] \n\t"
+ "lh %[Temp1], 6(%[input]) \n\t"
+ "msub $ac0, %[Temp1], %[cospi_20_64] \n\t"
+ "extp %[step1_5], $ac0, 31 \n\t"
+
+ /*
+ temp_2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
+ step1_6 = dct_const_round_shift(temp_2);
+ */
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "lh %[Temp0], 10(%[input]) \n\t"
+ "madd $ac1, %[Temp0], %[cospi_20_64] \n\t"
+ "lh %[Temp1], 6(%[input]) \n\t"
+ "madd $ac1, %[Temp1], %[cospi_12_64] \n\t"
+ "extp %[step1_6], $ac1, 31 \n\t"
+
+ /*
+ temp_1 = (step1_7 - step1_6 - step1_4 + step1_5) * cospi_16_64;
+ temp_2 = (step1_4 - step1_5 - step1_6 + step1_7) * cospi_16_64;
+ */
+ "sub %[Temp0], %[step1_7], %[step1_6] \n\t"
+ "sub %[Temp0], %[Temp0], %[step1_4] \n\t"
+ "add %[Temp0], %[Temp0], %[step1_5] \n\t"
+ "sub %[Temp1], %[step1_4], %[step1_5] \n\t"
+ "sub %[Temp1], %[Temp1], %[step1_6] \n\t"
+ "add %[Temp1], %[Temp1], %[step1_7] \n\t"
+
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+
+ "madd $ac0, %[Temp0], %[cospi_16_64] \n\t"
+ "madd $ac1, %[Temp1], %[cospi_16_64] \n\t"
+
+ /*
+ step1_4 = step1_4 + step1_5;
+ step1_7 = step1_6 + step1_7;
+ */
+ "add %[step1_4], %[step1_4], %[step1_5] \n\t"
+ "add %[step1_7], %[step1_7], %[step1_6] \n\t"
+
+ "extp %[step1_5], $ac0, 31 \n\t"
+ "extp %[step1_6], $ac1, 31 \n\t"
+
+ /* add block */
+ "lbu %[Temp1], 0(%[dest_pix]) \n\t"
+ "add %[Temp0], %[step1_0], %[step1_7] \n\t"
+ "addi %[Temp0], %[Temp0], 16 \n\t"
+ "sra %[Temp0], %[Temp0], 5 \n\t"
+ "add %[Temp1], %[Temp1], %[Temp0] \n\t"
+ "add %[Temp0], %[step1_1], %[step1_6] \n\t"
+ "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
+ "sb %[Temp2], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "lbu %[Temp1], 0(%[dest_pix]) \n\t"
+ "addi %[Temp0], %[Temp0], 16 \n\t"
+ "sra %[Temp0], %[Temp0], 5 \n\t"
+ "add %[Temp1], %[Temp1], %[Temp0] \n\t"
+ "add %[Temp0], %[step1_2], %[step1_5] \n\t"
+ "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
+ "sb %[Temp2], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "lbu %[Temp1], 0(%[dest_pix]) \n\t"
+ "addi %[Temp0], %[Temp0], 16 \n\t"
+ "sra %[Temp0], %[Temp0], 5 \n\t"
+ "add %[Temp1], %[Temp1], %[Temp0] \n\t"
+ "add %[Temp0], %[step1_3], %[step1_4] \n\t"
+ "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
+ "sb %[Temp2], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "lbu %[Temp1], 0(%[dest_pix]) \n\t"
+ "addi %[Temp0], %[Temp0], 16 \n\t"
+ "sra %[Temp0], %[Temp0], 5 \n\t"
+ "add %[Temp1], %[Temp1], %[Temp0] \n\t"
+ "sub %[Temp0], %[step1_3], %[step1_4] \n\t"
+ "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
+ "sb %[Temp2], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "lbu %[Temp1], 0(%[dest_pix]) \n\t"
+ "addi %[Temp0], %[Temp0], 16 \n\t"
+ "sra %[Temp0], %[Temp0], 5 \n\t"
+ "add %[Temp1], %[Temp1], %[Temp0] \n\t"
+ "sub %[Temp0], %[step1_2], %[step1_5] \n\t"
+ "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
+ "sb %[Temp2], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "lbu %[Temp1], 0(%[dest_pix]) \n\t"
+ "addi %[Temp0], %[Temp0], 16 \n\t"
+ "sra %[Temp0], %[Temp0], 5 \n\t"
+ "add %[Temp1], %[Temp1], %[Temp0] \n\t"
+ "sub %[Temp0], %[step1_1], %[step1_6] \n\t"
+ "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
+ "sb %[Temp2], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "lbu %[Temp1], 0(%[dest_pix]) \n\t"
+ "addi %[Temp0], %[Temp0], 16 \n\t"
+ "sra %[Temp0], %[Temp0], 5 \n\t"
+ "add %[Temp1], %[Temp1], %[Temp0] \n\t"
+ "sub %[Temp0], %[step1_0], %[step1_7] \n\t"
+ "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
+ "sb %[Temp2], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "lbu %[Temp1], 0(%[dest_pix]) \n\t"
+ "addi %[Temp0], %[Temp0], 16 \n\t"
+ "sra %[Temp0], %[Temp0], 5 \n\t"
+ "add %[Temp1], %[Temp1], %[Temp0] \n\t"
+ "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
+ "sb %[Temp2], 0(%[dest_pix]) \n\t"
+
+ : [step1_0] "=&r" (step1_0), [step1_1] "=&r" (step1_1),
+ [step1_2] "=&r" (step1_2), [step1_3] "=&r" (step1_3),
+ [step1_4] "=&r" (step1_4), [step1_5] "=&r" (step1_5),
+ [step1_6] "=&r" (step1_6), [step1_7] "=&r" (step1_7),
+ [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),
+ [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
+ [dest_pix] "+r" (dest_pix)
+ : [const_2_power_13] "r" (const_2_power_13),
+ [cospi_16_64] "r" (cospi_16_64), [cospi_28_64] "r" (cospi_28_64),
+ [cospi_4_64] "r" (cospi_4_64), [cospi_12_64] "r" (cospi_12_64),
+ [cospi_20_64] "r" (cospi_20_64), [cospi_8_64] "r" (cospi_8_64),
+ [cospi_24_64] "r" (cospi_24_64),
+ [input] "r" (input), [cm] "r" (cm), [dest_stride] "r" (dest_stride)
+ );
+
+ input += 8;
+ }
+}
+
+void vp9_idct8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
+ int dest_stride) {
+ DECLARE_ALIGNED(32, int16_t, out[8 * 8]);
+ int16_t *outptr = out;
+ uint32_t pos = 45;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__ (
+ "wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r" (pos)
+ );
+
+ // First transform rows
+ idct8_1d_rows_dspr2(input, outptr, 8);
+
+ // Then transform columns and add to dest
+ idct8_1d_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+}
+
+static void iadst8_1d_dspr2(const int16_t *input, int16_t *output) {
+ int s0, s1, s2, s3, s4, s5, s6, s7;
+ int x0, x1, x2, x3, x4, x5, x6, x7;
+
+ x0 = input[7];
+ x1 = input[0];
+ x2 = input[5];
+ x3 = input[2];
+ x4 = input[3];
+ x5 = input[4];
+ x6 = input[1];
+ x7 = input[6];
+
+ if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
+ output[0] = output[1] = output[2] = output[3] = output[4]
+ = output[5] = output[6] = output[7] = 0;
+ return;
+ }
+
+ // stage 1
+ s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
+ s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
+ s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
+ s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
+ s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
+ s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
+ s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
+ s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
+
+ x0 = ROUND_POWER_OF_TWO((s0 + s4), DCT_CONST_BITS);
+ x1 = ROUND_POWER_OF_TWO((s1 + s5), DCT_CONST_BITS);
+ x2 = ROUND_POWER_OF_TWO((s2 + s6), DCT_CONST_BITS);
+ x3 = ROUND_POWER_OF_TWO((s3 + s7), DCT_CONST_BITS);
+ x4 = ROUND_POWER_OF_TWO((s0 - s4), DCT_CONST_BITS);
+ x5 = ROUND_POWER_OF_TWO((s1 - s5), DCT_CONST_BITS);
+ x6 = ROUND_POWER_OF_TWO((s2 - s6), DCT_CONST_BITS);
+ x7 = ROUND_POWER_OF_TWO((s3 - s7), DCT_CONST_BITS);
+
+ // stage 2
+ s0 = x0;
+ s1 = x1;
+ s2 = x2;
+ s3 = x3;
+ s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
+ s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
+ s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
+ s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
+
+ x0 = s0 + s2;
+ x1 = s1 + s3;
+ x2 = s0 - s2;
+ x3 = s1 - s3;
+ x4 = ROUND_POWER_OF_TWO((s4 + s6), DCT_CONST_BITS);
+ x5 = ROUND_POWER_OF_TWO((s5 + s7), DCT_CONST_BITS);
+ x6 = ROUND_POWER_OF_TWO((s4 - s6), DCT_CONST_BITS);
+ x7 = ROUND_POWER_OF_TWO((s5 - s7), DCT_CONST_BITS);
+
+ // stage 3
+ s2 = cospi_16_64 * (x2 + x3);
+ s3 = cospi_16_64 * (x2 - x3);
+ s6 = cospi_16_64 * (x6 + x7);
+ s7 = cospi_16_64 * (x6 - x7);
+
+ x2 = ROUND_POWER_OF_TWO((s2), DCT_CONST_BITS);
+ x3 = ROUND_POWER_OF_TWO((s3), DCT_CONST_BITS);
+ x6 = ROUND_POWER_OF_TWO((s6), DCT_CONST_BITS);
+ x7 = ROUND_POWER_OF_TWO((s7), DCT_CONST_BITS);
+
+ output[0] = x0;
+ output[1] = -x4;
+ output[2] = x6;
+ output[3] = -x2;
+ output[4] = x3;
+ output[5] = -x7;
+ output[6] = x5;
+ output[7] = -x1;
+}
+
+void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
+ int dest_stride, int tx_type) {
+ int i, j;
+ DECLARE_ALIGNED(32, int16_t, out[8 * 8]);
+ int16_t *outptr = out;
+ int16_t temp_in[8 * 8], temp_out[8];
+ uint32_t pos = 45;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__ (
+ "wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r" (pos)
+ );
+
+ switch (tx_type) {
+ case DCT_DCT: // DCT in both horizontal and vertical
+ idct8_1d_rows_dspr2(input, outptr, 8);
+ idct8_1d_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+ break;
+ case ADST_DCT: // ADST in vertical, DCT in horizontal
+ idct8_1d_rows_dspr2(input, outptr, 8);
+
+ for (i = 0; i < 8; ++i) {
+ iadst8_1d_dspr2(&out[i * 8], temp_out);
+
+ for (j = 0; j < 8; ++j)
+ dest[j * dest_stride + i] =
+ clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
+ + dest[j * dest_stride + i]);
+ }
+ break;
+ case DCT_ADST: // DCT in vertical, ADST in horizontal
+ for (i = 0; i < 8; ++i) {
+ iadst8_1d_dspr2(input, outptr);
+ input += 8;
+ outptr += 8;
+ }
+
+ for (i = 0; i < 8; ++i) {
+ for (j = 0; j < 8; ++j) {
+ temp_in[i * 8 + j] = out[j * 8 + i];
+ }
+ }
+ idct8_1d_columns_add_blk_dspr2(&temp_in[0], dest, dest_stride);
+ break;
+ case ADST_ADST: // ADST in both directions
+ for (i = 0; i < 8; ++i) {
+ iadst8_1d_dspr2(input, outptr);
+ input += 8;
+ outptr += 8;
+ }
+
+ for (i = 0; i < 8; ++i) {
+ for (j = 0; j < 8; ++j)
+ temp_in[j] = out[j * 8 + i];
+
+ iadst8_1d_dspr2(temp_in, temp_out);
+
+ for (j = 0; j < 8; ++j)
+ dest[j * dest_stride + i] =
+ clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
+ + dest[j * dest_stride + i]);
+ }
+ break;
+ default:
+ printf("vp9_short_iht8x8_add_dspr2 : Invalid tx_type\n");
+ break;
+ }
+}
+
+void vp9_idct8x8_10_add_dspr2(const int16_t *input, uint8_t *dest,
+ int dest_stride) {
+ DECLARE_ALIGNED(32, int16_t, out[8 * 8]);
+ int16_t *outptr = out;
+ uint32_t pos = 45;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__ (
+ "wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r" (pos)
+ );
+
+ // First transform rows
+ idct8_1d_rows_dspr2(input, outptr, 4);
+
+ outptr += 4;
+
+ __asm__ __volatile__ (
+ "sw $zero, 0(%[outptr]) \n\t"
+ "sw $zero, 4(%[outptr]) \n\t"
+ "sw $zero, 16(%[outptr]) \n\t"
+ "sw $zero, 20(%[outptr]) \n\t"
+ "sw $zero, 32(%[outptr]) \n\t"
+ "sw $zero, 36(%[outptr]) \n\t"
+ "sw $zero, 48(%[outptr]) \n\t"
+ "sw $zero, 52(%[outptr]) \n\t"
+ "sw $zero, 64(%[outptr]) \n\t"
+ "sw $zero, 68(%[outptr]) \n\t"
+ "sw $zero, 80(%[outptr]) \n\t"
+ "sw $zero, 84(%[outptr]) \n\t"
+ "sw $zero, 96(%[outptr]) \n\t"
+ "sw $zero, 100(%[outptr]) \n\t"
+ "sw $zero, 112(%[outptr]) \n\t"
+ "sw $zero, 116(%[outptr]) \n\t"
+
+ :
+ : [outptr] "r" (outptr)
+ );
+
+
+ // Then transform columns and add to dest
+ idct8_1d_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+}
+
+void vp9_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest,
+ int dest_stride) {
+ uint32_t pos = 45;
+ int32_t out;
+ int32_t r;
+ int32_t a1, absa1;
+ int32_t t1, t2, vector_a1, vector_1, vector_2;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__ (
+ "wrdsp %[pos], 1 \n\t"
+
+ :
+ : [pos] "r" (pos)
+ );
+
+ out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);
+ __asm__ __volatile__ (
+ "addi %[out], %[out], 16 \n\t"
+ "sra %[a1], %[out], 5 \n\t"
+
+ : [out] "+r" (out), [a1] "=r" (a1)
+ :
+ );
+
+ if (a1 < 0) {
+ /* use quad-byte
+ * input and output memory are four byte aligned */
+ __asm__ __volatile__ (
+ "abs %[absa1], %[a1] \n\t"
+ "replv.qb %[vector_a1], %[absa1] \n\t"
+
+ : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)
+ : [a1] "r" (a1)
+ );
+
+ for (r = 8; r--;) {
+ __asm__ __volatile__ (
+ "lw %[t1], 0(%[dest]) \n\t"
+ "lw %[t2], 4(%[dest]) \n\t"
+ "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"
+ "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"
+ "sw %[vector_1], 0(%[dest]) \n\t"
+ "sw %[vector_2], 4(%[dest]) \n\t"
+ "add %[dest], %[dest], %[dest_stride] \n\t"
+
+ : [t1] "=&r" (t1), [t2] "=&r" (t2),
+ [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
+ [dest] "+&r" (dest)
+ : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
+ );
+ }
+ } else {
+ /* use quad-byte
+ * input and output memory are four byte aligned */
+ __asm__ __volatile__ (
+ "replv.qb %[vector_a1], %[a1] \n\t"
+
+ : [vector_a1] "=r" (vector_a1)
+ : [a1] "r" (a1)
+ );
+
+ for (r = 8; r--;) {
+ __asm__ __volatile__ (
+ "lw %[t1], 0(%[dest]) \n\t"
+ "lw %[t2], 4(%[dest]) \n\t"
+ "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"
+ "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"
+ "sw %[vector_1], 0(%[dest]) \n\t"
+ "sw %[vector_2], 4(%[dest]) \n\t"
+ "add %[dest], %[dest], %[dest_stride] \n\t"
+
+ : [t1] "=&r" (t1), [t2] "=&r" (t2),
+ [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
+ [dest] "+r" (dest)
+ : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
+ );
+ }
+ }
+}
+#endif // #if HAVE_DSPR2
diff --git a/vp9/common/vp9_enums.h b/vp9/common/vp9_enums.h
index 768ff2c94..1651b9050 100644
--- a/vp9/common/vp9_enums.h
+++ b/vp9/common/vp9_enums.h
@@ -76,4 +76,15 @@ typedef enum {
ADST_ADST = 3 // ADST in both directions
} TX_TYPE;
+typedef enum {
+ UNKNOWN = 0,
+ BT_601 = 1, // YUV
+ BT_709 = 2, // YUV
+ SMPTE_170 = 3, // YUV
+ SMPTE_240 = 4, // YUV
+ RESERVED_1 = 5,
+ RESERVED_2 = 6,
+ SRGB = 7 // RGB
+} COLOR_SPACE;
+
#endif // VP9_COMMON_VP9_ENUMS_H_
diff --git a/vp9/common/vp9_idct.c b/vp9/common/vp9_idct.c
index 52b039d99..ea8683ea1 100644
--- a/vp9/common/vp9_idct.c
+++ b/vp9/common/vp9_idct.c
@@ -1280,6 +1280,31 @@ void vp9_idct32x32_1024_add_c(const int16_t *input, uint8_t *dest, int stride) {
}
}
+void vp9_idct32x32_34_add_c(const int16_t *input, uint8_t *dest, int stride) {
+ int16_t out[32 * 32] = {0};
+ int16_t *outptr = out;
+ int i, j;
+ int16_t temp_in[32], temp_out[32];
+
+ // Rows
+ // only upper-left 8x8 has non-zero coeff
+ for (i = 0; i < 8; ++i) {
+ idct32_1d(input, outptr);
+ input += 32;
+ outptr += 32;
+ }
+
+ // Columns
+ for (i = 0; i < 32; ++i) {
+ for (j = 0; j < 32; ++j)
+ temp_in[j] = out[j * 32 + i];
+ idct32_1d(temp_in, temp_out);
+ for (j = 0; j < 32; ++j)
+ dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+ + dest[j * stride + i]);
+ }
+}
+
void vp9_idct32x32_1_add_c(const int16_t *input, uint8_t *dest, int stride) {
int i, j;
int a1;
@@ -1350,6 +1375,9 @@ void vp9_idct32x32_add(const int16_t *input, uint8_t *dest, int stride,
if (eob) {
if (eob == 1)
vp9_idct32x32_1_add(input, dest, stride);
+ else if (eob <= 34)
+ // non-zero coeff only in upper-left 8x8
+ vp9_idct32x32_34_add(input, dest, stride);
else
vp9_idct32x32_1024_add(input, dest, stride);
}
diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h
index 289210ecb..704469e29 100644
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h
@@ -90,6 +90,8 @@ typedef struct VP9Common {
DECLARE_ALIGNED(16, int16_t, a_dequant[QINDEX_RANGE][8]);
#endif
+ COLOR_SPACE color_space;
+
int width;
int height;
int display_width;
@@ -217,6 +219,13 @@ typedef struct VP9Common {
int cur_tile_mi_row_start, cur_tile_mi_row_end;
} VP9_COMMON;
+// ref == 0 => LAST_FRAME
+// ref == 1 => GOLDEN_FRAME
+// ref == 2 => ALTREF_FRAME
+static YV12_BUFFER_CONFIG *get_frame_ref_buffer(VP9_COMMON *cm, int ref) {
+ return &cm->yv12_fb[cm->active_ref_idx[ref]];
+}
+
static int get_free_fb(VP9_COMMON *cm) {
int i;
for (i = 0; i < NUM_YV12_BUFFERS; i++)
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index ba96e5ad6..2d9fbff97 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -268,43 +268,46 @@ specialize vp9_convolve8_avg_vert sse2 ssse3 neon dspr2
# dct
#
prototype void vp9_idct4x4_1_add "const int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_idct4x4_1_add sse2 neon
+specialize vp9_idct4x4_1_add sse2 neon dspr2
prototype void vp9_idct4x4_16_add "const int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_idct4x4_16_add sse2 neon
+specialize vp9_idct4x4_16_add sse2 neon dspr2
prototype void vp9_idct8x8_1_add "const int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_idct8x8_1_add sse2 neon
+specialize vp9_idct8x8_1_add sse2 neon dspr2
prototype void vp9_idct8x8_64_add "const int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_idct8x8_64_add sse2 neon
+specialize vp9_idct8x8_64_add sse2 neon dspr2
prototype void vp9_idct8x8_10_add "const int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_idct8x8_10_add sse2 neon
+specialize vp9_idct8x8_10_add sse2 neon dspr2
prototype void vp9_idct16x16_1_add "const int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_idct16x16_1_add sse2 neon
+specialize vp9_idct16x16_1_add sse2 neon dspr2
prototype void vp9_idct16x16_256_add "const int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_idct16x16_256_add sse2 neon
+specialize vp9_idct16x16_256_add sse2 neon dspr2
prototype void vp9_idct16x16_10_add "const int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_idct16x16_10_add sse2 neon
+specialize vp9_idct16x16_10_add sse2 neon dspr2
prototype void vp9_idct32x32_1024_add "const int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_idct32x32_1024_add sse2 neon
+specialize vp9_idct32x32_1024_add sse2 neon dspr2
+
+prototype void vp9_idct32x32_34_add "const int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_idct32x32_34_add sse2
prototype void vp9_idct32x32_1_add "const int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_idct32x32_1_add sse2
+specialize vp9_idct32x32_1_add sse2 dspr2
prototype void vp9_iht4x4_16_add "const int16_t *input, uint8_t *dest, int dest_stride, int tx_type"
-specialize vp9_iht4x4_16_add sse2 neon
+specialize vp9_iht4x4_16_add sse2 neon dspr2
prototype void vp9_iht8x8_64_add "const int16_t *input, uint8_t *dest, int dest_stride, int tx_type"
-specialize vp9_iht8x8_64_add sse2 neon
+specialize vp9_iht8x8_64_add sse2 neon dspr2
prototype void vp9_iht16x16_256_add "const int16_t *input, uint8_t *output, int pitch, int tx_type"
-specialize vp9_iht16x16_256_add sse2
+specialize vp9_iht16x16_256_add sse2 dspr2
# dct and add
diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c
index 74de6c670..ccf5aac17 100644
--- a/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -415,7 +415,7 @@ void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride,
res3 = _mm_packs_epi32(tmp6, tmp7); \
}
-#define IDCT8x8_1D \
+#define IDCT8_1D \
/* Stage1 */ \
{ \
const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \
@@ -530,7 +530,7 @@ void vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
in4, in5, in6, in7);
// 4-stage 1D idct8x8
- IDCT8x8_1D
+ IDCT8_1D
}
// Final rounding and shift
@@ -643,7 +643,7 @@ static void idct8_1d_sse2(__m128i *in) {
in4, in5, in6, in7);
// 4-stage 1D idct8x8
- IDCT8x8_1D
+ IDCT8_1D
in[0] = in0;
in[1] = in1;
in[2] = in2;
@@ -1068,7 +1068,7 @@ void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
in4, in5, in6, in7)
// 1D idct8x8
- IDCT8x8_1D
+ IDCT8_1D
// Final rounding and shift
in0 = _mm_adds_epi16(in0, final_rounding);
@@ -1099,7 +1099,7 @@ void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
RECON_AND_STORE(dest, in7);
}
-#define IDCT16x16_1D \
+#define IDCT16_1D \
/* Stage2 */ \
{ \
const __m128i lo_1_15 = _mm_unpacklo_epi16(in1, in15); \
@@ -1321,7 +1321,7 @@ void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest,
in12, in13, in14, in15);
}
- IDCT16x16_1D
+ IDCT16_1D
// Stage7
if (i == 0) {
@@ -2703,7 +2703,7 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
in8 = in9 = in10 = in11 = in12 = in13 = in14 = in15 = zero;
- IDCT16x16_1D
+ IDCT16_1D
// Stage7
in0 = _mm_add_epi16(stp2_0, stp1_15);
@@ -2785,6 +2785,698 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
input += 8; \
} \
+#define IDCT32_1D \
+/* Stage1 */ \
+{ \
+ const __m128i lo_1_31 = _mm_unpacklo_epi16(in1, in31); \
+ const __m128i hi_1_31 = _mm_unpackhi_epi16(in1, in31); \
+ const __m128i lo_17_15 = _mm_unpacklo_epi16(in17, in15); \
+ const __m128i hi_17_15 = _mm_unpackhi_epi16(in17, in15); \
+ \
+ const __m128i lo_9_23 = _mm_unpacklo_epi16(in9, in23); \
+ const __m128i hi_9_23 = _mm_unpackhi_epi16(in9, in23); \
+ const __m128i lo_25_7= _mm_unpacklo_epi16(in25, in7); \
+ const __m128i hi_25_7 = _mm_unpackhi_epi16(in25, in7); \
+ \
+ const __m128i lo_5_27 = _mm_unpacklo_epi16(in5, in27); \
+ const __m128i hi_5_27 = _mm_unpackhi_epi16(in5, in27); \
+ const __m128i lo_21_11 = _mm_unpacklo_epi16(in21, in11); \
+ const __m128i hi_21_11 = _mm_unpackhi_epi16(in21, in11); \
+ \
+ const __m128i lo_13_19 = _mm_unpacklo_epi16(in13, in19); \
+ const __m128i hi_13_19 = _mm_unpackhi_epi16(in13, in19); \
+ const __m128i lo_29_3 = _mm_unpacklo_epi16(in29, in3); \
+ const __m128i hi_29_3 = _mm_unpackhi_epi16(in29, in3); \
+ \
+ MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \
+ stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, \
+ stp1_17, stp1_30) \
+ MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, \
+ stg1_5, stg1_6, stg1_7, stp1_18, stp1_29, \
+ stp1_19, stp1_28) \
+ MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, \
+ stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, \
+ stp1_21, stp1_26) \
+ MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, \
+ stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, \
+ stp1_23, stp1_24) \
+} \
+\
+/* Stage2 */ \
+{ \
+ const __m128i lo_2_30 = _mm_unpacklo_epi16(in2, in30); \
+ const __m128i hi_2_30 = _mm_unpackhi_epi16(in2, in30); \
+ const __m128i lo_18_14 = _mm_unpacklo_epi16(in18, in14); \
+ const __m128i hi_18_14 = _mm_unpackhi_epi16(in18, in14); \
+ \
+ const __m128i lo_10_22 = _mm_unpacklo_epi16(in10, in22); \
+ const __m128i hi_10_22 = _mm_unpackhi_epi16(in10, in22); \
+ const __m128i lo_26_6 = _mm_unpacklo_epi16(in26, in6); \
+ const __m128i hi_26_6 = _mm_unpackhi_epi16(in26, in6); \
+ \
+ MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \
+ stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \
+ stp2_14) \
+ MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, \
+ stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, \
+ stp2_11, stp2_12) \
+ \
+ stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \
+ stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \
+ stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); \
+ stp2_19 = _mm_add_epi16(stp1_19, stp1_18); \
+ \
+ stp2_20 = _mm_add_epi16(stp1_20, stp1_21); \
+ stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); \
+ stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); \
+ stp2_23 = _mm_add_epi16(stp1_23, stp1_22); \
+ \
+ stp2_24 = _mm_add_epi16(stp1_24, stp1_25); \
+ stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); \
+ stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); \
+ stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \
+ \
+ stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \
+ stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \
+ stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \
+ stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \
+} \
+\
+/* Stage3 */ \
+{ \
+ const __m128i lo_4_28 = _mm_unpacklo_epi16(in4, in28); \
+ const __m128i hi_4_28 = _mm_unpackhi_epi16(in4, in28); \
+ const __m128i lo_20_12 = _mm_unpacklo_epi16(in20, in12); \
+ const __m128i hi_20_12 = _mm_unpackhi_epi16(in20, in12); \
+ \
+ const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \
+ const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \
+ const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
+ const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
+ \
+ const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
+ const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
+ const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
+ const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
+ \
+ MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, \
+ stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, \
+ stp1_6) \
+ \
+ stp1_8 = _mm_add_epi16(stp2_8, stp2_9); \
+ stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \
+ stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
+ stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
+ stp1_12 = _mm_add_epi16(stp2_12, stp2_13); \
+ stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
+ stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
+ stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
+ \
+ MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
+ stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \
+ stp1_18, stp1_29) \
+ MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
+ stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \
+ stp1_22, stp1_25) \
+ \
+ stp1_16 = stp2_16; \
+ stp1_31 = stp2_31; \
+ stp1_19 = stp2_19; \
+ stp1_20 = stp2_20; \
+ stp1_23 = stp2_23; \
+ stp1_24 = stp2_24; \
+ stp1_27 = stp2_27; \
+ stp1_28 = stp2_28; \
+} \
+\
+/* Stage4 */ \
+{ \
+ const __m128i lo_0_16 = _mm_unpacklo_epi16(in0, in16); \
+ const __m128i hi_0_16 = _mm_unpackhi_epi16(in0, in16); \
+ const __m128i lo_8_24 = _mm_unpacklo_epi16(in8, in24); \
+ const __m128i hi_8_24 = _mm_unpackhi_epi16(in8, in24); \
+ \
+ const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
+ const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
+ const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
+ const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
+ \
+ MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, \
+ stg4_1, stg4_2, stg4_3, stp2_0, stp2_1, \
+ stp2_2, stp2_3) \
+ \
+ stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
+ stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
+ stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
+ stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
+ \
+ MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
+ stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \
+ stp2_10, stp2_13) \
+ \
+ stp2_8 = stp1_8; \
+ stp2_15 = stp1_15; \
+ stp2_11 = stp1_11; \
+ stp2_12 = stp1_12; \
+ \
+ stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
+ stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
+ stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
+ stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
+ stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
+ stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
+ stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
+ stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
+ \
+ stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
+ stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
+ stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
+ stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
+ stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
+ stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
+ stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
+ stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
+} \
+\
+/* Stage5 */ \
+{ \
+ const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
+ const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
+ const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
+ const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
+ \
+ const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
+ const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
+ const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
+ const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
+ \
+ const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
+ const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
+ \
+ stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
+ stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
+ stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
+ stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
+ \
+ tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
+ tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
+ tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
+ tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
+ \
+ tmp0 = _mm_add_epi32(tmp0, rounding); \
+ tmp1 = _mm_add_epi32(tmp1, rounding); \
+ tmp2 = _mm_add_epi32(tmp2, rounding); \
+ tmp3 = _mm_add_epi32(tmp3, rounding); \
+ \
+ tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+ tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+ tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+ \
+ stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
+ stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
+ \
+ stp1_4 = stp2_4; \
+ stp1_7 = stp2_7; \
+ \
+ stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
+ stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
+ stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
+ stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
+ stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
+ stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
+ stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
+ stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
+ \
+ stp1_16 = stp2_16; \
+ stp1_17 = stp2_17; \
+ \
+ MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
+ stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \
+ stp1_19, stp1_28) \
+ MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
+ stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \
+ stp1_21, stp1_26) \
+ \
+ stp1_22 = stp2_22; \
+ stp1_23 = stp2_23; \
+ stp1_24 = stp2_24; \
+ stp1_25 = stp2_25; \
+ stp1_30 = stp2_30; \
+ stp1_31 = stp2_31; \
+} \
+\
+/* Stage6 */ \
+{ \
+ const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
+ const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
+ const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
+ const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
+ \
+ stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
+ stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
+ stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
+ stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
+ stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
+ stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
+ stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
+ stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
+ \
+ stp2_8 = stp1_8; \
+ stp2_9 = stp1_9; \
+ stp2_14 = stp1_14; \
+ stp2_15 = stp1_15; \
+ \
+ MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
+ stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \
+ stp2_13, stp2_11, stp2_12) \
+ \
+ stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
+ stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
+ stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
+ stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
+ stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
+ stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
+ stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
+ stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
+ \
+ stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
+ stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
+ stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
+ stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
+ stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
+ stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
+ stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
+ stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
+} \
+\
+/* Stage7 */ \
+{ \
+ const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
+ const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
+ const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
+ const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
+ \
+ const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
+ const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
+ const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
+ const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
+ \
+ stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
+ stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
+ stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
+ stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
+ stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
+ stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
+ stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
+ stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
+ stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
+ stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
+ stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
+ stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
+ stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
+ stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
+ stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
+ stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
+ \
+ stp1_16 = stp2_16; \
+ stp1_17 = stp2_17; \
+ stp1_18 = stp2_18; \
+ stp1_19 = stp2_19; \
+ \
+ MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
+ stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \
+ stp1_21, stp1_26) \
+ MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
+ stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \
+ stp1_23, stp1_24) \
+ \
+ stp1_28 = stp2_28; \
+ stp1_29 = stp2_29; \
+ stp1_30 = stp2_30; \
+ stp1_31 = stp2_31; \
+}
+
+// Only upper-left 8x8 has non-zero coeff
+void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest,
+ int stride) {
+ const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ const __m128i final_rounding = _mm_set1_epi16(1<<5);
+
+ // idct constants for each stage
+ const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
+ const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
+ const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
+ const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
+ const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
+ const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
+ const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
+ const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
+ const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
+ const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
+ const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
+ const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
+ const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
+ const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
+ const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
+ const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
+
+ const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+ const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
+ const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
+ const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
+ const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
+ const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
+ const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+ const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
+
+ const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+ const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+ const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
+ const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
+ const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+ const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
+ const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
+ const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
+ const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
+
+ const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+ const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+ const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
+ const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
+ const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+
+ const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12,
+ in13, in14, in15, in16, in17, in18, in19, in20, in21, in22, in23,
+ in24, in25, in26, in27, in28, in29, in30, in31;
+ __m128i col[128];
+ __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
+ stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
+ stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
+ stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29,
+ stp1_30, stp1_31;
+ __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
+ stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
+ stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22,
+ stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
+ stp2_30, stp2_31;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ int i, j, i32;
+
+ // We work on a 8x32 block each time, and loop 8 times for 2-D 32x32 idct.
+ for (i = 0; i < 8; i++) {
+ i32 = (i << 5);
+ if (i == 0) {
+ // First 1-D idct: first 8 rows
+ // Load input data.
+ LOAD_DQCOEFF(in0, input);
+ LOAD_DQCOEFF(in8, input);
+ LOAD_DQCOEFF(in16, input);
+ LOAD_DQCOEFF(in24, input);
+ LOAD_DQCOEFF(in1, input);
+ LOAD_DQCOEFF(in9, input);
+ LOAD_DQCOEFF(in17, input);
+ LOAD_DQCOEFF(in25, input);
+ LOAD_DQCOEFF(in2, input);
+ LOAD_DQCOEFF(in10, input);
+ LOAD_DQCOEFF(in18, input);
+ LOAD_DQCOEFF(in26, input);
+ LOAD_DQCOEFF(in3, input);
+ LOAD_DQCOEFF(in11, input);
+ LOAD_DQCOEFF(in19, input);
+ LOAD_DQCOEFF(in27, input);
+
+ LOAD_DQCOEFF(in4, input);
+ LOAD_DQCOEFF(in12, input);
+ LOAD_DQCOEFF(in20, input);
+ LOAD_DQCOEFF(in28, input);
+ LOAD_DQCOEFF(in5, input);
+ LOAD_DQCOEFF(in13, input);
+ LOAD_DQCOEFF(in21, input);
+ LOAD_DQCOEFF(in29, input);
+ LOAD_DQCOEFF(in6, input);
+ LOAD_DQCOEFF(in14, input);
+ LOAD_DQCOEFF(in22, input);
+ LOAD_DQCOEFF(in30, input);
+ LOAD_DQCOEFF(in7, input);
+ LOAD_DQCOEFF(in15, input);
+ LOAD_DQCOEFF(in23, input);
+ LOAD_DQCOEFF(in31, input);
+
+ // Transpose 32x8 block to 8x32 block
+ TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+ in4, in5, in6, in7);
+ TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
+ in10, in11, in12, in13, in14, in15);
+ TRANSPOSE_8X8(in16, in17, in18, in19, in20, in21, in22, in23, in16, in17,
+ in18, in19, in20, in21, in22, in23);
+ TRANSPOSE_8X8(in24, in25, in26, in27, in28, in29, in30, in31, in24, in25,
+ in26, in27, in28, in29, in30, in31);
+ } else if (i < 4) {
+ // First 1-D idct: next 24 zero-coeff rows
+ col[i32 + 0] = _mm_setzero_si128();
+ col[i32 + 1] = _mm_setzero_si128();
+ col[i32 + 2] = _mm_setzero_si128();
+ col[i32 + 3] = _mm_setzero_si128();
+ col[i32 + 4] = _mm_setzero_si128();
+ col[i32 + 5] = _mm_setzero_si128();
+ col[i32 + 6] = _mm_setzero_si128();
+ col[i32 + 7] = _mm_setzero_si128();
+ col[i32 + 8] = _mm_setzero_si128();
+ col[i32 + 9] = _mm_setzero_si128();
+ col[i32 + 10] = _mm_setzero_si128();
+ col[i32 + 11] = _mm_setzero_si128();
+ col[i32 + 12] = _mm_setzero_si128();
+ col[i32 + 13] = _mm_setzero_si128();
+ col[i32 + 14] = _mm_setzero_si128();
+ col[i32 + 15] = _mm_setzero_si128();
+ col[i32 + 16] = _mm_setzero_si128();
+ col[i32 + 17] = _mm_setzero_si128();
+ col[i32 + 18] = _mm_setzero_si128();
+ col[i32 + 19] = _mm_setzero_si128();
+ col[i32 + 20] = _mm_setzero_si128();
+ col[i32 + 21] = _mm_setzero_si128();
+ col[i32 + 22] = _mm_setzero_si128();
+ col[i32 + 23] = _mm_setzero_si128();
+ col[i32 + 24] = _mm_setzero_si128();
+ col[i32 + 25] = _mm_setzero_si128();
+ col[i32 + 26] = _mm_setzero_si128();
+ col[i32 + 27] = _mm_setzero_si128();
+ col[i32 + 28] = _mm_setzero_si128();
+ col[i32 + 29] = _mm_setzero_si128();
+ col[i32 + 30] = _mm_setzero_si128();
+ col[i32 + 31] = _mm_setzero_si128();
+ continue;
+ } else {
+ // Second 1-D idct
+ j = i - 4;
+
+ // Transpose 32x8 block to 8x32 block
+ TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
+ col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
+ col[j * 8 + 6], col[j * 8 + 7], in0, in1, in2, in3, in4,
+ in5, in6, in7);
+ j += 4;
+ TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
+ col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
+ col[j * 8 + 6], col[j * 8 + 7], in8, in9, in10,
+ in11, in12, in13, in14, in15);
+ j += 4;
+ TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
+ col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
+ col[j * 8 + 6], col[j * 8 + 7], in16, in17, in18,
+ in19, in20, in21, in22, in23);
+ j += 4;
+ TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
+ col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
+ col[j * 8 + 6], col[j * 8 + 7], in24, in25, in26, in27,
+ in28, in29, in30, in31);
+ }
+
+ IDCT32_1D
+
+ // final stage
+ if (i < 4) {
+ // 1_D: Store 32 intermediate results for each 8x32 block.
+ col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
+ col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
+ col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
+ col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
+ col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
+ col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
+ col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
+ col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
+ col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
+ col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
+ col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
+ col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
+ col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
+ col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
+ col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
+ col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
+ col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
+ col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
+ col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
+ col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
+ col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
+ col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
+ col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
+ col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
+ col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
+ col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
+ col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
+ col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
+ col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
+ col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
+ col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
+ col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
+ } else {
+ const __m128i zero = _mm_setzero_si128();
+
+ // 2_D: Calculate the results and store them to destination.
+ in0 = _mm_add_epi16(stp1_0, stp1_31);
+ in1 = _mm_add_epi16(stp1_1, stp1_30);
+ in2 = _mm_add_epi16(stp1_2, stp1_29);
+ in3 = _mm_add_epi16(stp1_3, stp1_28);
+ in4 = _mm_add_epi16(stp1_4, stp1_27);
+ in5 = _mm_add_epi16(stp1_5, stp1_26);
+ in6 = _mm_add_epi16(stp1_6, stp1_25);
+ in7 = _mm_add_epi16(stp1_7, stp1_24);
+ in8 = _mm_add_epi16(stp1_8, stp1_23);
+ in9 = _mm_add_epi16(stp1_9, stp1_22);
+ in10 = _mm_add_epi16(stp1_10, stp1_21);
+ in11 = _mm_add_epi16(stp1_11, stp1_20);
+ in12 = _mm_add_epi16(stp1_12, stp1_19);
+ in13 = _mm_add_epi16(stp1_13, stp1_18);
+ in14 = _mm_add_epi16(stp1_14, stp1_17);
+ in15 = _mm_add_epi16(stp1_15, stp1_16);
+ in16 = _mm_sub_epi16(stp1_15, stp1_16);
+ in17 = _mm_sub_epi16(stp1_14, stp1_17);
+ in18 = _mm_sub_epi16(stp1_13, stp1_18);
+ in19 = _mm_sub_epi16(stp1_12, stp1_19);
+ in20 = _mm_sub_epi16(stp1_11, stp1_20);
+ in21 = _mm_sub_epi16(stp1_10, stp1_21);
+ in22 = _mm_sub_epi16(stp1_9, stp1_22);
+ in23 = _mm_sub_epi16(stp1_8, stp1_23);
+ in24 = _mm_sub_epi16(stp1_7, stp1_24);
+ in25 = _mm_sub_epi16(stp1_6, stp1_25);
+ in26 = _mm_sub_epi16(stp1_5, stp1_26);
+ in27 = _mm_sub_epi16(stp1_4, stp1_27);
+ in28 = _mm_sub_epi16(stp1_3, stp1_28);
+ in29 = _mm_sub_epi16(stp1_2, stp1_29);
+ in30 = _mm_sub_epi16(stp1_1, stp1_30);
+ in31 = _mm_sub_epi16(stp1_0, stp1_31);
+
+ // Final rounding and shift
+ in0 = _mm_adds_epi16(in0, final_rounding);
+ in1 = _mm_adds_epi16(in1, final_rounding);
+ in2 = _mm_adds_epi16(in2, final_rounding);
+ in3 = _mm_adds_epi16(in3, final_rounding);
+ in4 = _mm_adds_epi16(in4, final_rounding);
+ in5 = _mm_adds_epi16(in5, final_rounding);
+ in6 = _mm_adds_epi16(in6, final_rounding);
+ in7 = _mm_adds_epi16(in7, final_rounding);
+ in8 = _mm_adds_epi16(in8, final_rounding);
+ in9 = _mm_adds_epi16(in9, final_rounding);
+ in10 = _mm_adds_epi16(in10, final_rounding);
+ in11 = _mm_adds_epi16(in11, final_rounding);
+ in12 = _mm_adds_epi16(in12, final_rounding);
+ in13 = _mm_adds_epi16(in13, final_rounding);
+ in14 = _mm_adds_epi16(in14, final_rounding);
+ in15 = _mm_adds_epi16(in15, final_rounding);
+ in16 = _mm_adds_epi16(in16, final_rounding);
+ in17 = _mm_adds_epi16(in17, final_rounding);
+ in18 = _mm_adds_epi16(in18, final_rounding);
+ in19 = _mm_adds_epi16(in19, final_rounding);
+ in20 = _mm_adds_epi16(in20, final_rounding);
+ in21 = _mm_adds_epi16(in21, final_rounding);
+ in22 = _mm_adds_epi16(in22, final_rounding);
+ in23 = _mm_adds_epi16(in23, final_rounding);
+ in24 = _mm_adds_epi16(in24, final_rounding);
+ in25 = _mm_adds_epi16(in25, final_rounding);
+ in26 = _mm_adds_epi16(in26, final_rounding);
+ in27 = _mm_adds_epi16(in27, final_rounding);
+ in28 = _mm_adds_epi16(in28, final_rounding);
+ in29 = _mm_adds_epi16(in29, final_rounding);
+ in30 = _mm_adds_epi16(in30, final_rounding);
+ in31 = _mm_adds_epi16(in31, final_rounding);
+
+ in0 = _mm_srai_epi16(in0, 6);
+ in1 = _mm_srai_epi16(in1, 6);
+ in2 = _mm_srai_epi16(in2, 6);
+ in3 = _mm_srai_epi16(in3, 6);
+ in4 = _mm_srai_epi16(in4, 6);
+ in5 = _mm_srai_epi16(in5, 6);
+ in6 = _mm_srai_epi16(in6, 6);
+ in7 = _mm_srai_epi16(in7, 6);
+ in8 = _mm_srai_epi16(in8, 6);
+ in9 = _mm_srai_epi16(in9, 6);
+ in10 = _mm_srai_epi16(in10, 6);
+ in11 = _mm_srai_epi16(in11, 6);
+ in12 = _mm_srai_epi16(in12, 6);
+ in13 = _mm_srai_epi16(in13, 6);
+ in14 = _mm_srai_epi16(in14, 6);
+ in15 = _mm_srai_epi16(in15, 6);
+ in16 = _mm_srai_epi16(in16, 6);
+ in17 = _mm_srai_epi16(in17, 6);
+ in18 = _mm_srai_epi16(in18, 6);
+ in19 = _mm_srai_epi16(in19, 6);
+ in20 = _mm_srai_epi16(in20, 6);
+ in21 = _mm_srai_epi16(in21, 6);
+ in22 = _mm_srai_epi16(in22, 6);
+ in23 = _mm_srai_epi16(in23, 6);
+ in24 = _mm_srai_epi16(in24, 6);
+ in25 = _mm_srai_epi16(in25, 6);
+ in26 = _mm_srai_epi16(in26, 6);
+ in27 = _mm_srai_epi16(in27, 6);
+ in28 = _mm_srai_epi16(in28, 6);
+ in29 = _mm_srai_epi16(in29, 6);
+ in30 = _mm_srai_epi16(in30, 6);
+ in31 = _mm_srai_epi16(in31, 6);
+
+ RECON_AND_STORE(dest, in0);
+ RECON_AND_STORE(dest, in1);
+ RECON_AND_STORE(dest, in2);
+ RECON_AND_STORE(dest, in3);
+ RECON_AND_STORE(dest, in4);
+ RECON_AND_STORE(dest, in5);
+ RECON_AND_STORE(dest, in6);
+ RECON_AND_STORE(dest, in7);
+ RECON_AND_STORE(dest, in8);
+ RECON_AND_STORE(dest, in9);
+ RECON_AND_STORE(dest, in10);
+ RECON_AND_STORE(dest, in11);
+ RECON_AND_STORE(dest, in12);
+ RECON_AND_STORE(dest, in13);
+ RECON_AND_STORE(dest, in14);
+ RECON_AND_STORE(dest, in15);
+ RECON_AND_STORE(dest, in16);
+ RECON_AND_STORE(dest, in17);
+ RECON_AND_STORE(dest, in18);
+ RECON_AND_STORE(dest, in19);
+ RECON_AND_STORE(dest, in20);
+ RECON_AND_STORE(dest, in21);
+ RECON_AND_STORE(dest, in22);
+ RECON_AND_STORE(dest, in23);
+ RECON_AND_STORE(dest, in24);
+ RECON_AND_STORE(dest, in25);
+ RECON_AND_STORE(dest, in26);
+ RECON_AND_STORE(dest, in27);
+ RECON_AND_STORE(dest, in28);
+ RECON_AND_STORE(dest, in29);
+ RECON_AND_STORE(dest, in30);
+ RECON_AND_STORE(dest, in31);
+
+ dest += 8 - (stride * 32);
+ }
+ }
+}
+
void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest,
int stride) {
const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
@@ -3009,336 +3701,7 @@ void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest,
in28, in29, in30, in31);
}
- // Stage1
- {
- const __m128i lo_1_31 = _mm_unpacklo_epi16(in1, in31);
- const __m128i hi_1_31 = _mm_unpackhi_epi16(in1, in31);
- const __m128i lo_17_15 = _mm_unpacklo_epi16(in17, in15);
- const __m128i hi_17_15 = _mm_unpackhi_epi16(in17, in15);
-
- const __m128i lo_9_23 = _mm_unpacklo_epi16(in9, in23);
- const __m128i hi_9_23 = _mm_unpackhi_epi16(in9, in23);
- const __m128i lo_25_7= _mm_unpacklo_epi16(in25, in7);
- const __m128i hi_25_7 = _mm_unpackhi_epi16(in25, in7);
-
- const __m128i lo_5_27 = _mm_unpacklo_epi16(in5, in27);
- const __m128i hi_5_27 = _mm_unpackhi_epi16(in5, in27);
- const __m128i lo_21_11 = _mm_unpacklo_epi16(in21, in11);
- const __m128i hi_21_11 = _mm_unpackhi_epi16(in21, in11);
-
- const __m128i lo_13_19 = _mm_unpacklo_epi16(in13, in19);
- const __m128i hi_13_19 = _mm_unpackhi_epi16(in13, in19);
- const __m128i lo_29_3 = _mm_unpacklo_epi16(in29, in3);
- const __m128i hi_29_3 = _mm_unpackhi_epi16(in29, in3);
-
- MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0,
- stg1_1, stg1_2, stg1_3, stp1_16, stp1_31,
- stp1_17, stp1_30)
- MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4,
- stg1_5, stg1_6, stg1_7, stp1_18, stp1_29,
- stp1_19, stp1_28)
- MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8,
- stg1_9, stg1_10, stg1_11, stp1_20, stp1_27,
- stp1_21, stp1_26)
- MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12,
- stg1_13, stg1_14, stg1_15, stp1_22, stp1_25,
- stp1_23, stp1_24)
- }
-
- // Stage2
- {
- const __m128i lo_2_30 = _mm_unpacklo_epi16(in2, in30);
- const __m128i hi_2_30 = _mm_unpackhi_epi16(in2, in30);
- const __m128i lo_18_14 = _mm_unpacklo_epi16(in18, in14);
- const __m128i hi_18_14 = _mm_unpackhi_epi16(in18, in14);
-
- const __m128i lo_10_22 = _mm_unpacklo_epi16(in10, in22);
- const __m128i hi_10_22 = _mm_unpackhi_epi16(in10, in22);
- const __m128i lo_26_6 = _mm_unpacklo_epi16(in26, in6);
- const __m128i hi_26_6 = _mm_unpackhi_epi16(in26, in6);
-
- MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0,
- stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9,
- stp2_14)
- MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4,
- stg2_5, stg2_6, stg2_7, stp2_10, stp2_13,
- stp2_11, stp2_12)
-
- stp2_16 = _mm_add_epi16(stp1_16, stp1_17);
- stp2_17 = _mm_sub_epi16(stp1_16, stp1_17);
- stp2_18 = _mm_sub_epi16(stp1_19, stp1_18);
- stp2_19 = _mm_add_epi16(stp1_19, stp1_18);
-
- stp2_20 = _mm_add_epi16(stp1_20, stp1_21);
- stp2_21 = _mm_sub_epi16(stp1_20, stp1_21);
- stp2_22 = _mm_sub_epi16(stp1_23, stp1_22);
- stp2_23 = _mm_add_epi16(stp1_23, stp1_22);
-
- stp2_24 = _mm_add_epi16(stp1_24, stp1_25);
- stp2_25 = _mm_sub_epi16(stp1_24, stp1_25);
- stp2_26 = _mm_sub_epi16(stp1_27, stp1_26);
- stp2_27 = _mm_add_epi16(stp1_27, stp1_26);
-
- stp2_28 = _mm_add_epi16(stp1_28, stp1_29);
- stp2_29 = _mm_sub_epi16(stp1_28, stp1_29);
- stp2_30 = _mm_sub_epi16(stp1_31, stp1_30);
- stp2_31 = _mm_add_epi16(stp1_31, stp1_30);
- }
-
- // Stage3
- {
- const __m128i lo_4_28 = _mm_unpacklo_epi16(in4, in28);
- const __m128i hi_4_28 = _mm_unpackhi_epi16(in4, in28);
- const __m128i lo_20_12 = _mm_unpacklo_epi16(in20, in12);
- const __m128i hi_20_12 = _mm_unpackhi_epi16(in20, in12);
-
- const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30);
- const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30);
- const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29);
- const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29);
-
- const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);
- const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);
- const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25);
- const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25);
-
- MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0,
- stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5,
- stp1_6)
-
- stp1_8 = _mm_add_epi16(stp2_8, stp2_9);
- stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);
- stp1_10 = _mm_sub_epi16(stp2_11, stp2_10);
- stp1_11 = _mm_add_epi16(stp2_11, stp2_10);
- stp1_12 = _mm_add_epi16(stp2_12, stp2_13);
- stp1_13 = _mm_sub_epi16(stp2_12, stp2_13);
- stp1_14 = _mm_sub_epi16(stp2_15, stp2_14);
- stp1_15 = _mm_add_epi16(stp2_15, stp2_14);
-
- MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4,
- stg3_5, stg3_6, stg3_4, stp1_17, stp1_30,
- stp1_18, stp1_29)
- MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8,
- stg3_9, stg3_10, stg3_8, stp1_21, stp1_26,
- stp1_22, stp1_25)
-
- stp1_16 = stp2_16;
- stp1_31 = stp2_31;
- stp1_19 = stp2_19;
- stp1_20 = stp2_20;
- stp1_23 = stp2_23;
- stp1_24 = stp2_24;
- stp1_27 = stp2_27;
- stp1_28 = stp2_28;
- }
-
- // Stage4
- {
- const __m128i lo_0_16 = _mm_unpacklo_epi16(in0, in16);
- const __m128i hi_0_16 = _mm_unpackhi_epi16(in0, in16);
- const __m128i lo_8_24 = _mm_unpacklo_epi16(in8, in24);
- const __m128i hi_8_24 = _mm_unpackhi_epi16(in8, in24);
-
- const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);
- const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14);
- const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
- const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);
-
- MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0,
- stg4_1, stg4_2, stg4_3, stp2_0, stp2_1,
- stp2_2, stp2_3)
-
- stp2_4 = _mm_add_epi16(stp1_4, stp1_5);
- stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);
- stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);
- stp2_7 = _mm_add_epi16(stp1_7, stp1_6);
-
- MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4,
- stg4_5, stg4_6, stg4_4, stp2_9, stp2_14,
- stp2_10, stp2_13)
-
- stp2_8 = stp1_8;
- stp2_15 = stp1_15;
- stp2_11 = stp1_11;
- stp2_12 = stp1_12;
-
- stp2_16 = _mm_add_epi16(stp1_16, stp1_19);
- stp2_17 = _mm_add_epi16(stp1_17, stp1_18);
- stp2_18 = _mm_sub_epi16(stp1_17, stp1_18);
- stp2_19 = _mm_sub_epi16(stp1_16, stp1_19);
- stp2_20 = _mm_sub_epi16(stp1_23, stp1_20);
- stp2_21 = _mm_sub_epi16(stp1_22, stp1_21);
- stp2_22 = _mm_add_epi16(stp1_22, stp1_21);
- stp2_23 = _mm_add_epi16(stp1_23, stp1_20);
-
- stp2_24 = _mm_add_epi16(stp1_24, stp1_27);
- stp2_25 = _mm_add_epi16(stp1_25, stp1_26);
- stp2_26 = _mm_sub_epi16(stp1_25, stp1_26);
- stp2_27 = _mm_sub_epi16(stp1_24, stp1_27);
- stp2_28 = _mm_sub_epi16(stp1_31, stp1_28);
- stp2_29 = _mm_sub_epi16(stp1_30, stp1_29);
- stp2_30 = _mm_add_epi16(stp1_29, stp1_30);
- stp2_31 = _mm_add_epi16(stp1_28, stp1_31);
- }
-
- // Stage5
- {
- const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);
- const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5);
- const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29);
- const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29);
-
- const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28);
- const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28);
- const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);
- const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);
-
- const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);
- const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);
-
- stp1_0 = _mm_add_epi16(stp2_0, stp2_3);
- stp1_1 = _mm_add_epi16(stp2_1, stp2_2);
- stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);
- stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);
-
- tmp0 = _mm_madd_epi16(lo_6_5, stg4_1);
- tmp1 = _mm_madd_epi16(hi_6_5, stg4_1);
- tmp2 = _mm_madd_epi16(lo_6_5, stg4_0);
- tmp3 = _mm_madd_epi16(hi_6_5, stg4_0);
-
- tmp0 = _mm_add_epi32(tmp0, rounding);
- tmp1 = _mm_add_epi32(tmp1, rounding);
- tmp2 = _mm_add_epi32(tmp2, rounding);
- tmp3 = _mm_add_epi32(tmp3, rounding);
-
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
- tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
- tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
-
- stp1_5 = _mm_packs_epi32(tmp0, tmp1);
- stp1_6 = _mm_packs_epi32(tmp2, tmp3);
-
- stp1_4 = stp2_4;
- stp1_7 = stp2_7;
-
- stp1_8 = _mm_add_epi16(stp2_8, stp2_11);
- stp1_9 = _mm_add_epi16(stp2_9, stp2_10);
- stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);
- stp1_11 = _mm_sub_epi16(stp2_8, stp2_11);
- stp1_12 = _mm_sub_epi16(stp2_15, stp2_12);
- stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);
- stp1_14 = _mm_add_epi16(stp2_14, stp2_13);
- stp1_15 = _mm_add_epi16(stp2_15, stp2_12);
-
- stp1_16 = stp2_16;
- stp1_17 = stp2_17;
-
- MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4,
- stg4_5, stg4_4, stg4_5, stp1_18, stp1_29,
- stp1_19, stp1_28)
- MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6,
- stg4_4, stg4_6, stg4_4, stp1_20, stp1_27,
- stp1_21, stp1_26)
-
- stp1_22 = stp2_22;
- stp1_23 = stp2_23;
- stp1_24 = stp2_24;
- stp1_25 = stp2_25;
- stp1_30 = stp2_30;
- stp1_31 = stp2_31;
- }
-
- // Stage6
- {
- const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
- const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);
- const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);
- const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12);
-
- stp2_0 = _mm_add_epi16(stp1_0, stp1_7);
- stp2_1 = _mm_add_epi16(stp1_1, stp1_6);
- stp2_2 = _mm_add_epi16(stp1_2, stp1_5);
- stp2_3 = _mm_add_epi16(stp1_3, stp1_4);
- stp2_4 = _mm_sub_epi16(stp1_3, stp1_4);
- stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);
- stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);
- stp2_7 = _mm_sub_epi16(stp1_0, stp1_7);
-
- stp2_8 = stp1_8;
- stp2_9 = stp1_9;
- stp2_14 = stp1_14;
- stp2_15 = stp1_15;
-
- MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12,
- stg6_0, stg4_0, stg6_0, stg4_0, stp2_10,
- stp2_13, stp2_11, stp2_12)
-
- stp2_16 = _mm_add_epi16(stp1_16, stp1_23);
- stp2_17 = _mm_add_epi16(stp1_17, stp1_22);
- stp2_18 = _mm_add_epi16(stp1_18, stp1_21);
- stp2_19 = _mm_add_epi16(stp1_19, stp1_20);
- stp2_20 = _mm_sub_epi16(stp1_19, stp1_20);
- stp2_21 = _mm_sub_epi16(stp1_18, stp1_21);
- stp2_22 = _mm_sub_epi16(stp1_17, stp1_22);
- stp2_23 = _mm_sub_epi16(stp1_16, stp1_23);
-
- stp2_24 = _mm_sub_epi16(stp1_31, stp1_24);
- stp2_25 = _mm_sub_epi16(stp1_30, stp1_25);
- stp2_26 = _mm_sub_epi16(stp1_29, stp1_26);
- stp2_27 = _mm_sub_epi16(stp1_28, stp1_27);
- stp2_28 = _mm_add_epi16(stp1_27, stp1_28);
- stp2_29 = _mm_add_epi16(stp1_26, stp1_29);
- stp2_30 = _mm_add_epi16(stp1_25, stp1_30);
- stp2_31 = _mm_add_epi16(stp1_24, stp1_31);
- }
-
- // Stage7
- {
- const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);
- const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);
- const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);
- const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);
-
- const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25);
- const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25);
- const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24);
- const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24);
-
- stp1_0 = _mm_add_epi16(stp2_0, stp2_15);
- stp1_1 = _mm_add_epi16(stp2_1, stp2_14);
- stp1_2 = _mm_add_epi16(stp2_2, stp2_13);
- stp1_3 = _mm_add_epi16(stp2_3, stp2_12);
- stp1_4 = _mm_add_epi16(stp2_4, stp2_11);
- stp1_5 = _mm_add_epi16(stp2_5, stp2_10);
- stp1_6 = _mm_add_epi16(stp2_6, stp2_9);
- stp1_7 = _mm_add_epi16(stp2_7, stp2_8);
- stp1_8 = _mm_sub_epi16(stp2_7, stp2_8);
- stp1_9 = _mm_sub_epi16(stp2_6, stp2_9);
- stp1_10 = _mm_sub_epi16(stp2_5, stp2_10);
- stp1_11 = _mm_sub_epi16(stp2_4, stp2_11);
- stp1_12 = _mm_sub_epi16(stp2_3, stp2_12);
- stp1_13 = _mm_sub_epi16(stp2_2, stp2_13);
- stp1_14 = _mm_sub_epi16(stp2_1, stp2_14);
- stp1_15 = _mm_sub_epi16(stp2_0, stp2_15);
-
- stp1_16 = stp2_16;
- stp1_17 = stp2_17;
- stp1_18 = stp2_18;
- stp1_19 = stp2_19;
-
- MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0,
- stg4_0, stg6_0, stg4_0, stp1_20, stp1_27,
- stp1_21, stp1_26)
- MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0,
- stg4_0, stg6_0, stg4_0, stp1_22, stp1_25,
- stp1_23, stp1_24)
-
- stp1_28 = stp2_28;
- stp1_29 = stp2_29;
- stp1_30 = stp2_30;
- stp1_31 = stp2_31;
- }
+ IDCT32_1D
// final stage
if (i < 4) {