diff options
author | Jingning Han <jingning@google.com> | 2015-08-03 10:50:32 -0700 |
---|---|---|
committer | Jingning Han <jingning@google.com> | 2015-08-03 11:59:50 -0700 |
commit | bfad9d2fe6e3a17e1deff19977c8c0907828bb6f (patch) | |
tree | 59b2c5b344ea9f7a2271201a1294207b90caab5d /vp9/common/mips | |
parent | 7723b8df6a4be00625258f6177df7d898ba6c33e (diff) | |
download | libvpx-bfad9d2fe6e3a17e1deff19977c8c0907828bb6f.tar libvpx-bfad9d2fe6e3a17e1deff19977c8c0907828bb6f.tar.gz libvpx-bfad9d2fe6e3a17e1deff19977c8c0907828bb6f.tar.bz2 libvpx-bfad9d2fe6e3a17e1deff19977c8c0907828bb6f.zip |
Move inverse transfrom dspr2 functions from vp9 to vpx_dsp
Change-Id: Ia9cf7c31cab4ba3dd6b9bb668c4b3e84bd55cf69
Diffstat (limited to 'vp9/common/mips')
-rw-r--r-- | vp9/common/mips/dspr2/vp9_common_dspr2.h | 59 | ||||
-rw-r--r-- | vp9/common/mips/dspr2/vp9_itrans16_dspr2.c | 1211 | ||||
-rw-r--r-- | vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c | 1074 | ||||
-rw-r--r-- | vp9/common/mips/dspr2/vp9_itrans32_dspr2.c | 1076 | ||||
-rw-r--r-- | vp9/common/mips/dspr2/vp9_itrans4_dspr2.c | 345 | ||||
-rw-r--r-- | vp9/common/mips/dspr2/vp9_itrans8_dspr2.c | 655 |
6 files changed, 3 insertions, 4417 deletions
diff --git a/vp9/common/mips/dspr2/vp9_common_dspr2.h b/vp9/common/mips/dspr2/vp9_common_dspr2.h deleted file mode 100644 index f8690fd61..000000000 --- a/vp9/common/mips/dspr2/vp9_common_dspr2.h +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Copyright (c) 2013 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef VP9_COMMON_MIPS_DSPR2_VP9_COMMON_DSPR2_H_ -#define VP9_COMMON_MIPS_DSPR2_VP9_COMMON_DSPR2_H_ - -#include <assert.h> - -#include "./vpx_config.h" -#include "vpx/vpx_integer.h" -#include "vpx_dsp/mips/common_dspr2.h" - -#ifdef __cplusplus -extern "C" { -#endif - -#if HAVE_DSPR2 -#define DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input) ({ \ - \ - int32_t tmp, out; \ - int dct_cost_rounding = DCT_CONST_ROUNDING; \ - int in = input; \ - \ - __asm__ __volatile__ ( \ - /* out = dct_const_round_shift(input_dc * cospi_16_64); */ \ - "mtlo %[dct_cost_rounding], $ac1 \n\t"\ - "mthi $zero, $ac1 \n\t"\ - "madd $ac1, %[in], %[cospi_16_64] \n\t"\ - "extp %[tmp], $ac1, 31 \n\t"\ - \ - /* out = dct_const_round_shift(out * cospi_16_64); */ \ - "mtlo %[dct_cost_rounding], $ac2 \n\t"\ - "mthi $zero, $ac2 \n\t"\ - "madd $ac2, %[tmp], %[cospi_16_64] \n\t"\ - "extp %[out], $ac2, 31 \n\t"\ - \ - : [tmp] "=&r" (tmp), [out] "=r" (out) \ - : [in] "r" (in), \ - [dct_cost_rounding] "r" (dct_cost_rounding), \ - [cospi_16_64] "r" (cospi_16_64) \ - ); \ - out; }) - -void vp9_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, - int dest_stride); - -#endif // #if HAVE_DSPR2 -#ifdef __cplusplus -} // extern "C" -#endif - -#endif // VP9_COMMON_MIPS_DSPR2_VP9_COMMON_DSPR2_H_ diff --git a/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c b/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c index aca6550c8..6ca83a00c 100644 --- a/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c +++ b/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c @@ -16,1074 +16,11 @@ #include "vp9/common/vp9_common.h" #include "vp9/common/vp9_blockd.h" #include "vp9/common/vp9_idct.h" -#include "vp9/common/mips/dspr2/vp9_common_dspr2.h" +#include "vpx_dsp/mips/inv_txfm_dspr2.h" #include "vpx_dsp/txfm_common.h" #include "vpx_ports/mem.h" #if HAVE_DSPR2 -static void idct16_rows_dspr2(const int16_t *input, int16_t *output, - uint32_t no_rows) { - int i; - int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; - int step1_10, step1_11, step1_12, step1_13; - int step2_0, step2_1, step2_2, step2_3; - int step2_8, step2_9, step2_10, step2_11; - int step2_12, step2_13, step2_14, step2_15; - int load1, load2, load3, load4, load5, load6, load7, load8; - int result1, result2, result3, result4; - const int const_2_power_13 = 8192; - - for (i = no_rows; i--; ) { - /* prefetch row */ - prefetch_load((const uint8_t *)(input + 16)); - - __asm__ __volatile__ ( - "lh %[load1], 0(%[input]) \n\t" - "lh %[load2], 16(%[input]) \n\t" - "lh %[load3], 8(%[input]) \n\t" - "lh %[load4], 24(%[input]) \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "add %[result1], %[load1], %[load2] \n\t" - "sub %[result2], %[load1], %[load2] \n\t" - "madd $ac1, %[result1], %[cospi_16_64] \n\t" - "madd $ac2, %[result2], %[cospi_16_64] \n\t" - "extp %[step2_0], $ac1, 31 \n\t" - "extp %[step2_1], $ac2, 31 \n\t" - - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - "madd $ac3, %[load3], %[cospi_24_64] \n\t" - "msub $ac3, %[load4], %[cospi_8_64] \n\t" - "extp %[step2_2], $ac3, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "madd $ac1, %[load3], %[cospi_8_64] \n\t" - "madd $ac1, %[load4], %[cospi_24_64] \n\t" - "extp %[step2_3], $ac1, 31 \n\t" - - "add %[step1_0], %[step2_0], %[step2_3] \n\t" - "add %[step1_1], %[step2_1], %[step2_2] \n\t" - "sub %[step1_2], %[step2_1], %[step2_2] \n\t" - "sub %[step1_3], %[step2_0], %[step2_3] \n\t" - - : [load1] "=&r" (load1), [load2] "=&r" (load2), - [load3] "=&r" (load3), [load4] "=&r" (load4), - [result1] "=&r" (result1), [result2] "=&r" (result2), - [step2_0] "=&r" (step2_0), [step2_1] "=&r" (step2_1), - [step2_2] "=&r" (step2_2), [step2_3] "=&r" (step2_3), - [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1), - [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3) - : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), - [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64), - [cospi_16_64] "r" (cospi_16_64) - ); - - __asm__ __volatile__ ( - "lh %[load5], 2(%[input]) \n\t" - "lh %[load6], 30(%[input]) \n\t" - "lh %[load7], 18(%[input]) \n\t" - "lh %[load8], 14(%[input]) \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "madd $ac1, %[load5], %[cospi_30_64] \n\t" - "msub $ac1, %[load6], %[cospi_2_64] \n\t" - "extp %[result1], $ac1, 31 \n\t" - - "madd $ac3, %[load7], %[cospi_14_64] \n\t" - "msub $ac3, %[load8], %[cospi_18_64] \n\t" - "extp %[result2], $ac3, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - - "madd $ac1, %[load7], %[cospi_18_64] \n\t" - "madd $ac1, %[load8], %[cospi_14_64] \n\t" - "extp %[result3], $ac1, 31 \n\t" - - "madd $ac2, %[load5], %[cospi_2_64] \n\t" - "madd $ac2, %[load6], %[cospi_30_64] \n\t" - "extp %[result4], $ac2, 31 \n\t" - - "sub %[load5], %[result1], %[result2] \n\t" - "sub %[load6], %[result4], %[result3] \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "madd $ac1, %[load6], %[cospi_24_64] \n\t" - "msub $ac1, %[load5], %[cospi_8_64] \n\t" - "madd $ac3, %[load5], %[cospi_24_64] \n\t" - "madd $ac3, %[load6], %[cospi_8_64] \n\t" - - "extp %[step2_9], $ac1, 31 \n\t" - "extp %[step2_14], $ac3, 31 \n\t" - "add %[step2_8], %[result1], %[result2] \n\t" - "add %[step2_15], %[result4], %[result3] \n\t" - - : [load5] "=&r" (load5), [load6] "=&r" (load6), - [load7] "=&r" (load7), [load8] "=&r" (load8), - [result1] "=&r" (result1), [result2] "=&r" (result2), - [result3] "=&r" (result3), [result4] "=&r" (result4), - [step2_8] "=r" (step2_8), [step2_15] "=r" (step2_15), - [step2_9] "=r" (step2_9), [step2_14] "=r" (step2_14) - : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), - [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64), - [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64), - [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) - ); - - __asm__ __volatile__ ( - "lh %[load1], 10(%[input]) \n\t" - "lh %[load2], 22(%[input]) \n\t" - "lh %[load3], 26(%[input]) \n\t" - "lh %[load4], 6(%[input]) \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "madd $ac1, %[load1], %[cospi_22_64] \n\t" - "msub $ac1, %[load2], %[cospi_10_64] \n\t" - "extp %[result1], $ac1, 31 \n\t" - - "madd $ac3, %[load3], %[cospi_6_64] \n\t" - "msub $ac3, %[load4], %[cospi_26_64] \n\t" - "extp %[result2], $ac3, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - - "madd $ac1, %[load1], %[cospi_10_64] \n\t" - "madd $ac1, %[load2], %[cospi_22_64] \n\t" - "extp %[result3], $ac1, 31 \n\t" - - "madd $ac2, %[load3], %[cospi_26_64] \n\t" - "madd $ac2, %[load4], %[cospi_6_64] \n\t" - "extp %[result4], $ac2, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "sub %[load1], %[result2], %[result1] \n\t" - "sub %[load2], %[result4], %[result3] \n\t" - - "msub $ac1, %[load1], %[cospi_24_64] \n\t" - "msub $ac1, %[load2], %[cospi_8_64] \n\t" - "madd $ac3, %[load2], %[cospi_24_64] \n\t" - "msub $ac3, %[load1], %[cospi_8_64] \n\t" - - "extp %[step2_10], $ac1, 31 \n\t" - "extp %[step2_13], $ac3, 31 \n\t" - "add %[step2_11], %[result1], %[result2] \n\t" - "add %[step2_12], %[result4], %[result3] \n\t" - - : [load1] "=&r" (load1), [load2] "=&r" (load2), - [load3] "=&r" (load3), [load4] "=&r" (load4), - [result1] "=&r" (result1), [result2] "=&r" (result2), - [result3] "=&r" (result3), [result4] "=&r" (result4), - [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11), - [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13) - : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), - [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64), - [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64), - [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) - ); - - __asm__ __volatile__ ( - "lh %[load5], 4(%[input]) \n\t" - "lh %[load6], 28(%[input]) \n\t" - "lh %[load7], 20(%[input]) \n\t" - "lh %[load8], 12(%[input]) \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "madd $ac1, %[load5], %[cospi_28_64] \n\t" - "msub $ac1, %[load6], %[cospi_4_64] \n\t" - "extp %[result1], $ac1, 31 \n\t" - - "madd $ac3, %[load7], %[cospi_12_64] \n\t" - "msub $ac3, %[load8], %[cospi_20_64] \n\t" - "extp %[result2], $ac3, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - - "madd $ac1, %[load7], %[cospi_20_64] \n\t" - "madd $ac1, %[load8], %[cospi_12_64] \n\t" - "extp %[result3], $ac1, 31 \n\t" - - "madd $ac2, %[load5], %[cospi_4_64] \n\t" - "madd $ac2, %[load6], %[cospi_28_64] \n\t" - "extp %[result4], $ac2, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "sub %[load5], %[result4], %[result3] \n\t" - "sub %[load5], %[load5], %[result1] \n\t" - "add %[load5], %[load5], %[result2] \n\t" - - "sub %[load6], %[result1], %[result2] \n\t" - "sub %[load6], %[load6], %[result3] \n\t" - "add %[load6], %[load6], %[result4] \n\t" - - "madd $ac1, %[load5], %[cospi_16_64] \n\t" - "madd $ac3, %[load6], %[cospi_16_64] \n\t" - - "extp %[step1_5], $ac1, 31 \n\t" - "extp %[step1_6], $ac3, 31 \n\t" - "add %[step1_4], %[result1], %[result2] \n\t" - "add %[step1_7], %[result4], %[result3] \n\t" - - : [load5] "=&r" (load5), [load6] "=&r" (load6), - [load7] "=&r" (load7), [load8] "=&r" (load8), - [result1] "=&r" (result1), [result2] "=&r" (result2), - [result3] "=&r" (result3), [result4] "=&r" (result4), - [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5), - [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7) - : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), - [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64), - [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64), - [cospi_16_64] "r" (cospi_16_64) - ); - - __asm__ __volatile__ ( - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - - "sub %[load5], %[step2_14], %[step2_13] \n\t" - "sub %[load5], %[load5], %[step2_9] \n\t" - "add %[load5], %[load5], %[step2_10] \n\t" - - "madd $ac0, %[load5], %[cospi_16_64] \n\t" - - "sub %[load6], %[step2_14], %[step2_13] \n\t" - "sub %[load6], %[load6], %[step2_10] \n\t" - "add %[load6], %[load6], %[step2_9] \n\t" - - "madd $ac1, %[load6], %[cospi_16_64] \n\t" - - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "sub %[load5], %[step2_15], %[step2_12] \n\t" - "sub %[load5], %[load5], %[step2_8] \n\t" - "add %[load5], %[load5], %[step2_11] \n\t" - - "madd $ac2, %[load5], %[cospi_16_64] \n\t" - - "sub %[load6], %[step2_15], %[step2_12] \n\t" - "sub %[load6], %[load6], %[step2_11] \n\t" - "add %[load6], %[load6], %[step2_8] \n\t" - - "madd $ac3, %[load6], %[cospi_16_64] \n\t" - - "extp %[step1_10], $ac0, 31 \n\t" - "extp %[step1_13], $ac1, 31 \n\t" - "extp %[step1_11], $ac2, 31 \n\t" - "extp %[step1_12], $ac3, 31 \n\t" - - : [load5] "=&r" (load5), [load6] "=&r" (load6), - [step1_10] "=r" (step1_10), [step1_11] "=r" (step1_11), - [step1_12] "=r" (step1_12), [step1_13] "=r" (step1_13) - : [const_2_power_13] "r" (const_2_power_13), - [step2_14] "r" (step2_14), [step2_13] "r" (step2_13), - [step2_9] "r" (step2_9), [step2_10] "r" (step2_10), - [step2_15] "r" (step2_15), [step2_12] "r" (step2_12), - [step2_8] "r" (step2_8), [step2_11] "r" (step2_11), - [cospi_16_64] "r" (cospi_16_64) - ); - - __asm__ __volatile__ ( - "add %[load5], %[step1_0], %[step1_7] \n\t" - "add %[load5], %[load5], %[step2_12] \n\t" - "add %[load5], %[load5], %[step2_15] \n\t" - "add %[load6], %[step1_1], %[step1_6] \n\t" - "add %[load6], %[load6], %[step2_13] \n\t" - "add %[load6], %[load6], %[step2_14] \n\t" - "sh %[load5], 0(%[output]) \n\t" - "sh %[load6], 32(%[output]) \n\t" - "sub %[load5], %[step1_1], %[step1_6] \n\t" - "add %[load5], %[load5], %[step2_9] \n\t" - "add %[load5], %[load5], %[step2_10] \n\t" - "sub %[load6], %[step1_0], %[step1_7] \n\t" - "add %[load6], %[load6], %[step2_8] \n\t" - "add %[load6], %[load6], %[step2_11] \n\t" - "sh %[load5], 192(%[output]) \n\t" - "sh %[load6], 224(%[output]) \n\t" - "sub %[load5], %[step1_0], %[step1_7] \n\t" - "sub %[load5], %[load5], %[step2_8] \n\t" - "sub %[load5], %[load5], %[step2_11] \n\t" - "sub %[load6], %[step1_1], %[step1_6] \n\t" - "sub %[load6], %[load6], %[step2_9] \n\t" - "sub %[load6], %[load6], %[step2_10] \n\t" - "sh %[load5], 256(%[output]) \n\t" - "sh %[load6], 288(%[output]) \n\t" - "add %[load5], %[step1_1], %[step1_6] \n\t" - "sub %[load5], %[load5], %[step2_13] \n\t" - "sub %[load5], %[load5], %[step2_14] \n\t" - "add %[load6], %[step1_0], %[step1_7] \n\t" - "sub %[load6], %[load6], %[step2_12] \n\t" - "sub %[load6], %[load6], %[step2_15] \n\t" - "sh %[load5], 448(%[output]) \n\t" - "sh %[load6], 480(%[output]) \n\t" - - : [load5] "=&r" (load5), [load6] "=&r" (load6) - : [output] "r" (output), - [step1_0] "r" (step1_0), [step1_1] "r" (step1_1), - [step1_6] "r" (step1_6), [step1_7] "r" (step1_7), - [step2_8] "r" (step2_8), [step2_9] "r" (step2_9), - [step2_10] "r" (step2_10), [step2_11] "r" (step2_11), - [step2_12] "r" (step2_12), [step2_13] "r" (step2_13), - [step2_14] "r" (step2_14), [step2_15] "r" (step2_15) - ); - - __asm__ __volatile__ ( - "add %[load5], %[step1_2], %[step1_5] \n\t" - "add %[load5], %[load5], %[step1_13] \n\t" - "add %[load6], %[step1_3], %[step1_4] \n\t" - "add %[load6], %[load6], %[step1_12] \n\t" - "sh %[load5], 64(%[output]) \n\t" - "sh %[load6], 96(%[output]) \n\t" - "sub %[load5], %[step1_3], %[step1_4] \n\t" - "add %[load5], %[load5], %[step1_11] \n\t" - "sub %[load6], %[step1_2], %[step1_5] \n\t" - "add %[load6], %[load6], %[step1_10] \n\t" - "sh %[load5], 128(%[output]) \n\t" - "sh %[load6], 160(%[output]) \n\t" - "sub %[load5], %[step1_2], %[step1_5] \n\t" - "sub %[load5], %[load5], %[step1_10] \n\t" - "sub %[load6], %[step1_3], %[step1_4] \n\t" - "sub %[load6], %[load6], %[step1_11] \n\t" - "sh %[load5], 320(%[output]) \n\t" - "sh %[load6], 352(%[output]) \n\t" - "add %[load5], %[step1_3], %[step1_4] \n\t" - "sub %[load5], %[load5], %[step1_12] \n\t" - "add %[load6], %[step1_2], %[step1_5] \n\t" - "sub %[load6], %[load6], %[step1_13] \n\t" - "sh %[load5], 384(%[output]) \n\t" - "sh %[load6], 416(%[output]) \n\t" - - : [load5] "=&r" (load5), [load6] "=&r" (load6) - : [output] "r" (output), - [step1_2] "r" (step1_2), [step1_3] "r" (step1_3), - [step1_4] "r" (step1_4), [step1_5] "r" (step1_5), - [step1_10] "r" (step1_10), [step1_11] "r" (step1_11), - [step1_12] "r" (step1_12), [step1_13] "r" (step1_13) - ); - - input += 16; - output += 1; - } -} - -static void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, - int dest_stride) { - int i; - int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; - int step1_8, step1_9, step1_10, step1_11; - int step1_12, step1_13, step1_14, step1_15; - int step2_0, step2_1, step2_2, step2_3; - int step2_8, step2_9, step2_10, step2_11; - int step2_12, step2_13, step2_14, step2_15; - int load1, load2, load3, load4, load5, load6, load7, load8; - int result1, result2, result3, result4; - const int const_2_power_13 = 8192; - uint8_t *dest_pix; - uint8_t *cm = vpx_ff_cropTbl; - - /* prefetch vpx_ff_cropTbl */ - prefetch_load(vpx_ff_cropTbl); - prefetch_load(vpx_ff_cropTbl + 32); - prefetch_load(vpx_ff_cropTbl + 64); - prefetch_load(vpx_ff_cropTbl + 96); - prefetch_load(vpx_ff_cropTbl + 128); - prefetch_load(vpx_ff_cropTbl + 160); - prefetch_load(vpx_ff_cropTbl + 192); - prefetch_load(vpx_ff_cropTbl + 224); - - for (i = 0; i < 16; ++i) { - dest_pix = (dest + i); - __asm__ __volatile__ ( - "lh %[load1], 0(%[input]) \n\t" - "lh %[load2], 16(%[input]) \n\t" - "lh %[load3], 8(%[input]) \n\t" - "lh %[load4], 24(%[input]) \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "add %[result1], %[load1], %[load2] \n\t" - "sub %[result2], %[load1], %[load2] \n\t" - "madd $ac1, %[result1], %[cospi_16_64] \n\t" - "madd $ac2, %[result2], %[cospi_16_64] \n\t" - "extp %[step2_0], $ac1, 31 \n\t" - "extp %[step2_1], $ac2, 31 \n\t" - - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - "madd $ac3, %[load3], %[cospi_24_64] \n\t" - "msub $ac3, %[load4], %[cospi_8_64] \n\t" - "extp %[step2_2], $ac3, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "madd $ac1, %[load3], %[cospi_8_64] \n\t" - "madd $ac1, %[load4], %[cospi_24_64] \n\t" - "extp %[step2_3], $ac1, 31 \n\t" - - "add %[step1_0], %[step2_0], %[step2_3] \n\t" - "add %[step1_1], %[step2_1], %[step2_2] \n\t" - "sub %[step1_2], %[step2_1], %[step2_2] \n\t" - "sub %[step1_3], %[step2_0], %[step2_3] \n\t" - - : [load1] "=&r" (load1), [load2] "=&r" (load2), - [load3] "=&r" (load3), [load4] "=&r" (load4), - [result1] "=&r" (result1), [result2] "=&r" (result2), - [step2_0] "=&r" (step2_0), [step2_1] "=&r" (step2_1), - [step2_2] "=&r" (step2_2), [step2_3] "=&r" (step2_3), - [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1), - [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3) - : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), - [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64), - [cospi_16_64] "r" (cospi_16_64) - ); - - __asm__ __volatile__ ( - "lh %[load5], 2(%[input]) \n\t" - "lh %[load6], 30(%[input]) \n\t" - "lh %[load7], 18(%[input]) \n\t" - "lh %[load8], 14(%[input]) \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "madd $ac1, %[load5], %[cospi_30_64] \n\t" - "msub $ac1, %[load6], %[cospi_2_64] \n\t" - "extp %[result1], $ac1, 31 \n\t" - - "madd $ac3, %[load7], %[cospi_14_64] \n\t" - "msub $ac3, %[load8], %[cospi_18_64] \n\t" - "extp %[result2], $ac3, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - - "madd $ac1, %[load7], %[cospi_18_64] \n\t" - "madd $ac1, %[load8], %[cospi_14_64] \n\t" - "extp %[result3], $ac1, 31 \n\t" - - "madd $ac2, %[load5], %[cospi_2_64] \n\t" - "madd $ac2, %[load6], %[cospi_30_64] \n\t" - "extp %[result4], $ac2, 31 \n\t" - - "sub %[load5], %[result1], %[result2] \n\t" - "sub %[load6], %[result4], %[result3] \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "madd $ac1, %[load6], %[cospi_24_64] \n\t" - "msub $ac1, %[load5], %[cospi_8_64] \n\t" - "madd $ac3, %[load5], %[cospi_24_64] \n\t" - "madd $ac3, %[load6], %[cospi_8_64] \n\t" - - "extp %[step2_9], $ac1, 31 \n\t" - "extp %[step2_14], $ac3, 31 \n\t" - "add %[step2_8], %[result1], %[result2] \n\t" - "add %[step2_15], %[result4], %[result3] \n\t" - - : [load5] "=&r" (load5), [load6] "=&r" (load6), - [load7] "=&r" (load7), [load8] "=&r" (load8), - [result1] "=&r" (result1), [result2] "=&r" (result2), - [result3] "=&r" (result3), [result4] "=&r" (result4), - [step2_8] "=r" (step2_8), [step2_15] "=r" (step2_15), - [step2_9] "=r" (step2_9), [step2_14] "=r" (step2_14) - : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), - [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64), - [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64), - [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) - ); - - __asm__ __volatile__ ( - "lh %[load1], 10(%[input]) \n\t" - "lh %[load2], 22(%[input]) \n\t" - "lh %[load3], 26(%[input]) \n\t" - "lh %[load4], 6(%[input]) \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "madd $ac1, %[load1], %[cospi_22_64] \n\t" - "msub $ac1, %[load2], %[cospi_10_64] \n\t" - "extp %[result1], $ac1, 31 \n\t" - - "madd $ac3, %[load3], %[cospi_6_64] \n\t" - "msub $ac3, %[load4], %[cospi_26_64] \n\t" - "extp %[result2], $ac3, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - - "madd $ac1, %[load1], %[cospi_10_64] \n\t" - "madd $ac1, %[load2], %[cospi_22_64] \n\t" - "extp %[result3], $ac1, 31 \n\t" - - "madd $ac2, %[load3], %[cospi_26_64] \n\t" - "madd $ac2, %[load4], %[cospi_6_64] \n\t" - "extp %[result4], $ac2, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "sub %[load1], %[result2], %[result1] \n\t" - "sub %[load2], %[result4], %[result3] \n\t" - - "msub $ac1, %[load1], %[cospi_24_64] \n\t" - "msub $ac1, %[load2], %[cospi_8_64] \n\t" - "madd $ac3, %[load2], %[cospi_24_64] \n\t" - "msub $ac3, %[load1], %[cospi_8_64] \n\t" - - "extp %[step2_10], $ac1, 31 \n\t" - "extp %[step2_13], $ac3, 31 \n\t" - "add %[step2_11], %[result1], %[result2] \n\t" - "add %[step2_12], %[result4], %[result3] \n\t" - - : [load1] "=&r" (load1), [load2] "=&r" (load2), - [load3] "=&r" (load3), [load4] "=&r" (load4), - [result1] "=&r" (result1), [result2] "=&r" (result2), - [result3] "=&r" (result3), [result4] "=&r" (result4), - [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11), - [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13) - : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), - [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64), - [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64), - [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) - ); - - __asm__ __volatile__ ( - "lh %[load5], 4(%[input]) \n\t" - "lh %[load6], 28(%[input]) \n\t" - "lh %[load7], 20(%[input]) \n\t" - "lh %[load8], 12(%[input]) \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "madd $ac1, %[load5], %[cospi_28_64] \n\t" - "msub $ac1, %[load6], %[cospi_4_64] \n\t" - "extp %[result1], $ac1, 31 \n\t" - - "madd $ac3, %[load7], %[cospi_12_64] \n\t" - "msub $ac3, %[load8], %[cospi_20_64] \n\t" - "extp %[result2], $ac3, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - - "madd $ac1, %[load7], %[cospi_20_64] \n\t" - "madd $ac1, %[load8], %[cospi_12_64] \n\t" - "extp %[result3], $ac1, 31 \n\t" - - "madd $ac2, %[load5], %[cospi_4_64] \n\t" - "madd $ac2, %[load6], %[cospi_28_64] \n\t" - "extp %[result4], $ac2, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "sub %[load5], %[result4], %[result3] \n\t" - "sub %[load5], %[load5], %[result1] \n\t" - "add %[load5], %[load5], %[result2] \n\t" - - "sub %[load6], %[result1], %[result2] \n\t" - "sub %[load6], %[load6], %[result3] \n\t" - "add %[load6], %[load6], %[result4] \n\t" - - "madd $ac1, %[load5], %[cospi_16_64] \n\t" - "madd $ac3, %[load6], %[cospi_16_64] \n\t" - - "extp %[step1_5], $ac1, 31 \n\t" - "extp %[step1_6], $ac3, 31 \n\t" - - "add %[step1_4], %[result1], %[result2] \n\t" - "add %[step1_7], %[result4], %[result3] \n\t" - - : [load5] "=&r" (load5), [load6] "=&r" (load6), - [load7] "=&r" (load7), [load8] "=&r" (load8), - [result1] "=&r" (result1), [result2] "=&r" (result2), - [result3] "=&r" (result3), [result4] "=&r" (result4), - [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5), - [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7) - : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), - [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64), - [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64), - [cospi_16_64] "r" (cospi_16_64) - ); - - __asm__ __volatile__ ( - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - - "sub %[load5], %[step2_14], %[step2_13] \n\t" - "sub %[load5], %[load5], %[step2_9] \n\t" - "add %[load5], %[load5], %[step2_10] \n\t" - - "madd $ac0, %[load5], %[cospi_16_64] \n\t" - - "sub %[load6], %[step2_14], %[step2_13] \n\t" - "sub %[load6], %[load6], %[step2_10] \n\t" - "add %[load6], %[load6], %[step2_9] \n\t" - - "madd $ac1, %[load6], %[cospi_16_64] \n\t" - - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "sub %[load5], %[step2_15], %[step2_12] \n\t" - "sub %[load5], %[load5], %[step2_8] \n\t" - "add %[load5], %[load5], %[step2_11] \n\t" - - "madd $ac2, %[load5], %[cospi_16_64] \n\t" - - "sub %[load6], %[step2_15], %[step2_12] \n\t" - "sub %[load6], %[load6], %[step2_11] \n\t" - "add %[load6], %[load6], %[step2_8] \n\t" - - "madd $ac3, %[load6], %[cospi_16_64] \n\t" - - "extp %[step1_10], $ac0, 31 \n\t" - "extp %[step1_13], $ac1, 31 \n\t" - "extp %[step1_11], $ac2, 31 \n\t" - "extp %[step1_12], $ac3, 31 \n\t" - - : [load5] "=&r" (load5), [load6] "=&r" (load6), - [step1_10] "=r" (step1_10), [step1_11] "=r" (step1_11), - [step1_12] "=r" (step1_12), [step1_13] "=r" (step1_13) - : [const_2_power_13] "r" (const_2_power_13), - [step2_14] "r" (step2_14), [step2_13] "r" (step2_13), - [step2_9] "r" (step2_9), [step2_10] "r" (step2_10), - [step2_15] "r" (step2_15), [step2_12] "r" (step2_12), - [step2_8] "r" (step2_8), [step2_11] "r" (step2_11), - [cospi_16_64] "r" (cospi_16_64) - ); - - step1_8 = step2_8 + step2_11; - step1_9 = step2_9 + step2_10; - step1_14 = step2_13 + step2_14; - step1_15 = step2_12 + step2_15; - - __asm__ __volatile__ ( - "lbu %[load7], 0(%[dest_pix]) \n\t" - "add %[load5], %[step1_0], %[step1_7] \n\t" - "add %[load5], %[load5], %[step1_15] \n\t" - "addi %[load5], %[load5], 32 \n\t" - "sra %[load5], %[load5], 6 \n\t" - "add %[load7], %[load7], %[load5] \n\t" - "lbux %[load5], %[load7](%[cm]) \n\t" - "add %[load6], %[step1_1], %[step1_6] \n\t" - "add %[load6], %[load6], %[step1_14] \n\t" - "sb %[load5], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - "lbu %[load8], 0(%[dest_pix]) \n\t" - "addi %[load6], %[load6], 32 \n\t" - "sra %[load6], %[load6], 6 \n\t" - "add %[load8], %[load8], %[load6] \n\t" - "lbux %[load6], %[load8](%[cm]) \n\t" - "sb %[load6], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - - "lbu %[load7], 0(%[dest_pix]) \n\t" - "add %[load5], %[step1_2], %[step1_5] \n\t" - "add %[load5], %[load5], %[step1_13] \n\t" - "addi %[load5], %[load5], 32 \n\t" - "sra %[load5], %[load5], 6 \n\t" - "add %[load7], %[load7], %[load5] \n\t" - "lbux %[load5], %[load7](%[cm]) \n\t" - "add %[load6], %[step1_3], %[step1_4] \n\t" - "add %[load6], %[load6], %[step1_12] \n\t" - "sb %[load5], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - "lbu %[load8], 0(%[dest_pix]) \n\t" - "addi %[load6], %[load6], 32 \n\t" - "sra %[load6], %[load6], 6 \n\t" - "add %[load8], %[load8], %[load6] \n\t" - "lbux %[load6], %[load8](%[cm]) \n\t" - "sb %[load6], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - - "lbu %[load7], 0(%[dest_pix]) \n\t" - "sub %[load5], %[step1_3], %[step1_4] \n\t" - "add %[load5], %[load5], %[step1_11] \n\t" - "addi %[load5], %[load5], 32 \n\t" - "sra %[load5], %[load5], 6 \n\t" - "add %[load7], %[load7], %[load5] \n\t" - "lbux %[load5], %[load7](%[cm]) \n\t" - "sub %[load6], %[step1_2], %[step1_5] \n\t" - "add %[load6], %[load6], %[step1_10] \n\t" - "sb %[load5], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - "lbu %[load8], 0(%[dest_pix]) \n\t" - "addi %[load6], %[load6], 32 \n\t" - "sra %[load6], %[load6], 6 \n\t" - "add %[load8], %[load8], %[load6] \n\t" - "lbux %[load6], %[load8](%[cm]) \n\t" - "sb %[load6], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - - "sub %[load5], %[step1_1], %[step1_6] \n\t" - "lbu %[load7], 0(%[dest_pix]) \n\t" - "add %[load5], %[load5], %[step1_9] \n\t" - "addi %[load5], %[load5], 32 \n\t" - "sra %[load5], %[load5], 6 \n\t" - "add %[load7], %[load7], %[load5] \n\t" - "lbux %[load5], %[load7](%[cm]) \n\t" - "sub %[load6], %[step1_0], %[step1_7] \n\t" - "add %[load6], %[load6], %[step1_8] \n\t" - "sb %[load5], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - "lbu %[load8], 0(%[dest_pix]) \n\t" - "addi %[load6], %[load6], 32 \n\t" - "sra %[load6], %[load6], 6 \n\t" - "add %[load8], %[load8], %[load6] \n\t" - "lbux %[load6], %[load8](%[cm]) \n\t" - "sb %[load6], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - - "lbu %[load7], 0(%[dest_pix]) \n\t" - "sub %[load5], %[step1_0], %[step1_7] \n\t" - "sub %[load5], %[load5], %[step1_8] \n\t" - "addi %[load5], %[load5], 32 \n\t" - "sra %[load5], %[load5], 6 \n\t" - "add %[load7], %[load7], %[load5] \n\t" - "lbux %[load5], %[load7](%[cm]) \n\t" - "sub %[load6], %[step1_1], %[step1_6] \n\t" - "sub %[load6], %[load6], %[step1_9] \n\t" - "sb %[load5], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - "lbu %[load8], 0(%[dest_pix]) \n\t" - "addi %[load6], %[load6], 32 \n\t" - "sra %[load6], %[load6], 6 \n\t" - "add %[load8], %[load8], %[load6] \n\t" - "lbux %[load6], %[load8](%[cm]) \n\t" - "sb %[load6], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - - "lbu %[load7], 0(%[dest_pix]) \n\t" - "sub %[load5], %[step1_2], %[step1_5] \n\t" - "sub %[load5], %[load5], %[step1_10] \n\t" - "addi %[load5], %[load5], 32 \n\t" - "sra %[load5], %[load5], 6 \n\t" - "add %[load7], %[load7], %[load5] \n\t" - "lbux %[load5], %[load7](%[cm]) \n\t" - "sub %[load6], %[step1_3], %[step1_4] \n\t" - "sub %[load6], %[load6], %[step1_11] \n\t" - "sb %[load5], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - "lbu %[load8], 0(%[dest_pix]) \n\t" - "addi %[load6], %[load6], 32 \n\t" - "sra %[load6], %[load6], 6 \n\t" - "add %[load8], %[load8], %[load6] \n\t" - "lbux %[load6], %[load8](%[cm]) \n\t" - "sb %[load6], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - - "lbu %[load7], 0(%[dest_pix]) \n\t" - "add %[load5], %[step1_3], %[step1_4] \n\t" - "sub %[load5], %[load5], %[step1_12] \n\t" - "addi %[load5], %[load5], 32 \n\t" - "sra %[load5], %[load5], 6 \n\t" - "add %[load7], %[load7], %[load5] \n\t" - "lbux %[load5], %[load7](%[cm]) \n\t" - "add %[load6], %[step1_2], %[step1_5] \n\t" - "sub %[load6], %[load6], %[step1_13] \n\t" - "sb %[load5], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - "lbu %[load8], 0(%[dest_pix]) \n\t" - "addi %[load6], %[load6], 32 \n\t" - "sra %[load6], %[load6], 6 \n\t" - "add %[load8], %[load8], %[load6] \n\t" - "lbux %[load6], %[load8](%[cm]) \n\t" - "sb %[load6], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - - "lbu %[load7], 0(%[dest_pix]) \n\t" - "add %[load5], %[step1_1], %[step1_6] \n\t" - "sub %[load5], %[load5], %[step1_14] \n\t" - "addi %[load5], %[load5], 32 \n\t" - "sra %[load5], %[load5], 6 \n\t" - "add %[load7], %[load7], %[load5] \n\t" - "lbux %[load5], %[load7](%[cm]) \n\t" - "add %[load6], %[step1_0], %[step1_7] \n\t" - "sub %[load6], %[load6], %[step1_15] \n\t" - "sb %[load5], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - "lbu %[load8], 0(%[dest_pix]) \n\t" - "addi %[load6], %[load6], 32 \n\t" - "sra %[load6], %[load6], 6 \n\t" - "add %[load8], %[load8], %[load6] \n\t" - "lbux %[load6], %[load8](%[cm]) \n\t" - "sb %[load6], 0(%[dest_pix]) \n\t" - - : [load5] "=&r" (load5), [load6] "=&r" (load6), [load7] "=&r" (load7), - [load8] "=&r" (load8), [dest_pix] "+r" (dest_pix) - : [cm] "r" (cm), [dest_stride] "r" (dest_stride), - [step1_0] "r" (step1_0), [step1_1] "r" (step1_1), - [step1_2] "r" (step1_2), [step1_3] "r" (step1_3), - [step1_4] "r" (step1_4), [step1_5] "r" (step1_5), - [step1_6] "r" (step1_6), [step1_7] "r" (step1_7), - [step1_8] "r" (step1_8), [step1_9] "r" (step1_9), - [step1_10] "r" (step1_10), [step1_11] "r" (step1_11), - [step1_12] "r" (step1_12), [step1_13] "r" (step1_13), - [step1_14] "r" (step1_14), [step1_15] "r" (step1_15) - ); - - input += 16; - } -} - -void vp9_idct16x16_256_add_dspr2(const int16_t *input, uint8_t *dest, - int dest_stride) { - DECLARE_ALIGNED(32, int16_t, out[16 * 16]); - uint32_t pos = 45; - - /* bit positon for extract from acc */ - __asm__ __volatile__ ( - "wrdsp %[pos], 1 \n\t" - : - : [pos] "r" (pos) - ); - - // First transform rows - idct16_rows_dspr2(input, out, 16); - - // Then transform columns and add to dest - idct16_cols_add_blk_dspr2(out, dest, dest_stride); -} - -static void iadst16_dspr2(const int16_t *input, int16_t *output) { - int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15; - - int x0 = input[15]; - int x1 = input[0]; - int x2 = input[13]; - int x3 = input[2]; - int x4 = input[11]; - int x5 = input[4]; - int x6 = input[9]; - int x7 = input[6]; - int x8 = input[7]; - int x9 = input[8]; - int x10 = input[5]; - int x11 = input[10]; - int x12 = input[3]; - int x13 = input[12]; - int x14 = input[1]; - int x15 = input[14]; - - if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 - | x9 | x10 | x11 | x12 | x13 | x14 | x15)) { - output[0] = output[1] = output[2] = output[3] = output[4] - = output[5] = output[6] = output[7] = output[8] - = output[9] = output[10] = output[11] = output[12] - = output[13] = output[14] = output[15] = 0; - return; - } - - // stage 1 - s0 = x0 * cospi_1_64 + x1 * cospi_31_64; - s1 = x0 * cospi_31_64 - x1 * cospi_1_64; - s2 = x2 * cospi_5_64 + x3 * cospi_27_64; - s3 = x2 * cospi_27_64 - x3 * cospi_5_64; - s4 = x4 * cospi_9_64 + x5 * cospi_23_64; - s5 = x4 * cospi_23_64 - x5 * cospi_9_64; - s6 = x6 * cospi_13_64 + x7 * cospi_19_64; - s7 = x6 * cospi_19_64 - x7 * cospi_13_64; - s8 = x8 * cospi_17_64 + x9 * cospi_15_64; - s9 = x8 * cospi_15_64 - x9 * cospi_17_64; - s10 = x10 * cospi_21_64 + x11 * cospi_11_64; - s11 = x10 * cospi_11_64 - x11 * cospi_21_64; - s12 = x12 * cospi_25_64 + x13 * cospi_7_64; - s13 = x12 * cospi_7_64 - x13 * cospi_25_64; - s14 = x14 * cospi_29_64 + x15 * cospi_3_64; - s15 = x14 * cospi_3_64 - x15 * cospi_29_64; - - x0 = dct_const_round_shift(s0 + s8); - x1 = dct_const_round_shift(s1 + s9); - x2 = dct_const_round_shift(s2 + s10); - x3 = dct_const_round_shift(s3 + s11); - x4 = dct_const_round_shift(s4 + s12); - x5 = dct_const_round_shift(s5 + s13); - x6 = dct_const_round_shift(s6 + s14); - x7 = dct_const_round_shift(s7 + s15); - x8 = dct_const_round_shift(s0 - s8); - x9 = dct_const_round_shift(s1 - s9); - x10 = dct_const_round_shift(s2 - s10); - x11 = dct_const_round_shift(s3 - s11); - x12 = dct_const_round_shift(s4 - s12); - x13 = dct_const_round_shift(s5 - s13); - x14 = dct_const_round_shift(s6 - s14); - x15 = dct_const_round_shift(s7 - s15); - - // stage 2 - s0 = x0; - s1 = x1; - s2 = x2; - s3 = x3; - s4 = x4; - s5 = x5; - s6 = x6; - s7 = x7; - s8 = x8 * cospi_4_64 + x9 * cospi_28_64; - s9 = x8 * cospi_28_64 - x9 * cospi_4_64; - s10 = x10 * cospi_20_64 + x11 * cospi_12_64; - s11 = x10 * cospi_12_64 - x11 * cospi_20_64; - s12 = - x12 * cospi_28_64 + x13 * cospi_4_64; - s13 = x12 * cospi_4_64 + x13 * cospi_28_64; - s14 = - x14 * cospi_12_64 + x15 * cospi_20_64; - s15 = x14 * cospi_20_64 + x15 * cospi_12_64; - - x0 = s0 + s4; - x1 = s1 + s5; - x2 = s2 + s6; - x3 = s3 + s7; - x4 = s0 - s4; - x5 = s1 - s5; - x6 = s2 - s6; - x7 = s3 - s7; - x8 = dct_const_round_shift(s8 + s12); - x9 = dct_const_round_shift(s9 + s13); - x10 = dct_const_round_shift(s10 + s14); - x11 = dct_const_round_shift(s11 + s15); - x12 = dct_const_round_shift(s8 - s12); - x13 = dct_const_round_shift(s9 - s13); - x14 = dct_const_round_shift(s10 - s14); - x15 = dct_const_round_shift(s11 - s15); - - // stage 3 - s0 = x0; - s1 = x1; - s2 = x2; - s3 = x3; - s4 = x4 * cospi_8_64 + x5 * cospi_24_64; - s5 = x4 * cospi_24_64 - x5 * cospi_8_64; - s6 = - x6 * cospi_24_64 + x7 * cospi_8_64; - s7 = x6 * cospi_8_64 + x7 * cospi_24_64; - s8 = x8; - s9 = x9; - s10 = x10; - s11 = x11; - s12 = x12 * cospi_8_64 + x13 * cospi_24_64; - s13 = x12 * cospi_24_64 - x13 * cospi_8_64; - s14 = - x14 * cospi_24_64 + x15 * cospi_8_64; - s15 = x14 * cospi_8_64 + x15 * cospi_24_64; - - x0 = s0 + s2; - x1 = s1 + s3; - x2 = s0 - s2; - x3 = s1 - s3; - x4 = dct_const_round_shift(s4 + s6); - x5 = dct_const_round_shift(s5 + s7); - x6 = dct_const_round_shift(s4 - s6); - x7 = dct_const_round_shift(s5 - s7); - x8 = s8 + s10; - x9 = s9 + s11; - x10 = s8 - s10; - x11 = s9 - s11; - x12 = dct_const_round_shift(s12 + s14); - x13 = dct_const_round_shift(s13 + s15); - x14 = dct_const_round_shift(s12 - s14); - x15 = dct_const_round_shift(s13 - s15); - - // stage 4 - s2 = (- cospi_16_64) * (x2 + x3); - s3 = cospi_16_64 * (x2 - x3); - s6 = cospi_16_64 * (x6 + x7); - s7 = cospi_16_64 * (- x6 + x7); - s10 = cospi_16_64 * (x10 + x11); - s11 = cospi_16_64 * (- x10 + x11); - s14 = (- cospi_16_64) * (x14 + x15); - s15 = cospi_16_64 * (x14 - x15); - - x2 = dct_const_round_shift(s2); - x3 = dct_const_round_shift(s3); - x6 = dct_const_round_shift(s6); - x7 = dct_const_round_shift(s7); - x10 = dct_const_round_shift(s10); - x11 = dct_const_round_shift(s11); - x14 = dct_const_round_shift(s14); - x15 = dct_const_round_shift(s15); - - output[0] = x0; - output[1] = -x8; - output[2] = x12; - output[3] = -x4; - output[4] = x6; - output[5] = x14; - output[6] = x10; - output[7] = x2; - output[8] = x3; - output[9] = x11; - output[10] = x15; - output[11] = x7; - output[12] = x5; - output[13] = -x13; - output[14] = x9; - output[15] = -x1; -} - void vp9_iht16x16_256_add_dspr2(const int16_t *input, uint8_t *dest, int pitch, int tx_type) { int i, j; @@ -1168,150 +105,4 @@ void vp9_iht16x16_256_add_dspr2(const int16_t *input, uint8_t *dest, break; } } - -void vp9_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest, - int dest_stride) { - DECLARE_ALIGNED(32, int16_t, out[16 * 16]); - int16_t *outptr = out; - uint32_t i; - uint32_t pos = 45; - - /* bit positon for extract from acc */ - __asm__ __volatile__ ( - "wrdsp %[pos], 1 \n\t" - : - : [pos] "r" (pos) - ); - - // First transform rows. Since all non-zero dct coefficients are in - // upper-left 4x4 area, we only need to calculate first 4 rows here. - idct16_rows_dspr2(input, outptr, 4); - - outptr += 4; - for (i = 0; i < 6; ++i) { - __asm__ __volatile__ ( - "sw $zero, 0(%[outptr]) \n\t" - "sw $zero, 32(%[outptr]) \n\t" - "sw $zero, 64(%[outptr]) \n\t" - "sw $zero, 96(%[outptr]) \n\t" - "sw $zero, 128(%[outptr]) \n\t" - "sw $zero, 160(%[outptr]) \n\t" - "sw $zero, 192(%[outptr]) \n\t" - "sw $zero, 224(%[outptr]) \n\t" - "sw $zero, 256(%[outptr]) \n\t" - "sw $zero, 288(%[outptr]) \n\t" - "sw $zero, 320(%[outptr]) \n\t" - "sw $zero, 352(%[outptr]) \n\t" - "sw $zero, 384(%[outptr]) \n\t" - "sw $zero, 416(%[outptr]) \n\t" - "sw $zero, 448(%[outptr]) \n\t" - "sw $zero, 480(%[outptr]) \n\t" - - : - : [outptr] "r" (outptr) - ); - - outptr += 2; - } - - // Then transform columns - idct16_cols_add_blk_dspr2(out, dest, dest_stride); -} - -void vp9_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest, - int dest_stride) { - uint32_t pos = 45; - int32_t out; - int32_t r; - int32_t a1, absa1; - int32_t vector_a1; - int32_t t1, t2, t3, t4; - int32_t vector_1, vector_2, vector_3, vector_4; - - /* bit positon for extract from acc */ - __asm__ __volatile__ ( - "wrdsp %[pos], 1 \n\t" - - : - : [pos] "r" (pos) - ); - - out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]); - __asm__ __volatile__ ( - "addi %[out], %[out], 32 \n\t" - "sra %[a1], %[out], 6 \n\t" - - : [out] "+r" (out), [a1] "=r" (a1) - : - ); - - if (a1 < 0) { - /* use quad-byte - * input and output memory are four byte aligned */ - __asm__ __volatile__ ( - "abs %[absa1], %[a1] \n\t" - "replv.qb %[vector_a1], %[absa1] \n\t" - - : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1) - : [a1] "r" (a1) - ); - - for (r = 16; r--;) { - __asm__ __volatile__ ( - "lw %[t1], 0(%[dest]) \n\t" - "lw %[t2], 4(%[dest]) \n\t" - "lw %[t3], 8(%[dest]) \n\t" - "lw %[t4], 12(%[dest]) \n\t" - "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" - "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" - "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" - "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" - "sw %[vector_1], 0(%[dest]) \n\t" - "sw %[vector_2], 4(%[dest]) \n\t" - "sw %[vector_3], 8(%[dest]) \n\t" - "sw %[vector_4], 12(%[dest]) \n\t" - "add %[dest], %[dest], %[dest_stride] \n\t" - - : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4), - [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2), - [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4), - [dest] "+&r" (dest) - : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1) - ); - } - } else { - /* use quad-byte - * input and output memory are four byte aligned */ - __asm__ __volatile__ ( - "replv.qb %[vector_a1], %[a1] \n\t" - - : [vector_a1] "=r" (vector_a1) - : [a1] "r" (a1) - ); - - for (r = 16; r--;) { - __asm__ __volatile__ ( - "lw %[t1], 0(%[dest]) \n\t" - "lw %[t2], 4(%[dest]) \n\t" - "lw %[t3], 8(%[dest]) \n\t" - "lw %[t4], 12(%[dest]) \n\t" - "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" - "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" - "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" - "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" - "sw %[vector_1], 0(%[dest]) \n\t" - "sw %[vector_2], 4(%[dest]) \n\t" - "sw %[vector_3], 8(%[dest]) \n\t" - "sw %[vector_4], 12(%[dest]) \n\t" - "add %[dest], %[dest], %[dest_stride] \n\t" - - : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4), - [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2), - [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4), - [dest] "+&r" (dest) - : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1) - ); - } - } -} #endif // #if HAVE_DSPR2 diff --git a/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c b/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c deleted file mode 100644 index 48da85cbc..000000000 --- a/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c +++ /dev/null @@ -1,1074 +0,0 @@ -/* - * Copyright (c) 2013 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include <assert.h> - -#include "./vpx_config.h" -#include "./vp9_rtcd.h" -#include "vp9/common/vp9_common.h" -#include "vp9/common/vp9_blockd.h" -#include "vp9/common/mips/dspr2/vp9_common_dspr2.h" -#include "vpx_dsp/txfm_common.h" -#include "vpx_ports/mem.h" - -#if HAVE_DSPR2 -void vp9_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, - int dest_stride) { - int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6; - int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13; - int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19; - int16_t step1_20, step1_21, step1_22, step1_23, step1_24, step1_25, step1_26; - int16_t step1_27, step1_28, step1_29, step1_30, step1_31; - int16_t step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6; - int16_t step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13; - int16_t step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20; - int16_t step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27; - int16_t step2_28, step2_29, step2_30, step2_31; - int16_t step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14; - int16_t step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21; - int16_t step3_22, step3_23, step3_24, step3_25, step3_26, step3_27; - int16_t step3_28, step3_29, step3_30, step3_31; - int temp0, temp1, temp2, temp3; - int load1, load2, load3, load4; - int result1, result2; - int i, temp21; - uint8_t *dest_pix, *dest_pix1; - const int const_2_power_13 = 8192; - uint8_t *cm = vpx_ff_cropTbl; - - /* prefetch vpx_ff_cropTbl */ - prefetch_load(vpx_ff_cropTbl); - prefetch_load(vpx_ff_cropTbl + 32); - prefetch_load(vpx_ff_cropTbl + 64); - prefetch_load(vpx_ff_cropTbl + 96); - prefetch_load(vpx_ff_cropTbl + 128); - prefetch_load(vpx_ff_cropTbl + 160); - prefetch_load(vpx_ff_cropTbl + 192); - prefetch_load(vpx_ff_cropTbl + 224); - - for (i = 0; i < 32; ++i) { - dest_pix = dest + i; - dest_pix1 = dest + i + 31 * dest_stride; - - __asm__ __volatile__ ( - "lh %[load1], 2(%[input]) \n\t" - "lh %[load2], 62(%[input]) \n\t" - "lh %[load3], 34(%[input]) \n\t" - "lh %[load4], 30(%[input]) \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "madd $ac1, %[load1], %[cospi_31_64] \n\t" - "msub $ac1, %[load2], %[cospi_1_64] \n\t" - "extp %[temp0], $ac1, 31 \n\t" - - "madd $ac3, %[load1], %[cospi_1_64] \n\t" - "madd $ac3, %[load2], %[cospi_31_64] \n\t" - "extp %[temp3], $ac3, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - - "madd $ac2, %[load3], %[cospi_15_64] \n\t" - "msub $ac2, %[load4], %[cospi_17_64] \n\t" - "extp %[temp1], $ac2, 31 \n\t" - - "madd $ac1, %[load3], %[cospi_17_64] \n\t" - "madd $ac1, %[load4], %[cospi_15_64] \n\t" - "extp %[temp2], $ac1, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "sub %[load1], %[temp3], %[temp2] \n\t" - "sub %[load2], %[temp0], %[temp1] \n\t" - - "madd $ac1, %[load1], %[cospi_28_64] \n\t" - "msub $ac1, %[load2], %[cospi_4_64] \n\t" - "madd $ac3, %[load1], %[cospi_4_64] \n\t" - "madd $ac3, %[load2], %[cospi_28_64] \n\t" - - "extp %[step1_17], $ac1, 31 \n\t" - "extp %[step1_30], $ac3, 31 \n\t" - "add %[step1_16], %[temp0], %[temp1] \n\t" - "add %[step1_31], %[temp2], %[temp3] \n\t" - - : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3), - [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), - [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), - [step1_16] "=r" (step1_16), [step1_17] "=r" (step1_17), - [step1_30] "=r" (step1_30), [step1_31] "=r" (step1_31) - : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), - [cospi_31_64] "r" (cospi_31_64), [cospi_1_64] "r" (cospi_1_64), - [cospi_4_64] "r" (cospi_4_64), [cospi_17_64] "r" (cospi_17_64), - [cospi_15_64] "r" (cospi_15_64), [cospi_28_64] "r" (cospi_28_64) - ); - - __asm__ __volatile__ ( - "lh %[load1], 18(%[input]) \n\t" - "lh %[load2], 46(%[input]) \n\t" - "lh %[load3], 50(%[input]) \n\t" - "lh %[load4], 14(%[input]) \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "madd $ac1, %[load1], %[cospi_23_64] \n\t" - "msub $ac1, %[load2], %[cospi_9_64] \n\t" - "extp %[temp0], $ac1, 31 \n\t" - - "madd $ac3, %[load1], %[cospi_9_64] \n\t" - "madd $ac3, %[load2], %[cospi_23_64] \n\t" - "extp %[temp3], $ac3, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - - "madd $ac2, %[load3], %[cospi_7_64] \n\t" - "msub $ac2, %[load4], %[cospi_25_64] \n\t" - "extp %[temp1], $ac2, 31 \n\t" - - "madd $ac1, %[load3], %[cospi_25_64] \n\t" - "madd $ac1, %[load4], %[cospi_7_64] \n\t" - "extp %[temp2], $ac1, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "sub %[load1], %[temp1], %[temp0] \n\t" - "sub %[load2], %[temp2], %[temp3] \n\t" - - "msub $ac1, %[load1], %[cospi_28_64] \n\t" - "msub $ac1, %[load2], %[cospi_4_64] \n\t" - "msub $ac3, %[load1], %[cospi_4_64] \n\t" - "madd $ac3, %[load2], %[cospi_28_64] \n\t" - - "extp %[step1_18], $ac1, 31 \n\t" - "extp %[step1_29], $ac3, 31 \n\t" - "add %[step1_19], %[temp0], %[temp1] \n\t" - "add %[step1_28], %[temp2], %[temp3] \n\t" - - : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3), - [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), - [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), - [step1_18] "=r" (step1_18), [step1_19] "=r" (step1_19), - [step1_28] "=r" (step1_28), [step1_29] "=r" (step1_29) - : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), - [cospi_23_64] "r" (cospi_23_64), [cospi_9_64] "r" (cospi_9_64), - [cospi_4_64] "r" (cospi_4_64), [cospi_7_64] "r" (cospi_7_64), - [cospi_25_64] "r" (cospi_25_64), [cospi_28_64] "r" (cospi_28_64) - ); - - __asm__ __volatile__ ( - "lh %[load1], 10(%[input]) \n\t" - "lh %[load2], 54(%[input]) \n\t" - "lh %[load3], 42(%[input]) \n\t" - "lh %[load4], 22(%[input]) \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "madd $ac1, %[load1], %[cospi_27_64] \n\t" - "msub $ac1, %[load2], %[cospi_5_64] \n\t" - "extp %[temp0], $ac1, 31 \n\t" - - "madd $ac3, %[load1], %[cospi_5_64] \n\t" - "madd $ac3, %[load2], %[cospi_27_64] \n\t" - "extp %[temp3], $ac3, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - - "madd $ac2, %[load3], %[cospi_11_64] \n\t" - "msub $ac2, %[load4], %[cospi_21_64] \n\t" - "extp %[temp1], $ac2, 31 \n\t" - - "madd $ac1, %[load3], %[cospi_21_64] \n\t" - "madd $ac1, %[load4], %[cospi_11_64] \n\t" - "extp %[temp2], $ac1, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "sub %[load1], %[temp0], %[temp1] \n\t" - "sub %[load2], %[temp3], %[temp2] \n\t" - - "madd $ac1, %[load2], %[cospi_12_64] \n\t" - "msub $ac1, %[load1], %[cospi_20_64] \n\t" - "madd $ac3, %[load1], %[cospi_12_64] \n\t" - "madd $ac3, %[load2], %[cospi_20_64] \n\t" - - "extp %[step1_21], $ac1, 31 \n\t" - "extp %[step1_26], $ac3, 31 \n\t" - "add %[step1_20], %[temp0], %[temp1] \n\t" - "add %[step1_27], %[temp2], %[temp3] \n\t" - - : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3), - [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), - [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), - [step1_20] "=r" (step1_20), [step1_21] "=r" (step1_21), - [step1_26] "=r" (step1_26), [step1_27] "=r" (step1_27) - : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), - [cospi_27_64] "r" (cospi_27_64), [cospi_5_64] "r" (cospi_5_64), - [cospi_11_64] "r" (cospi_11_64), [cospi_21_64] "r" (cospi_21_64), - [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64) - ); - - __asm__ __volatile__ ( - "lh %[load1], 26(%[input]) \n\t" - "lh %[load2], 38(%[input]) \n\t" - "lh %[load3], 58(%[input]) \n\t" - "lh %[load4], 6(%[input]) \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "madd $ac1, %[load1], %[cospi_19_64] \n\t" - "msub $ac1, %[load2], %[cospi_13_64] \n\t" - "extp %[temp0], $ac1, 31 \n\t" - "madd $ac3, %[load1], %[cospi_13_64] \n\t" - "madd $ac3, %[load2], %[cospi_19_64] \n\t" - "extp %[temp3], $ac3, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - - "madd $ac2, %[load3], %[cospi_3_64] \n\t" - "msub $ac2, %[load4], %[cospi_29_64] \n\t" - "extp %[temp1], $ac2, 31 \n\t" - "madd $ac1, %[load3], %[cospi_29_64] \n\t" - "madd $ac1, %[load4], %[cospi_3_64] \n\t" - "extp %[temp2], $ac1, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "sub %[load1], %[temp1], %[temp0] \n\t" - "sub %[load2], %[temp2], %[temp3] \n\t" - "msub $ac1, %[load1], %[cospi_12_64] \n\t" - "msub $ac1, %[load2], %[cospi_20_64] \n\t" - "msub $ac3, %[load1], %[cospi_20_64] \n\t" - "madd $ac3, %[load2], %[cospi_12_64] \n\t" - "extp %[step1_22], $ac1, 31 \n\t" - "extp %[step1_25], $ac3, 31 \n\t" - "add %[step1_23], %[temp0], %[temp1] \n\t" - "add %[step1_24], %[temp2], %[temp3] \n\t" - - : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3), - [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), - [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), - [step1_22] "=r" (step1_22), [step1_23] "=r" (step1_23), - [step1_24] "=r" (step1_24), [step1_25] "=r" (step1_25) - : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), - [cospi_19_64] "r" (cospi_19_64), [cospi_13_64] "r" (cospi_13_64), - [cospi_3_64] "r" (cospi_3_64), [cospi_29_64] "r" (cospi_29_64), - [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64) - ); - - __asm__ __volatile__ ( - "lh %[load1], 4(%[input]) \n\t" - "lh %[load2], 60(%[input]) \n\t" - "lh %[load3], 36(%[input]) \n\t" - "lh %[load4], 28(%[input]) \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "madd $ac1, %[load1], %[cospi_30_64] \n\t" - "msub $ac1, %[load2], %[cospi_2_64] \n\t" - "extp %[temp0], $ac1, 31 \n\t" - "madd $ac3, %[load1], %[cospi_2_64] \n\t" - "madd $ac3, %[load2], %[cospi_30_64] \n\t" - "extp %[temp3], $ac3, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - - "madd $ac2, %[load3], %[cospi_14_64] \n\t" - "msub $ac2, %[load4], %[cospi_18_64] \n\t" - "extp %[temp1], $ac2, 31 \n\t" - "madd $ac1, %[load3], %[cospi_18_64] \n\t" - "madd $ac1, %[load4], %[cospi_14_64] \n\t" - "extp %[temp2], $ac1, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "sub %[load1], %[temp0], %[temp1] \n\t" - "sub %[load2], %[temp3], %[temp2] \n\t" - "msub $ac1, %[load1], %[cospi_8_64] \n\t" - "madd $ac1, %[load2], %[cospi_24_64] \n\t" - "madd $ac3, %[load1], %[cospi_24_64] \n\t" - "madd $ac3, %[load2], %[cospi_8_64] \n\t" - "extp %[step2_9], $ac1, 31 \n\t" - "extp %[step2_14], $ac3, 31 \n\t" - "add %[step2_8], %[temp0], %[temp1] \n\t" - "add %[step2_15], %[temp2], %[temp3] \n\t" - - : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3), - [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), - [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), - [step2_8] "=r" (step2_8), [step2_9] "=r" (step2_9), - [step2_14] "=r" (step2_14), [step2_15] "=r" (step2_15) - : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), - [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64), - [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64), - [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64) - ); - - __asm__ __volatile__ ( - "lh %[load1], 20(%[input]) \n\t" - "lh %[load2], 44(%[input]) \n\t" - "lh %[load3], 52(%[input]) \n\t" - "lh %[load4], 12(%[input]) \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "madd $ac1, %[load1], %[cospi_22_64] \n\t" - "msub $ac1, %[load2], %[cospi_10_64] \n\t" - "extp %[temp0], $ac1, 31 \n\t" - "madd $ac3, %[load1], %[cospi_10_64] \n\t" - "madd $ac3, %[load2], %[cospi_22_64] \n\t" - "extp %[temp3], $ac3, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - - "madd $ac2, %[load3], %[cospi_6_64] \n\t" - "msub $ac2, %[load4], %[cospi_26_64] \n\t" - "extp %[temp1], $ac2, 31 \n\t" - "madd $ac1, %[load3], %[cospi_26_64] \n\t" - "madd $ac1, %[load4], %[cospi_6_64] \n\t" - "extp %[temp2], $ac1, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "sub %[load1], %[temp1], %[temp0] \n\t" - "sub %[load2], %[temp2], %[temp3] \n\t" - "msub $ac1, %[load1], %[cospi_24_64] \n\t" - "msub $ac1, %[load2], %[cospi_8_64] \n\t" - "madd $ac3, %[load2], %[cospi_24_64] \n\t" - "msub $ac3, %[load1], %[cospi_8_64] \n\t" - "extp %[step2_10], $ac1, 31 \n\t" - "extp %[step2_13], $ac3, 31 \n\t" - "add %[step2_11], %[temp0], %[temp1] \n\t" - "add %[step2_12], %[temp2], %[temp3] \n\t" - - : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3), - [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), - [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), - [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11), - [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13) - : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), - [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64), - [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64), - [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64) - ); - - __asm__ __volatile__ ( - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "sub %[temp0], %[step2_14], %[step2_13] \n\t" - "sub %[temp0], %[temp0], %[step2_9] \n\t" - "add %[temp0], %[temp0], %[step2_10] \n\t" - "madd $ac0, %[temp0], %[cospi_16_64] \n\t" - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "sub %[temp1], %[step2_14], %[step2_13] \n\t" - "add %[temp1], %[temp1], %[step2_9] \n\t" - "sub %[temp1], %[temp1], %[step2_10] \n\t" - "madd $ac1, %[temp1], %[cospi_16_64] \n\t" - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "sub %[temp0], %[step2_15], %[step2_12] \n\t" - "sub %[temp0], %[temp0], %[step2_8] \n\t" - "add %[temp0], %[temp0], %[step2_11] \n\t" - "madd $ac2, %[temp0], %[cospi_16_64] \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - "sub %[temp1], %[step2_15], %[step2_12] \n\t" - "add %[temp1], %[temp1], %[step2_8] \n\t" - "sub %[temp1], %[temp1], %[step2_11] \n\t" - "madd $ac3, %[temp1], %[cospi_16_64] \n\t" - - "add %[step3_8], %[step2_8], %[step2_11] \n\t" - "add %[step3_9], %[step2_9], %[step2_10] \n\t" - "add %[step3_14], %[step2_13], %[step2_14] \n\t" - "add %[step3_15], %[step2_12], %[step2_15] \n\t" - "extp %[step3_10], $ac0, 31 \n\t" - "extp %[step3_13], $ac1, 31 \n\t" - "extp %[step3_11], $ac2, 31 \n\t" - "extp %[step3_12], $ac3, 31 \n\t" - - : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), - [step3_8] "=r" (step3_8), [step3_9] "=r" (step3_9), - [step3_10] "=r" (step3_10), [step3_11] "=r" (step3_11), - [step3_12] "=r" (step3_12), [step3_13] "=r" (step3_13), - [step3_14] "=r" (step3_14), [step3_15] "=r" (step3_15) - : [const_2_power_13] "r" (const_2_power_13), [step2_8] "r" (step2_8), - [step2_9] "r" (step2_9), [step2_10] "r" (step2_10), - [step2_11] "r" (step2_11), [step2_12] "r" (step2_12), - [step2_13] "r" (step2_13), [step2_14] "r" (step2_14), - [step2_15] "r" (step2_15), [cospi_16_64] "r" (cospi_16_64) - ); - - step2_18 = step1_17 - step1_18; - step2_29 = step1_30 - step1_29; - - __asm__ __volatile__ ( - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "msub $ac0, %[step2_18], %[cospi_8_64] \n\t" - "madd $ac0, %[step2_29], %[cospi_24_64] \n\t" - "extp %[step3_18], $ac0, 31 \n\t" - - : [step3_18] "=r" (step3_18) - : [const_2_power_13] "r" (const_2_power_13), - [step2_18] "r" (step2_18), [step2_29] "r" (step2_29), - [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) - ); - - temp21 = step2_18 * cospi_24_64 + step2_29 * cospi_8_64; - step3_29 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; - - step2_19 = step1_16 - step1_19; - step2_28 = step1_31 - step1_28; - - __asm__ __volatile__ ( - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "msub $ac0, %[step2_19], %[cospi_8_64] \n\t" - "madd $ac0, %[step2_28], %[cospi_24_64] \n\t" - "extp %[step3_19], $ac0, 31 \n\t" - - : [step3_19] "=r" (step3_19) - : [const_2_power_13] "r" (const_2_power_13), - [step2_19] "r" (step2_19), [step2_28] "r" (step2_28), - [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) - ); - - temp21 = step2_19 * cospi_24_64 + step2_28 * cospi_8_64; - step3_28 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; - - step3_16 = step1_16 + step1_19; - step3_17 = step1_17 + step1_18; - step3_30 = step1_29 + step1_30; - step3_31 = step1_28 + step1_31; - - step2_20 = step1_23 - step1_20; - step2_27 = step1_24 - step1_27; - - __asm__ __volatile__ ( - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "msub $ac0, %[step2_20], %[cospi_24_64] \n\t" - "msub $ac0, %[step2_27], %[cospi_8_64] \n\t" - "extp %[step3_20], $ac0, 31 \n\t" - - : [step3_20] "=r" (step3_20) - : [const_2_power_13] "r" (const_2_power_13), - [step2_20] "r" (step2_20), [step2_27] "r" (step2_27), - [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) - ); - - temp21 = -step2_20 * cospi_8_64 + step2_27 * cospi_24_64; - step3_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; - - step2_21 = step1_22 - step1_21; - step2_26 = step1_25 - step1_26; - - __asm__ __volatile__ ( - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "msub $ac1, %[step2_21], %[cospi_24_64] \n\t" - "msub $ac1, %[step2_26], %[cospi_8_64] \n\t" - "extp %[step3_21], $ac1, 31 \n\t" - - : [step3_21] "=r" (step3_21) - : [const_2_power_13] "r" (const_2_power_13), - [step2_21] "r" (step2_21), [step2_26] "r" (step2_26), - [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) - ); - - temp21 = -step2_21 * cospi_8_64 + step2_26 * cospi_24_64; - step3_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; - - step3_22 = step1_21 + step1_22; - step3_23 = step1_20 + step1_23; - step3_24 = step1_24 + step1_27; - step3_25 = step1_25 + step1_26; - - step2_16 = step3_16 + step3_23; - step2_17 = step3_17 + step3_22; - step2_18 = step3_18 + step3_21; - step2_19 = step3_19 + step3_20; - step2_20 = step3_19 - step3_20; - step2_21 = step3_18 - step3_21; - step2_22 = step3_17 - step3_22; - step2_23 = step3_16 - step3_23; - - step2_24 = step3_31 - step3_24; - step2_25 = step3_30 - step3_25; - step2_26 = step3_29 - step3_26; - step2_27 = step3_28 - step3_27; - step2_28 = step3_28 + step3_27; - step2_29 = step3_29 + step3_26; - step2_30 = step3_30 + step3_25; - step2_31 = step3_31 + step3_24; - - __asm__ __volatile__ ( - "lh %[load1], 0(%[input]) \n\t" - "lh %[load2], 32(%[input]) \n\t" - "lh %[load3], 16(%[input]) \n\t" - "lh %[load4], 48(%[input]) \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "add %[result1], %[load1], %[load2] \n\t" - "sub %[result2], %[load1], %[load2] \n\t" - "madd $ac1, %[result1], %[cospi_16_64] \n\t" - "madd $ac2, %[result2], %[cospi_16_64] \n\t" - "extp %[temp0], $ac1, 31 \n\t" - "extp %[temp1], $ac2, 31 \n\t" - - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - "madd $ac3, %[load3], %[cospi_24_64] \n\t" - "msub $ac3, %[load4], %[cospi_8_64] \n\t" - "extp %[temp2], $ac3, 31 \n\t" - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "madd $ac1, %[load3], %[cospi_8_64] \n\t" - "madd $ac1, %[load4], %[cospi_24_64] \n\t" - "extp %[temp3], $ac1, 31 \n\t" - "add %[step1_0], %[temp0], %[temp3] \n\t" - "add %[step1_1], %[temp1], %[temp2] \n\t" - "sub %[step1_2], %[temp1], %[temp2] \n\t" - "sub %[step1_3], %[temp0], %[temp3] \n\t" - - : [load1] "=&r" (load1), [load2] "=&r" (load2), - [load3] "=&r" (load3), [load4] "=&r" (load4), - [result1] "=&r" (result1), [result2] "=&r" (result2), - [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), - [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), - [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1), - [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3) - : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), - [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64), - [cospi_16_64] "r" (cospi_16_64) - ); - - __asm__ __volatile__ ( - "lh %[load1], 8(%[input]) \n\t" - "lh %[load2], 56(%[input]) \n\t" - "lh %[load3], 40(%[input]) \n\t" - "lh %[load4], 24(%[input]) \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "madd $ac1, %[load1], %[cospi_28_64] \n\t" - "msub $ac1, %[load2], %[cospi_4_64] \n\t" - "extp %[temp0], $ac1, 31 \n\t" - "madd $ac3, %[load1], %[cospi_4_64] \n\t" - "madd $ac3, %[load2], %[cospi_28_64] \n\t" - "extp %[temp3], $ac3, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - - "madd $ac2, %[load3], %[cospi_12_64] \n\t" - "msub $ac2, %[load4], %[cospi_20_64] \n\t" - "extp %[temp1], $ac2, 31 \n\t" - "madd $ac1, %[load3], %[cospi_20_64] \n\t" - "madd $ac1, %[load4], %[cospi_12_64] \n\t" - "extp %[temp2], $ac1, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "sub %[load1], %[temp3], %[temp2] \n\t" - "sub %[load1], %[load1], %[temp0] \n\t" - "add %[load1], %[load1], %[temp1] \n\t" - "sub %[load2], %[temp0], %[temp1] \n\t" - "sub %[load2], %[load2], %[temp2] \n\t" - "add %[load2], %[load2], %[temp3] \n\t" - "madd $ac1, %[load1], %[cospi_16_64] \n\t" - "madd $ac3, %[load2], %[cospi_16_64] \n\t" - - "extp %[step1_5], $ac1, 31 \n\t" - "extp %[step1_6], $ac3, 31 \n\t" - "add %[step1_4], %[temp0], %[temp1] \n\t" - "add %[step1_7], %[temp3], %[temp2] \n\t" - - : [load1] "=&r" (load1), [load2] "=&r" (load2), - [load3] "=&r" (load3), [load4] "=&r" (load4), - [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), - [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), - [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5), - [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7) - : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), - [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64), - [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64), - [cospi_16_64] "r" (cospi_16_64) - ); - - step2_0 = step1_0 + step1_7; - step2_1 = step1_1 + step1_6; - step2_2 = step1_2 + step1_5; - step2_3 = step1_3 + step1_4; - step2_4 = step1_3 - step1_4; - step2_5 = step1_2 - step1_5; - step2_6 = step1_1 - step1_6; - step2_7 = step1_0 - step1_7; - - // stage 7 - step1_0 = step2_0 + step3_15; - step1_1 = step2_1 + step3_14; - step1_2 = step2_2 + step3_13; - step1_3 = step2_3 + step3_12; - step1_4 = step2_4 + step3_11; - step1_5 = step2_5 + step3_10; - step1_6 = step2_6 + step3_9; - step1_7 = step2_7 + step3_8; - step1_8 = step2_7 - step3_8; - step1_9 = step2_6 - step3_9; - step1_10 = step2_5 - step3_10; - step1_11 = step2_4 - step3_11; - step1_12 = step2_3 - step3_12; - step1_13 = step2_2 - step3_13; - step1_14 = step2_1 - step3_14; - step1_15 = step2_0 - step3_15; - - __asm__ __volatile__ ( - "sub %[temp0], %[step2_27], %[step2_20] \n\t" - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "madd $ac0, %[temp0], %[cospi_16_64] \n\t" - "extp %[step1_20], $ac0, 31 \n\t" - - : [temp0] "=&r" (temp0), [step1_20] "=r" (step1_20) - : [const_2_power_13] "r" (const_2_power_13), [step2_20] "r" (step2_20), - [step2_27] "r" (step2_27), [cospi_16_64] "r" (cospi_16_64) - ); - - temp21 = (step2_20 + step2_27) * cospi_16_64; - step1_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; - - __asm__ __volatile__ ( - "sub %[temp0], %[step2_26], %[step2_21] \n\t" - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "madd $ac0, %[temp0], %[cospi_16_64] \n\t" - "extp %[step1_21], $ac0, 31 \n\t" - - : [temp0] "=&r" (temp0), [step1_21] "=r" (step1_21) - : [const_2_power_13] "r" (const_2_power_13), [step2_26] "r" (step2_26), - [step2_21] "r" (step2_21), [cospi_16_64] "r" (cospi_16_64) - ); - - temp21 = (step2_21 + step2_26) * cospi_16_64; - step1_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; - - __asm__ __volatile__ ( - "sub %[temp0], %[step2_25], %[step2_22] \n\t" - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "madd $ac0, %[temp0], %[cospi_16_64] \n\t" - "extp %[step1_22], $ac0, 31 \n\t" - - : [temp0] "=&r" (temp0), [step1_22] "=r" (step1_22) - : [const_2_power_13] "r" (const_2_power_13), [step2_25] "r" (step2_25), - [step2_22] "r" (step2_22), [cospi_16_64] "r" (cospi_16_64) - ); - - temp21 = (step2_22 + step2_25) * cospi_16_64; - step1_25 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; - - __asm__ __volatile__ ( - "sub %[temp0], %[step2_24], %[step2_23] \n\t" - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "madd $ac0, %[temp0], %[cospi_16_64] \n\t" - "extp %[step1_23], $ac0, 31 \n\t" - - : [temp0] "=&r" (temp0), [step1_23] "=r" (step1_23) - : [const_2_power_13] "r" (const_2_power_13), [step2_24] "r" (step2_24), - [step2_23] "r" (step2_23), [cospi_16_64] "r" (cospi_16_64) - ); - - temp21 = (step2_23 + step2_24) * cospi_16_64; - step1_24 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; - - __asm__ __volatile__ ( - "lbu %[temp2], 0(%[dest_pix]) \n\t" - "add %[temp0], %[step1_0], %[step2_31] \n\t" - "addi %[temp0], %[temp0], 32 \n\t" - "sra %[temp0], %[temp0], 6 \n\t" - "add %[temp2], %[temp2], %[temp0] \n\t" - "lbux %[temp0], %[temp2](%[cm]) \n\t" - "add %[temp1], %[step1_1], %[step2_30] \n\t" - "sb %[temp0], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - "lbu %[temp3], 0(%[dest_pix]) \n\t" - "addi %[temp1], %[temp1], 32 \n\t" - "sra %[temp1], %[temp1], 6 \n\t" - "add %[temp3], %[temp3], %[temp1] \n\t" - "lbux %[temp1], %[temp3](%[cm]) \n\t" - "sb %[temp1], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - - "lbu %[temp2], 0(%[dest_pix]) \n\t" - "add %[temp0], %[step1_2], %[step2_29] \n\t" - "addi %[temp0], %[temp0], 32 \n\t" - "sra %[temp0], %[temp0], 6 \n\t" - "add %[temp2], %[temp2], %[temp0] \n\t" - "lbux %[temp0], %[temp2](%[cm]) \n\t" - "add %[temp1], %[step1_3], %[step2_28] \n\t" - "sb %[temp0], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - "lbu %[temp3], 0(%[dest_pix]) \n\t" - "addi %[temp1], %[temp1], 32 \n\t" - "sra %[temp1], %[temp1], 6 \n\t" - "add %[temp3], %[temp3], %[temp1] \n\t" - "lbux %[temp1], %[temp3](%[cm]) \n\t" - "sb %[temp1], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - - : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2), - [temp3] "=&r" (temp3), [dest_pix] "+r" (dest_pix) - : [cm] "r" (cm), [dest_stride] "r" (dest_stride), - [step1_0] "r" (step1_0), [step1_1] "r" (step1_1), - [step1_2] "r" (step1_2), [step1_3] "r" (step1_3), - [step2_28] "r" (step2_28), [step2_29] "r" (step2_29), - [step2_30] "r" (step2_30), [step2_31] "r" (step2_31) - ); - - step3_12 = ROUND_POWER_OF_TWO((step1_3 - step2_28), 6); - step3_13 = ROUND_POWER_OF_TWO((step1_2 - step2_29), 6); - step3_14 = ROUND_POWER_OF_TWO((step1_1 - step2_30), 6); - step3_15 = ROUND_POWER_OF_TWO((step1_0 - step2_31), 6); - - __asm__ __volatile__ ( - "lbu %[temp2], 0(%[dest_pix1]) \n\t" - "add %[temp2], %[temp2], %[step3_15] \n\t" - "lbux %[temp0], %[temp2](%[cm]) \n\t" - "sb %[temp0], 0(%[dest_pix1]) \n\t" - "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" - "lbu %[temp3], 0(%[dest_pix1]) \n\t" - "add %[temp3], %[temp3], %[step3_14] \n\t" - "lbux %[temp1], %[temp3](%[cm]) \n\t" - "sb %[temp1], 0(%[dest_pix1]) \n\t" - "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" - - "lbu %[temp2], 0(%[dest_pix1]) \n\t" - "add %[temp2], %[temp2], %[step3_13] \n\t" - "lbux %[temp0], %[temp2](%[cm]) \n\t" - "sb %[temp0], 0(%[dest_pix1]) \n\t" - "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" - "lbu %[temp3], 0(%[dest_pix1]) \n\t" - "add %[temp3], %[temp3], %[step3_12] \n\t" - "lbux %[temp1], %[temp3](%[cm]) \n\t" - "sb %[temp1], 0(%[dest_pix1]) \n\t" - "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" - - : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2), - [temp3] "=&r" (temp3), [dest_pix1] "+r" (dest_pix1) - : [cm] "r" (cm), [dest_stride] "r" (dest_stride), - [step3_12] "r" (step3_12), [step3_13] "r" (step3_13), - [step3_14] "r" (step3_14), [step3_15] "r" (step3_15) - ); - - __asm__ __volatile__ ( - "lbu %[temp2], 0(%[dest_pix]) \n\t" - "add %[temp0], %[step1_4], %[step1_27] \n\t" - "addi %[temp0], %[temp0], 32 \n\t" - "sra %[temp0], %[temp0], 6 \n\t" - "add %[temp2], %[temp2], %[temp0] \n\t" - "lbux %[temp0], %[temp2](%[cm]) \n\t" - "add %[temp1], %[step1_5], %[step1_26] \n\t" - "sb %[temp0], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - "lbu %[temp3], 0(%[dest_pix]) \n\t" - "addi %[temp1], %[temp1], 32 \n\t" - "sra %[temp1], %[temp1], 6 \n\t" - "add %[temp3], %[temp3], %[temp1] \n\t" - "lbux %[temp1], %[temp3](%[cm]) \n\t" - "sb %[temp1], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - - "lbu %[temp2], 0(%[dest_pix]) \n\t" - "add %[temp0], %[step1_6], %[step1_25] \n\t" - "addi %[temp0], %[temp0], 32 \n\t" - "sra %[temp0], %[temp0], 6 \n\t" - "add %[temp2], %[temp2], %[temp0] \n\t" - "lbux %[temp0], %[temp2](%[cm]) \n\t" - "add %[temp1], %[step1_7], %[step1_24] \n\t" - "sb %[temp0], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - "lbu %[temp3], 0(%[dest_pix]) \n\t" - "addi %[temp1], %[temp1], 32 \n\t" - "sra %[temp1], %[temp1], 6 \n\t" - "add %[temp3], %[temp3], %[temp1] \n\t" - "lbux %[temp1], %[temp3](%[cm]) \n\t" - "sb %[temp1], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - - : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2), - [temp3] "=&r" (temp3), [dest_pix] "+r" (dest_pix) - : [cm] "r" (cm), [dest_stride] "r" (dest_stride), - [step1_4] "r" (step1_4), [step1_5] "r" (step1_5), - [step1_6] "r" (step1_6), [step1_7] "r" (step1_7), - [step1_24] "r" (step1_24), [step1_25] "r" (step1_25), - [step1_26] "r" (step1_26), [step1_27] "r" (step1_27) - ); - - step3_12 = ROUND_POWER_OF_TWO((step1_7 - step1_24), 6); - step3_13 = ROUND_POWER_OF_TWO((step1_6 - step1_25), 6); - step3_14 = ROUND_POWER_OF_TWO((step1_5 - step1_26), 6); - step3_15 = ROUND_POWER_OF_TWO((step1_4 - step1_27), 6); - - __asm__ __volatile__ ( - "lbu %[temp2], 0(%[dest_pix1]) \n\t" - "add %[temp2], %[temp2], %[step3_15] \n\t" - "lbux %[temp0], %[temp2](%[cm]) \n\t" - "sb %[temp0], 0(%[dest_pix1]) \n\t" - "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" - "lbu %[temp3], 0(%[dest_pix1]) \n\t" - "add %[temp3], %[temp3], %[step3_14] \n\t" - "lbux %[temp1], %[temp3](%[cm]) \n\t" - "sb %[temp1], 0(%[dest_pix1]) \n\t" - "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" - - "lbu %[temp2], 0(%[dest_pix1]) \n\t" - "add %[temp2], %[temp2], %[step3_13] \n\t" - "lbux %[temp0], %[temp2](%[cm]) \n\t" - "sb %[temp0], 0(%[dest_pix1]) \n\t" - "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" - "lbu %[temp3], 0(%[dest_pix1]) \n\t" - "add %[temp3], %[temp3], %[step3_12] \n\t" - "lbux %[temp1], %[temp3](%[cm]) \n\t" - "sb %[temp1], 0(%[dest_pix1]) \n\t" - "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" - - : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2), - [temp3] "=&r" (temp3), [dest_pix1] "+r" (dest_pix1) - : [cm] "r" (cm), [dest_stride] "r" (dest_stride), - [step3_12] "r" (step3_12), [step3_13] "r" (step3_13), - [step3_14] "r" (step3_14), [step3_15] "r" (step3_15) - ); - - __asm__ __volatile__ ( - "lbu %[temp2], 0(%[dest_pix]) \n\t" - "add %[temp0], %[step1_8], %[step1_23] \n\t" - "addi %[temp0], %[temp0], 32 \n\t" - "sra %[temp0], %[temp0], 6 \n\t" - "add %[temp2], %[temp2], %[temp0] \n\t" - "lbux %[temp0], %[temp2](%[cm]) \n\t" - "add %[temp1], %[step1_9], %[step1_22] \n\t" - "sb %[temp0], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - "lbu %[temp3], 0(%[dest_pix]) \n\t" - "addi %[temp1], %[temp1], 32 \n\t" - "sra %[temp1], %[temp1], 6 \n\t" - "add %[temp3], %[temp3], %[temp1] \n\t" - "lbux %[temp1], %[temp3](%[cm]) \n\t" - "sb %[temp1], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - - "lbu %[temp2], 0(%[dest_pix]) \n\t" - "add %[temp0], %[step1_10], %[step1_21] \n\t" - "addi %[temp0], %[temp0], 32 \n\t" - "sra %[temp0], %[temp0], 6 \n\t" - "add %[temp2], %[temp2], %[temp0] \n\t" - "lbux %[temp0], %[temp2](%[cm]) \n\t" - "add %[temp1], %[step1_11], %[step1_20] \n\t" - "sb %[temp0], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - "lbu %[temp3], 0(%[dest_pix]) \n\t" - "addi %[temp1], %[temp1], 32 \n\t" - "sra %[temp1], %[temp1], 6 \n\t" - "add %[temp3], %[temp3], %[temp1] \n\t" - "lbux %[temp1], %[temp3](%[cm]) \n\t" - "sb %[temp1], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - - : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2), - [temp3] "=&r" (temp3), [dest_pix] "+r" (dest_pix) - : [cm] "r" (cm), [dest_stride] "r" (dest_stride), - [step1_8] "r" (step1_8), [step1_9] "r" (step1_9), - [step1_10] "r" (step1_10), [step1_11] "r" (step1_11), - [step1_20] "r" (step1_20), [step1_21] "r" (step1_21), - [step1_22] "r" (step1_22), [step1_23] "r" (step1_23) - ); - - step3_12 = ROUND_POWER_OF_TWO((step1_11 - step1_20), 6); - step3_13 = ROUND_POWER_OF_TWO((step1_10 - step1_21), 6); - step3_14 = ROUND_POWER_OF_TWO((step1_9 - step1_22), 6); - step3_15 = ROUND_POWER_OF_TWO((step1_8 - step1_23), 6); - - __asm__ __volatile__ ( - "lbu %[temp2], 0(%[dest_pix1]) \n\t" - "add %[temp2], %[temp2], %[step3_15] \n\t" - "lbux %[temp0], %[temp2](%[cm]) \n\t" - "sb %[temp0], 0(%[dest_pix1]) \n\t" - "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" - "lbu %[temp3], 0(%[dest_pix1]) \n\t" - "add %[temp3], %[temp3], %[step3_14] \n\t" - "lbux %[temp1], %[temp3](%[cm]) \n\t" - "sb %[temp1], 0(%[dest_pix1]) \n\t" - "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" - - "lbu %[temp2], 0(%[dest_pix1]) \n\t" - "add %[temp2], %[temp2], %[step3_13] \n\t" - "lbux %[temp0], %[temp2](%[cm]) \n\t" - "sb %[temp0], 0(%[dest_pix1]) \n\t" - "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" - "lbu %[temp3], 0(%[dest_pix1]) \n\t" - "add %[temp3], %[temp3], %[step3_12] \n\t" - "lbux %[temp1], %[temp3](%[cm]) \n\t" - "sb %[temp1], 0(%[dest_pix1]) \n\t" - "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" - - : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2), - [temp3] "=&r" (temp3), [dest_pix1] "+r" (dest_pix1) - : [cm] "r" (cm), [dest_stride] "r" (dest_stride), - [step3_12] "r" (step3_12), [step3_13] "r" (step3_13), - [step3_14] "r" (step3_14), [step3_15] "r" (step3_15) - ); - - __asm__ __volatile__ ( - "lbu %[temp2], 0(%[dest_pix]) \n\t" - "add %[temp0], %[step1_12], %[step2_19] \n\t" - "addi %[temp0], %[temp0], 32 \n\t" - "sra %[temp0], %[temp0], 6 \n\t" - "add %[temp2], %[temp2], %[temp0] \n\t" - "lbux %[temp0], %[temp2](%[cm]) \n\t" - "add %[temp1], %[step1_13], %[step2_18] \n\t" - "sb %[temp0], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - "lbu %[temp3], 0(%[dest_pix]) \n\t" - "addi %[temp1], %[temp1], 32 \n\t" - "sra %[temp1], %[temp1], 6 \n\t" - "add %[temp3], %[temp3], %[temp1] \n\t" - "lbux %[temp1], %[temp3](%[cm]) \n\t" - "sb %[temp1], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - - "lbu %[temp2], 0(%[dest_pix]) \n\t" - "add %[temp0], %[step1_14], %[step2_17] \n\t" - "addi %[temp0], %[temp0], 32 \n\t" - "sra %[temp0], %[temp0], 6 \n\t" - "add %[temp2], %[temp2], %[temp0] \n\t" - "lbux %[temp0], %[temp2](%[cm]) \n\t" - "add %[temp1], %[step1_15], %[step2_16] \n\t" - "sb %[temp0], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - "lbu %[temp3], 0(%[dest_pix]) \n\t" - "addi %[temp1], %[temp1], 32 \n\t" - "sra %[temp1], %[temp1], 6 \n\t" - "add %[temp3], %[temp3], %[temp1] \n\t" - "lbux %[temp1], %[temp3](%[cm]) \n\t" - "sb %[temp1], 0(%[dest_pix]) \n\t" - - : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2), - [temp3] "=&r" (temp3), [dest_pix] "+r" (dest_pix) - : [cm] "r" (cm), [dest_stride] "r" (dest_stride), - [step1_12] "r" (step1_12), [step1_13] "r" (step1_13), - [step1_14] "r" (step1_14), [step1_15] "r" (step1_15), - [step2_16] "r" (step2_16), [step2_17] "r" (step2_17), - [step2_18] "r" (step2_18), [step2_19] "r" (step2_19) - ); - - step3_12 = ROUND_POWER_OF_TWO((step1_15 - step2_16), 6); - step3_13 = ROUND_POWER_OF_TWO((step1_14 - step2_17), 6); - step3_14 = ROUND_POWER_OF_TWO((step1_13 - step2_18), 6); - step3_15 = ROUND_POWER_OF_TWO((step1_12 - step2_19), 6); - - __asm__ __volatile__ ( - "lbu %[temp2], 0(%[dest_pix1]) \n\t" - "add %[temp2], %[temp2], %[step3_15] \n\t" - "lbux %[temp0], %[temp2](%[cm]) \n\t" - "sb %[temp0], 0(%[dest_pix1]) \n\t" - "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" - "lbu %[temp3], 0(%[dest_pix1]) \n\t" - "add %[temp3], %[temp3], %[step3_14] \n\t" - "lbux %[temp1], %[temp3](%[cm]) \n\t" - "sb %[temp1], 0(%[dest_pix1]) \n\t" - "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" - - "lbu %[temp2], 0(%[dest_pix1]) \n\t" - "add %[temp2], %[temp2], %[step3_13] \n\t" - "lbux %[temp0], %[temp2](%[cm]) \n\t" - "sb %[temp0], 0(%[dest_pix1]) \n\t" - "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" - "lbu %[temp3], 0(%[dest_pix1]) \n\t" - "add %[temp3], %[temp3], %[step3_12] \n\t" - "lbux %[temp1], %[temp3](%[cm]) \n\t" - "sb %[temp1], 0(%[dest_pix1]) \n\t" - - : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2), - [temp3] "=&r" (temp3), [dest_pix1] "+r" (dest_pix1) - : [cm] "r" (cm), [dest_stride] "r" (dest_stride), - [step3_12] "r" (step3_12), [step3_13] "r" (step3_13), - [step3_14] "r" (step3_14), [step3_15] "r" (step3_15) - ); - - input += 32; - } -} -#endif // #if HAVE_DSPR2 diff --git a/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c b/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c deleted file mode 100644 index b4b0d248c..000000000 --- a/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c +++ /dev/null @@ -1,1076 +0,0 @@ -/* - * Copyright (c) 2013 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include <assert.h> -#include <stdio.h> - -#include "./vpx_config.h" -#include "./vp9_rtcd.h" -#include "vp9/common/vp9_common.h" -#include "vp9/common/vp9_blockd.h" -#include "vp9/common/mips/dspr2/vp9_common_dspr2.h" -#include "vpx_dsp/txfm_common.h" - -#if HAVE_DSPR2 -static void idct32_rows_dspr2(const int16_t *input, int16_t *output, - uint32_t no_rows) { - int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6; - int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13; - int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19, step1_20; - int16_t step1_21, step1_22, step1_23, step1_24, step1_25, step1_26, step1_27; - int16_t step1_28, step1_29, step1_30, step1_31; - int16_t step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6; - int16_t step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13; - int16_t step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20; - int16_t step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27; - int16_t step2_28, step2_29, step2_30, step2_31; - int16_t step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14; - int16_t step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21; - int16_t step3_22, step3_23, step3_24, step3_25, step3_26, step3_27, step3_28; - int16_t step3_29, step3_30, step3_31; - int temp0, temp1, temp2, temp3; - int load1, load2, load3, load4; - int result1, result2; - int temp21; - int i; - const int const_2_power_13 = 8192; - const int32_t *input_int; - - for (i = no_rows; i--; ) { - input_int = (const int32_t *)input; - - if (!(input_int[0] | input_int[1] | input_int[2] | input_int[3] | - input_int[4] | input_int[5] | input_int[6] | input_int[7] | - input_int[8] | input_int[9] | input_int[10] | input_int[11] | - input_int[12] | input_int[13] | input_int[14] | input_int[15])) { - input += 32; - - __asm__ __volatile__ ( - "sh $zero, 0(%[output]) \n\t" - "sh $zero, 64(%[output]) \n\t" - "sh $zero, 128(%[output]) \n\t" - "sh $zero, 192(%[output]) \n\t" - "sh $zero, 256(%[output]) \n\t" - "sh $zero, 320(%[output]) \n\t" - "sh $zero, 384(%[output]) \n\t" - "sh $zero, 448(%[output]) \n\t" - "sh $zero, 512(%[output]) \n\t" - "sh $zero, 576(%[output]) \n\t" - "sh $zero, 640(%[output]) \n\t" - "sh $zero, 704(%[output]) \n\t" - "sh $zero, 768(%[output]) \n\t" - "sh $zero, 832(%[output]) \n\t" - "sh $zero, 896(%[output]) \n\t" - "sh $zero, 960(%[output]) \n\t" - "sh $zero, 1024(%[output]) \n\t" - "sh $zero, 1088(%[output]) \n\t" - "sh $zero, 1152(%[output]) \n\t" - "sh $zero, 1216(%[output]) \n\t" - "sh $zero, 1280(%[output]) \n\t" - "sh $zero, 1344(%[output]) \n\t" - "sh $zero, 1408(%[output]) \n\t" - "sh $zero, 1472(%[output]) \n\t" - "sh $zero, 1536(%[output]) \n\t" - "sh $zero, 1600(%[output]) \n\t" - "sh $zero, 1664(%[output]) \n\t" - "sh $zero, 1728(%[output]) \n\t" - "sh $zero, 1792(%[output]) \n\t" - "sh $zero, 1856(%[output]) \n\t" - "sh $zero, 1920(%[output]) \n\t" - "sh $zero, 1984(%[output]) \n\t" - - : - : [output] "r" (output) - ); - - output += 1; - - continue; - } - - /* prefetch row */ - prefetch_load((const uint8_t *)(input + 32)); - prefetch_load((const uint8_t *)(input + 48)); - - __asm__ __volatile__ ( - "lh %[load1], 2(%[input]) \n\t" - "lh %[load2], 62(%[input]) \n\t" - "lh %[load3], 34(%[input]) \n\t" - "lh %[load4], 30(%[input]) \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "madd $ac1, %[load1], %[cospi_31_64] \n\t" - "msub $ac1, %[load2], %[cospi_1_64] \n\t" - "extp %[temp0], $ac1, 31 \n\t" - - "madd $ac3, %[load1], %[cospi_1_64] \n\t" - "madd $ac3, %[load2], %[cospi_31_64] \n\t" - "extp %[temp3], $ac3, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - - "madd $ac2, %[load3], %[cospi_15_64] \n\t" - "msub $ac2, %[load4], %[cospi_17_64] \n\t" - "extp %[temp1], $ac2, 31 \n\t" - - "madd $ac1, %[load3], %[cospi_17_64] \n\t" - "madd $ac1, %[load4], %[cospi_15_64] \n\t" - "extp %[temp2], $ac1, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "sub %[load1], %[temp3], %[temp2] \n\t" - "sub %[load2], %[temp0], %[temp1] \n\t" - - "madd $ac1, %[load1], %[cospi_28_64] \n\t" - "msub $ac1, %[load2], %[cospi_4_64] \n\t" - "madd $ac3, %[load1], %[cospi_4_64] \n\t" - "madd $ac3, %[load2], %[cospi_28_64] \n\t" - - "extp %[step1_17], $ac1, 31 \n\t" - "extp %[step1_30], $ac3, 31 \n\t" - "add %[step1_16], %[temp0], %[temp1] \n\t" - "add %[step1_31], %[temp2], %[temp3] \n\t" - - : [load1] "=&r" (load1), [load2] "=&r" (load2), - [load3] "=&r" (load3), [load4] "=&r" (load4), - [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), - [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), - [step1_16] "=r" (step1_16), [step1_17] "=r" (step1_17), - [step1_30] "=r" (step1_30), [step1_31] "=r" (step1_31) - : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), - [cospi_31_64] "r" (cospi_31_64), [cospi_1_64] "r" (cospi_1_64), - [cospi_4_64] "r" (cospi_4_64), [cospi_17_64] "r" (cospi_17_64), - [cospi_15_64] "r" (cospi_15_64), [cospi_28_64] "r" (cospi_28_64) - ); - - __asm__ __volatile__ ( - "lh %[load1], 18(%[input]) \n\t" - "lh %[load2], 46(%[input]) \n\t" - "lh %[load3], 50(%[input]) \n\t" - "lh %[load4], 14(%[input]) \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "madd $ac1, %[load1], %[cospi_23_64] \n\t" - "msub $ac1, %[load2], %[cospi_9_64] \n\t" - "extp %[temp0], $ac1, 31 \n\t" - - "madd $ac3, %[load1], %[cospi_9_64] \n\t" - "madd $ac3, %[load2], %[cospi_23_64] \n\t" - "extp %[temp3], $ac3, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - - "madd $ac2, %[load3], %[cospi_7_64] \n\t" - "msub $ac2, %[load4], %[cospi_25_64] \n\t" - "extp %[temp1], $ac2, 31 \n\t" - - "madd $ac1, %[load3], %[cospi_25_64] \n\t" - "madd $ac1, %[load4], %[cospi_7_64] \n\t" - "extp %[temp2], $ac1, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "sub %[load1], %[temp1], %[temp0] \n\t" - "sub %[load2], %[temp2], %[temp3] \n\t" - - "msub $ac1, %[load1], %[cospi_28_64] \n\t" - "msub $ac1, %[load2], %[cospi_4_64] \n\t" - "msub $ac3, %[load1], %[cospi_4_64] \n\t" - "madd $ac3, %[load2], %[cospi_28_64] \n\t" - - "extp %[step1_18], $ac1, 31 \n\t" - "extp %[step1_29], $ac3, 31 \n\t" - "add %[step1_19], %[temp0], %[temp1] \n\t" - "add %[step1_28], %[temp2], %[temp3] \n\t" - - : [load1] "=&r" (load1), [load2] "=&r" (load2), - [load3] "=&r" (load3), [load4] "=&r" (load4), - [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), - [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), - [step1_18] "=r" (step1_18), [step1_19] "=r" (step1_19), - [step1_28] "=r" (step1_28), [step1_29] "=r" (step1_29) - : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), - [cospi_23_64] "r" (cospi_23_64), [cospi_9_64] "r" (cospi_9_64), - [cospi_4_64] "r" (cospi_4_64), [cospi_7_64] "r" (cospi_7_64), - [cospi_25_64] "r" (cospi_25_64), [cospi_28_64] "r" (cospi_28_64) - ); - - __asm__ __volatile__ ( - "lh %[load1], 10(%[input]) \n\t" - "lh %[load2], 54(%[input]) \n\t" - "lh %[load3], 42(%[input]) \n\t" - "lh %[load4], 22(%[input]) \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "madd $ac1, %[load1], %[cospi_27_64] \n\t" - "msub $ac1, %[load2], %[cospi_5_64] \n\t" - "extp %[temp0], $ac1, 31 \n\t" - - "madd $ac3, %[load1], %[cospi_5_64] \n\t" - "madd $ac3, %[load2], %[cospi_27_64] \n\t" - "extp %[temp3], $ac3, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - - "madd $ac2, %[load3], %[cospi_11_64] \n\t" - "msub $ac2, %[load4], %[cospi_21_64] \n\t" - "extp %[temp1], $ac2, 31 \n\t" - - "madd $ac1, %[load3], %[cospi_21_64] \n\t" - "madd $ac1, %[load4], %[cospi_11_64] \n\t" - "extp %[temp2], $ac1, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "sub %[load1], %[temp0], %[temp1] \n\t" - "sub %[load2], %[temp3], %[temp2] \n\t" - - "madd $ac1, %[load2], %[cospi_12_64] \n\t" - "msub $ac1, %[load1], %[cospi_20_64] \n\t" - "madd $ac3, %[load1], %[cospi_12_64] \n\t" - "madd $ac3, %[load2], %[cospi_20_64] \n\t" - - "extp %[step1_21], $ac1, 31 \n\t" - "extp %[step1_26], $ac3, 31 \n\t" - "add %[step1_20], %[temp0], %[temp1] \n\t" - "add %[step1_27], %[temp2], %[temp3] \n\t" - - : [load1] "=&r" (load1), [load2] "=&r" (load2), - [load3] "=&r" (load3), [load4] "=&r" (load4), - [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), - [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), - [step1_20] "=r" (step1_20), [step1_21] "=r" (step1_21), - [step1_26] "=r" (step1_26), [step1_27] "=r" (step1_27) - : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), - [cospi_27_64] "r" (cospi_27_64), [cospi_5_64] "r" (cospi_5_64), - [cospi_11_64] "r" (cospi_11_64), [cospi_21_64] "r" (cospi_21_64), - [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64) - ); - - __asm__ __volatile__ ( - "lh %[load1], 26(%[input]) \n\t" - "lh %[load2], 38(%[input]) \n\t" - "lh %[load3], 58(%[input]) \n\t" - "lh %[load4], 6(%[input]) \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "madd $ac1, %[load1], %[cospi_19_64] \n\t" - "msub $ac1, %[load2], %[cospi_13_64] \n\t" - "extp %[temp0], $ac1, 31 \n\t" - - "madd $ac3, %[load1], %[cospi_13_64] \n\t" - "madd $ac3, %[load2], %[cospi_19_64] \n\t" - "extp %[temp3], $ac3, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - - "madd $ac2, %[load3], %[cospi_3_64] \n\t" - "msub $ac2, %[load4], %[cospi_29_64] \n\t" - "extp %[temp1], $ac2, 31 \n\t" - - "madd $ac1, %[load3], %[cospi_29_64] \n\t" - "madd $ac1, %[load4], %[cospi_3_64] \n\t" - "extp %[temp2], $ac1, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "sub %[load1], %[temp1], %[temp0] \n\t" - "sub %[load2], %[temp2], %[temp3] \n\t" - - "msub $ac1, %[load1], %[cospi_12_64] \n\t" - "msub $ac1, %[load2], %[cospi_20_64] \n\t" - "msub $ac3, %[load1], %[cospi_20_64] \n\t" - "madd $ac3, %[load2], %[cospi_12_64] \n\t" - - "extp %[step1_22], $ac1, 31 \n\t" - "extp %[step1_25], $ac3, 31 \n\t" - "add %[step1_23], %[temp0], %[temp1] \n\t" - "add %[step1_24], %[temp2], %[temp3] \n\t" - - : [load1] "=&r" (load1), [load2] "=&r" (load2), - [load3] "=&r" (load3), [load4] "=&r" (load4), - [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), - [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), - [step1_22] "=r" (step1_22), [step1_23] "=r" (step1_23), - [step1_24] "=r" (step1_24), [step1_25] "=r" (step1_25) - : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), - [cospi_19_64] "r" (cospi_19_64), [cospi_13_64] "r" (cospi_13_64), - [cospi_3_64] "r" (cospi_3_64), [cospi_29_64] "r" (cospi_29_64), - [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64) - ); - - __asm__ __volatile__ ( - "lh %[load1], 4(%[input]) \n\t" - "lh %[load2], 60(%[input]) \n\t" - "lh %[load3], 36(%[input]) \n\t" - "lh %[load4], 28(%[input]) \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "madd $ac1, %[load1], %[cospi_30_64] \n\t" - "msub $ac1, %[load2], %[cospi_2_64] \n\t" - "extp %[temp0], $ac1, 31 \n\t" - - "madd $ac3, %[load1], %[cospi_2_64] \n\t" - "madd $ac3, %[load2], %[cospi_30_64] \n\t" - "extp %[temp3], $ac3, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - - "madd $ac2, %[load3], %[cospi_14_64] \n\t" - "msub $ac2, %[load4], %[cospi_18_64] \n\t" - "extp %[temp1], $ac2, 31 \n\t" - - "madd $ac1, %[load3], %[cospi_18_64] \n\t" - "madd $ac1, %[load4], %[cospi_14_64] \n\t" - "extp %[temp2], $ac1, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "sub %[load1], %[temp0], %[temp1] \n\t" - "sub %[load2], %[temp3], %[temp2] \n\t" - - "msub $ac1, %[load1], %[cospi_8_64] \n\t" - "madd $ac1, %[load2], %[cospi_24_64] \n\t" - "madd $ac3, %[load1], %[cospi_24_64] \n\t" - "madd $ac3, %[load2], %[cospi_8_64] \n\t" - - "extp %[step2_9], $ac1, 31 \n\t" - "extp %[step2_14], $ac3, 31 \n\t" - "add %[step2_8], %[temp0], %[temp1] \n\t" - "add %[step2_15], %[temp2], %[temp3] \n\t" - - : [load1] "=&r" (load1), [load2] "=&r" (load2), - [load3] "=&r" (load3), [load4] "=&r" (load4), - [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), - [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), - [step2_8] "=r" (step2_8), [step2_9] "=r" (step2_9), - [step2_14] "=r" (step2_14), [step2_15] "=r" (step2_15) - : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), - [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64), - [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64), - [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64) - ); - - __asm__ __volatile__ ( - "lh %[load1], 20(%[input]) \n\t" - "lh %[load2], 44(%[input]) \n\t" - "lh %[load3], 52(%[input]) \n\t" - "lh %[load4], 12(%[input]) \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "madd $ac1, %[load1], %[cospi_22_64] \n\t" - "msub $ac1, %[load2], %[cospi_10_64] \n\t" - "extp %[temp0], $ac1, 31 \n\t" - - "madd $ac3, %[load1], %[cospi_10_64] \n\t" - "madd $ac3, %[load2], %[cospi_22_64] \n\t" - "extp %[temp3], $ac3, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - - "madd $ac2, %[load3], %[cospi_6_64] \n\t" - "msub $ac2, %[load4], %[cospi_26_64] \n\t" - "extp %[temp1], $ac2, 31 \n\t" - - "madd $ac1, %[load3], %[cospi_26_64] \n\t" - "madd $ac1, %[load4], %[cospi_6_64] \n\t" - "extp %[temp2], $ac1, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "sub %[load1], %[temp1], %[temp0] \n\t" - "sub %[load2], %[temp2], %[temp3] \n\t" - - "msub $ac1, %[load1], %[cospi_24_64] \n\t" - "msub $ac1, %[load2], %[cospi_8_64] \n\t" - "madd $ac3, %[load2], %[cospi_24_64] \n\t" - "msub $ac3, %[load1], %[cospi_8_64] \n\t" - - "extp %[step2_10], $ac1, 31 \n\t" - "extp %[step2_13], $ac3, 31 \n\t" - "add %[step2_11], %[temp0], %[temp1] \n\t" - "add %[step2_12], %[temp2], %[temp3] \n\t" - - : [load1] "=&r" (load1), [load2] "=&r" (load2), - [load3] "=&r" (load3), [load4] "=&r" (load4), - [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), - [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), - [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11), - [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13) - : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), - [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64), - [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64), - [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64) - ); - - __asm__ __volatile__ ( - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "sub %[temp0], %[step2_14], %[step2_13] \n\t" - "sub %[temp0], %[temp0], %[step2_9] \n\t" - "add %[temp0], %[temp0], %[step2_10] \n\t" - "madd $ac0, %[temp0], %[cospi_16_64] \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "sub %[temp1], %[step2_14], %[step2_13] \n\t" - "add %[temp1], %[temp1], %[step2_9] \n\t" - "sub %[temp1], %[temp1], %[step2_10] \n\t" - "madd $ac1, %[temp1], %[cospi_16_64] \n\t" - - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "sub %[temp0], %[step2_15], %[step2_12] \n\t" - "sub %[temp0], %[temp0], %[step2_8] \n\t" - "add %[temp0], %[temp0], %[step2_11] \n\t" - "madd $ac2, %[temp0], %[cospi_16_64] \n\t" - - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - "sub %[temp1], %[step2_15], %[step2_12] \n\t" - "add %[temp1], %[temp1], %[step2_8] \n\t" - "sub %[temp1], %[temp1], %[step2_11] \n\t" - "madd $ac3, %[temp1], %[cospi_16_64] \n\t" - - "add %[step3_8], %[step2_8], %[step2_11] \n\t" - "add %[step3_9], %[step2_9], %[step2_10] \n\t" - "add %[step3_14], %[step2_13], %[step2_14] \n\t" - "add %[step3_15], %[step2_12], %[step2_15] \n\t" - - "extp %[step3_10], $ac0, 31 \n\t" - "extp %[step3_13], $ac1, 31 \n\t" - "extp %[step3_11], $ac2, 31 \n\t" - "extp %[step3_12], $ac3, 31 \n\t" - - : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), - [step3_8] "=r" (step3_8), [step3_9] "=r" (step3_9), - [step3_10] "=r" (step3_10), [step3_11] "=r" (step3_11), - [step3_12] "=r" (step3_12), [step3_13] "=r" (step3_13), - [step3_14] "=r" (step3_14), [step3_15] "=r" (step3_15) - : [const_2_power_13] "r" (const_2_power_13), - [step2_8] "r" (step2_8), [step2_9] "r" (step2_9), - [step2_10] "r" (step2_10), [step2_11] "r" (step2_11), - [step2_12] "r" (step2_12), [step2_13] "r" (step2_13), - [step2_14] "r" (step2_14), [step2_15] "r" (step2_15), - [cospi_16_64] "r" (cospi_16_64) - ); - - step2_18 = step1_17 - step1_18; - step2_29 = step1_30 - step1_29; - - __asm__ __volatile__ ( - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "msub $ac0, %[step2_18], %[cospi_8_64] \n\t" - "madd $ac0, %[step2_29], %[cospi_24_64] \n\t" - "extp %[step3_18], $ac0, 31 \n\t" - - : [step3_18] "=r" (step3_18) - : [const_2_power_13] "r" (const_2_power_13), - [step2_18] "r" (step2_18), [step2_29] "r" (step2_29), - [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) - ); - - temp21 = step2_18 * cospi_24_64 + step2_29 * cospi_8_64; - step3_29 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; - - step2_19 = step1_16 - step1_19; - step2_28 = step1_31 - step1_28; - - __asm__ __volatile__ ( - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "msub $ac0, %[step2_19], %[cospi_8_64] \n\t" - "madd $ac0, %[step2_28], %[cospi_24_64] \n\t" - "extp %[step3_19], $ac0, 31 \n\t" - - : [step3_19] "=r" (step3_19) - : [const_2_power_13] "r" (const_2_power_13), - [step2_19] "r" (step2_19), [step2_28] "r" (step2_28), - [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) - ); - - temp21 = step2_19 * cospi_24_64 + step2_28 * cospi_8_64; - step3_28 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; - - step3_16 = step1_16 + step1_19; - step3_17 = step1_17 + step1_18; - step3_30 = step1_29 + step1_30; - step3_31 = step1_28 + step1_31; - - step2_20 = step1_23 - step1_20; - step2_27 = step1_24 - step1_27; - - __asm__ __volatile__ ( - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "msub $ac0, %[step2_20], %[cospi_24_64] \n\t" - "msub $ac0, %[step2_27], %[cospi_8_64] \n\t" - "extp %[step3_20], $ac0, 31 \n\t" - - : [step3_20] "=r" (step3_20) - : [const_2_power_13] "r" (const_2_power_13), - [step2_20] "r" (step2_20), [step2_27] "r" (step2_27), - [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) - ); - - temp21 = -step2_20 * cospi_8_64 + step2_27 * cospi_24_64; - step3_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; - - step2_21 = step1_22 - step1_21; - step2_26 = step1_25 - step1_26; - - __asm__ __volatile__ ( - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "msub $ac1, %[step2_21], %[cospi_24_64] \n\t" - "msub $ac1, %[step2_26], %[cospi_8_64] \n\t" - "extp %[step3_21], $ac1, 31 \n\t" - - : [step3_21] "=r" (step3_21) - : [const_2_power_13] "r" (const_2_power_13), - [step2_21] "r" (step2_21), [step2_26] "r" (step2_26), - [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) - ); - - temp21 = -step2_21 * cospi_8_64 + step2_26 * cospi_24_64; - step3_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; - - step3_22 = step1_21 + step1_22; - step3_23 = step1_20 + step1_23; - step3_24 = step1_24 + step1_27; - step3_25 = step1_25 + step1_26; - - step2_16 = step3_16 + step3_23; - step2_17 = step3_17 + step3_22; - step2_18 = step3_18 + step3_21; - step2_19 = step3_19 + step3_20; - step2_20 = step3_19 - step3_20; - step2_21 = step3_18 - step3_21; - step2_22 = step3_17 - step3_22; - step2_23 = step3_16 - step3_23; - - step2_24 = step3_31 - step3_24; - step2_25 = step3_30 - step3_25; - step2_26 = step3_29 - step3_26; - step2_27 = step3_28 - step3_27; - step2_28 = step3_28 + step3_27; - step2_29 = step3_29 + step3_26; - step2_30 = step3_30 + step3_25; - step2_31 = step3_31 + step3_24; - - __asm__ __volatile__ ( - "lh %[load1], 0(%[input]) \n\t" - "lh %[load2], 32(%[input]) \n\t" - "lh %[load3], 16(%[input]) \n\t" - "lh %[load4], 48(%[input]) \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "add %[result1], %[load1], %[load2] \n\t" - "sub %[result2], %[load1], %[load2] \n\t" - "madd $ac1, %[result1], %[cospi_16_64] \n\t" - "madd $ac2, %[result2], %[cospi_16_64] \n\t" - "extp %[temp0], $ac1, 31 \n\t" - "extp %[temp1], $ac2, 31 \n\t" - - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - "madd $ac3, %[load3], %[cospi_24_64] \n\t" - "msub $ac3, %[load4], %[cospi_8_64] \n\t" - "extp %[temp2], $ac3, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "madd $ac1, %[load3], %[cospi_8_64] \n\t" - "madd $ac1, %[load4], %[cospi_24_64] \n\t" - "extp %[temp3], $ac1, 31 \n\t" - - "add %[step1_0], %[temp0], %[temp3] \n\t" - "add %[step1_1], %[temp1], %[temp2] \n\t" - "sub %[step1_2], %[temp1], %[temp2] \n\t" - "sub %[step1_3], %[temp0], %[temp3] \n\t" - - : [load1] "=&r" (load1), [load2] "=&r" (load2), - [load3] "=&r" (load3), [load4] "=&r" (load4), - [result1] "=&r" (result1), [result2] "=&r" (result2), - [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), - [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), - [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1), - [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3) - : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), - [cospi_16_64] "r" (cospi_16_64), - [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) - - ); - - __asm__ __volatile__ ( - "lh %[load1], 8(%[input]) \n\t" - "lh %[load2], 56(%[input]) \n\t" - "lh %[load3], 40(%[input]) \n\t" - "lh %[load4], 24(%[input]) \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "madd $ac1, %[load1], %[cospi_28_64] \n\t" - "msub $ac1, %[load2], %[cospi_4_64] \n\t" - "extp %[temp0], $ac1, 31 \n\t" - - "madd $ac3, %[load1], %[cospi_4_64] \n\t" - "madd $ac3, %[load2], %[cospi_28_64] \n\t" - "extp %[temp3], $ac3, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - - "madd $ac2, %[load3], %[cospi_12_64] \n\t" - "msub $ac2, %[load4], %[cospi_20_64] \n\t" - "extp %[temp1], $ac2, 31 \n\t" - - "madd $ac1, %[load3], %[cospi_20_64] \n\t" - "madd $ac1, %[load4], %[cospi_12_64] \n\t" - "extp %[temp2], $ac1, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "sub %[load1], %[temp3], %[temp2] \n\t" - "sub %[load1], %[load1], %[temp0] \n\t" - "add %[load1], %[load1], %[temp1] \n\t" - - "sub %[load2], %[temp0], %[temp1] \n\t" - "sub %[load2], %[load2], %[temp2] \n\t" - "add %[load2], %[load2], %[temp3] \n\t" - - "madd $ac1, %[load1], %[cospi_16_64] \n\t" - "madd $ac3, %[load2], %[cospi_16_64] \n\t" - - "extp %[step1_5], $ac1, 31 \n\t" - "extp %[step1_6], $ac3, 31 \n\t" - "add %[step1_4], %[temp0], %[temp1] \n\t" - "add %[step1_7], %[temp3], %[temp2] \n\t" - - : [load1] "=&r" (load1), [load2] "=&r" (load2), - [load3] "=&r" (load3), [load4] "=&r" (load4), - [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), - [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), - [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5), - [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7) - : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), - [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64), - [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64), - [cospi_16_64] "r" (cospi_16_64) - ); - - step2_0 = step1_0 + step1_7; - step2_1 = step1_1 + step1_6; - step2_2 = step1_2 + step1_5; - step2_3 = step1_3 + step1_4; - step2_4 = step1_3 - step1_4; - step2_5 = step1_2 - step1_5; - step2_6 = step1_1 - step1_6; - step2_7 = step1_0 - step1_7; - - step1_0 = step2_0 + step3_15; - step1_1 = step2_1 + step3_14; - step1_2 = step2_2 + step3_13; - step1_3 = step2_3 + step3_12; - step1_4 = step2_4 + step3_11; - step1_5 = step2_5 + step3_10; - step1_6 = step2_6 + step3_9; - step1_7 = step2_7 + step3_8; - step1_8 = step2_7 - step3_8; - step1_9 = step2_6 - step3_9; - step1_10 = step2_5 - step3_10; - step1_11 = step2_4 - step3_11; - step1_12 = step2_3 - step3_12; - step1_13 = step2_2 - step3_13; - step1_14 = step2_1 - step3_14; - step1_15 = step2_0 - step3_15; - - __asm__ __volatile__ ( - "sub %[temp0], %[step2_27], %[step2_20] \n\t" - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "madd $ac0, %[temp0], %[cospi_16_64] \n\t" - "extp %[step1_20], $ac0, 31 \n\t" - - : [temp0] "=&r" (temp0), [step1_20] "=r" (step1_20) - : [const_2_power_13] "r" (const_2_power_13), - [step2_20] "r" (step2_20), [step2_27] "r" (step2_27), - [cospi_16_64] "r" (cospi_16_64) - ); - - temp21 = (step2_20 + step2_27) * cospi_16_64; - step1_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; - - __asm__ __volatile__ ( - "sub %[temp0], %[step2_26], %[step2_21] \n\t" - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "madd $ac0, %[temp0], %[cospi_16_64] \n\t" - "extp %[step1_21], $ac0, 31 \n\t" - - : [temp0] "=&r" (temp0), [step1_21] "=r" (step1_21) - : [const_2_power_13] "r" (const_2_power_13), - [step2_26] "r" (step2_26), [step2_21] "r" (step2_21), - [cospi_16_64] "r" (cospi_16_64) - ); - - temp21 = (step2_21 + step2_26) * cospi_16_64; - step1_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; - - __asm__ __volatile__ ( - "sub %[temp0], %[step2_25], %[step2_22] \n\t" - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "madd $ac0, %[temp0], %[cospi_16_64] \n\t" - "extp %[step1_22], $ac0, 31 \n\t" - - : [temp0] "=&r" (temp0), [step1_22] "=r" (step1_22) - : [const_2_power_13] "r" (const_2_power_13), - [step2_25] "r" (step2_25), [step2_22] "r" (step2_22), - [cospi_16_64] "r" (cospi_16_64) - ); - - temp21 = (step2_22 + step2_25) * cospi_16_64; - step1_25 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; - - __asm__ __volatile__ ( - "sub %[temp0], %[step2_24], %[step2_23] \n\t" - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "madd $ac0, %[temp0], %[cospi_16_64] \n\t" - "extp %[step1_23], $ac0, 31 \n\t" - - : [temp0] "=&r" (temp0), [step1_23] "=r" (step1_23) - : [const_2_power_13] "r" (const_2_power_13), - [step2_24] "r" (step2_24), [step2_23] "r" (step2_23), - [cospi_16_64] "r" (cospi_16_64) - ); - - temp21 = (step2_23 + step2_24) * cospi_16_64; - step1_24 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; - - // final stage - output[0 * 32] = step1_0 + step2_31; - output[1 * 32] = step1_1 + step2_30; - output[2 * 32] = step1_2 + step2_29; - output[3 * 32] = step1_3 + step2_28; - output[4 * 32] = step1_4 + step1_27; - output[5 * 32] = step1_5 + step1_26; - output[6 * 32] = step1_6 + step1_25; - output[7 * 32] = step1_7 + step1_24; - output[8 * 32] = step1_8 + step1_23; - output[9 * 32] = step1_9 + step1_22; - output[10 * 32] = step1_10 + step1_21; - output[11 * 32] = step1_11 + step1_20; - output[12 * 32] = step1_12 + step2_19; - output[13 * 32] = step1_13 + step2_18; - output[14 * 32] = step1_14 + step2_17; - output[15 * 32] = step1_15 + step2_16; - output[16 * 32] = step1_15 - step2_16; - output[17 * 32] = step1_14 - step2_17; - output[18 * 32] = step1_13 - step2_18; - output[19 * 32] = step1_12 - step2_19; - output[20 * 32] = step1_11 - step1_20; - output[21 * 32] = step1_10 - step1_21; - output[22 * 32] = step1_9 - step1_22; - output[23 * 32] = step1_8 - step1_23; - output[24 * 32] = step1_7 - step1_24; - output[25 * 32] = step1_6 - step1_25; - output[26 * 32] = step1_5 - step1_26; - output[27 * 32] = step1_4 - step1_27; - output[28 * 32] = step1_3 - step2_28; - output[29 * 32] = step1_2 - step2_29; - output[30 * 32] = step1_1 - step2_30; - output[31 * 32] = step1_0 - step2_31; - - input += 32; - output += 1; - } -} - -void vp9_idct32x32_1024_add_dspr2(const int16_t *input, uint8_t *dest, - int dest_stride) { - DECLARE_ALIGNED(32, int16_t, out[32 * 32]); - int16_t *outptr = out; - uint32_t pos = 45; - - /* bit positon for extract from acc */ - __asm__ __volatile__ ( - "wrdsp %[pos], 1 \n\t" - : - : [pos] "r" (pos) - ); - - // Rows - idct32_rows_dspr2(input, outptr, 32); - - // Columns - vp9_idct32_cols_add_blk_dspr2(out, dest, dest_stride); -} - -void vp9_idct32x32_34_add_dspr2(const int16_t *input, uint8_t *dest, - int stride) { - DECLARE_ALIGNED(32, int16_t, out[32 * 32]); - int16_t *outptr = out; - uint32_t i; - uint32_t pos = 45; - - /* bit positon for extract from acc */ - __asm__ __volatile__ ( - "wrdsp %[pos], 1 \n\t" - : - : [pos] "r" (pos) - ); - - // Rows - idct32_rows_dspr2(input, outptr, 8); - - outptr += 8; - __asm__ __volatile__ ( - "sw $zero, 0(%[outptr]) \n\t" - "sw $zero, 4(%[outptr]) \n\t" - "sw $zero, 8(%[outptr]) \n\t" - "sw $zero, 12(%[outptr]) \n\t" - "sw $zero, 16(%[outptr]) \n\t" - "sw $zero, 20(%[outptr]) \n\t" - "sw $zero, 24(%[outptr]) \n\t" - "sw $zero, 28(%[outptr]) \n\t" - "sw $zero, 32(%[outptr]) \n\t" - "sw $zero, 36(%[outptr]) \n\t" - "sw $zero, 40(%[outptr]) \n\t" - "sw $zero, 44(%[outptr]) \n\t" - - : - : [outptr] "r" (outptr) - ); - - for (i = 0; i < 31; ++i) { - outptr += 32; - - __asm__ __volatile__ ( - "sw $zero, 0(%[outptr]) \n\t" - "sw $zero, 4(%[outptr]) \n\t" - "sw $zero, 8(%[outptr]) \n\t" - "sw $zero, 12(%[outptr]) \n\t" - "sw $zero, 16(%[outptr]) \n\t" - "sw $zero, 20(%[outptr]) \n\t" - "sw $zero, 24(%[outptr]) \n\t" - "sw $zero, 28(%[outptr]) \n\t" - "sw $zero, 32(%[outptr]) \n\t" - "sw $zero, 36(%[outptr]) \n\t" - "sw $zero, 40(%[outptr]) \n\t" - "sw $zero, 44(%[outptr]) \n\t" - - : - : [outptr] "r" (outptr) - ); - } - - // Columns - vp9_idct32_cols_add_blk_dspr2(out, dest, stride); -} - -void vp9_idct32x32_1_add_dspr2(const int16_t *input, uint8_t *dest, - int stride) { - int r, out; - int32_t a1, absa1; - int32_t vector_a1; - int32_t t1, t2, t3, t4; - int32_t vector_1, vector_2, vector_3, vector_4; - uint32_t pos = 45; - - /* bit positon for extract from acc */ - __asm__ __volatile__ ( - "wrdsp %[pos], 1 \n\t" - - : - : [pos] "r" (pos) - ); - - out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]); - __asm__ __volatile__ ( - "addi %[out], %[out], 32 \n\t" - "sra %[a1], %[out], 6 \n\t" - - : [out] "+r" (out), [a1] "=r" (a1) - : - ); - - if (a1 < 0) { - /* use quad-byte - * input and output memory are four byte aligned */ - __asm__ __volatile__ ( - "abs %[absa1], %[a1] \n\t" - "replv.qb %[vector_a1], %[absa1] \n\t" - - : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1) - : [a1] "r" (a1) - ); - - for (r = 32; r--;) { - __asm__ __volatile__ ( - "lw %[t1], 0(%[dest]) \n\t" - "lw %[t2], 4(%[dest]) \n\t" - "lw %[t3], 8(%[dest]) \n\t" - "lw %[t4], 12(%[dest]) \n\t" - "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" - "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" - "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" - "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" - "sw %[vector_1], 0(%[dest]) \n\t" - "sw %[vector_2], 4(%[dest]) \n\t" - "sw %[vector_3], 8(%[dest]) \n\t" - "sw %[vector_4], 12(%[dest]) \n\t" - - "lw %[t1], 16(%[dest]) \n\t" - "lw %[t2], 20(%[dest]) \n\t" - "lw %[t3], 24(%[dest]) \n\t" - "lw %[t4], 28(%[dest]) \n\t" - "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" - "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" - "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" - "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" - "sw %[vector_1], 16(%[dest]) \n\t" - "sw %[vector_2], 20(%[dest]) \n\t" - "sw %[vector_3], 24(%[dest]) \n\t" - "sw %[vector_4], 28(%[dest]) \n\t" - - "add %[dest], %[dest], %[stride] \n\t" - - : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4), - [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2), - [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4), - [dest] "+&r" (dest) - : [stride] "r" (stride), [vector_a1] "r" (vector_a1) - ); - } - } else { - /* use quad-byte - * input and output memory are four byte aligned */ - __asm__ __volatile__ ( - "replv.qb %[vector_a1], %[a1] \n\t" - - : [vector_a1] "=r" (vector_a1) - : [a1] "r" (a1) - ); - - for (r = 32; r--;) { - __asm__ __volatile__ ( - "lw %[t1], 0(%[dest]) \n\t" - "lw %[t2], 4(%[dest]) \n\t" - "lw %[t3], 8(%[dest]) \n\t" - "lw %[t4], 12(%[dest]) \n\t" - "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" - "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" - "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" - "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" - "sw %[vector_1], 0(%[dest]) \n\t" - "sw %[vector_2], 4(%[dest]) \n\t" - "sw %[vector_3], 8(%[dest]) \n\t" - "sw %[vector_4], 12(%[dest]) \n\t" - - "lw %[t1], 16(%[dest]) \n\t" - "lw %[t2], 20(%[dest]) \n\t" - "lw %[t3], 24(%[dest]) \n\t" - "lw %[t4], 28(%[dest]) \n\t" - "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" - "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" - "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" - "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" - "sw %[vector_1], 16(%[dest]) \n\t" - "sw %[vector_2], 20(%[dest]) \n\t" - "sw %[vector_3], 24(%[dest]) \n\t" - "sw %[vector_4], 28(%[dest]) \n\t" - - "add %[dest], %[dest], %[stride] \n\t" - - : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4), - [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2), - [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4), - [dest] "+&r" (dest) - : [stride] "r" (stride), [vector_a1] "r" (vector_a1) - ); - } - } -} -#endif // #if HAVE_DSPR2 diff --git a/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c b/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c index aa801ecb6..848f7c0aa 100644 --- a/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c +++ b/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c @@ -16,354 +16,11 @@ #include "vp9/common/vp9_common.h" #include "vp9/common/vp9_blockd.h" #include "vp9/common/vp9_idct.h" -#include "vp9/common/mips/dspr2/vp9_common_dspr2.h" +#include "vpx_dsp/mips/inv_txfm_dspr2.h" #include "vpx_dsp/txfm_common.h" #include "vpx_ports/mem.h" #if HAVE_DSPR2 -static void vp9_idct4_rows_dspr2(const int16_t *input, int16_t *output) { - int16_t step_0, step_1, step_2, step_3; - int Temp0, Temp1, Temp2, Temp3; - const int const_2_power_13 = 8192; - int i; - - for (i = 4; i--; ) { - __asm__ __volatile__ ( - /* - temp_1 = (input[0] + input[2]) * cospi_16_64; - step_0 = dct_const_round_shift(temp_1); - - temp_2 = (input[0] - input[2]) * cospi_16_64; - step_1 = dct_const_round_shift(temp_2); - */ - "lh %[Temp0], 0(%[input]) \n\t" - "lh %[Temp1], 4(%[input]) \n\t" - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "add %[Temp2], %[Temp0], %[Temp1] \n\t" - "sub %[Temp3], %[Temp0], %[Temp1] \n\t" - "madd $ac0, %[Temp2], %[cospi_16_64] \n\t" - "lh %[Temp0], 2(%[input]) \n\t" - "lh %[Temp1], 6(%[input]) \n\t" - "extp %[step_0], $ac0, 31 \n\t" - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - - "madd $ac1, %[Temp3], %[cospi_16_64] \n\t" - "extp %[step_1], $ac1, 31 \n\t" - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - - /* - temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64; - step_2 = dct_const_round_shift(temp1); - */ - "madd $ac0, %[Temp0], %[cospi_24_64] \n\t" - "msub $ac0, %[Temp1], %[cospi_8_64] \n\t" - "extp %[step_2], $ac0, 31 \n\t" - - /* - temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64; - step_3 = dct_const_round_shift(temp2); - */ - "madd $ac1, %[Temp0], %[cospi_8_64] \n\t" - "madd $ac1, %[Temp1], %[cospi_24_64] \n\t" - "extp %[step_3], $ac1, 31 \n\t" - - /* - output[0] = step_0 + step_3; - output[4] = step_1 + step_2; - output[8] = step_1 - step_2; - output[12] = step_0 - step_3; - */ - "add %[Temp0], %[step_0], %[step_3] \n\t" - "sh %[Temp0], 0(%[output]) \n\t" - - "add %[Temp1], %[step_1], %[step_2] \n\t" - "sh %[Temp1], 8(%[output]) \n\t" - - "sub %[Temp2], %[step_1], %[step_2] \n\t" - "sh %[Temp2], 16(%[output]) \n\t" - - "sub %[Temp3], %[step_0], %[step_3] \n\t" - "sh %[Temp3], 24(%[output]) \n\t" - - : [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1), - [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), - [step_0] "=&r" (step_0), [step_1] "=&r" (step_1), - [step_2] "=&r" (step_2), [step_3] "=&r" (step_3), - [output] "+r" (output) - : [const_2_power_13] "r" (const_2_power_13), - [cospi_8_64] "r" (cospi_8_64), [cospi_16_64] "r" (cospi_16_64), - [cospi_24_64] "r" (cospi_24_64), - [input] "r" (input) - ); - - input += 4; - output += 1; - } -} - -static void vp9_idct4_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, - int dest_stride) { - int16_t step_0, step_1, step_2, step_3; - int Temp0, Temp1, Temp2, Temp3; - const int const_2_power_13 = 8192; - int i; - uint8_t *dest_pix; - uint8_t *cm = vpx_ff_cropTbl; - - /* prefetch vpx_ff_cropTbl */ - prefetch_load(vpx_ff_cropTbl); - prefetch_load(vpx_ff_cropTbl + 32); - prefetch_load(vpx_ff_cropTbl + 64); - prefetch_load(vpx_ff_cropTbl + 96); - prefetch_load(vpx_ff_cropTbl + 128); - prefetch_load(vpx_ff_cropTbl + 160); - prefetch_load(vpx_ff_cropTbl + 192); - prefetch_load(vpx_ff_cropTbl + 224); - - for (i = 0; i < 4; ++i) { - dest_pix = (dest + i); - - __asm__ __volatile__ ( - /* - temp_1 = (input[0] + input[2]) * cospi_16_64; - step_0 = dct_const_round_shift(temp_1); - - temp_2 = (input[0] - input[2]) * cospi_16_64; - step_1 = dct_const_round_shift(temp_2); - */ - "lh %[Temp0], 0(%[input]) \n\t" - "lh %[Temp1], 4(%[input]) \n\t" - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "add %[Temp2], %[Temp0], %[Temp1] \n\t" - "sub %[Temp3], %[Temp0], %[Temp1] \n\t" - "madd $ac0, %[Temp2], %[cospi_16_64] \n\t" - "lh %[Temp0], 2(%[input]) \n\t" - "lh %[Temp1], 6(%[input]) \n\t" - "extp %[step_0], $ac0, 31 \n\t" - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - - "madd $ac1, %[Temp3], %[cospi_16_64] \n\t" - "extp %[step_1], $ac1, 31 \n\t" - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - - /* - temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64; - step_2 = dct_const_round_shift(temp1); - */ - "madd $ac0, %[Temp0], %[cospi_24_64] \n\t" - "msub $ac0, %[Temp1], %[cospi_8_64] \n\t" - "extp %[step_2], $ac0, 31 \n\t" - - /* - temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64; - step_3 = dct_const_round_shift(temp2); - */ - "madd $ac1, %[Temp0], %[cospi_8_64] \n\t" - "madd $ac1, %[Temp1], %[cospi_24_64] \n\t" - "extp %[step_3], $ac1, 31 \n\t" - - /* - output[0] = step_0 + step_3; - output[4] = step_1 + step_2; - output[8] = step_1 - step_2; - output[12] = step_0 - step_3; - */ - "add %[Temp0], %[step_0], %[step_3] \n\t" - "addi %[Temp0], %[Temp0], 8 \n\t" - "sra %[Temp0], %[Temp0], 4 \n\t" - "lbu %[Temp1], 0(%[dest_pix]) \n\t" - "add %[Temp1], %[Temp1], %[Temp0] \n\t" - "add %[Temp0], %[step_1], %[step_2] \n\t" - "lbux %[Temp2], %[Temp1](%[cm]) \n\t" - "sb %[Temp2], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - - "addi %[Temp0], %[Temp0], 8 \n\t" - "sra %[Temp0], %[Temp0], 4 \n\t" - "lbu %[Temp1], 0(%[dest_pix]) \n\t" - "add %[Temp1], %[Temp1], %[Temp0] \n\t" - "sub %[Temp0], %[step_1], %[step_2] \n\t" - "lbux %[Temp2], %[Temp1](%[cm]) \n\t" - "sb %[Temp2], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - - "addi %[Temp0], %[Temp0], 8 \n\t" - "sra %[Temp0], %[Temp0], 4 \n\t" - "lbu %[Temp1], 0(%[dest_pix]) \n\t" - "add %[Temp1], %[Temp1], %[Temp0] \n\t" - "sub %[Temp0], %[step_0], %[step_3] \n\t" - "lbux %[Temp2], %[Temp1](%[cm]) \n\t" - "sb %[Temp2], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - - "addi %[Temp0], %[Temp0], 8 \n\t" - "sra %[Temp0], %[Temp0], 4 \n\t" - "lbu %[Temp1], 0(%[dest_pix]) \n\t" - "add %[Temp1], %[Temp1], %[Temp0] \n\t" - "lbux %[Temp2], %[Temp1](%[cm]) \n\t" - "sb %[Temp2], 0(%[dest_pix]) \n\t" - - : [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1), - [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), - [step_0] "=&r" (step_0), [step_1] "=&r" (step_1), - [step_2] "=&r" (step_2), [step_3] "=&r" (step_3), - [dest_pix] "+r" (dest_pix) - : [const_2_power_13] "r" (const_2_power_13), - [cospi_8_64] "r" (cospi_8_64), [cospi_16_64] "r" (cospi_16_64), - [cospi_24_64] "r" (cospi_24_64), - [input] "r" (input), [cm] "r" (cm), [dest_stride] "r" (dest_stride) - ); - - input += 4; - } -} - -void vp9_idct4x4_16_add_dspr2(const int16_t *input, uint8_t *dest, - int dest_stride) { - DECLARE_ALIGNED(32, int16_t, out[4 * 4]); - int16_t *outptr = out; - uint32_t pos = 45; - - /* bit positon for extract from acc */ - __asm__ __volatile__ ( - "wrdsp %[pos], 1 \n\t" - : - : [pos] "r" (pos) - ); - - // Rows - vp9_idct4_rows_dspr2(input, outptr); - - // Columns - vp9_idct4_columns_add_blk_dspr2(&out[0], dest, dest_stride); -} - -void vp9_idct4x4_1_add_dspr2(const int16_t *input, uint8_t *dest, - int dest_stride) { - int a1, absa1; - int r; - int32_t out; - int t2, vector_a1, vector_a; - uint32_t pos = 45; - int16_t input_dc = input[0]; - - /* bit positon for extract from acc */ - __asm__ __volatile__ ( - "wrdsp %[pos], 1 \n\t" - - : - : [pos] "r" (pos) - ); - - out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input_dc); - __asm__ __volatile__ ( - "addi %[out], %[out], 8 \n\t" - "sra %[a1], %[out], 4 \n\t" - - : [out] "+r" (out), [a1] "=r" (a1) - : - ); - - if (a1 < 0) { - /* use quad-byte - * input and output memory are four byte aligned */ - __asm__ __volatile__ ( - "abs %[absa1], %[a1] \n\t" - "replv.qb %[vector_a1], %[absa1] \n\t" - - : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1) - : [a1] "r" (a1) - ); - - for (r = 4; r--;) { - __asm__ __volatile__ ( - "lw %[t2], 0(%[dest]) \n\t" - "subu_s.qb %[vector_a], %[t2], %[vector_a1] \n\t" - "sw %[vector_a], 0(%[dest]) \n\t" - "add %[dest], %[dest], %[dest_stride] \n\t" - - : [t2] "=&r" (t2), [vector_a] "=&r" (vector_a), - [dest] "+&r" (dest) - : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1) - ); - } - } else { - /* use quad-byte - * input and output memory are four byte aligned */ - __asm__ __volatile__ ( - "replv.qb %[vector_a1], %[a1] \n\t" - : [vector_a1] "=r" (vector_a1) - : [a1] "r" (a1) - ); - - for (r = 4; r--;) { - __asm__ __volatile__ ( - "lw %[t2], 0(%[dest]) \n\t" - "addu_s.qb %[vector_a], %[t2], %[vector_a1] \n\t" - "sw %[vector_a], 0(%[dest]) \n\t" - "add %[dest], %[dest], %[dest_stride] \n\t" - - : [t2] "=&r" (t2), [vector_a] "=&r" (vector_a), - [dest] "+&r" (dest) - : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1) - ); - } - } -} - -static void iadst4_dspr2(const int16_t *input, int16_t *output) { - int s0, s1, s2, s3, s4, s5, s6, s7; - int x0, x1, x2, x3; - - x0 = input[0]; - x1 = input[1]; - x2 = input[2]; - x3 = input[3]; - - if (!(x0 | x1 | x2 | x3)) { - output[0] = output[1] = output[2] = output[3] = 0; - return; - } - - s0 = sinpi_1_9 * x0; - s1 = sinpi_2_9 * x0; - s2 = sinpi_3_9 * x1; - s3 = sinpi_4_9 * x2; - s4 = sinpi_1_9 * x2; - s5 = sinpi_2_9 * x3; - s6 = sinpi_4_9 * x3; - s7 = x0 - x2 + x3; - - x0 = s0 + s3 + s5; - x1 = s1 - s4 - s6; - x2 = sinpi_3_9 * s7; - x3 = s2; - - s0 = x0 + x3; - s1 = x1 + x3; - s2 = x2; - s3 = x0 + x1 - x3; - - // 1-D transform scaling factor is sqrt(2). - // The overall dynamic range is 14b (input) + 14b (multiplication scaling) - // + 1b (addition) = 29b. - // Hence the output bit depth is 15b. - output[0] = dct_const_round_shift(s0); - output[1] = dct_const_round_shift(s1); - output[2] = dct_const_round_shift(s2); - output[3] = dct_const_round_shift(s3); -} - void vp9_iht4x4_16_add_dspr2(const int16_t *input, uint8_t *dest, int dest_stride, int tx_type) { int i, j; diff --git a/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c b/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c index 5270fa17f..37f3ca9fc 100644 --- a/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c +++ b/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c @@ -15,538 +15,11 @@ #include "./vp9_rtcd.h" #include "vp9/common/vp9_common.h" #include "vp9/common/vp9_blockd.h" -#include "vp9/common/mips/dspr2/vp9_common_dspr2.h" +#include "vpx_dsp/mips/inv_txfm_dspr2.h" #include "vpx_dsp/txfm_common.h" #include "vpx_ports/mem.h" #if HAVE_DSPR2 -static void idct8_rows_dspr2(const int16_t *input, int16_t *output, - uint32_t no_rows) { - int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; - const int const_2_power_13 = 8192; - int Temp0, Temp1, Temp2, Temp3, Temp4; - int i; - - for (i = no_rows; i--; ) { - __asm__ __volatile__ ( - /* - temp_1 = (input[0] + input[4]) * cospi_16_64; - step2_0 = dct_const_round_shift(temp_1); - - temp_2 = (input[0] - input[4]) * cospi_16_64; - step2_1 = dct_const_round_shift(temp_2); - */ - "lh %[Temp0], 0(%[input]) \n\t" - "lh %[Temp1], 8(%[input]) \n\t" - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "add %[Temp2], %[Temp0], %[Temp1] \n\t" - "madd $ac0, %[Temp2], %[cospi_16_64] \n\t" - "extp %[Temp4], $ac0, 31 \n\t" - - "sub %[Temp3], %[Temp0], %[Temp1] \n\t" - "madd $ac1, %[Temp3], %[cospi_16_64] \n\t" - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "extp %[Temp2], $ac1, 31 \n\t" - - /* - temp_1 = input[2] * cospi_24_64 - input[6] * cospi_8_64; - step2_2 = dct_const_round_shift(temp_1); - */ - "lh %[Temp0], 4(%[input]) \n\t" - "lh %[Temp1], 12(%[input]) \n\t" - "madd $ac0, %[Temp0], %[cospi_24_64] \n\t" - "msub $ac0, %[Temp1], %[cospi_8_64] \n\t" - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "extp %[Temp3], $ac0, 31 \n\t" - - /* - step1_1 = step2_1 + step2_2; - step1_2 = step2_1 - step2_2; - */ - "add %[step1_1], %[Temp2], %[Temp3] \n\t" - "sub %[step1_2], %[Temp2], %[Temp3] \n\t" - - /* - temp_2 = input[2] * cospi_8_64 + input[6] * cospi_24_64; - step2_3 = dct_const_round_shift(temp_2); - */ - "madd $ac1, %[Temp0], %[cospi_8_64] \n\t" - "madd $ac1, %[Temp1], %[cospi_24_64] \n\t" - "extp %[Temp1], $ac1, 31 \n\t" - - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - - /* - step1_0 = step2_0 + step2_3; - step1_3 = step2_0 - step2_3; - */ - "add %[step1_0], %[Temp4], %[Temp1] \n\t" - "sub %[step1_3], %[Temp4], %[Temp1] \n\t" - - /* - temp_1 = input[1] * cospi_28_64 - input[7] * cospi_4_64; - step1_4 = dct_const_round_shift(temp_1); - */ - "lh %[Temp0], 2(%[input]) \n\t" - "madd $ac0, %[Temp0], %[cospi_28_64] \n\t" - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "lh %[Temp1], 14(%[input]) \n\t" - "lh %[Temp0], 2(%[input]) \n\t" - "msub $ac0, %[Temp1], %[cospi_4_64] \n\t" - "extp %[step1_4], $ac0, 31 \n\t" - - /* - temp_2 = input[1] * cospi_4_64 + input[7] * cospi_28_64; - step1_7 = dct_const_round_shift(temp_2); - */ - "madd $ac1, %[Temp0], %[cospi_4_64] \n\t" - "madd $ac1, %[Temp1], %[cospi_28_64] \n\t" - "extp %[step1_7], $ac1, 31 \n\t" - - /* - temp_1 = input[5] * cospi_12_64 - input[3] * cospi_20_64; - step1_5 = dct_const_round_shift(temp_1); - */ - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "lh %[Temp0], 10(%[input]) \n\t" - "madd $ac0, %[Temp0], %[cospi_12_64] \n\t" - "lh %[Temp1], 6(%[input]) \n\t" - "msub $ac0, %[Temp1], %[cospi_20_64] \n\t" - "extp %[step1_5], $ac0, 31 \n\t" - - /* - temp_2 = input[5] * cospi_20_64 + input[3] * cospi_12_64; - step1_6 = dct_const_round_shift(temp_2); - */ - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "lh %[Temp0], 10(%[input]) \n\t" - "madd $ac1, %[Temp0], %[cospi_20_64] \n\t" - "lh %[Temp1], 6(%[input]) \n\t" - "madd $ac1, %[Temp1], %[cospi_12_64] \n\t" - "extp %[step1_6], $ac1, 31 \n\t" - - /* - temp_1 = (step1_7 - step1_6 - step1_4 + step1_5) * cospi_16_64; - temp_2 = (step1_4 - step1_5 - step1_6 + step1_7) * cospi_16_64; - */ - "sub %[Temp0], %[step1_7], %[step1_6] \n\t" - "sub %[Temp0], %[Temp0], %[step1_4] \n\t" - "add %[Temp0], %[Temp0], %[step1_5] \n\t" - "sub %[Temp1], %[step1_4], %[step1_5] \n\t" - "sub %[Temp1], %[Temp1], %[step1_6] \n\t" - "add %[Temp1], %[Temp1], %[step1_7] \n\t" - - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - - "madd $ac0, %[Temp0], %[cospi_16_64] \n\t" - "madd $ac1, %[Temp1], %[cospi_16_64] \n\t" - - /* - step1_4 = step1_4 + step1_5; - step1_7 = step1_6 + step1_7; - */ - "add %[step1_4], %[step1_4], %[step1_5] \n\t" - "add %[step1_7], %[step1_7], %[step1_6] \n\t" - - "extp %[step1_5], $ac0, 31 \n\t" - "extp %[step1_6], $ac1, 31 \n\t" - - "add %[Temp0], %[step1_0], %[step1_7] \n\t" - "sh %[Temp0], 0(%[output]) \n\t" - "add %[Temp1], %[step1_1], %[step1_6] \n\t" - "sh %[Temp1], 16(%[output]) \n\t" - "add %[Temp0], %[step1_2], %[step1_5] \n\t" - "sh %[Temp0], 32(%[output]) \n\t" - "add %[Temp1], %[step1_3], %[step1_4] \n\t" - "sh %[Temp1], 48(%[output]) \n\t" - - "sub %[Temp0], %[step1_3], %[step1_4] \n\t" - "sh %[Temp0], 64(%[output]) \n\t" - "sub %[Temp1], %[step1_2], %[step1_5] \n\t" - "sh %[Temp1], 80(%[output]) \n\t" - "sub %[Temp0], %[step1_1], %[step1_6] \n\t" - "sh %[Temp0], 96(%[output]) \n\t" - "sub %[Temp1], %[step1_0], %[step1_7] \n\t" - "sh %[Temp1], 112(%[output]) \n\t" - - : [step1_0] "=&r" (step1_0), [step1_1] "=&r" (step1_1), - [step1_2] "=&r" (step1_2), [step1_3] "=&r" (step1_3), - [step1_4] "=&r" (step1_4), [step1_5] "=&r" (step1_5), - [step1_6] "=&r" (step1_6), [step1_7] "=&r" (step1_7), - [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1), - [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), - [Temp4] "=&r" (Temp4) - : [const_2_power_13] "r" (const_2_power_13), - [cospi_16_64] "r" (cospi_16_64), [cospi_28_64] "r" (cospi_28_64), - [cospi_4_64] "r" (cospi_4_64), [cospi_12_64] "r" (cospi_12_64), - [cospi_20_64] "r" (cospi_20_64), [cospi_8_64] "r" (cospi_8_64), - [cospi_24_64] "r" (cospi_24_64), - [output] "r" (output), [input] "r" (input) - ); - - input += 8; - output += 1; - } -} - -static void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, - int dest_stride) { - int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; - int Temp0, Temp1, Temp2, Temp3; - int i; - const int const_2_power_13 = 8192; - uint8_t *dest_pix; - uint8_t *cm = vpx_ff_cropTbl; - - /* prefetch vpx_ff_cropTbl */ - prefetch_load(vpx_ff_cropTbl); - prefetch_load(vpx_ff_cropTbl + 32); - prefetch_load(vpx_ff_cropTbl + 64); - prefetch_load(vpx_ff_cropTbl + 96); - prefetch_load(vpx_ff_cropTbl + 128); - prefetch_load(vpx_ff_cropTbl + 160); - prefetch_load(vpx_ff_cropTbl + 192); - prefetch_load(vpx_ff_cropTbl + 224); - - for (i = 0; i < 8; ++i) { - dest_pix = (dest + i); - - __asm__ __volatile__ ( - /* - temp_1 = (input[0] + input[4]) * cospi_16_64; - step2_0 = dct_const_round_shift(temp_1); - - temp_2 = (input[0] - input[4]) * cospi_16_64; - step2_1 = dct_const_round_shift(temp_2); - */ - "lh %[Temp0], 0(%[input]) \n\t" - "lh %[Temp1], 8(%[input]) \n\t" - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "add %[Temp2], %[Temp0], %[Temp1] \n\t" - "madd $ac0, %[Temp2], %[cospi_16_64] \n\t" - "extp %[step1_6], $ac0, 31 \n\t" - - "sub %[Temp3], %[Temp0], %[Temp1] \n\t" - "madd $ac1, %[Temp3], %[cospi_16_64] \n\t" - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "extp %[Temp2], $ac1, 31 \n\t" - - /* - temp_1 = input[2] * cospi_24_64 - input[6] * cospi_8_64; - step2_2 = dct_const_round_shift(temp_1); - */ - "lh %[Temp0], 4(%[input]) \n\t" - "lh %[Temp1], 12(%[input]) \n\t" - "madd $ac0, %[Temp0], %[cospi_24_64] \n\t" - "msub $ac0, %[Temp1], %[cospi_8_64] \n\t" - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "extp %[Temp3], $ac0, 31 \n\t" - - /* - step1_1 = step2_1 + step2_2; - step1_2 = step2_1 - step2_2; - */ - "add %[step1_1], %[Temp2], %[Temp3] \n\t" - "sub %[step1_2], %[Temp2], %[Temp3] \n\t" - - /* - temp_2 = input[2] * cospi_8_64 + input[6] * cospi_24_64; - step2_3 = dct_const_round_shift(temp_2); - */ - "madd $ac1, %[Temp0], %[cospi_8_64] \n\t" - "madd $ac1, %[Temp1], %[cospi_24_64] \n\t" - "extp %[Temp1], $ac1, 31 \n\t" - - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - - /* - step1_0 = step2_0 + step2_3; - step1_3 = step2_0 - step2_3; - */ - "add %[step1_0], %[step1_6], %[Temp1] \n\t" - "sub %[step1_3], %[step1_6], %[Temp1] \n\t" - - /* - temp_1 = input[1] * cospi_28_64 - input[7] * cospi_4_64; - step1_4 = dct_const_round_shift(temp_1); - */ - "lh %[Temp0], 2(%[input]) \n\t" - "madd $ac0, %[Temp0], %[cospi_28_64] \n\t" - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "lh %[Temp1], 14(%[input]) \n\t" - "lh %[Temp0], 2(%[input]) \n\t" - "msub $ac0, %[Temp1], %[cospi_4_64] \n\t" - "extp %[step1_4], $ac0, 31 \n\t" - - /* - temp_2 = input[1] * cospi_4_64 + input[7] * cospi_28_64; - step1_7 = dct_const_round_shift(temp_2); - */ - "madd $ac1, %[Temp0], %[cospi_4_64] \n\t" - "madd $ac1, %[Temp1], %[cospi_28_64] \n\t" - "extp %[step1_7], $ac1, 31 \n\t" - - /* - temp_1 = input[5] * cospi_12_64 - input[3] * cospi_20_64; - step1_5 = dct_const_round_shift(temp_1); - */ - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "lh %[Temp0], 10(%[input]) \n\t" - "madd $ac0, %[Temp0], %[cospi_12_64] \n\t" - "lh %[Temp1], 6(%[input]) \n\t" - "msub $ac0, %[Temp1], %[cospi_20_64] \n\t" - "extp %[step1_5], $ac0, 31 \n\t" - - /* - temp_2 = input[5] * cospi_20_64 + input[3] * cospi_12_64; - step1_6 = dct_const_round_shift(temp_2); - */ - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "lh %[Temp0], 10(%[input]) \n\t" - "madd $ac1, %[Temp0], %[cospi_20_64] \n\t" - "lh %[Temp1], 6(%[input]) \n\t" - "madd $ac1, %[Temp1], %[cospi_12_64] \n\t" - "extp %[step1_6], $ac1, 31 \n\t" - - /* - temp_1 = (step1_7 - step1_6 - step1_4 + step1_5) * cospi_16_64; - temp_2 = (step1_4 - step1_5 - step1_6 + step1_7) * cospi_16_64; - */ - "sub %[Temp0], %[step1_7], %[step1_6] \n\t" - "sub %[Temp0], %[Temp0], %[step1_4] \n\t" - "add %[Temp0], %[Temp0], %[step1_5] \n\t" - "sub %[Temp1], %[step1_4], %[step1_5] \n\t" - "sub %[Temp1], %[Temp1], %[step1_6] \n\t" - "add %[Temp1], %[Temp1], %[step1_7] \n\t" - - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - - "madd $ac0, %[Temp0], %[cospi_16_64] \n\t" - "madd $ac1, %[Temp1], %[cospi_16_64] \n\t" - - /* - step1_4 = step1_4 + step1_5; - step1_7 = step1_6 + step1_7; - */ - "add %[step1_4], %[step1_4], %[step1_5] \n\t" - "add %[step1_7], %[step1_7], %[step1_6] \n\t" - - "extp %[step1_5], $ac0, 31 \n\t" - "extp %[step1_6], $ac1, 31 \n\t" - - /* add block */ - "lbu %[Temp1], 0(%[dest_pix]) \n\t" - "add %[Temp0], %[step1_0], %[step1_7] \n\t" - "addi %[Temp0], %[Temp0], 16 \n\t" - "sra %[Temp0], %[Temp0], 5 \n\t" - "add %[Temp1], %[Temp1], %[Temp0] \n\t" - "add %[Temp0], %[step1_1], %[step1_6] \n\t" - "lbux %[Temp2], %[Temp1](%[cm]) \n\t" - "sb %[Temp2], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - - "lbu %[Temp1], 0(%[dest_pix]) \n\t" - "addi %[Temp0], %[Temp0], 16 \n\t" - "sra %[Temp0], %[Temp0], 5 \n\t" - "add %[Temp1], %[Temp1], %[Temp0] \n\t" - "add %[Temp0], %[step1_2], %[step1_5] \n\t" - "lbux %[Temp2], %[Temp1](%[cm]) \n\t" - "sb %[Temp2], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - - "lbu %[Temp1], 0(%[dest_pix]) \n\t" - "addi %[Temp0], %[Temp0], 16 \n\t" - "sra %[Temp0], %[Temp0], 5 \n\t" - "add %[Temp1], %[Temp1], %[Temp0] \n\t" - "add %[Temp0], %[step1_3], %[step1_4] \n\t" - "lbux %[Temp2], %[Temp1](%[cm]) \n\t" - "sb %[Temp2], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - - "lbu %[Temp1], 0(%[dest_pix]) \n\t" - "addi %[Temp0], %[Temp0], 16 \n\t" - "sra %[Temp0], %[Temp0], 5 \n\t" - "add %[Temp1], %[Temp1], %[Temp0] \n\t" - "sub %[Temp0], %[step1_3], %[step1_4] \n\t" - "lbux %[Temp2], %[Temp1](%[cm]) \n\t" - "sb %[Temp2], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - - "lbu %[Temp1], 0(%[dest_pix]) \n\t" - "addi %[Temp0], %[Temp0], 16 \n\t" - "sra %[Temp0], %[Temp0], 5 \n\t" - "add %[Temp1], %[Temp1], %[Temp0] \n\t" - "sub %[Temp0], %[step1_2], %[step1_5] \n\t" - "lbux %[Temp2], %[Temp1](%[cm]) \n\t" - "sb %[Temp2], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - - "lbu %[Temp1], 0(%[dest_pix]) \n\t" - "addi %[Temp0], %[Temp0], 16 \n\t" - "sra %[Temp0], %[Temp0], 5 \n\t" - "add %[Temp1], %[Temp1], %[Temp0] \n\t" - "sub %[Temp0], %[step1_1], %[step1_6] \n\t" - "lbux %[Temp2], %[Temp1](%[cm]) \n\t" - "sb %[Temp2], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - - "lbu %[Temp1], 0(%[dest_pix]) \n\t" - "addi %[Temp0], %[Temp0], 16 \n\t" - "sra %[Temp0], %[Temp0], 5 \n\t" - "add %[Temp1], %[Temp1], %[Temp0] \n\t" - "sub %[Temp0], %[step1_0], %[step1_7] \n\t" - "lbux %[Temp2], %[Temp1](%[cm]) \n\t" - "sb %[Temp2], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - - "lbu %[Temp1], 0(%[dest_pix]) \n\t" - "addi %[Temp0], %[Temp0], 16 \n\t" - "sra %[Temp0], %[Temp0], 5 \n\t" - "add %[Temp1], %[Temp1], %[Temp0] \n\t" - "lbux %[Temp2], %[Temp1](%[cm]) \n\t" - "sb %[Temp2], 0(%[dest_pix]) \n\t" - - : [step1_0] "=&r" (step1_0), [step1_1] "=&r" (step1_1), - [step1_2] "=&r" (step1_2), [step1_3] "=&r" (step1_3), - [step1_4] "=&r" (step1_4), [step1_5] "=&r" (step1_5), - [step1_6] "=&r" (step1_6), [step1_7] "=&r" (step1_7), - [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1), - [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), - [dest_pix] "+r" (dest_pix) - : [const_2_power_13] "r" (const_2_power_13), - [cospi_16_64] "r" (cospi_16_64), [cospi_28_64] "r" (cospi_28_64), - [cospi_4_64] "r" (cospi_4_64), [cospi_12_64] "r" (cospi_12_64), - [cospi_20_64] "r" (cospi_20_64), [cospi_8_64] "r" (cospi_8_64), - [cospi_24_64] "r" (cospi_24_64), - [input] "r" (input), [cm] "r" (cm), [dest_stride] "r" (dest_stride) - ); - - input += 8; - } -} - -void vp9_idct8x8_64_add_dspr2(const int16_t *input, uint8_t *dest, - int dest_stride) { - DECLARE_ALIGNED(32, int16_t, out[8 * 8]); - int16_t *outptr = out; - uint32_t pos = 45; - - /* bit positon for extract from acc */ - __asm__ __volatile__ ( - "wrdsp %[pos], 1 \n\t" - : - : [pos] "r" (pos) - ); - - // First transform rows - idct8_rows_dspr2(input, outptr, 8); - - // Then transform columns and add to dest - idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride); -} - -static void iadst8_dspr2(const int16_t *input, int16_t *output) { - int s0, s1, s2, s3, s4, s5, s6, s7; - int x0, x1, x2, x3, x4, x5, x6, x7; - - x0 = input[7]; - x1 = input[0]; - x2 = input[5]; - x3 = input[2]; - x4 = input[3]; - x5 = input[4]; - x6 = input[1]; - x7 = input[6]; - - if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) { - output[0] = output[1] = output[2] = output[3] = output[4] - = output[5] = output[6] = output[7] = 0; - return; - } - - // stage 1 - s0 = cospi_2_64 * x0 + cospi_30_64 * x1; - s1 = cospi_30_64 * x0 - cospi_2_64 * x1; - s2 = cospi_10_64 * x2 + cospi_22_64 * x3; - s3 = cospi_22_64 * x2 - cospi_10_64 * x3; - s4 = cospi_18_64 * x4 + cospi_14_64 * x5; - s5 = cospi_14_64 * x4 - cospi_18_64 * x5; - s6 = cospi_26_64 * x6 + cospi_6_64 * x7; - s7 = cospi_6_64 * x6 - cospi_26_64 * x7; - - x0 = ROUND_POWER_OF_TWO((s0 + s4), DCT_CONST_BITS); - x1 = ROUND_POWER_OF_TWO((s1 + s5), DCT_CONST_BITS); - x2 = ROUND_POWER_OF_TWO((s2 + s6), DCT_CONST_BITS); - x3 = ROUND_POWER_OF_TWO((s3 + s7), DCT_CONST_BITS); - x4 = ROUND_POWER_OF_TWO((s0 - s4), DCT_CONST_BITS); - x5 = ROUND_POWER_OF_TWO((s1 - s5), DCT_CONST_BITS); - x6 = ROUND_POWER_OF_TWO((s2 - s6), DCT_CONST_BITS); - x7 = ROUND_POWER_OF_TWO((s3 - s7), DCT_CONST_BITS); - - // stage 2 - s0 = x0; - s1 = x1; - s2 = x2; - s3 = x3; - s4 = cospi_8_64 * x4 + cospi_24_64 * x5; - s5 = cospi_24_64 * x4 - cospi_8_64 * x5; - s6 = -cospi_24_64 * x6 + cospi_8_64 * x7; - s7 = cospi_8_64 * x6 + cospi_24_64 * x7; - - x0 = s0 + s2; - x1 = s1 + s3; - x2 = s0 - s2; - x3 = s1 - s3; - x4 = ROUND_POWER_OF_TWO((s4 + s6), DCT_CONST_BITS); - x5 = ROUND_POWER_OF_TWO((s5 + s7), DCT_CONST_BITS); - x6 = ROUND_POWER_OF_TWO((s4 - s6), DCT_CONST_BITS); - x7 = ROUND_POWER_OF_TWO((s5 - s7), DCT_CONST_BITS); - - // stage 3 - s2 = cospi_16_64 * (x2 + x3); - s3 = cospi_16_64 * (x2 - x3); - s6 = cospi_16_64 * (x6 + x7); - s7 = cospi_16_64 * (x6 - x7); - - x2 = ROUND_POWER_OF_TWO((s2), DCT_CONST_BITS); - x3 = ROUND_POWER_OF_TWO((s3), DCT_CONST_BITS); - x6 = ROUND_POWER_OF_TWO((s6), DCT_CONST_BITS); - x7 = ROUND_POWER_OF_TWO((s7), DCT_CONST_BITS); - - output[0] = x0; - output[1] = -x4; - output[2] = x6; - output[3] = -x2; - output[4] = x3; - output[5] = -x7; - output[6] = x5; - output[7] = -x1; -} - void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest, int dest_stride, int tx_type) { int i, j; @@ -617,130 +90,4 @@ void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest, break; } } - -void vp9_idct8x8_12_add_dspr2(const int16_t *input, uint8_t *dest, - int dest_stride) { - DECLARE_ALIGNED(32, int16_t, out[8 * 8]); - int16_t *outptr = out; - uint32_t pos = 45; - - /* bit positon for extract from acc */ - __asm__ __volatile__ ( - "wrdsp %[pos], 1 \n\t" - : - : [pos] "r" (pos) - ); - - // First transform rows - idct8_rows_dspr2(input, outptr, 4); - - outptr += 4; - - __asm__ __volatile__ ( - "sw $zero, 0(%[outptr]) \n\t" - "sw $zero, 4(%[outptr]) \n\t" - "sw $zero, 16(%[outptr]) \n\t" - "sw $zero, 20(%[outptr]) \n\t" - "sw $zero, 32(%[outptr]) \n\t" - "sw $zero, 36(%[outptr]) \n\t" - "sw $zero, 48(%[outptr]) \n\t" - "sw $zero, 52(%[outptr]) \n\t" - "sw $zero, 64(%[outptr]) \n\t" - "sw $zero, 68(%[outptr]) \n\t" - "sw $zero, 80(%[outptr]) \n\t" - "sw $zero, 84(%[outptr]) \n\t" - "sw $zero, 96(%[outptr]) \n\t" - "sw $zero, 100(%[outptr]) \n\t" - "sw $zero, 112(%[outptr]) \n\t" - "sw $zero, 116(%[outptr]) \n\t" - - : - : [outptr] "r" (outptr) - ); - - - // Then transform columns and add to dest - idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride); -} - -void vp9_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest, - int dest_stride) { - uint32_t pos = 45; - int32_t out; - int32_t r; - int32_t a1, absa1; - int32_t t1, t2, vector_a1, vector_1, vector_2; - - /* bit positon for extract from acc */ - __asm__ __volatile__ ( - "wrdsp %[pos], 1 \n\t" - - : - : [pos] "r" (pos) - ); - - out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]); - __asm__ __volatile__ ( - "addi %[out], %[out], 16 \n\t" - "sra %[a1], %[out], 5 \n\t" - - : [out] "+r" (out), [a1] "=r" (a1) - : - ); - - if (a1 < 0) { - /* use quad-byte - * input and output memory are four byte aligned */ - __asm__ __volatile__ ( - "abs %[absa1], %[a1] \n\t" - "replv.qb %[vector_a1], %[absa1] \n\t" - - : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1) - : [a1] "r" (a1) - ); - - for (r = 8; r--;) { - __asm__ __volatile__ ( - "lw %[t1], 0(%[dest]) \n\t" - "lw %[t2], 4(%[dest]) \n\t" - "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" - "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" - "sw %[vector_1], 0(%[dest]) \n\t" - "sw %[vector_2], 4(%[dest]) \n\t" - "add %[dest], %[dest], %[dest_stride] \n\t" - - : [t1] "=&r" (t1), [t2] "=&r" (t2), - [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2), - [dest] "+&r" (dest) - : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1) - ); - } - } else { - /* use quad-byte - * input and output memory are four byte aligned */ - __asm__ __volatile__ ( - "replv.qb %[vector_a1], %[a1] \n\t" - - : [vector_a1] "=r" (vector_a1) - : [a1] "r" (a1) - ); - - for (r = 8; r--;) { - __asm__ __volatile__ ( - "lw %[t1], 0(%[dest]) \n\t" - "lw %[t2], 4(%[dest]) \n\t" - "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" - "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" - "sw %[vector_1], 0(%[dest]) \n\t" - "sw %[vector_2], 4(%[dest]) \n\t" - "add %[dest], %[dest], %[dest_stride] \n\t" - - : [t1] "=&r" (t1), [t2] "=&r" (t2), - [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2), - [dest] "+r" (dest) - : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1) - ); - } - } -} #endif // #if HAVE_DSPR2 |