diff options
author | Johann <johannkoenig@google.com> | 2011-09-20 09:51:05 -0700 |
---|---|---|
committer | Gerrit Code Review <gerrit@gerrit.golo.chromium.org> | 2011-09-20 09:51:05 -0700 |
commit | 6829e62718a2970f2c62b4ace717acb46776ef73 (patch) | |
tree | 35f23c851b87d547006bc7a6eb129835b330c6ce /vp8 | |
parent | 86e07525d513ed3e8232dc2637b824f21c4dd0a2 (diff) | |
parent | 0c2529a8129c9f978d7d87992e033ba3c83b073c (diff) | |
download | libvpx-6829e62718a2970f2c62b4ace717acb46776ef73.tar libvpx-6829e62718a2970f2c62b4ace717acb46776ef73.tar.gz libvpx-6829e62718a2970f2c62b4ace717acb46776ef73.tar.bz2 libvpx-6829e62718a2970f2c62b4ace717acb46776ef73.zip |
Merge "NEON FDCT updated to match current C code"
Diffstat (limited to 'vp8')
-rw-r--r-- | vp8/encoder/arm/arm_csystemdependent.c | 4 | ||||
-rw-r--r-- | vp8/encoder/arm/dct_arm.h | 4 | ||||
-rw-r--r-- | vp8/encoder/arm/neon/fastfdct4x4_neon.asm | 124 | ||||
-rw-r--r-- | vp8/encoder/arm/neon/fastfdct8x4_neon.asm | 177 | ||||
-rw-r--r-- | vp8/encoder/arm/neon/shortfdct_neon.asm | 293 | ||||
-rw-r--r-- | vp8/vp8cx_arm.mk | 2 |
6 files changed, 189 insertions, 415 deletions
diff --git a/vp8/encoder/arm/arm_csystemdependent.c b/vp8/encoder/arm/arm_csystemdependent.c index a6572b3be..081775bfd 100644 --- a/vp8/encoder/arm/arm_csystemdependent.c +++ b/vp8/encoder/arm/arm_csystemdependent.c @@ -107,8 +107,8 @@ void vp8_arch_arm_encoder_init(VP8_COMP *cpi) cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_neon; cpi->rtcd.fdct.short8x4 = vp8_short_fdct8x4_neon; - cpi->rtcd.fdct.fast4x4 = vp8_fast_fdct4x4_neon; - cpi->rtcd.fdct.fast8x4 = vp8_fast_fdct8x4_neon; + cpi->rtcd.fdct.fast4x4 = vp8_short_fdct4x4_neon; + cpi->rtcd.fdct.fast8x4 = vp8_short_fdct8x4_neon; cpi->rtcd.fdct.walsh_short4x4 = vp8_short_walsh4x4_neon; /*cpi->rtcd.encodemb.berr = vp8_block_error_c; diff --git a/vp8/encoder/arm/dct_arm.h b/vp8/encoder/arm/dct_arm.h index f94022b89..db553c4e0 100644 --- a/vp8/encoder/arm/dct_arm.h +++ b/vp8/encoder/arm/dct_arm.h @@ -51,10 +51,10 @@ extern prototype_fdct(vp8_short_walsh4x4_neon); #define vp8_fdct_short8x4 vp8_short_fdct8x4_neon #undef vp8_fdct_fast4x4 -#define vp8_fdct_fast4x4 vp8_fast_fdct4x4_neon +#define vp8_fdct_fast4x4 vp8_short_fdct4x4_neon #undef vp8_fdct_fast8x4 -#define vp8_fdct_fast8x4 vp8_fast_fdct8x4_neon +#define vp8_fdct_fast8x4 vp8_short_fdct8x4_neon #undef vp8_fdct_walsh_short4x4 #define vp8_fdct_walsh_short4x4 vp8_short_walsh4x4_neon diff --git a/vp8/encoder/arm/neon/fastfdct4x4_neon.asm b/vp8/encoder/arm/neon/fastfdct4x4_neon.asm deleted file mode 100644 index 1cc0bd781..000000000 --- a/vp8/encoder/arm/neon/fastfdct4x4_neon.asm +++ /dev/null @@ -1,124 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_fast_fdct4x4_neon| - - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 -;void vp8_fast_fdct4x4_c(short *input, short *output, int pitch); -;NOTE: -;The input *src_diff. src_diff is calculated as: -;diff_ptr[c] = src_ptr[c] - pred_ptr[c]; (in Subtract* function) -;In which *src_ptr and *pred_ptr both are unsigned char. -;Therefore, *src_diff should be in the range of [-255, 255]. -;CAUTION: -;The input values of 25th block are set in vp8_build_dcblock function, which are out of [-255, 255]. -;But, VP8 encoder only uses vp8_short_fdct4x4_c for 25th block, not vp8_fast_fdct4x4_c. That makes -;it ok for assuming *input in [-255, 255] in vp8_fast_fdct4x4_c, but not ok in vp8_short_fdct4x4_c. - -|vp8_fast_fdct4x4_neon| PROC - vld1.16 {d2}, [r0], r2 ;load input - ldr r12, _ffdct_coeff_ - vld1.16 {d3}, [r0], r2 - vld1.16 {d4}, [r0], r2 - vld1.16 {d0}, [r12] - vld1.16 {d5}, [r0], r2 - - ;First for-loop - ;transpose d2, d3, d4, d5. Then, d2=ip[0], d3=ip[1], d4=ip[2], d5=ip[3] - vtrn.32 d2, d4 - vtrn.32 d3, d5 - vtrn.16 d2, d3 - vtrn.16 d4, d5 - - vadd.s16 d6, d2, d5 ;ip[0]+ip[3] - vadd.s16 d7, d3, d4 ;ip[1]+ip[2] - vsub.s16 d8, d3, d4 ;ip[1]-ip[2] - vsub.s16 d9, d2, d5 ;ip[0]-ip[3] - vshl.i16 q3, q3, #1 ; a1, b1 - vshl.i16 q4, q4, #1 ; c1, d1 - - vadd.s16 d10, d6, d7 ;temp1 = a1 + b1 - vsub.s16 d11, d6, d7 ;temp2 = a1 - b1 - - vqdmulh.s16 q6, q5, d0[1] - vqdmulh.s16 q8, q4, d0[0] - vqdmulh.s16 q7, q4, d0[2] - - vshr.s16 q6, q6, #1 - vshr.s16 q8, q8, #1 - vshr.s16 q7, q7, #1 ;d14:temp1 = ( c1 * x_c3)>>16; d15:temp1 = (d1 * x_c3)>>16 - vadd.s16 q8, q4, q8 ;d16:temp2 = ((c1 * x_c1)>>16) + c1; d17:temp2 = ((d1 * x_c1)>>16) + d1 - - vadd.s16 d2, d10, d12 ;op[0] = ((temp1 * x_c2 )>>16) + temp1 - vadd.s16 d4, d11, d13 ;op[2] = ((temp2 * x_c2 )>>16) + temp2 - vadd.s16 d3, d14, d17 ;op[1] = temp1 + temp2 -- q is not necessary, just for protection - vsub.s16 d5, d15, d16 ;op[3] = temp1 - temp2 - - ;Second for-loop - ;transpose d2, d3, d4, d5. Then, d2=ip[0], d3=ip[4], d4=ip[8], d5=ip[12] - vtrn.32 d2, d4 - vtrn.32 d3, d5 - vtrn.16 d2, d3 - vtrn.16 d4, d5 - - vadd.s16 d6, d2, d5 ;a1 = ip[0]+ip[12] - vadd.s16 d7, d3, d4 ;b1 = ip[4]+ip[8] - vsub.s16 d8, d3, d4 ;c1 = ip[4]-ip[8] - vsub.s16 d9, d2, d5 ;d1 = ip[0]-ip[12] - - vadd.s16 d10, d6, d7 ;temp1 = a1 + b1 - vsub.s16 d11, d6, d7 ;temp2 = a1 - b1 - - - vqdmulh.s16 q6, q5, d0[1] - vqdmulh.s16 q8, q4, d0[0] - vqdmulh.s16 q7, q4, d0[2] - - vshr.s16 q6, q6, #1 - vshr.s16 q8, q8, #1 - vshr.s16 q7, q7, #1 ;d14:temp1 = ( c1 * x_c3)>>16; d15:temp1 = (d1 * x_c3)>>16 - vadd.s16 q8, q4, q8 ;d16:temp2 = ((c1 * x_c1)>>16) + c1; d17:temp2 = ((d1 * x_c1)>>16) + d1 - - vadd.s16 d2, d10, d12 ;a2 = ((temp1 * x_c2 )>>16) + temp1 - vadd.s16 d4, d11, d13 ;c2 = ((temp2 * x_c2 )>>16) + temp2 - vadd.s16 d3, d14, d17 ;b2 = temp1 + temp2 -- q is not necessary, just for protection - vsub.s16 d5, d15, d16 ;d2 = temp1 - temp2 - - vclt.s16 q3, q1, #0 - vclt.s16 q4, q2, #0 - - vsub.s16 q1, q1, q3 - vsub.s16 q2, q2, q4 - - vshr.s16 q1, q1, #1 - vshr.s16 q2, q2, #1 - - vst1.16 {q1, q2}, [r1] - - bx lr - - ENDP - -;----------------- - -_ffdct_coeff_ - DCD ffdct_coeff -ffdct_coeff -; 60547 = 0xEC83 -; 46341 = 0xB505 -; 25080 = 0x61F8 - DCD 0xB505EC83, 0x000061F8 - - END diff --git a/vp8/encoder/arm/neon/fastfdct8x4_neon.asm b/vp8/encoder/arm/neon/fastfdct8x4_neon.asm deleted file mode 100644 index f6e8bbb83..000000000 --- a/vp8/encoder/arm/neon/fastfdct8x4_neon.asm +++ /dev/null @@ -1,177 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_fast_fdct8x4_neon| - - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 -;void vp8_fast_fdct4x4_c(short *input, short *output, int pitch); -;NOTE: -;The input *src_diff. src_diff is calculated as: -;diff_ptr[c] = src_ptr[c] - pred_ptr[c]; (in Subtract* function) -;In which *src_ptr and *pred_ptr both are unsigned char. -;Therefore, *src_diff should be in the range of [-255, 255]. -;CAUTION: -;The input values of 25th block are set in vp8_build_dcblock function, which are out of [-255, 255]. -;But, VP8 encoder only uses vp8_short_fdct4x4_c for 25th block, not vp8_fast_fdct4x4_c. That makes -;it ok for assuming *input in [-255, 255] in vp8_fast_fdct4x4_c, but not ok in vp8_short_fdct4x4_c. - -|vp8_fast_fdct8x4_neon| PROC - vld1.16 {q1}, [r0], r2 ;load input - ldr r12, _ffdct8_coeff_ - vld1.16 {q2}, [r0], r2 - vld1.16 {q3}, [r0], r2 - vld1.16 {d0}, [r12] - vld1.16 {q4}, [r0], r2 - - ;First for-loop - ;transpose d2, d4, d6, d8. Then, d2=ip[0], d4=ip[1], d6=ip[2], d8=ip[3] - ;transpose d3, d5, d7, d9. Then, d3=ip[0], d5=ip[1], d7=ip[2], d9=ip[3] - vtrn.32 d2, d6 - vtrn.32 d3, d7 - vtrn.32 d4, d8 - vtrn.32 d5, d9 - vtrn.16 d2, d4 - vtrn.16 d3, d5 - vtrn.16 d6, d8 - vtrn.16 d7, d9 - - vadd.s16 d10, d2, d8 ;ip[0]+ip[3] - vadd.s16 d11, d4, d6 ;ip[1]+ip[2] - vsub.s16 d12, d4, d6 ;ip[1]-ip[2] - vsub.s16 d13, d2, d8 ;ip[0]-ip[3] - vadd.s16 d22, d3, d9 - vadd.s16 d23, d5, d7 - vsub.s16 d24, d5, d7 - vsub.s16 d25, d3, d9 - - vshl.i16 q5, q5, #1 ; a1, b1 - vshl.i16 q6, q6, #1 ; c1, d1 - vshl.i16 q1, q11, #1 - vshl.i16 q2, q12, #1 - - vadd.s16 d14, d10, d11 ;temp1 = a1 + b1 - vsub.s16 d15, d10, d11 ;temp2 = a1 - b1 - vadd.s16 d24, d2, d3 - vsub.s16 d25, d2, d3 - - vqdmulh.s16 q8, q7, d0[1] - vqdmulh.s16 q13, q12, d0[1] - vqdmulh.s16 q10, q6, d0[0] - vqdmulh.s16 q15, q2, d0[0] - vqdmulh.s16 q9, q6, d0[2] - vqdmulh.s16 q14, q2, d0[2] - - vshr.s16 q8, q8, #1 - vshr.s16 q13, q13, #1 - vshr.s16 q10, q10, #1 - vshr.s16 q15, q15, #1 - vshr.s16 q9, q9, #1 ;d18:temp1 = ( c1 * x_c3)>>16; d19:temp1 = (d1 * x_c3)>>16 - vshr.s16 q14, q14, #1 ;d28:temp1 = ( c1 * x_c3)>>16; d29:temp1 = (d1 * x_c3)>>16 - vadd.s16 q10, q6, q10 ;d20:temp2 = ((c1 * x_c1)>>16) + c1; d21:temp2 = ((d1 * x_c1)>>16) + d1 - vadd.s16 q15, q2, q15 ;d30:temp2 = ((c1 * x_c1)>>16) + c1; d31:temp2 = ((d1 * x_c1)>>16) + d1 - - vadd.s16 d2, d14, d16 ;op[0] = ((temp1 * x_c2 )>>16) + temp1 - vadd.s16 d3, d24, d26 ;op[0] = ((temp1 * x_c2 )>>16) + temp1 - vadd.s16 d6, d15, d17 ;op[2] = ((temp2 * x_c2 )>>16) + temp2 - vadd.s16 d7, d25, d27 ;op[2] = ((temp2 * x_c2 )>>16) + temp2 - vadd.s16 d4, d18, d21 ;op[1] = temp1 + temp2 -- q is not necessary, just for protection - vadd.s16 d5, d28, d31 ;op[1] = temp1 + temp2 -- q is not necessary, just for protection - vsub.s16 d8, d19, d20 ;op[3] = temp1 - temp2 - vsub.s16 d9, d29, d30 ;op[3] = temp1 - temp2 - - ;Second for-loop - ;transpose d2, d4, d6, d8. Then, d2=ip[0], d4=ip[4], d6=ip[8], d8=ip[12] - ;transpose d3, d5, d7, d9. Then, d3=ip[0], d5=ip[4], d7=ip[8], d9=ip[12] - vtrn.32 d2, d6 - vtrn.32 d3, d7 - vtrn.32 d4, d8 - vtrn.32 d5, d9 - vtrn.16 d2, d4 - vtrn.16 d3, d5 - vtrn.16 d6, d8 - vtrn.16 d7, d9 - - vadd.s16 d10, d2, d8 ;a1 = ip[0]+ip[12] - vadd.s16 d11, d4, d6 ;b1 = ip[4]+ip[8] - vsub.s16 d12, d4, d6 ;c1 = ip[4]-ip[8] - vsub.s16 d13, d2, d8 ;d1 = ip[0]-ip[12] - vadd.s16 d2, d3, d9 - vadd.s16 d4, d5, d7 - vsub.s16 d24, d5, d7 - vsub.s16 d25, d3, d9 - - vadd.s16 d14, d10, d11 ;temp1 = a1 + b1 - vsub.s16 d15, d10, d11 ;temp2 = a1 - b1 - vadd.s16 d22, d2, d4 - vsub.s16 d23, d2, d4 - - vqdmulh.s16 q8, q7, d0[1] - vqdmulh.s16 q13, q11, d0[1] - vqdmulh.s16 q10, q6, d0[0] - vqdmulh.s16 q15, q12, d0[0] - vqdmulh.s16 q9, q6, d0[2] - vqdmulh.s16 q14, q12, d0[2] - - vshr.s16 q8, q8, #1 - vshr.s16 q13, q13, #1 - vshr.s16 q10, q10, #1 - vshr.s16 q15, q15, #1 - vshr.s16 q9, q9, #1 ;d18:temp1 = ( c1 * x_c3)>>16; d19:temp1 = (d1 * x_c3)>>16 - vshr.s16 q14, q14, #1 ;d28:temp1 = ( c1 * x_c3)>>16; d29:temp1 = (d1 * x_c3)>>16 - vadd.s16 q10, q6, q10 ;d20:temp2 = ((c1 * x_c1)>>16) + c1; d21:temp2 = ((d1 * x_c1)>>16) + d1 - vadd.s16 q15, q12, q15 ;d30:temp2 = ((c1 * x_c1)>>16) + c1; d31:temp2 = ((d1 * x_c1)>>16) + d1 - - vadd.s16 d2, d14, d16 ;a2 = ((temp1 * x_c2 )>>16) + temp1 - vadd.s16 d6, d22, d26 ;a2 = ((temp1 * x_c2 )>>16) + temp1 - vadd.s16 d4, d15, d17 ;c2 = ((temp2 * x_c2 )>>16) + temp2 - vadd.s16 d8, d23, d27 ;c2 = ((temp2 * x_c2 )>>16) + temp2 - vadd.s16 d3, d18, d21 ;b2 = temp1 + temp2 -- q is not necessary, just for protection - vadd.s16 d7, d28, d31 ;b2 = temp1 + temp2 -- q is not necessary, just for protection - vsub.s16 d5, d19, d20 ;d2 = temp1 - temp2 - vsub.s16 d9, d29, d30 ;d2 = temp1 - temp2 - - vclt.s16 q5, q1, #0 - vclt.s16 q6, q2, #0 - vclt.s16 q7, q3, #0 - vclt.s16 q8, q4, #0 - - vsub.s16 q1, q1, q5 - vsub.s16 q2, q2, q6 - vsub.s16 q3, q3, q7 - vsub.s16 q4, q4, q8 - - vshr.s16 q1, q1, #1 - vshr.s16 q2, q2, #1 - vshr.s16 q3, q3, #1 - vshr.s16 q4, q4, #1 - - vst1.16 {q1, q2}, [r1]! - vst1.16 {q3, q4}, [r1] - - bx lr - - ENDP - -;----------------- - -_ffdct8_coeff_ - DCD ffdct8_coeff -ffdct8_coeff -; 60547 = 0xEC83 -; 46341 = 0xB505 -; 25080 = 0x61F8 - DCD 0xB505EC83, 0x000061F8 - - END diff --git a/vp8/encoder/arm/neon/shortfdct_neon.asm b/vp8/encoder/arm/neon/shortfdct_neon.asm index 1b7f36277..09dd011ec 100644 --- a/vp8/encoder/arm/neon/shortfdct_neon.asm +++ b/vp8/encoder/arm/neon/shortfdct_neon.asm @@ -11,134 +11,211 @@ EXPORT |vp8_short_fdct4x4_neon| EXPORT |vp8_short_fdct8x4_neon| + ARM REQUIRE8 PRESERVE8 + AREA ||.text||, CODE, READONLY, ALIGN=4 + - AREA ||.text||, CODE, READONLY, ALIGN=2 + ALIGN 16 ; enable use of @128 bit aligned loads +coeff + DCW 5352, 5352, 5352, 5352 + DCW 2217, 2217, 2217, 2217 + DCD 14500, 14500, 14500, 14500 + DCD 7500, 7500, 7500, 7500 + DCD 12000, 12000, 12000, 12000 + DCD 51000, 51000, 51000, 51000 -; r0 short *input -; r1 short *output -; r2 int pitch -; Input has a pitch, output is contiguous +;void vp8_short_fdct4x4_c(short *input, short *output, int pitch) |vp8_short_fdct4x4_neon| PROC - ldr r12, _dct_matrix_ - vld1.16 d0, [r0], r2 - vld1.16 d1, [r0], r2 - vld1.16 d2, [r0], r2 - vld1.16 d3, [r0] - vld1.16 {q2, q3}, [r12] - -;first stage - vmull.s16 q11, d4, d0[0] ;i=0 - vmull.s16 q12, d4, d1[0] ;i=1 - vmull.s16 q13, d4, d2[0] ;i=2 - vmull.s16 q14, d4, d3[0] ;i=3 - - vmlal.s16 q11, d5, d0[1] - vmlal.s16 q12, d5, d1[1] - vmlal.s16 q13, d5, d2[1] - vmlal.s16 q14, d5, d3[1] - - vmlal.s16 q11, d6, d0[2] - vmlal.s16 q12, d6, d1[2] - vmlal.s16 q13, d6, d2[2] - vmlal.s16 q14, d6, d3[2] - - vmlal.s16 q11, d7, d0[3] ;sumtemp for i=0 - vmlal.s16 q12, d7, d1[3] ;sumtemp for i=1 - vmlal.s16 q13, d7, d2[3] ;sumtemp for i=2 - vmlal.s16 q14, d7, d3[3] ;sumtemp for i=3 - - ; rounding - vrshrn.i32 d22, q11, #14 - vrshrn.i32 d24, q12, #14 - vrshrn.i32 d26, q13, #14 - vrshrn.i32 d28, q14, #14 - -;second stage - vmull.s16 q4, d22, d4[0] ;i=0 - vmull.s16 q5, d22, d4[1] ;i=1 - vmull.s16 q6, d22, d4[2] ;i=2 - vmull.s16 q7, d22, d4[3] ;i=3 - - vmlal.s16 q4, d24, d5[0] - vmlal.s16 q5, d24, d5[1] - vmlal.s16 q6, d24, d5[2] - vmlal.s16 q7, d24, d5[3] - - vmlal.s16 q4, d26, d6[0] - vmlal.s16 q5, d26, d6[1] - vmlal.s16 q6, d26, d6[2] - vmlal.s16 q7, d26, d6[3] - - vmlal.s16 q4, d28, d7[0] ;sumtemp for i=0 - vmlal.s16 q5, d28, d7[1] ;sumtemp for i=1 - vmlal.s16 q6, d28, d7[2] ;sumtemp for i=2 - vmlal.s16 q7, d28, d7[3] ;sumtemp for i=3 - - vrshr.s32 q0, q4, #16 - vrshr.s32 q1, q5, #16 - vrshr.s32 q2, q6, #16 - vrshr.s32 q3, q7, #16 - - vmovn.i32 d0, q0 - vmovn.i32 d1, q1 - vmovn.i32 d2, q2 - vmovn.i32 d3, q3 - - vst1.16 {q0, q1}, [r1] + + ; Part one + vld1.16 {d0}, [r0@64], r2 + adr r12, coeff + vld1.16 {d1}, [r0@64], r2 + vld1.16 {q8}, [r12@128]! ; d16=5352, d17=2217 + vld1.16 {d2}, [r0@64], r2 + vld1.32 {q9, q10}, [r12@128]! ; q9=14500, q10=7500 + vld1.16 {d3}, [r0@64], r2 + + ; transpose d0=ip[0], d1=ip[1], d2=ip[2], d3=ip[3] + vtrn.32 d0, d2 + vtrn.32 d1, d3 + vld1.32 {q11,q12}, [r12@128] ; q11=12000, q12=51000 + vtrn.16 d0, d1 + vtrn.16 d2, d3 + + vadd.s16 d4, d0, d3 ; a1 = ip[0] + ip[3] + vadd.s16 d5, d1, d2 ; b1 = ip[1] + ip[2] + vsub.s16 d6, d1, d2 ; c1 = ip[1] - ip[2] + vsub.s16 d7, d0, d3 ; d1 = ip[0] - ip[3] + + vshl.s16 q2, q2, #3 ; (a1, b1) << 3 + vshl.s16 q3, q3, #3 ; (c1, d1) << 3 + + vadd.s16 d0, d4, d5 ; op[0] = a1 + b1 + vsub.s16 d2, d4, d5 ; op[2] = a1 - b1 + + vmlal.s16 q9, d7, d16 ; d1*5352 + 14500 + vmlal.s16 q10, d7, d17 ; d1*2217 + 7500 + vmlal.s16 q9, d6, d17 ; c1*2217 + d1*5352 + 14500 + vmlsl.s16 q10, d6, d16 ; d1*2217 - c1*5352 + 7500 + + vshrn.s32 d1, q9, #12 ; op[1] = (c1*2217 + d1*5352 + 14500)>>12 + vshrn.s32 d3, q10, #12 ; op[3] = (d1*2217 - c1*5352 + 7500)>>12 + + + ; Part two + + ; transpose d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12] + vtrn.32 d0, d2 + vtrn.32 d1, d3 + vtrn.16 d0, d1 + vtrn.16 d2, d3 + + vmov.s16 d26, #7 + + vadd.s16 d4, d0, d3 ; a1 = ip[0] + ip[12] + vadd.s16 d5, d1, d2 ; b1 = ip[4] + ip[8] + vsub.s16 d6, d1, d2 ; c1 = ip[4] - ip[8] + vadd.s16 d4, d4, d26 ; a1 + 7 + vsub.s16 d7, d0, d3 ; d1 = ip[0] - ip[12] + + vadd.s16 d0, d4, d5 ; op[0] = a1 + b1 + 7 + vsub.s16 d2, d4, d5 ; op[8] = a1 - b1 + 7 + + vmlal.s16 q11, d7, d16 ; d1*5352 + 12000 + vmlal.s16 q12, d7, d17 ; d1*2217 + 51000 + + vceq.s16 d4, d7, #0 + + vshr.s16 d0, d0, #4 + vshr.s16 d2, d2, #4 + + vmlal.s16 q11, d6, d17 ; c1*2217 + d1*5352 + 12000 + vmlsl.s16 q12, d6, d16 ; d1*2217 - c1*5352 + 51000 + + vmvn.s16 d4, d4 + vshrn.s32 d1, q11, #16 ; op[4] = (c1*2217 + d1*5352 + 12000)>>16 + vsub.s16 d1, d1, d4 ; op[4] += (d1!=0) + vshrn.s32 d3, q12, #16 ; op[12]= (d1*2217 - c1*5352 + 51000)>>16 + + vst1.16 {q0, q1}, [r1@128] bx lr ENDP -; r0 short *input -; r1 short *output -; r2 int pitch +;void vp8_short_fdct8x4_c(short *input, short *output, int pitch) |vp8_short_fdct8x4_neon| PROC - ; Store link register and input before calling - ; first 4x4 fdct. Do not need to worry about - ; output or pitch because those pointers are not - ; touched in the 4x4 fdct function - stmdb sp!, {r0, lr} - bl vp8_short_fdct4x4_neon + ; Part one + + vld1.16 {q0}, [r0@128], r2 + adr r12, coeff + vld1.16 {q1}, [r0@128], r2 + vld1.16 {q8}, [r12@128]! ; d16=5352, d17=2217 + vld1.16 {q2}, [r0@128], r2 + vld1.32 {q9, q10}, [r12@128]! ; q9=14500, q10=7500 + vld1.16 {q3}, [r0@128], r2 + + ; transpose q0=ip[0], q1=ip[1], q2=ip[2], q3=ip[3] + vtrn.32 q0, q2 ; [A0|B0] + vtrn.32 q1, q3 ; [A1|B1] + vtrn.16 q0, q1 ; [A2|B2] + vtrn.16 q2, q3 ; [A3|B3] - ldmia sp!, {r0, lr} + vadd.s16 q11, q0, q3 ; a1 = ip[0] + ip[3] + vadd.s16 q12, q1, q2 ; b1 = ip[1] + ip[2] + vsub.s16 q13, q1, q2 ; c1 = ip[1] - ip[2] + vsub.s16 q14, q0, q3 ; d1 = ip[0] - ip[3] - ; Move to the next block of data. - add r0, r0, #8 - add r1, r1, #32 + vshl.s16 q11, q11, #3 ; a1 << 3 + vshl.s16 q12, q12, #3 ; b1 << 3 + vshl.s16 q13, q13, #3 ; c1 << 3 + vshl.s16 q14, q14, #3 ; d1 << 3 - ; Second time through do not store off the - ; link register, just return from the 4x4 fdtc - b vp8_short_fdct4x4_neon + vadd.s16 q0, q11, q12 ; [A0 | B0] = a1 + b1 + vsub.s16 q2, q11, q12 ; [A2 | B2] = a1 - b1 + + vmov.s16 q11, q9 ; 14500 + vmov.s16 q12, q10 ; 7500 + + vmlal.s16 q9, d28, d16 ; A[1] = d1*5352 + 14500 + vmlal.s16 q10, d28, d17 ; A[3] = d1*2217 + 7500 + vmlal.s16 q11, d29, d16 ; B[1] = d1*5352 + 14500 + vmlal.s16 q12, d29, d17 ; B[3] = d1*2217 + 7500 + + vmlal.s16 q9, d26, d17 ; A[1] = c1*2217 + d1*5352 + 14500 + vmlsl.s16 q10, d26, d16 ; A[3] = d1*2217 - c1*5352 + 7500 + vmlal.s16 q11, d27, d17 ; B[1] = c1*2217 + d1*5352 + 14500 + vmlsl.s16 q12, d27, d16 ; B[3] = d1*2217 - c1*5352 + 7500 + + vshrn.s32 d2, q9, #12 ; A[1] = (c1*2217 + d1*5352 + 14500)>>12 + vshrn.s32 d6, q10, #12 ; A[3] = (d1*2217 - c1*5352 + 7500)>>12 + vshrn.s32 d3, q11, #12 ; B[1] = (c1*2217 + d1*5352 + 14500)>>12 + vshrn.s32 d7, q12, #12 ; B[3] = (d1*2217 - c1*5352 + 7500)>>12 + + + ; Part two + vld1.32 {q9,q10}, [r12@128] ; q9=12000, q10=51000 + + ; transpose q0=ip[0], q1=ip[4], q2=ip[8], q3=ip[12] + vtrn.32 q0, q2 ; q0=[A0 | B0] + vtrn.32 q1, q3 ; q1=[A4 | B4] + vtrn.16 q0, q1 ; q2=[A8 | B8] + vtrn.16 q2, q3 ; q3=[A12|B12] + + vmov.s16 q15, #7 + + vadd.s16 q11, q0, q3 ; a1 = ip[0] + ip[12] + vadd.s16 q12, q1, q2 ; b1 = ip[4] + ip[8] + vadd.s16 q11, q11, q15 ; a1 + 7 + vsub.s16 q13, q1, q2 ; c1 = ip[4] - ip[8] + vsub.s16 q14, q0, q3 ; d1 = ip[0] - ip[12] + + vadd.s16 q0, q11, q12 ; a1 + b1 + 7 + vsub.s16 q1, q11, q12 ; a1 - b1 + 7 + + vmov.s16 q11, q9 ; 12000 + vmov.s16 q12, q10 ; 51000 + + vshr.s16 d0, d0, #4 ; A[0] = (a1 + b1 + 7)>>4 + vshr.s16 d4, d1, #4 ; B[0] = (a1 + b1 + 7)>>4 + vshr.s16 d2, d2, #4 ; A[8] = (a1 + b1 + 7)>>4 + vshr.s16 d6, d3, #4 ; B[8] = (a1 + b1 + 7)>>4 + + + vmlal.s16 q9, d28, d16 ; A[4] = d1*5352 + 12000 + vmlal.s16 q10, d28, d17 ; A[12] = d1*2217 + 51000 + vmlal.s16 q11, d29, d16 ; B[4] = d1*5352 + 12000 + vmlal.s16 q12, d29, d17 ; B[12] = d1*2217 + 51000 + + vceq.s16 q14, q14, #0 + + vmlal.s16 q9, d26, d17 ; A[4] = c1*2217 + d1*5352 + 12000 + vmlsl.s16 q10, d26, d16 ; A[12] = d1*2217 - c1*5352 + 51000 + vmlal.s16 q11, d27, d17 ; B[4] = c1*2217 + d1*5352 + 12000 + vmlsl.s16 q12, d27, d16 ; B[12] = d1*2217 - c1*5352 + 51000 + + vmvn.s16 q14, q14 + + vshrn.s32 d1, q9, #16 ; A[4] = (c1*2217 + d1*5352 + 12000)>>16 + vshrn.s32 d3, q10, #16 ; A[12]= (d1*2217 - c1*5352 + 51000)>>16 + vsub.s16 d1, d1, d28 ; A[4] += (d1!=0) + + vshrn.s32 d5, q11, #16 ; B[4] = (c1*2217 + d1*5352 + 12000)>>16 + vshrn.s32 d7, q12, #16 ; B[12]= (d1*2217 - c1*5352 + 51000)>>16 + vsub.s16 d5, d5, d29 ; B[4] += (d1!=0) + + vst1.16 {q0, q1}, [r1@128]! ; block A + vst1.16 {q2, q3}, [r1@128]! ; block B - ; Should never get to this. bx lr ENDP -;----------------- - -_dct_matrix_ - DCD dct_matrix -dct_matrix -; DCW 23170, 30274, 23170, 12540 -; DCW 23170, 12540, -23170,-30274 -; DCW 23170, -12540, -23170, 30274 -; DCW 23170, -30274, 23170,-12540 -; 23170 = 0x5a82 -; -23170 = 0xa57e -; 30274 = 0x7642 -; -30274 = 0x89be -; 12540 = 0x30fc -; -12540 = 0xcf04 - DCD 0x76425a82, 0x30fc5a82 - DCD 0x30fc5a82, 0x89bea57e - DCD 0xcf045a82, 0x7642a57e - DCD 0x89be5a82, 0xcf045a82 - END + diff --git a/vp8/vp8cx_arm.mk b/vp8/vp8cx_arm.mk index 4a860f494..99b2688ff 100644 --- a/vp8/vp8cx_arm.mk +++ b/vp8/vp8cx_arm.mk @@ -49,8 +49,6 @@ VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/walsh_v6$(ASM) #File list for neon # encoder -VP8_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/neon/fastfdct4x4_neon$(ASM) -VP8_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/neon/fastfdct8x4_neon$(ASM) VP8_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/neon/fastquantizeb_neon$(ASM) VP8_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/neon/sad8_neon$(ASM) VP8_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/neon/sad16_neon$(ASM) |