diff options
Diffstat (limited to 'vp8/common/arm/neon/iwalsh_neon.asm')
-rw-r--r-- | vp8/common/arm/neon/iwalsh_neon.asm | 95 |
1 files changed, 95 insertions, 0 deletions
diff --git a/vp8/common/arm/neon/iwalsh_neon.asm b/vp8/common/arm/neon/iwalsh_neon.asm new file mode 100644 index 000000000..4fc744c96 --- /dev/null +++ b/vp8/common/arm/neon/iwalsh_neon.asm @@ -0,0 +1,95 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + EXPORT |vp8_short_inv_walsh4x4_neon| + EXPORT |vp8_short_inv_walsh4x4_1_neon| + + ARM + REQUIRE8 + PRESERVE8 + + AREA |.text|, CODE, READONLY ; name this block of code + +;short vp8_short_inv_walsh4x4_neon(short *input, short *output) +|vp8_short_inv_walsh4x4_neon| PROC + + ; read in all four lines of values: d0->d3 + vldm.64 r0, {q0, q1} + + ; first for loop + + vadd.s16 d4, d0, d3 ;a = [0] + [12] + vadd.s16 d5, d1, d2 ;b = [4] + [8] + vsub.s16 d6, d1, d2 ;c = [4] - [8] + vsub.s16 d7, d0, d3 ;d = [0] - [12] + + vadd.s16 d0, d4, d5 ;a + b + vadd.s16 d1, d6, d7 ;c + d + vsub.s16 d2, d4, d5 ;a - b + vsub.s16 d3, d7, d6 ;d - c + + vtrn.32 d0, d2 ;d0: 0 1 8 9 + ;d2: 2 3 10 11 + vtrn.32 d1, d3 ;d1: 4 5 12 13 + ;d3: 6 7 14 15 + + vtrn.16 d0, d1 ;d0: 0 4 8 12 + ;d1: 1 5 9 13 + vtrn.16 d2, d3 ;d2: 2 6 10 14 + ;d3: 3 7 11 15 + + ; second for loop + + vadd.s16 d4, d0, d3 ;a = [0] + [3] + vadd.s16 d5, d1, d2 ;b = [1] + [2] + vsub.s16 d6, d1, d2 ;c = [1] - [2] + vsub.s16 d7, d0, d3 ;d = [0] - [3] + + vadd.s16 d0, d4, d5 ;e = a + b + vadd.s16 d1, d6, d7 ;f = c + d + vsub.s16 d2, d4, d5 ;g = a - b + vsub.s16 d3, d7, d6 ;h = d - c + + vmov.i16 q2, #3 + vadd.i16 q0, q0, q2 ;e/f += 3 + vadd.i16 q1, q1, q2 ;g/h += 3 + + vshr.s16 q0, q0, #3 ;e/f >> 3 + vshr.s16 q1, q1, #3 ;g/h >> 3 + + vtrn.32 d0, d2 + vtrn.32 d1, d3 + vtrn.16 d0, d1 + vtrn.16 d2, d3 + + vstmia.16 r1!, {q0} + vstmia.16 r1!, {q1} + + bx lr + ENDP ; |vp8_short_inv_walsh4x4_neon| + + +;short vp8_short_inv_walsh4x4_1_neon(short *input, short *output) +|vp8_short_inv_walsh4x4_1_neon| PROC + ; load a full line into a neon register + vld1.16 {q0}, [r0] + ; extract first element and replicate + vdup.16 q1, d0[0] + ; add 3 to all values + vmov.i16 q2, #3 + vadd.i16 q3, q1, q2 + ; right shift + vshr.s16 q3, q3, #3 + ; write it back + vstmia.16 r1!, {q3} + vstmia.16 r1!, {q3} + + bx lr + ENDP ; |vp8_short_inv_walsh4x4_1_neon| + + END |