diff options
author | Johann <johannkoenig@google.com> | 2010-07-23 13:42:30 -0400 |
---|---|---|
committer | Fritz Koenig <frkoenig@google.com> | 2010-07-26 08:55:19 -0400 |
commit | 56f5a9a060d4c89a71616a90207327e6c544f543 (patch) | |
tree | 173d002c3bde5a79e1b5e893b4a778712efd8f65 /vp8/common/arm | |
parent | 98fcccfe9751894ace9693a39ba0609fe5ea904d (diff) | |
download | libvpx-56f5a9a060d4c89a71616a90207327e6c544f543.tar libvpx-56f5a9a060d4c89a71616a90207327e6c544f543.tar.gz libvpx-56f5a9a060d4c89a71616a90207327e6c544f543.tar.bz2 libvpx-56f5a9a060d4c89a71616a90207327e6c544f543.zip |
update arm idct functions
Jeff Muizelaar posted some changes to the idct/reconstruction c code.
This is the equivalent update for the arm assembly.
This shows a good boost on v6, and a minor boost on neon.
Here are some numbers for highway in qcif, 2641 frames:
HEAD neon: ~161 fps
new neon: ~162 fps
HEAD v6: ~102 fps
new v6: ~106 fps
The following functions have been updated for armv6 and neon:
vp8_dc_only_idct_add
vp8_dequant_idct_add
vp8_dequant_dc_idct_add
Conflicts:
vp8/decoder/arm/armv6/dequantdcidct_v6.asm
vp8/decoder/arm/armv6/dequantidct_v6.asm
Resolved by removing these files. When I rewrote the functions, I also
moved the files to dequant_dc_idct_v6.asm/dequant_idct_v6.asm
Change-Id: Ie3300df824d52474eca1a5134cf22d8b7809a5d4
Diffstat (limited to 'vp8/common/arm')
-rw-r--r-- | vp8/common/arm/armv6/dc_only_idct_add_v6.asm | 67 | ||||
-rw-r--r-- | vp8/common/arm/armv6/idct_v6.asm | 32 | ||||
-rw-r--r-- | vp8/common/arm/armv6/iwalsh_v6.asm | 16 | ||||
-rw-r--r-- | vp8/common/arm/idct_arm.h | 16 | ||||
-rw-r--r-- | vp8/common/arm/neon/dc_only_idct_add_neon.asm | 49 |
5 files changed, 136 insertions, 44 deletions
diff --git a/vp8/common/arm/armv6/dc_only_idct_add_v6.asm b/vp8/common/arm/armv6/dc_only_idct_add_v6.asm new file mode 100644 index 000000000..19227282e --- /dev/null +++ b/vp8/common/arm/armv6/dc_only_idct_add_v6.asm @@ -0,0 +1,67 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + EXPORT |vp8_dc_only_idct_add_v6| + + AREA |.text|, CODE, READONLY + +;void vp8_dc_only_idct_add_v6(short input_dc, unsigned char *pred_ptr, +; unsigned char *dst_ptr, int pitch, int stride) +; r0 input_dc +; r1 pred_ptr +; r2 dest_ptr +; r3 pitch +; sp stride + +|vp8_dc_only_idct_add_v6| PROC + stmdb sp!, {r4 - r7, lr} + + add r0, r0, #4 ; input_dc += 4 + ldr r12, c0x0000FFFF + ldr r4, [r1], r3 + ldr r6, [r1], r3 + and r0, r12, r0, asr #3 ; input_dc >> 3 + mask + ldr lr, [sp, #20] + orr r0, r0, r0, lsl #16 ; a1 | a1 + + uxtab16 r5, r0, r4 ; a1+2 | a1+0 + uxtab16 r4, r0, r4, ror #8 ; a1+3 | a1+1 + uxtab16 r7, r0, r6 + uxtab16 r6, r0, r6, ror #8 + usat16 r5, #8, r5 + usat16 r4, #8, r4 + usat16 r7, #8, r7 + usat16 r6, #8, r6 + orr r5, r5, r4, lsl #8 + orr r7, r7, r6, lsl #8 + ldr r4, [r1], r3 + ldr r6, [r1] + str r5, [r2], lr + str r7, [r2], lr + + uxtab16 r5, r0, r4 + uxtab16 r4, r0, r4, ror #8 + uxtab16 r7, r0, r6 + uxtab16 r6, r0, r6, ror #8 + usat16 r5, #8, r5 + usat16 r4, #8, r4 + usat16 r7, #8, r7 + usat16 r6, #8, r6 + orr r5, r5, r4, lsl #8 + orr r7, r7, r6, lsl #8 + str r5, [r2], lr + str r7, [r2] + + ldmia sp!, {r4 - r7, pc} + + ENDP ; |vp8_dc_only_idct_add_v6| + +; Constant Pool +c0x0000FFFF DCD 0x0000FFFF + END diff --git a/vp8/common/arm/armv6/idct_v6.asm b/vp8/common/arm/armv6/idct_v6.asm index d9913c75e..d96908cc6 100644 --- a/vp8/common/arm/armv6/idct_v6.asm +++ b/vp8/common/arm/armv6/idct_v6.asm @@ -15,8 +15,6 @@ EXPORT |vp8_short_idct4x4llm_v6_scott| EXPORT |vp8_short_idct4x4llm_v6_dual| - EXPORT |vp8_dc_only_idct_armv6| - AREA |.text|, CODE, READONLY ;******************************************************************************** @@ -344,34 +342,4 @@ loop2_dual ldmia sp!, {r4 - r11, pc} ; replace vars, return restore ENDP - -; sjl added 10/17/08 -;void dc_only_idct_armv6(short input_dc, short *output, int pitch) -|vp8_dc_only_idct_armv6| PROC - stmdb sp!, {r4 - r6, lr} - - add r0, r0, #0x4 - add r4, r1, r2 ; output + shortpitch - mov r0, r0, ASR #0x3 ;aka a1 - add r5, r1, r2, LSL #1 ; output + shortpitch * 2 - pkhbt r0, r0, r0, lsl #16 ; a1 | a1 - add r6, r5, r2 ; output + shortpitch * 3 - - str r0, [r1, #0] - str r0, [r1, #4] - - str r0, [r4, #0] - str r0, [r4, #4] - - str r0, [r5, #0] - str r0, [r5, #4] - - str r0, [r6, #0] - str r0, [r6, #4] - - - ldmia sp!, {r4 - r6, pc} - - ENDP ; |vp8_dc_only_idct_armv6| - END diff --git a/vp8/common/arm/armv6/iwalsh_v6.asm b/vp8/common/arm/armv6/iwalsh_v6.asm index f4002b2ce..cab6bc916 100644 --- a/vp8/common/arm/armv6/iwalsh_v6.asm +++ b/vp8/common/arm/armv6/iwalsh_v6.asm @@ -8,8 +8,8 @@ ; be found in the AUTHORS file in the root of the source tree. ; - EXPORT |vp8_short_inv_walsh4x4_armv6| - EXPORT |vp8_short_inv_walsh4x4_1_armv6| + EXPORT |vp8_short_inv_walsh4x4_v6| + EXPORT |vp8_short_inv_walsh4x4_1_v6| ARM REQUIRE8 @@ -17,8 +17,8 @@ AREA |.text|, CODE, READONLY ; name this block of code -;short vp8_short_inv_walsh4x4_armv6(short *input, short *output) -|vp8_short_inv_walsh4x4_armv6| PROC +;short vp8_short_inv_walsh4x4_v6(short *input, short *output) +|vp8_short_inv_walsh4x4_v6| PROC stmdb sp!, {r4 - r11, lr} @@ -123,11 +123,11 @@ str r5, [r1] ldmia sp!, {r4 - r11, pc} - ENDP ; |vp8_short_inv_walsh4x4_armv6| + ENDP ; |vp8_short_inv_walsh4x4_v6| -;short vp8_short_inv_walsh4x4_1_armv6(short *input, short *output) -|vp8_short_inv_walsh4x4_1_armv6| PROC +;short vp8_short_inv_walsh4x4_1_v6(short *input, short *output) +|vp8_short_inv_walsh4x4_1_v6| PROC ldrsh r2, [r0] ; [0] add r2, r2, #3 ; [0] + 3 @@ -145,7 +145,7 @@ str r2, [r1] bx lr - ENDP ; |vp8_short_inv_walsh4x4_1_armv6| + ENDP ; |vp8_short_inv_walsh4x4_1_v6| ; Constant Pool c0x00030003 DCD 0x00030003 diff --git a/vp8/common/arm/idct_arm.h b/vp8/common/arm/idct_arm.h index 97af32e69..6d917c445 100644 --- a/vp8/common/arm/idct_arm.h +++ b/vp8/common/arm/idct_arm.h @@ -15,8 +15,9 @@ #if HAVE_ARMV6 extern prototype_idct(vp8_short_idct4x4llm_1_v6); extern prototype_idct(vp8_short_idct4x4llm_v6_dual); -extern prototype_second_order(vp8_short_inv_walsh4x4_1_armv6); -extern prototype_second_order(vp8_short_inv_walsh4x4_armv6); +extern prototype_idct_scalar_add(vp8_dc_only_idct_add_v6); +extern prototype_second_order(vp8_short_inv_walsh4x4_1_v6); +extern prototype_second_order(vp8_short_inv_walsh4x4_v6); #undef vp8_idct_idct1 #define vp8_idct_idct1 vp8_short_idct4x4llm_1_v6 @@ -24,16 +25,20 @@ extern prototype_second_order(vp8_short_inv_walsh4x4_armv6); #undef vp8_idct_idct16 #define vp8_idct_idct16 vp8_short_idct4x4llm_v6_dual +#undef vp8_idct_idct1_scalar_add +#define vp8_idct_idct1_scalar_add vp8_dc_only_idct_add_v6 + #undef vp8_idct_iwalsh1 -#define vp8_idct_iwalsh1 vp8_short_inv_walsh4x4_1_armv6 +#define vp8_idct_iwalsh1 vp8_short_inv_walsh4x4_1_v6 #undef vp8_idct_iwalsh16 -#define vp8_idct_iwalsh16 vp8_short_inv_walsh4x4_armv6 +#define vp8_idct_iwalsh16 vp8_short_inv_walsh4x4_v6 #endif #if HAVE_ARMV7 extern prototype_idct(vp8_short_idct4x4llm_1_neon); extern prototype_idct(vp8_short_idct4x4llm_neon); +extern prototype_idct_scalar_add(vp8_dc_only_idct_add_neon); extern prototype_second_order(vp8_short_inv_walsh4x4_1_neon); extern prototype_second_order(vp8_short_inv_walsh4x4_neon); @@ -43,6 +48,9 @@ extern prototype_second_order(vp8_short_inv_walsh4x4_neon); #undef vp8_idct_idct16 #define vp8_idct_idct16 vp8_short_idct4x4llm_neon +#undef vp8_idct_idct1_scalar_add +#define vp8_idct_idct1_scalar_add vp8_dc_only_idct_add_neon + #undef vp8_idct_iwalsh1 #define vp8_idct_iwalsh1 vp8_short_inv_walsh4x4_1_neon diff --git a/vp8/common/arm/neon/dc_only_idct_add_neon.asm b/vp8/common/arm/neon/dc_only_idct_add_neon.asm new file mode 100644 index 000000000..e6f141fda --- /dev/null +++ b/vp8/common/arm/neon/dc_only_idct_add_neon.asm @@ -0,0 +1,49 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_dc_only_idct_add_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +;void vp8_dc_only_idct_add_neon(short input_dc, unsigned char *pred_ptr, +; unsigned char *dst_ptr, int pitch, int stride) +; r0 input_dc +; r1 pred_ptr +; r2 dst_ptr +; r3 pitch +; sp stride +|vp8_dc_only_idct_add_neon| PROC + add r0, r0, #4 + asr r0, r0, #3 + ldr r12, [sp] + vdup.16 q0, r0 + + vld1.32 {d2[0]}, [r1], r3 + vld1.32 {d2[1]}, [r1], r3 + vld1.32 {d4[0]}, [r1], r3 + vld1.32 {d4[1]}, [r1] + + vaddw.u8 q1, q0, d2 + vaddw.u8 q2, q0, d4 + + vqmovun.s16 d2, q1 + vqmovun.s16 d4, q2 + + vst1.32 {d2[0]}, [r2], r12 + vst1.32 {d2[1]}, [r2], r12 + vst1.32 {d4[0]}, [r2], r12 + vst1.32 {d4[1]}, [r2] + + bx lr + + ENDP + END |