diff options
Diffstat (limited to 'vp8/common/arm')
-rw-r--r-- | vp8/common/arm/arm_systemdependent.c | 2 | ||||
-rw-r--r-- | vp8/common/arm/armv6/iwalsh_v6.asm | 124 | ||||
-rw-r--r-- | vp8/common/arm/idct_arm.h | 6 | ||||
-rw-r--r-- | vp8/common/arm/neon/iwalsh_neon.asm | 37 |
4 files changed, 76 insertions, 93 deletions
diff --git a/vp8/common/arm/arm_systemdependent.c b/vp8/common/arm/arm_systemdependent.c index b5f194d3d..cd55a6377 100644 --- a/vp8/common/arm/arm_systemdependent.c +++ b/vp8/common/arm/arm_systemdependent.c @@ -46,7 +46,6 @@ void vp8_arch_arm_common_init(VP8_COMMON *ctx) rtcd->subpix.bilinear4x4 = vp8_bilinear_predict4x4_armv6; rtcd->idct.idct16 = vp8_short_idct4x4llm_v6_dual; - rtcd->idct.iwalsh1 = vp8_short_inv_walsh4x4_1_v6; rtcd->idct.iwalsh16 = vp8_short_inv_walsh4x4_v6; rtcd->loopfilter.normal_mb_v = vp8_loop_filter_mbv_armv6; @@ -80,7 +79,6 @@ void vp8_arch_arm_common_init(VP8_COMMON *ctx) rtcd->subpix.bilinear4x4 = vp8_bilinear_predict4x4_neon; rtcd->idct.idct16 = vp8_short_idct4x4llm_neon; - rtcd->idct.iwalsh1 = vp8_short_inv_walsh4x4_1_neon; rtcd->idct.iwalsh16 = vp8_short_inv_walsh4x4_neon; rtcd->loopfilter.normal_mb_v = vp8_loop_filter_mbv_neon; diff --git a/vp8/common/arm/armv6/iwalsh_v6.asm b/vp8/common/arm/armv6/iwalsh_v6.asm index 463bff0f5..31ef09cad 100644 --- a/vp8/common/arm/armv6/iwalsh_v6.asm +++ b/vp8/common/arm/armv6/iwalsh_v6.asm @@ -9,7 +9,6 @@ ; EXPORT |vp8_short_inv_walsh4x4_v6| - EXPORT |vp8_short_inv_walsh4x4_1_v6| ARM REQUIRE8 @@ -17,19 +16,19 @@ AREA |.text|, CODE, READONLY ; name this block of code -;short vp8_short_inv_walsh4x4_v6(short *input, short *output) +;short vp8_short_inv_walsh4x4_v6(short *input, short *mb_dqcoeff) |vp8_short_inv_walsh4x4_v6| PROC - stmdb sp!, {r4 - r11, lr} + stmdb sp!, {r4 - r12, lr} - ldr r2, [r0], #4 ; [1 | 0] - ldr r3, [r0], #4 ; [3 | 2] - ldr r4, [r0], #4 ; [5 | 4] - ldr r5, [r0], #4 ; [7 | 6] - ldr r6, [r0], #4 ; [9 | 8] - ldr r7, [r0], #4 ; [11 | 10] - ldr r8, [r0], #4 ; [13 | 12] - ldr r9, [r0] ; [15 | 14] + ldr r2, [r0, #0] ; [1 | 0] + ldr r3, [r0, #4] ; [3 | 2] + ldr r4, [r0, #8] ; [5 | 4] + ldr r5, [r0, #12] ; [7 | 6] + ldr r6, [r0, #16] ; [9 | 8] + ldr r7, [r0, #20] ; [11 | 10] + ldr r8, [r0, #24] ; [13 | 12] + ldr r9, [r0, #28] ; [15 | 14] qadd16 r10, r2, r8 ; a1 [1+13 | 0+12] qadd16 r11, r4, r6 ; b1 [5+9 | 4+8] @@ -69,24 +68,27 @@ qadd16 r4, r4, r10 ; [b2+3|c2+3] qadd16 r5, r5, r10 ; [a2+3|d2+3] - asr r12, r2, #3 ; [1 | x] - pkhtb r12, r12, r3, asr #19; [1 | 0] - lsl lr, r3, #16 ; [~3 | x] - lsl r2, r2, #16 ; [~2 | x] - asr lr, lr, #3 ; [3 | x] - pkhtb lr, lr, r2, asr #19 ; [3 | 2] - - asr r2, r4, #3 ; [5 | x] - pkhtb r2, r2, r5, asr #19 ; [5 | 4] - lsl r3, r5, #16 ; [~7 | x] - lsl r4, r4, #16 ; [~6 | x] - asr r3, r3, #3 ; [7 | x] - pkhtb r3, r3, r4, asr #19 ; [7 | 6] - - str r12, [r1], #4 - str lr, [r1], #4 - str r2, [r1], #4 - str r3, [r1], #4 + asr r12, r3, #19 ; [0] + strh r12, [r1], #32 + asr lr, r2, #19 ; [1] + strh lr, [r1], #32 + sxth r2, r2 + sxth r3, r3 + asr r2, r2, #3 ; [2] + strh r2, [r1], #32 + asr r3, r3, #3 ; [3] + strh r3, [r1], #32 + + asr r12, r5, #19 ; [4] + strh r12, [r1], #32 + asr lr, r4, #19 ; [5] + strh lr, [r1], #32 + sxth r4, r4 + sxth r5, r5 + asr r4, r4, #3 ; [6] + strh r4, [r1], #32 + asr r5, r5, #3 ; [7] + strh r5, [r1], #32 qsubaddx r2, r6, r7 ; [c1|a1] [9-10 | 8+11] qaddsubx r3, r6, r7 ; [b1|d1] [9+10 | 8-11] @@ -103,50 +105,32 @@ qadd16 r8, r8, r10 ; [b2+3|c2+3] qadd16 r9, r9, r10 ; [a2+3|d2+3] - asr r2, r6, #3 ; [9 | x] - pkhtb r2, r2, r7, asr #19 ; [9 | 8] - lsl r3, r7, #16 ; [~11| x] - lsl r4, r6, #16 ; [~10| x] - asr r3, r3, #3 ; [11 | x] - pkhtb r3, r3, r4, asr #19 ; [11 | 10] - - asr r4, r8, #3 ; [13 | x] - pkhtb r4, r4, r9, asr #19 ; [13 | 12] - lsl r5, r9, #16 ; [~15| x] - lsl r6, r8, #16 ; [~14| x] - asr r5, r5, #3 ; [15 | x] - pkhtb r5, r5, r6, asr #19 ; [15 | 14] - - str r2, [r1], #4 - str r3, [r1], #4 - str r4, [r1], #4 - str r5, [r1] - - ldmia sp!, {r4 - r11, pc} + asr r12, r7, #19 ; [8] + strh r12, [r1], #32 + asr lr, r6, #19 ; [9] + strh lr, [r1], #32 + sxth r6, r6 + sxth r7, r7 + asr r6, r6, #3 ; [10] + strh r6, [r1], #32 + asr r7, r7, #3 ; [11] + strh r7, [r1], #32 + + asr r12, r9, #19 ; [12] + strh r12, [r1], #32 + asr lr, r8, #19 ; [13] + strh lr, [r1], #32 + sxth r8, r8 + sxth r9, r9 + asr r8, r8, #3 ; [14] + strh r8, [r1], #32 + asr r9, r9, #3 ; [15] + strh r9, [r1], #32 + + ldmia sp!, {r4 - r12, pc} ENDP ; |vp8_short_inv_walsh4x4_v6| -;short vp8_short_inv_walsh4x4_1_v6(short *input, short *output) -|vp8_short_inv_walsh4x4_1_v6| PROC - - ldrsh r2, [r0] ; [0] - add r2, r2, #3 ; [0] + 3 - asr r2, r2, #3 ; a1 ([0]+3) >> 3 - lsl r2, r2, #16 ; [a1 | x] - orr r2, r2, r2, lsr #16 ; [a1 | a1] - - str r2, [r1], #4 - str r2, [r1], #4 - str r2, [r1], #4 - str r2, [r1], #4 - str r2, [r1], #4 - str r2, [r1], #4 - str r2, [r1], #4 - str r2, [r1] - - bx lr - ENDP ; |vp8_short_inv_walsh4x4_1_v6| - ; Constant Pool c0x00030003 DCD 0x00030003 END diff --git a/vp8/common/arm/idct_arm.h b/vp8/common/arm/idct_arm.h index c710c2eb0..68c0cad11 100644 --- a/vp8/common/arm/idct_arm.h +++ b/vp8/common/arm/idct_arm.h @@ -25,9 +25,6 @@ extern prototype_second_order(vp8_short_inv_walsh4x4_v6); #undef vp8_idct_idct1_scalar_add #define vp8_idct_idct1_scalar_add vp8_dc_only_idct_add_v6 -#undef vp8_idct_iwalsh1 -#define vp8_idct_iwalsh1 vp8_short_inv_walsh4x4_1_v6 - #undef vp8_idct_iwalsh16 #define vp8_idct_iwalsh16 vp8_short_inv_walsh4x4_v6 #endif @@ -46,9 +43,6 @@ extern prototype_second_order(vp8_short_inv_walsh4x4_neon); #undef vp8_idct_idct1_scalar_add #define vp8_idct_idct1_scalar_add vp8_dc_only_idct_add_neon -#undef vp8_idct_iwalsh1 -#define vp8_idct_iwalsh1 vp8_short_inv_walsh4x4_1_neon - #undef vp8_idct_iwalsh16 #define vp8_idct_iwalsh16 vp8_short_inv_walsh4x4_neon #endif diff --git a/vp8/common/arm/neon/iwalsh_neon.asm b/vp8/common/arm/neon/iwalsh_neon.asm index 01c79d937..e8ea2a619 100644 --- a/vp8/common/arm/neon/iwalsh_neon.asm +++ b/vp8/common/arm/neon/iwalsh_neon.asm @@ -8,7 +8,6 @@ ; be found in the AUTHORS file in the root of the source tree. ; EXPORT |vp8_short_inv_walsh4x4_neon| - EXPORT |vp8_short_inv_walsh4x4_1_neon| ARM REQUIRE8 @@ -16,7 +15,7 @@ AREA |.text|, CODE, READONLY ; name this block of code -;short vp8_short_inv_walsh4x4_neon(short *input, short *output) +;short vp8_short_inv_walsh4x4_neon(short *input, short *mb_dqcoeff) |vp8_short_inv_walsh4x4_neon| PROC ; read in all four lines of values: d0->d3 @@ -59,22 +58,30 @@ vshr.s16 q0, q0, #3 ;e/f >> 3 vshr.s16 q1, q1, #3 ;g/h >> 3 - vst4.i16 {d0,d1,d2,d3}, [r1@128] + mov r2, #64 + add r3, r1, #32 - bx lr - ENDP ; |vp8_short_inv_walsh4x4_neon| + vst1.i16 d0[0], [r1],r2 + vst1.i16 d1[0], [r3],r2 + vst1.i16 d2[0], [r1],r2 + vst1.i16 d3[0], [r3],r2 + + vst1.i16 d0[1], [r1],r2 + vst1.i16 d1[1], [r3],r2 + vst1.i16 d2[1], [r1],r2 + vst1.i16 d3[1], [r3],r2 + vst1.i16 d0[2], [r1],r2 + vst1.i16 d1[2], [r3],r2 + vst1.i16 d2[2], [r1],r2 + vst1.i16 d3[2], [r3],r2 + + vst1.i16 d0[3], [r1],r2 + vst1.i16 d1[3], [r3],r2 + vst1.i16 d2[3], [r1] + vst1.i16 d3[3], [r3] -;short vp8_short_inv_walsh4x4_1_neon(short *input, short *output) -|vp8_short_inv_walsh4x4_1_neon| PROC - ldrsh r2, [r0] ; load input[0] - add r3, r2, #3 ; add 3 - add r2, r1, #16 ; base for last 8 output - asr r0, r3, #3 ; right shift 3 - vdup.16 q0, r0 ; load and duplicate - vst1.16 {q0}, [r1@128] ; write back 8 - vst1.16 {q0}, [r2@128] ; write back last 8 bx lr - ENDP ; |vp8_short_inv_walsh4x4_1_neon| + ENDP ; |vp8_short_inv_walsh4x4_neon| END |