summaryrefslogtreecommitdiff
path: root/vp8/common/arm
diff options
context:
space:
mode:
Diffstat (limited to 'vp8/common/arm')
-rw-r--r--vp8/common/arm/arm_systemdependent.c2
-rw-r--r--vp8/common/arm/armv6/iwalsh_v6.asm124
-rw-r--r--vp8/common/arm/idct_arm.h6
-rw-r--r--vp8/common/arm/neon/iwalsh_neon.asm37
4 files changed, 76 insertions, 93 deletions
diff --git a/vp8/common/arm/arm_systemdependent.c b/vp8/common/arm/arm_systemdependent.c
index b5f194d3d..cd55a6377 100644
--- a/vp8/common/arm/arm_systemdependent.c
+++ b/vp8/common/arm/arm_systemdependent.c
@@ -46,7 +46,6 @@ void vp8_arch_arm_common_init(VP8_COMMON *ctx)
rtcd->subpix.bilinear4x4 = vp8_bilinear_predict4x4_armv6;
rtcd->idct.idct16 = vp8_short_idct4x4llm_v6_dual;
- rtcd->idct.iwalsh1 = vp8_short_inv_walsh4x4_1_v6;
rtcd->idct.iwalsh16 = vp8_short_inv_walsh4x4_v6;
rtcd->loopfilter.normal_mb_v = vp8_loop_filter_mbv_armv6;
@@ -80,7 +79,6 @@ void vp8_arch_arm_common_init(VP8_COMMON *ctx)
rtcd->subpix.bilinear4x4 = vp8_bilinear_predict4x4_neon;
rtcd->idct.idct16 = vp8_short_idct4x4llm_neon;
- rtcd->idct.iwalsh1 = vp8_short_inv_walsh4x4_1_neon;
rtcd->idct.iwalsh16 = vp8_short_inv_walsh4x4_neon;
rtcd->loopfilter.normal_mb_v = vp8_loop_filter_mbv_neon;
diff --git a/vp8/common/arm/armv6/iwalsh_v6.asm b/vp8/common/arm/armv6/iwalsh_v6.asm
index 463bff0f5..31ef09cad 100644
--- a/vp8/common/arm/armv6/iwalsh_v6.asm
+++ b/vp8/common/arm/armv6/iwalsh_v6.asm
@@ -9,7 +9,6 @@
;
EXPORT |vp8_short_inv_walsh4x4_v6|
- EXPORT |vp8_short_inv_walsh4x4_1_v6|
ARM
REQUIRE8
@@ -17,19 +16,19 @@
AREA |.text|, CODE, READONLY ; name this block of code
-;short vp8_short_inv_walsh4x4_v6(short *input, short *output)
+;short vp8_short_inv_walsh4x4_v6(short *input, short *mb_dqcoeff)
|vp8_short_inv_walsh4x4_v6| PROC
- stmdb sp!, {r4 - r11, lr}
+ stmdb sp!, {r4 - r12, lr}
- ldr r2, [r0], #4 ; [1 | 0]
- ldr r3, [r0], #4 ; [3 | 2]
- ldr r4, [r0], #4 ; [5 | 4]
- ldr r5, [r0], #4 ; [7 | 6]
- ldr r6, [r0], #4 ; [9 | 8]
- ldr r7, [r0], #4 ; [11 | 10]
- ldr r8, [r0], #4 ; [13 | 12]
- ldr r9, [r0] ; [15 | 14]
+ ldr r2, [r0, #0] ; [1 | 0]
+ ldr r3, [r0, #4] ; [3 | 2]
+ ldr r4, [r0, #8] ; [5 | 4]
+ ldr r5, [r0, #12] ; [7 | 6]
+ ldr r6, [r0, #16] ; [9 | 8]
+ ldr r7, [r0, #20] ; [11 | 10]
+ ldr r8, [r0, #24] ; [13 | 12]
+ ldr r9, [r0, #28] ; [15 | 14]
qadd16 r10, r2, r8 ; a1 [1+13 | 0+12]
qadd16 r11, r4, r6 ; b1 [5+9 | 4+8]
@@ -69,24 +68,27 @@
qadd16 r4, r4, r10 ; [b2+3|c2+3]
qadd16 r5, r5, r10 ; [a2+3|d2+3]
- asr r12, r2, #3 ; [1 | x]
- pkhtb r12, r12, r3, asr #19; [1 | 0]
- lsl lr, r3, #16 ; [~3 | x]
- lsl r2, r2, #16 ; [~2 | x]
- asr lr, lr, #3 ; [3 | x]
- pkhtb lr, lr, r2, asr #19 ; [3 | 2]
-
- asr r2, r4, #3 ; [5 | x]
- pkhtb r2, r2, r5, asr #19 ; [5 | 4]
- lsl r3, r5, #16 ; [~7 | x]
- lsl r4, r4, #16 ; [~6 | x]
- asr r3, r3, #3 ; [7 | x]
- pkhtb r3, r3, r4, asr #19 ; [7 | 6]
-
- str r12, [r1], #4
- str lr, [r1], #4
- str r2, [r1], #4
- str r3, [r1], #4
+ asr r12, r3, #19 ; [0]
+ strh r12, [r1], #32
+ asr lr, r2, #19 ; [1]
+ strh lr, [r1], #32
+ sxth r2, r2
+ sxth r3, r3
+ asr r2, r2, #3 ; [2]
+ strh r2, [r1], #32
+ asr r3, r3, #3 ; [3]
+ strh r3, [r1], #32
+
+ asr r12, r5, #19 ; [4]
+ strh r12, [r1], #32
+ asr lr, r4, #19 ; [5]
+ strh lr, [r1], #32
+ sxth r4, r4
+ sxth r5, r5
+ asr r4, r4, #3 ; [6]
+ strh r4, [r1], #32
+ asr r5, r5, #3 ; [7]
+ strh r5, [r1], #32
qsubaddx r2, r6, r7 ; [c1|a1] [9-10 | 8+11]
qaddsubx r3, r6, r7 ; [b1|d1] [9+10 | 8-11]
@@ -103,50 +105,32 @@
qadd16 r8, r8, r10 ; [b2+3|c2+3]
qadd16 r9, r9, r10 ; [a2+3|d2+3]
- asr r2, r6, #3 ; [9 | x]
- pkhtb r2, r2, r7, asr #19 ; [9 | 8]
- lsl r3, r7, #16 ; [~11| x]
- lsl r4, r6, #16 ; [~10| x]
- asr r3, r3, #3 ; [11 | x]
- pkhtb r3, r3, r4, asr #19 ; [11 | 10]
-
- asr r4, r8, #3 ; [13 | x]
- pkhtb r4, r4, r9, asr #19 ; [13 | 12]
- lsl r5, r9, #16 ; [~15| x]
- lsl r6, r8, #16 ; [~14| x]
- asr r5, r5, #3 ; [15 | x]
- pkhtb r5, r5, r6, asr #19 ; [15 | 14]
-
- str r2, [r1], #4
- str r3, [r1], #4
- str r4, [r1], #4
- str r5, [r1]
-
- ldmia sp!, {r4 - r11, pc}
+ asr r12, r7, #19 ; [8]
+ strh r12, [r1], #32
+ asr lr, r6, #19 ; [9]
+ strh lr, [r1], #32
+ sxth r6, r6
+ sxth r7, r7
+ asr r6, r6, #3 ; [10]
+ strh r6, [r1], #32
+ asr r7, r7, #3 ; [11]
+ strh r7, [r1], #32
+
+ asr r12, r9, #19 ; [12]
+ strh r12, [r1], #32
+ asr lr, r8, #19 ; [13]
+ strh lr, [r1], #32
+ sxth r8, r8
+ sxth r9, r9
+ asr r8, r8, #3 ; [14]
+ strh r8, [r1], #32
+ asr r9, r9, #3 ; [15]
+ strh r9, [r1], #32
+
+ ldmia sp!, {r4 - r12, pc}
ENDP ; |vp8_short_inv_walsh4x4_v6|
-;short vp8_short_inv_walsh4x4_1_v6(short *input, short *output)
-|vp8_short_inv_walsh4x4_1_v6| PROC
-
- ldrsh r2, [r0] ; [0]
- add r2, r2, #3 ; [0] + 3
- asr r2, r2, #3 ; a1 ([0]+3) >> 3
- lsl r2, r2, #16 ; [a1 | x]
- orr r2, r2, r2, lsr #16 ; [a1 | a1]
-
- str r2, [r1], #4
- str r2, [r1], #4
- str r2, [r1], #4
- str r2, [r1], #4
- str r2, [r1], #4
- str r2, [r1], #4
- str r2, [r1], #4
- str r2, [r1]
-
- bx lr
- ENDP ; |vp8_short_inv_walsh4x4_1_v6|
-
; Constant Pool
c0x00030003 DCD 0x00030003
END
diff --git a/vp8/common/arm/idct_arm.h b/vp8/common/arm/idct_arm.h
index c710c2eb0..68c0cad11 100644
--- a/vp8/common/arm/idct_arm.h
+++ b/vp8/common/arm/idct_arm.h
@@ -25,9 +25,6 @@ extern prototype_second_order(vp8_short_inv_walsh4x4_v6);
#undef vp8_idct_idct1_scalar_add
#define vp8_idct_idct1_scalar_add vp8_dc_only_idct_add_v6
-#undef vp8_idct_iwalsh1
-#define vp8_idct_iwalsh1 vp8_short_inv_walsh4x4_1_v6
-
#undef vp8_idct_iwalsh16
#define vp8_idct_iwalsh16 vp8_short_inv_walsh4x4_v6
#endif
@@ -46,9 +43,6 @@ extern prototype_second_order(vp8_short_inv_walsh4x4_neon);
#undef vp8_idct_idct1_scalar_add
#define vp8_idct_idct1_scalar_add vp8_dc_only_idct_add_neon
-#undef vp8_idct_iwalsh1
-#define vp8_idct_iwalsh1 vp8_short_inv_walsh4x4_1_neon
-
#undef vp8_idct_iwalsh16
#define vp8_idct_iwalsh16 vp8_short_inv_walsh4x4_neon
#endif
diff --git a/vp8/common/arm/neon/iwalsh_neon.asm b/vp8/common/arm/neon/iwalsh_neon.asm
index 01c79d937..e8ea2a619 100644
--- a/vp8/common/arm/neon/iwalsh_neon.asm
+++ b/vp8/common/arm/neon/iwalsh_neon.asm
@@ -8,7 +8,6 @@
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_short_inv_walsh4x4_neon|
- EXPORT |vp8_short_inv_walsh4x4_1_neon|
ARM
REQUIRE8
@@ -16,7 +15,7 @@
AREA |.text|, CODE, READONLY ; name this block of code
-;short vp8_short_inv_walsh4x4_neon(short *input, short *output)
+;short vp8_short_inv_walsh4x4_neon(short *input, short *mb_dqcoeff)
|vp8_short_inv_walsh4x4_neon| PROC
; read in all four lines of values: d0->d3
@@ -59,22 +58,30 @@
vshr.s16 q0, q0, #3 ;e/f >> 3
vshr.s16 q1, q1, #3 ;g/h >> 3
- vst4.i16 {d0,d1,d2,d3}, [r1@128]
+ mov r2, #64
+ add r3, r1, #32
- bx lr
- ENDP ; |vp8_short_inv_walsh4x4_neon|
+ vst1.i16 d0[0], [r1],r2
+ vst1.i16 d1[0], [r3],r2
+ vst1.i16 d2[0], [r1],r2
+ vst1.i16 d3[0], [r3],r2
+
+ vst1.i16 d0[1], [r1],r2
+ vst1.i16 d1[1], [r3],r2
+ vst1.i16 d2[1], [r1],r2
+ vst1.i16 d3[1], [r3],r2
+ vst1.i16 d0[2], [r1],r2
+ vst1.i16 d1[2], [r3],r2
+ vst1.i16 d2[2], [r1],r2
+ vst1.i16 d3[2], [r3],r2
+
+ vst1.i16 d0[3], [r1],r2
+ vst1.i16 d1[3], [r3],r2
+ vst1.i16 d2[3], [r1]
+ vst1.i16 d3[3], [r3]
-;short vp8_short_inv_walsh4x4_1_neon(short *input, short *output)
-|vp8_short_inv_walsh4x4_1_neon| PROC
- ldrsh r2, [r0] ; load input[0]
- add r3, r2, #3 ; add 3
- add r2, r1, #16 ; base for last 8 output
- asr r0, r3, #3 ; right shift 3
- vdup.16 q0, r0 ; load and duplicate
- vst1.16 {q0}, [r1@128] ; write back 8
- vst1.16 {q0}, [r2@128] ; write back last 8
bx lr
- ENDP ; |vp8_short_inv_walsh4x4_1_neon|
+ ENDP ; |vp8_short_inv_walsh4x4_neon|
END