summaryrefslogtreecommitdiff
path: root/vp8/encoder/arm/neon
diff options
context:
space:
mode:
authorYunqing Wang <yunqingwang@google.com>2014-04-28 14:42:23 -0700
committerYunqing Wang <yunqingwang@google.com>2014-04-28 14:51:53 -0700
commit33df6d1fc1d268b4901b74b4141f83594266f041 (patch)
tree87031fef033550336f5d32c3cfe55ae4cf6d2e09 /vp8/encoder/arm/neon
parent5ba44e37a470be7ec74f717c293cfcb864c84a0d (diff)
downloadlibvpx-33df6d1fc1d268b4901b74b4141f83594266f041.tar
libvpx-33df6d1fc1d268b4901b74b4141f83594266f041.tar.gz
libvpx-33df6d1fc1d268b4901b74b4141f83594266f041.tar.bz2
libvpx-33df6d1fc1d268b4901b74b4141f83594266f041.zip
Save NEON registers in VP8 NEON functions
The recent compiler can generate optimized code that uses NEON registers for various operations besides floating-point operations. Therefore, only saving callee-saved registers d8 - d15 at the beginning of the encoder/decoder is not enough anymore. This patch added register saving code in VP8 NEON functions that use those registers. Change-Id: Ie9e44f5188cf410990c8aaaac68faceee9dffd31
Diffstat (limited to 'vp8/encoder/arm/neon')
-rw-r--r--vp8/encoder/arm/neon/subtract_neon.asm14
-rw-r--r--vp8/encoder/arm/neon/vp8_memcpy_neon.asm2
-rw-r--r--vp8/encoder/arm/neon/vp8_mse16x16_neon.asm9
3 files changed, 20 insertions, 5 deletions
diff --git a/vp8/encoder/arm/neon/subtract_neon.asm b/vp8/encoder/arm/neon/subtract_neon.asm
index 5bda78678..840cb33d9 100644
--- a/vp8/encoder/arm/neon/subtract_neon.asm
+++ b/vp8/encoder/arm/neon/subtract_neon.asm
@@ -65,8 +65,10 @@
; unsigned char *pred, int pred_stride)
|vp8_subtract_mby_neon| PROC
push {r4-r7}
+ vpush {d8-d15}
+
mov r12, #4
- ldr r4, [sp, #16] ; pred_stride
+ ldr r4, [sp, #80] ; pred_stride
mov r6, #32 ; "diff" stride x2
add r5, r0, #16 ; second diff pointer
@@ -101,6 +103,7 @@ subtract_mby_loop
subs r12, r12, #1
bne subtract_mby_loop
+ vpop {d8-d15}
pop {r4-r7}
bx lr
ENDP
@@ -112,9 +115,11 @@ subtract_mby_loop
|vp8_subtract_mbuv_neon| PROC
push {r4-r7}
- ldr r4, [sp, #16] ; upred
- ldr r5, [sp, #20] ; vpred
- ldr r6, [sp, #24] ; pred_stride
+ vpush {d8-d15}
+
+ ldr r4, [sp, #80] ; upred
+ ldr r5, [sp, #84] ; vpred
+ ldr r6, [sp, #88] ; pred_stride
add r0, r0, #512 ; short *udiff = diff + 256;
mov r12, #32 ; "diff" stride x2
add r7, r0, #16 ; second diff pointer
@@ -191,6 +196,7 @@ subtract_mby_loop
vst1.16 {q14}, [r0], r12
vst1.16 {q15}, [r7], r12
+ vpop {d8-d15}
pop {r4-r7}
bx lr
diff --git a/vp8/encoder/arm/neon/vp8_memcpy_neon.asm b/vp8/encoder/arm/neon/vp8_memcpy_neon.asm
index 5b9f11e59..d219e2d14 100644
--- a/vp8/encoder/arm/neon/vp8_memcpy_neon.asm
+++ b/vp8/encoder/arm/neon/vp8_memcpy_neon.asm
@@ -21,6 +21,7 @@
;void vp8_memcpy_partial_neon(unsigned char *dst_ptr, unsigned char *src_ptr,
; int sz);
|vp8_memcpy_partial_neon| PROC
+ vpush {d8-d15}
;pld [r1] ;preload pred data
;pld [r1, #128]
;pld [r1, #256]
@@ -64,6 +65,7 @@ extra_copy_neon_loop
bne extra_copy_neon_loop
done_copy_neon_loop
+ vpop {d8-d15}
bx lr
ENDP
diff --git a/vp8/encoder/arm/neon/vp8_mse16x16_neon.asm b/vp8/encoder/arm/neon/vp8_mse16x16_neon.asm
index 55edbf512..f82af3ee3 100644
--- a/vp8/encoder/arm/neon/vp8_mse16x16_neon.asm
+++ b/vp8/encoder/arm/neon/vp8_mse16x16_neon.asm
@@ -27,6 +27,8 @@
;from vp8_variance().
|vp8_mse16x16_neon| PROC
+ vpush {q7}
+
vmov.i8 q7, #0 ;q7, q8, q9, q10 - sse
vmov.i8 q8, #0
vmov.i8 q9, #0
@@ -62,7 +64,7 @@ mse16x16_neon_loop
vadd.u32 q7, q7, q8
vadd.u32 q9, q9, q10
- ldr r12, [sp] ;load *sse from stack
+ ldr r12, [sp, #16] ;load *sse from stack
vadd.u32 q10, q7, q9
vpaddl.u32 q1, q10
@@ -71,6 +73,7 @@ mse16x16_neon_loop
vst1.32 {d0[0]}, [r12]
vmov.32 r0, d0[0]
+ vpop {q7}
bx lr
ENDP
@@ -82,6 +85,8 @@ mse16x16_neon_loop
; r2 unsigned char *ref_ptr,
; r3 int recon_stride
|vp8_get4x4sse_cs_neon| PROC
+ vpush {q7}
+
vld1.8 {d0}, [r0], r1 ;Load up source and reference
vld1.8 {d4}, [r2], r3
vld1.8 {d1}, [r0], r1
@@ -109,6 +114,8 @@ mse16x16_neon_loop
vadd.u64 d0, d2, d3
vmov.32 r0, d0[0]
+
+ vpop {q7}
bx lr
ENDP