summaryrefslogtreecommitdiff
path: root/vp8/common/arm/neon
diff options
context:
space:
mode:
authorScott LaVarnway <slavarnway@google.com>2011-11-17 12:54:42 -0500
committerTero Rintaluoma <teror@google.com>2011-11-25 09:24:04 +0200
commit4a91541c946c1fc2655a942ec79033618f03c4ca (patch)
tree70093355ebd25dd2c79515f7950c8490f6937355 /vp8/common/arm/neon
parent7b0feac4a4386eef3e1ea851e52e4f30935e255d (diff)
downloadlibvpx-4a91541c946c1fc2655a942ec79033618f03c4ca.tar
libvpx-4a91541c946c1fc2655a942ec79033618f03c4ca.tar.gz
libvpx-4a91541c946c1fc2655a942ec79033618f03c4ca.tar.bz2
libvpx-4a91541c946c1fc2655a942ec79033618f03c4ca.zip
Modified the inverse walsh to output directly
to the dqcoeff or qcoeff buffer. The encoder would populate the dc coeffs of the y blocks as a separate stage (recon_dcblock) and the decoder would use a special version of the idct. This change eliminates the extra copy and reduces the code footprint. [Tero] Added needed changes to armv6 and NEON assembly. Change-Id: I83202ffdbaf83f6e5dd69f4ba2519fcf0b13b3ba
Diffstat (limited to 'vp8/common/arm/neon')
-rw-r--r--vp8/common/arm/neon/iwalsh_neon.asm37
1 files changed, 22 insertions, 15 deletions
diff --git a/vp8/common/arm/neon/iwalsh_neon.asm b/vp8/common/arm/neon/iwalsh_neon.asm
index 01c79d937..e8ea2a619 100644
--- a/vp8/common/arm/neon/iwalsh_neon.asm
+++ b/vp8/common/arm/neon/iwalsh_neon.asm
@@ -8,7 +8,6 @@
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_short_inv_walsh4x4_neon|
- EXPORT |vp8_short_inv_walsh4x4_1_neon|
ARM
REQUIRE8
@@ -16,7 +15,7 @@
AREA |.text|, CODE, READONLY ; name this block of code
-;short vp8_short_inv_walsh4x4_neon(short *input, short *output)
+;short vp8_short_inv_walsh4x4_neon(short *input, short *mb_dqcoeff)
|vp8_short_inv_walsh4x4_neon| PROC
; read in all four lines of values: d0->d3
@@ -59,22 +58,30 @@
vshr.s16 q0, q0, #3 ;e/f >> 3
vshr.s16 q1, q1, #3 ;g/h >> 3
- vst4.i16 {d0,d1,d2,d3}, [r1@128]
+ mov r2, #64
+ add r3, r1, #32
- bx lr
- ENDP ; |vp8_short_inv_walsh4x4_neon|
+ vst1.i16 d0[0], [r1],r2
+ vst1.i16 d1[0], [r3],r2
+ vst1.i16 d2[0], [r1],r2
+ vst1.i16 d3[0], [r3],r2
+
+ vst1.i16 d0[1], [r1],r2
+ vst1.i16 d1[1], [r3],r2
+ vst1.i16 d2[1], [r1],r2
+ vst1.i16 d3[1], [r3],r2
+ vst1.i16 d0[2], [r1],r2
+ vst1.i16 d1[2], [r3],r2
+ vst1.i16 d2[2], [r1],r2
+ vst1.i16 d3[2], [r3],r2
+
+ vst1.i16 d0[3], [r1],r2
+ vst1.i16 d1[3], [r3],r2
+ vst1.i16 d2[3], [r1]
+ vst1.i16 d3[3], [r3]
-;short vp8_short_inv_walsh4x4_1_neon(short *input, short *output)
-|vp8_short_inv_walsh4x4_1_neon| PROC
- ldrsh r2, [r0] ; load input[0]
- add r3, r2, #3 ; add 3
- add r2, r1, #16 ; base for last 8 output
- asr r0, r3, #3 ; right shift 3
- vdup.16 q0, r0 ; load and duplicate
- vst1.16 {q0}, [r1@128] ; write back 8
- vst1.16 {q0}, [r2@128] ; write back last 8
bx lr
- ENDP ; |vp8_short_inv_walsh4x4_1_neon|
+ ENDP ; |vp8_short_inv_walsh4x4_neon|
END