summaryrefslogtreecommitdiff
path: root/vp8/common/arm/armv6/sixtappredict8x4_v6.asm
diff options
context:
space:
mode:
authorJohn Koleszar <jkoleszar@google.com>2010-05-18 11:58:33 -0400
committerJohn Koleszar <jkoleszar@google.com>2010-05-18 11:58:33 -0400
commit0ea50ce9cb4b65eee6afa1d041fe8beb5abda667 (patch)
tree1f3b9019f28bc56fd3156f96e5a9653a983ee61b /vp8/common/arm/armv6/sixtappredict8x4_v6.asm
downloadlibvpx-0ea50ce9cb4b65eee6afa1d041fe8beb5abda667.tar
libvpx-0ea50ce9cb4b65eee6afa1d041fe8beb5abda667.tar.gz
libvpx-0ea50ce9cb4b65eee6afa1d041fe8beb5abda667.tar.bz2
libvpx-0ea50ce9cb4b65eee6afa1d041fe8beb5abda667.zip
Initial WebM release
Diffstat (limited to 'vp8/common/arm/armv6/sixtappredict8x4_v6.asm')
-rw-r--r--vp8/common/arm/armv6/sixtappredict8x4_v6.asm277
1 files changed, 277 insertions, 0 deletions
diff --git a/vp8/common/arm/armv6/sixtappredict8x4_v6.asm b/vp8/common/arm/armv6/sixtappredict8x4_v6.asm
new file mode 100644
index 000000000..551d863e9
--- /dev/null
+++ b/vp8/common/arm/armv6/sixtappredict8x4_v6.asm
@@ -0,0 +1,277 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_sixtap_predict8x4_armv6|
+
+ AREA |.text|, CODE, READONLY ; name this block of code
+;-------------------------------------
+; r0 unsigned char *src_ptr,
+; r1 int src_pixels_per_line,
+; r2 int xoffset,
+; r3 int yoffset,
+; stack unsigned char *dst_ptr,
+; stack int dst_pitch
+;-------------------------------------
+;note: In first pass, store the result in transpose(8linesx9columns) on stack. Temporary stack size is 184.
+;Line width is 20 that is 9 short data plus 2 to make it 4bytes aligned. In second pass, load data from stack,
+;and the result is stored in transpose.
+|vp8_sixtap_predict8x4_armv6| PROC
+ stmdb sp!, {r4 - r11, lr}
+ sub sp, sp, #184 ;reserve space on stack for temporary storage: 20x(8+1) +4
+
+ cmp r2, #0 ;skip first_pass filter if xoffset=0
+ str r3, [sp], #4 ;store yoffset
+ beq skip_firstpass_filter
+
+;first-pass filter
+ ldr r12, _filter8_coeff_
+ sub r0, r0, r1, lsl #1
+
+ add r2, r12, r2, lsl #4 ;calculate filter location
+ add r0, r0, #3 ;adjust src only for loading convinience
+
+ ldr r3, [r2] ; load up packed filter coefficients
+ ldr r4, [r2, #4]
+ ldr r5, [r2, #8]
+
+ mov r2, #0x90000 ; height=9 is top part of counter
+
+ sub r1, r1, #8
+ mov lr, #20
+
+|first_pass_hloop_v6|
+ ldrb r6, [r0, #-5] ; load source data
+ ldrb r7, [r0, #-4]
+ ldrb r8, [r0, #-3]
+ ldrb r9, [r0, #-2]
+ ldrb r10, [r0, #-1]
+
+ orr r2, r2, #0x4 ; construct loop counter. width=8=4x2
+
+ pkhbt r6, r6, r7, lsl #16 ; r7 | r6
+ pkhbt r7, r7, r8, lsl #16 ; r8 | r7
+
+ pkhbt r8, r8, r9, lsl #16 ; r9 | r8
+ pkhbt r9, r9, r10, lsl #16 ; r10 | r9
+
+|first_pass_wloop_v6|
+ smuad r11, r6, r3 ; vp8_filter[0], vp8_filter[1]
+ smuad r12, r7, r3
+
+ ldrb r6, [r0], #1
+
+ smlad r11, r8, r4, r11 ; vp8_filter[2], vp8_filter[3]
+ ldrb r7, [r0], #1
+ smlad r12, r9, r4, r12
+
+ pkhbt r10, r10, r6, lsl #16 ; r10 | r9
+ pkhbt r6, r6, r7, lsl #16 ; r11 | r10
+ smlad r11, r10, r5, r11 ; vp8_filter[4], vp8_filter[5]
+ smlad r12, r6, r5, r12
+
+ sub r2, r2, #1
+
+ add r11, r11, #0x40 ; round_shift_and_clamp
+ tst r2, #0xff ; test loop counter
+ usat r11, #8, r11, asr #7
+ add r12, r12, #0x40
+ strh r11, [sp], lr ; result is transposed and stored, which
+ usat r12, #8, r12, asr #7
+
+ strh r12, [sp], lr
+
+ movne r11, r6
+ movne r12, r7
+
+ movne r6, r8
+ movne r7, r9
+ movne r8, r10
+ movne r9, r11
+ movne r10, r12
+
+ bne first_pass_wloop_v6
+
+ ;;add r9, ppl, #30 ; attempt to load 2 adjacent cache lines
+ ;;IF ARCHITECTURE=6
+ ;pld [src, ppl]
+ ;;pld [src, r9]
+ ;;ENDIF
+
+ subs r2, r2, #0x10000
+
+ mov r6, #158
+ sub sp, sp, r6
+
+ add r0, r0, r1 ; move to next input line
+
+ bne first_pass_hloop_v6
+
+;second pass filter
+secondpass_filter
+ mov r1, #18
+ sub sp, sp, r1 ; 18+4
+
+ ldr r3, [sp, #-4] ; load back yoffset
+ ldr r0, [sp, #216] ; load dst address from stack 180+36
+ ldr r1, [sp, #220] ; load dst stride from stack 180+40
+
+ cmp r3, #0
+ beq skip_secondpass_filter
+
+ ldr r12, _filter8_coeff_
+ add lr, r12, r3, lsl #4 ;calculate filter location
+
+ mov r2, #0x00080000
+
+ ldr r3, [lr] ; load up packed filter coefficients
+ ldr r4, [lr, #4]
+ ldr r5, [lr, #8]
+
+ pkhbt r12, r4, r3 ; pack the filter differently
+ pkhbt r11, r5, r4
+
+second_pass_hloop_v6
+ ldr r6, [sp] ; load the data
+ ldr r7, [sp, #4]
+
+ orr r2, r2, #2 ; loop counter
+
+second_pass_wloop_v6
+ smuad lr, r3, r6 ; apply filter
+ smulbt r10, r3, r6
+
+ ldr r8, [sp, #8]
+
+ smlad lr, r4, r7, lr
+ smladx r10, r12, r7, r10
+
+ ldrh r9, [sp, #12]
+
+ smlad lr, r5, r8, lr
+ smladx r10, r11, r8, r10
+
+ add sp, sp, #4
+ smlatb r10, r5, r9, r10
+
+ sub r2, r2, #1
+
+ add lr, lr, #0x40 ; round_shift_and_clamp
+ tst r2, #0xff
+ usat lr, #8, lr, asr #7
+ add r10, r10, #0x40
+ strb lr, [r0], r1 ; the result is transposed back and stored
+ usat r10, #8, r10, asr #7
+
+ strb r10, [r0],r1
+
+ movne r6, r7
+ movne r7, r8
+
+ bne second_pass_wloop_v6
+
+ subs r2, r2, #0x10000
+ add sp, sp, #12 ; updata src for next loop (20-8)
+ sub r0, r0, r1, lsl #2
+ add r0, r0, #1
+
+ bne second_pass_hloop_v6
+
+ add sp, sp, #20
+ ldmia sp!, {r4 - r11, pc}
+
+;--------------------
+skip_firstpass_filter
+ sub r0, r0, r1, lsl #1
+ sub r1, r1, #8
+ mov r2, #9
+ mov r3, #20
+
+skip_firstpass_hloop
+ ldrb r4, [r0], #1 ; load data
+ subs r2, r2, #1
+ ldrb r5, [r0], #1
+ strh r4, [sp], r3 ; store it to immediate buffer
+ ldrb r6, [r0], #1 ; load data
+ strh r5, [sp], r3
+ ldrb r7, [r0], #1
+ strh r6, [sp], r3
+ ldrb r8, [r0], #1
+ strh r7, [sp], r3
+ ldrb r9, [r0], #1
+ strh r8, [sp], r3
+ ldrb r10, [r0], #1
+ strh r9, [sp], r3
+ ldrb r11, [r0], #1
+ strh r10, [sp], r3
+ add r0, r0, r1 ; move to next input line
+ strh r11, [sp], r3
+
+ mov r4, #158
+ sub sp, sp, r4 ; move over to next column
+ bne skip_firstpass_hloop
+
+ b secondpass_filter
+
+;--------------------
+skip_secondpass_filter
+ mov r2, #8
+ add sp, sp, #4 ;start from src[0] instead of src[-2]
+
+skip_secondpass_hloop
+ ldr r6, [sp], #4
+ subs r2, r2, #1
+ ldr r8, [sp], #4
+
+ mov r7, r6, lsr #16 ; unpack
+ strb r6, [r0], r1
+ mov r9, r8, lsr #16
+ strb r7, [r0], r1
+ add sp, sp, #12 ; 20-8
+ strb r8, [r0], r1
+ strb r9, [r0], r1
+
+ sub r0, r0, r1, lsl #2
+ add r0, r0, #1
+
+ bne skip_secondpass_hloop
+
+ add sp, sp, #16 ; 180 - (160 +4)
+
+ ldmia sp!, {r4 - r11, pc}
+
+ ENDP
+
+;-----------------
+ AREA subpelfilters8_dat, DATA, READWRITE ;read/write by default
+;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
+;One word each is reserved. Label filter_coeff can be used to access the data.
+;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
+_filter8_coeff_
+ DCD filter8_coeff
+filter8_coeff
+ DCD 0x00000000, 0x00000080, 0x00000000, 0x00000000
+ DCD 0xfffa0000, 0x000c007b, 0x0000ffff, 0x00000000
+ DCD 0xfff50002, 0x0024006c, 0x0001fff8, 0x00000000
+ DCD 0xfff70000, 0x0032005d, 0x0000fffa, 0x00000000
+ DCD 0xfff00003, 0x004d004d, 0x0003fff0, 0x00000000
+ DCD 0xfffa0000, 0x005d0032, 0x0000fff7, 0x00000000
+ DCD 0xfff80001, 0x006c0024, 0x0002fff5, 0x00000000
+ DCD 0xffff0000, 0x007b000c, 0x0000fffa, 0x00000000
+
+ ;DCD 0, 0, 128, 0, 0, 0
+ ;DCD 0, -6, 123, 12, -1, 0
+ ;DCD 2, -11, 108, 36, -8, 1
+ ;DCD 0, -9, 93, 50, -6, 0
+ ;DCD 3, -16, 77, 77, -16, 3
+ ;DCD 0, -6, 50, 93, -9, 0
+ ;DCD 1, -8, 36, 108, -11, 2
+ ;DCD 0, -1, 12, 123, -6, 0
+
+ END