; ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. ; ; Use of this source code is governed by a BSD-style license ; that can be found in the LICENSE file in the root of the source ; tree. An additional intellectual property rights grant can be found ; in the file PATENTS. All contributing project authors may ; be found in the AUTHORS file in the root of the source tree. ; EXPORT |vp8_sixtap_predict8x4_armv6| AREA |.text|, CODE, READONLY ; name this block of code ;------------------------------------- ; r0 unsigned char *src_ptr, ; r1 int src_pixels_per_line, ; r2 int xoffset, ; r3 int yoffset, ; stack unsigned char *dst_ptr, ; stack int dst_pitch ;------------------------------------- ;note: In first pass, store the result in transpose(8linesx9columns) on stack. Temporary stack size is 184. ;Line width is 20 that is 9 short data plus 2 to make it 4bytes aligned. In second pass, load data from stack, ;and the result is stored in transpose. |vp8_sixtap_predict8x4_armv6| PROC stmdb sp!, {r4 - r11, lr} str r3, [sp, #-184]! ;reserve space on stack for temporary storage, store yoffset cmp r2, #0 ;skip first_pass filter if xoffset=0 add lr, sp, #4 ;point to temporary buffer beq skip_firstpass_filter ;first-pass filter adr r12, filter8_coeff sub r0, r0, r1, lsl #1 add r3, r1, #10 ; preload next low pld [r0, r3] add r2, r12, r2, lsl #4 ;calculate filter location add r0, r0, #3 ;adjust src only for loading convinience ldr r3, [r2] ; load up packed filter coefficients ldr r4, [r2, #4] ldr r5, [r2, #8] mov r2, #0x90000 ; height=9 is top part of counter sub r1, r1, #8 |first_pass_hloop_v6| ldrb r6, [r0, #-5] ; load source data ldrb r7, [r0, #-4] ldrb r8, [r0, #-3] ldrb r9, [r0, #-2] ldrb r10, [r0, #-1] orr r2, r2, #0x4 ; construct loop counter. width=8=4x2 pkhbt r6, r6, r7, lsl #16 ; r7 | r6 pkhbt r7, r7, r8, lsl #16 ; r8 | r7 pkhbt r8, r8, r9, lsl #16 ; r9 | r8 pkhbt r9, r9, r10, lsl #16 ; r10 | r9 |first_pass_wloop_v6| smuad r11, r6, r3 ; vp8_filter[0], vp8_filter[1] smuad r12, r7, r3 ldrb r6, [r0], #1 smlad r11, r8, r4, r11 ; vp8_filter[2], vp8_filter[3] ldrb r7, [r0], #1 smlad r12, r9, r4, r12 pkhbt r10, r10, r6, lsl #16 ; r10 | r9 pkhbt r6, r6, r7, lsl #16 ; r11 | r10 smlad r11, r10, r5, r11 ; vp8_filter[4], vp8_filter[5] smlad r12, r6, r5, r12 sub r2, r2, #1 add r11, r11, #0x40 ; round_shift_and_clamp tst r2, #0xff ; test loop counter usat r11, #8, r11, asr #7 add r12, r12, #0x40 strh r11, [lr], #20 ; result is transposed and stored, which usat r12, #8, r12, asr #7 strh r12, [lr], #20 movne r11, r6 movne r12, r7 movne r6, r8 movne r7, r9 movne r8, r10 movne r9, r11 movne r10, r12 bne first_pass_wloop_v6 ;;add r9, ppl, #30 ; attempt to load 2 adjacent cache lines ;;IF ARCHITECTURE=6 ;pld [src, ppl] ;;pld [src, r9] ;;ENDIF subs r2, r2, #0x10000 sub lr, lr, #158 add r0, r0, r1 ; move to next input line add r11, r1, #18 ; preload next low. adding back block width(=8), which is subtracted earlier pld [r0, r11] bne first_pass_hloop_v6 ;second pass filter secondpass_filter ldr r3, [sp], #4 ; load back yoffset ldr r0, [sp, #216] ; load dst address from stack 180+36 ldr r1, [sp, #220] ; load dst stride from stack 180+40 cmp r3, #0 beq skip_secondpass_filter adr r12, filter8_coeff add lr, r12, r3, lsl #4 ;calculate filter location mov r2, #0x00080000 ldr r3, [lr] ; load up packed filter coefficients ldr r4, [lr, #4] ldr r5, [lr, #8] pkhbt r12, r4, r3 ; pack the filter differently pkhbt r11, r5, r4 second_pass_hloop_v6 ldr r6, [sp] ; load the data ldr r7, [sp, #4] orr r2, r2, #2 ; loop counter second_pass_wloop_v6 smuad lr, r3, r6 ; apply filter smulbt r10, r3, r6 ldr r8, [sp, #8] smlad lr, r4, r7, lr smladx r10, r12, r7, r10 ldrh r9, [sp, #12] smlad lr, r5, r8, lr smladx r10, r11, r8, r10 add sp, sp, #4 smlatb r10, r5, r9, r10 sub r2, r2, #1 add lr, lr, #0x40 ; round_shift_and_clamp tst r2, #0xff usat lr, #8, lr, asr #7 add r10, r10, #0x40 strb lr, [r0], r1 ; the result is transposed back and stored usat r10, #8, r10, asr #7 strb r10, [r0],r1 movne r6, r7 movne r7, r8 bne second_pass_wloop_v6 subs r2, r2, #0x10000 add sp, sp, #12 ; updata src for next loop (20-8) sub r0, r0, r1, lsl #2 add r0, r0, #1 bne second_pass_hloop_v6 add sp, sp, #20 ldmia sp!, {r4 - r11, pc} ;-------------------- skip_firstpass_filter sub r0, r0, r1, lsl #1 sub r1, r1, #8 mov r2, #9 skip_firstpass_hloop ldrb r4, [r0], #1 ; load data subs r2, r2, #1 ldrb r5, [r0], #1 strh r4, [lr], #20 ; store it to immediate buffer ldrb r6, [r0], #1 ; load data strh r5, [lr], #20 ldrb r7, [r0], #1 strh r6, [lr], #20 ldrb r8, [r0], #1 strh r7, [lr], #20 ldrb r9, [r0], #1 strh r8, [lr], #20 ldrb r10, [r0], #1 strh r9, [lr], #20 ldrb r11, [r0], #1 strh r10, [lr], #20 add r0, r0, r1 ; move to next input line strh r11, [lr], #20 sub lr, lr, #158 ; move over to next column bne skip_firstpass_hloop b secondpass_filter ;-------------------- skip_secondpass_filter mov r2, #8 add sp, sp, #4 ;start from src[0] instead of src[-2] skip_secondpass_hloop ldr r6, [sp], #4 subs r2, r2, #1 ldr r8, [sp], #4 mov r7, r6, lsr #16 ; unpack strb r6, [r0], r1 mov r9, r8, lsr #16 strb r7, [r0], r1 add sp, sp, #12 ; 20-8 strb r8, [r0], r1 strb r9, [r0], r1 sub r0, r0, r1, lsl #2 add r0, r0, #1 bne skip_secondpass_hloop add sp, sp, #16 ; 180 - (160 +4) ldmia sp!, {r4 - r11, pc} ENDP ;----------------- ;One word each is reserved. Label filter_coeff can be used to access the data. ;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ... filter8_coeff DCD 0x00000000, 0x00000080, 0x00000000, 0x00000000 DCD 0xfffa0000, 0x000c007b, 0x0000ffff, 0x00000000 DCD 0xfff50002, 0x0024006c, 0x0001fff8, 0x00000000 DCD 0xfff70000, 0x0032005d, 0x0000fffa, 0x00000000 DCD 0xfff00003, 0x004d004d, 0x0003fff0, 0x00000000 DCD 0xfffa0000, 0x005d0032, 0x0000fff7, 0x00000000 DCD 0xfff80001, 0x006c0024, 0x0002fff5, 0x00000000 DCD 0xffff0000, 0x007b000c, 0x0000fffa, 0x00000000 ;DCD 0, 0, 128, 0, 0, 0 ;DCD 0, -6, 123, 12, -1, 0 ;DCD 2, -11, 108, 36, -8, 1 ;DCD 0, -9, 93, 50, -6, 0 ;DCD 3, -16, 77, 77, -16, 3 ;DCD 0, -6, 50, 93, -9, 0 ;DCD 1, -8, 36, 108, -11, 2 ;DCD 0, -1, 12, 123, -6, 0 END