diff options
Diffstat (limited to 'vp8/common/arm')
53 files changed, 11846 insertions, 0 deletions
diff --git a/vp8/common/arm/armv6/bilinearfilter_v6.asm b/vp8/common/arm/armv6/bilinearfilter_v6.asm new file mode 100644 index 000000000..4428cf8ff --- /dev/null +++ b/vp8/common/arm/armv6/bilinearfilter_v6.asm @@ -0,0 +1,237 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_filter_block2d_bil_first_pass_armv6| + EXPORT |vp8_filter_block2d_bil_second_pass_armv6| + + AREA |.text|, CODE, READONLY ; name this block of code + +;------------------------------------- +; r0 unsigned char *src_ptr, +; r1 unsigned short *output_ptr, +; r2 unsigned int src_pixels_per_line, +; r3 unsigned int output_height, +; stack unsigned int output_width, +; stack const short *vp8_filter +;------------------------------------- +; The output is transposed stroed in output array to make it easy for second pass filtering. +|vp8_filter_block2d_bil_first_pass_armv6| PROC + stmdb sp!, {r4 - r11, lr} + + ldr r11, [sp, #40] ; vp8_filter address + ldr r4, [sp, #36] ; output width + + mov r12, r3 ; outer-loop counter + sub r2, r2, r4 ; src increment for height loop + + ;;IF ARCHITECTURE=6 + pld [r0] + ;;ENDIF + + ldr r5, [r11] ; load up filter coefficients + + mov r3, r3, lsl #1 ; output_height*2 + add r3, r3, #2 ; plus 2 to make output buffer 4-bit aligned since height is actually (height+1) + + mov r11, r1 ; save output_ptr for each row + + cmp r5, #128 ; if filter coef = 128, then skip the filter + beq bil_null_1st_filter + +|bil_height_loop_1st_v6| + ldrb r6, [r0] ; load source data + ldrb r7, [r0, #1] + ldrb r8, [r0, #2] + mov lr, r4, lsr #2 ; 4-in-parellel loop counter + +|bil_width_loop_1st_v6| + ldrb r9, [r0, #3] + ldrb r10, [r0, #4] + + pkhbt r6, r6, r7, lsl #16 ; src[1] | src[0] + pkhbt r7, r7, r8, lsl #16 ; src[2] | src[1] + + smuad r6, r6, r5 ; apply the filter + pkhbt r8, r8, r9, lsl #16 ; src[3] | src[2] + smuad r7, r7, r5 + pkhbt r9, r9, r10, lsl #16 ; src[4] | src[3] + + smuad r8, r8, r5 + smuad r9, r9, r5 + + add r0, r0, #4 + subs lr, lr, #1 + + add r6, r6, #0x40 ; round_shift_and_clamp + add r7, r7, #0x40 + usat r6, #16, r6, asr #7 + usat r7, #16, r7, asr #7 + + strh r6, [r1], r3 ; result is transposed and stored + + add r8, r8, #0x40 ; round_shift_and_clamp + strh r7, [r1], r3 + add r9, r9, #0x40 + usat r8, #16, r8, asr #7 + usat r9, #16, r9, asr #7 + + strh r8, [r1], r3 ; result is transposed and stored + + ldrneb r6, [r0] ; load source data + strh r9, [r1], r3 + + ldrneb r7, [r0, #1] + ldrneb r8, [r0, #2] + + bne bil_width_loop_1st_v6 + + add r0, r0, r2 ; move to next input row + subs r12, r12, #1 + + ;;IF ARCHITECTURE=6 + pld [r0] + ;;ENDIF + + add r11, r11, #2 ; move over to next column + mov r1, r11 + + bne bil_height_loop_1st_v6 + + ldmia sp!, {r4 - r11, pc} + +|bil_null_1st_filter| +|bil_height_loop_null_1st| + mov lr, r4, lsr #2 ; loop counter + +|bil_width_loop_null_1st| + ldrb r6, [r0] ; load data + ldrb r7, [r0, #1] + ldrb r8, [r0, #2] + ldrb r9, [r0, #3] + + strh r6, [r1], r3 ; store it to immediate buffer + add r0, r0, #4 + strh r7, [r1], r3 + subs lr, lr, #1 + strh r8, [r1], r3 + strh r9, [r1], r3 + + bne bil_width_loop_null_1st + + subs r12, r12, #1 + add r0, r0, r2 ; move to next input line + add r11, r11, #2 ; move over to next column + mov r1, r11 + + bne bil_height_loop_null_1st + + ldmia sp!, {r4 - r11, pc} + + ENDP ; |vp8_filter_block2d_bil_first_pass_armv6| + + +;--------------------------------- +; r0 unsigned short *src_ptr, +; r1 unsigned char *output_ptr, +; r2 int output_pitch, +; r3 unsigned int output_height, +; stack unsigned int output_width, +; stack const short *vp8_filter +;--------------------------------- +|vp8_filter_block2d_bil_second_pass_armv6| PROC + stmdb sp!, {r4 - r11, lr} + + ldr r11, [sp, #40] ; vp8_filter address + ldr r4, [sp, #36] ; output width + + ldr r5, [r11] ; load up filter coefficients + mov r12, r4 ; outer-loop counter = width, since we work on transposed data matrix + mov r11, r1 + + cmp r5, #128 ; if filter coef = 128, then skip the filter + beq bil_null_2nd_filter + +|bil_height_loop_2nd| + ldr r6, [r0] ; load the data + ldr r8, [r0, #4] + ldrh r10, [r0, #8] + mov lr, r3, lsr #2 ; loop counter + +|bil_width_loop_2nd| + pkhtb r7, r6, r8 ; src[1] | src[2] + pkhtb r9, r8, r10 ; src[3] | src[4] + + smuad r6, r6, r5 ; apply filter + smuad r8, r8, r5 ; apply filter + + subs lr, lr, #1 + + smuadx r7, r7, r5 ; apply filter + smuadx r9, r9, r5 ; apply filter + + add r0, r0, #8 + + add r6, r6, #0x40 ; round_shift_and_clamp + add r7, r7, #0x40 + usat r6, #8, r6, asr #7 + usat r7, #8, r7, asr #7 + strb r6, [r1], r2 ; the result is transposed back and stored + + add r8, r8, #0x40 ; round_shift_and_clamp + strb r7, [r1], r2 + add r9, r9, #0x40 + usat r8, #8, r8, asr #7 + usat r9, #8, r9, asr #7 + strb r8, [r1], r2 ; the result is transposed back and stored + + ldrne r6, [r0] ; load data + strb r9, [r1], r2 + ldrne r8, [r0, #4] + ldrneh r10, [r0, #8] + + bne bil_width_loop_2nd + + subs r12, r12, #1 + add r0, r0, #4 ; update src for next row + add r11, r11, #1 + mov r1, r11 + + bne bil_height_loop_2nd + ldmia sp!, {r4 - r11, pc} + +|bil_null_2nd_filter| +|bil_height_loop_null_2nd| + mov lr, r3, lsr #2 + +|bil_width_loop_null_2nd| + ldr r6, [r0], #4 ; load data + subs lr, lr, #1 + ldr r8, [r0], #4 + + strb r6, [r1], r2 ; store data + mov r7, r6, lsr #16 + strb r7, [r1], r2 + mov r9, r8, lsr #16 + strb r8, [r1], r2 + strb r9, [r1], r2 + + bne bil_width_loop_null_2nd + + subs r12, r12, #1 + add r0, r0, #4 + add r11, r11, #1 + mov r1, r11 + + bne bil_height_loop_null_2nd + + ldmia sp!, {r4 - r11, pc} + ENDP ; |vp8_filter_block2d_second_pass_armv6| + + END diff --git a/vp8/common/arm/armv6/copymem16x16_v6.asm b/vp8/common/arm/armv6/copymem16x16_v6.asm new file mode 100644 index 000000000..00e97397c --- /dev/null +++ b/vp8/common/arm/armv6/copymem16x16_v6.asm @@ -0,0 +1,181 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_copy_mem16x16_v6| + ; ARM + ; REQUIRE8 + ; PRESERVE8 + + AREA Block, CODE, READONLY ; name this block of code +;void copy_mem16x16_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride) +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= +|vp8_copy_mem16x16_v6| PROC + stmdb sp!, {r4 - r7} + ;push {r4-r7} + + ;preload + pld [r0] + pld [r0, r1] + pld [r0, r1, lsl #1] + + ands r4, r0, #15 + beq copy_mem16x16_fast + + ands r4, r0, #7 + beq copy_mem16x16_8 + + ands r4, r0, #3 + beq copy_mem16x16_4 + + ;copy one byte each time + ldrb r4, [r0] + ldrb r5, [r0, #1] + ldrb r6, [r0, #2] + ldrb r7, [r0, #3] + + mov r12, #16 + +copy_mem16x16_1_loop + strb r4, [r2] + strb r5, [r2, #1] + strb r6, [r2, #2] + strb r7, [r2, #3] + + ldrb r4, [r0, #4] + ldrb r5, [r0, #5] + ldrb r6, [r0, #6] + ldrb r7, [r0, #7] + + subs r12, r12, #1 + + strb r4, [r2, #4] + strb r5, [r2, #5] + strb r6, [r2, #6] + strb r7, [r2, #7] + + ldrb r4, [r0, #8] + ldrb r5, [r0, #9] + ldrb r6, [r0, #10] + ldrb r7, [r0, #11] + + strb r4, [r2, #8] + strb r5, [r2, #9] + strb r6, [r2, #10] + strb r7, [r2, #11] + + ldrb r4, [r0, #12] + ldrb r5, [r0, #13] + ldrb r6, [r0, #14] + ldrb r7, [r0, #15] + + add r0, r0, r1 + + strb r4, [r2, #12] + strb r5, [r2, #13] + strb r6, [r2, #14] + strb r7, [r2, #15] + + add r2, r2, r3 + + ldrneb r4, [r0] + ldrneb r5, [r0, #1] + ldrneb r6, [r0, #2] + ldrneb r7, [r0, #3] + + bne copy_mem16x16_1_loop + + ldmia sp!, {r4 - r7} + ;pop {r4-r7} + mov pc, lr + +;copy 4 bytes each time +copy_mem16x16_4 + ldr r4, [r0] + ldr r5, [r0, #4] + ldr r6, [r0, #8] + ldr r7, [r0, #12] + + mov r12, #16 + +copy_mem16x16_4_loop + subs r12, r12, #1 + add r0, r0, r1 + + str r4, [r2] + str r5, [r2, #4] + str r6, [r2, #8] + str r7, [r2, #12] + + add r2, r2, r3 + + ldrne r4, [r0] + ldrne r5, [r0, #4] + ldrne r6, [r0, #8] + ldrne r7, [r0, #12] + + bne copy_mem16x16_4_loop + + ldmia sp!, {r4 - r7} + ;pop {r4-r7} + mov pc, lr + +;copy 8 bytes each time +copy_mem16x16_8 + sub r1, r1, #16 + sub r3, r3, #16 + + mov r12, #16 + +copy_mem16x16_8_loop + ldmia r0!, {r4-r5} + ;ldm r0, {r4-r5} + ldmia r0!, {r6-r7} + + add r0, r0, r1 + + stmia r2!, {r4-r5} + subs r12, r12, #1 + ;stm r2, {r4-r5} + stmia r2!, {r6-r7} + + add r2, r2, r3 + + bne copy_mem16x16_8_loop + + ldmia sp!, {r4 - r7} + ;pop {r4-r7} + mov pc, lr + +;copy 16 bytes each time +copy_mem16x16_fast + ;sub r1, r1, #16 + ;sub r3, r3, #16 + + mov r12, #16 + +copy_mem16x16_fast_loop + ldmia r0, {r4-r7} + ;ldm r0, {r4-r7} + add r0, r0, r1 + + subs r12, r12, #1 + stmia r2, {r4-r7} + ;stm r2, {r4-r7} + add r2, r2, r3 + + bne copy_mem16x16_fast_loop + + ldmia sp!, {r4 - r7} + ;pop {r4-r7} + mov pc, lr + + ENDP ; |vp8_copy_mem16x16_v6| + + END diff --git a/vp8/common/arm/armv6/copymem8x4_v6.asm b/vp8/common/arm/armv6/copymem8x4_v6.asm new file mode 100644 index 000000000..94473ca65 --- /dev/null +++ b/vp8/common/arm/armv6/copymem8x4_v6.asm @@ -0,0 +1,127 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_copy_mem8x4_v6| + ; ARM + ; REQUIRE8 + ; PRESERVE8 + + AREA Block, CODE, READONLY ; name this block of code +;void vp8_copy_mem8x4_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride) +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= +|vp8_copy_mem8x4_v6| PROC + ;push {r4-r5} + stmdb sp!, {r4-r5} + + ;preload + pld [r0] + pld [r0, r1] + pld [r0, r1, lsl #1] + + ands r4, r0, #7 + beq copy_mem8x4_fast + + ands r4, r0, #3 + beq copy_mem8x4_4 + + ;copy 1 byte each time + ldrb r4, [r0] + ldrb r5, [r0, #1] + + mov r12, #4 + +copy_mem8x4_1_loop + strb r4, [r2] + strb r5, [r2, #1] + + ldrb r4, [r0, #2] + ldrb r5, [r0, #3] + + subs r12, r12, #1 + + strb r4, [r2, #2] + strb r5, [r2, #3] + + ldrb r4, [r0, #4] + ldrb r5, [r0, #5] + + strb r4, [r2, #4] + strb r5, [r2, #5] + + ldrb r4, [r0, #6] + ldrb r5, [r0, #7] + + add r0, r0, r1 + + strb r4, [r2, #6] + strb r5, [r2, #7] + + add r2, r2, r3 + + ldrneb r4, [r0] + ldrneb r5, [r0, #1] + + bne copy_mem8x4_1_loop + + ldmia sp!, {r4 - r5} + ;pop {r4-r5} + mov pc, lr + +;copy 4 bytes each time +copy_mem8x4_4 + ldr r4, [r0] + ldr r5, [r0, #4] + + mov r12, #4 + +copy_mem8x4_4_loop + subs r12, r12, #1 + add r0, r0, r1 + + str r4, [r2] + str r5, [r2, #4] + + add r2, r2, r3 + + ldrne r4, [r0] + ldrne r5, [r0, #4] + + bne copy_mem8x4_4_loop + + ldmia sp!, {r4-r5} + ;pop {r4-r5} + mov pc, lr + +;copy 8 bytes each time +copy_mem8x4_fast + ;sub r1, r1, #8 + ;sub r3, r3, #8 + + mov r12, #4 + +copy_mem8x4_fast_loop + ldmia r0, {r4-r5} + ;ldm r0, {r4-r5} + add r0, r0, r1 + + subs r12, r12, #1 + stmia r2, {r4-r5} + ;stm r2, {r4-r5} + add r2, r2, r3 + + bne copy_mem8x4_fast_loop + + ldmia sp!, {r4-r5} + ;pop {r4-r5} + mov pc, lr + + ENDP ; |vp8_copy_mem8x4_v6| + + END diff --git a/vp8/common/arm/armv6/copymem8x8_v6.asm b/vp8/common/arm/armv6/copymem8x8_v6.asm new file mode 100644 index 000000000..7cfa53389 --- /dev/null +++ b/vp8/common/arm/armv6/copymem8x8_v6.asm @@ -0,0 +1,127 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_copy_mem8x8_v6| + ; ARM + ; REQUIRE8 + ; PRESERVE8 + + AREA Block, CODE, READONLY ; name this block of code +;void copy_mem8x8_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride) +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= +|vp8_copy_mem8x8_v6| PROC + ;push {r4-r5} + stmdb sp!, {r4-r5} + + ;preload + pld [r0] + pld [r0, r1] + pld [r0, r1, lsl #1] + + ands r4, r0, #7 + beq copy_mem8x8_fast + + ands r4, r0, #3 + beq copy_mem8x8_4 + + ;copy 1 byte each time + ldrb r4, [r0] + ldrb r5, [r0, #1] + + mov r12, #8 + +copy_mem8x8_1_loop + strb r4, [r2] + strb r5, [r2, #1] + + ldrb r4, [r0, #2] + ldrb r5, [r0, #3] + + subs r12, r12, #1 + + strb r4, [r2, #2] + strb r5, [r2, #3] + + ldrb r4, [r0, #4] + ldrb r5, [r0, #5] + + strb r4, [r2, #4] + strb r5, [r2, #5] + + ldrb r4, [r0, #6] + ldrb r5, [r0, #7] + + add r0, r0, r1 + + strb r4, [r2, #6] + strb r5, [r2, #7] + + add r2, r2, r3 + + ldrneb r4, [r0] + ldrneb r5, [r0, #1] + + bne copy_mem8x8_1_loop + + ldmia sp!, {r4 - r5} + ;pop {r4-r5} + mov pc, lr + +;copy 4 bytes each time +copy_mem8x8_4 + ldr r4, [r0] + ldr r5, [r0, #4] + + mov r12, #8 + +copy_mem8x8_4_loop + subs r12, r12, #1 + add r0, r0, r1 + + str r4, [r2] + str r5, [r2, #4] + + add r2, r2, r3 + + ldrne r4, [r0] + ldrne r5, [r0, #4] + + bne copy_mem8x8_4_loop + + ldmia sp!, {r4 - r5} + ;pop {r4-r5} + mov pc, lr + +;copy 8 bytes each time +copy_mem8x8_fast + ;sub r1, r1, #8 + ;sub r3, r3, #8 + + mov r12, #8 + +copy_mem8x8_fast_loop + ldmia r0, {r4-r5} + ;ldm r0, {r4-r5} + add r0, r0, r1 + + subs r12, r12, #1 + stmia r2, {r4-r5} + ;stm r2, {r4-r5} + add r2, r2, r3 + + bne copy_mem8x8_fast_loop + + ldmia sp!, {r4-r5} + ;pop {r4-r5} + mov pc, lr + + ENDP ; |vp8_copy_mem8x8_v6| + + END diff --git a/vp8/common/arm/armv6/filter_v6.asm b/vp8/common/arm/armv6/filter_v6.asm new file mode 100644 index 000000000..a7863fc94 --- /dev/null +++ b/vp8/common/arm/armv6/filter_v6.asm @@ -0,0 +1,383 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_filter_block2d_first_pass_armv6| + EXPORT |vp8_filter_block2d_second_pass_armv6| + EXPORT |vp8_filter_block2d_first_pass_only_armv6| + EXPORT |vp8_filter_block2d_second_pass_only_armv6| + + AREA |.text|, CODE, READONLY ; name this block of code +;------------------------------------- +; r0 unsigned char *src_ptr +; r1 short *output_ptr +; r2 unsigned int src_pixels_per_line +; r3 unsigned int output_width +; stack unsigned int output_height +; stack const short *vp8_filter +;------------------------------------- +; vp8_filter the input and put in the output array. Apply the 6 tap FIR filter with +; the output being a 2 byte value and the intput being a 1 byte value. +|vp8_filter_block2d_first_pass_armv6| PROC + stmdb sp!, {r4 - r11, lr} + + ldr r11, [sp, #40] ; vp8_filter address + ldr r7, [sp, #36] ; output height + + sub r2, r2, r3 ; inside loop increments input array, + ; so the height loop only needs to add + ; r2 - width to the input pointer + + mov r3, r3, lsl #1 ; multiply width by 2 because using shorts + add r12, r3, #16 ; square off the output + sub sp, sp, #4 + + ;;IF ARCHITECTURE=6 + ;pld [r0, #-2] + ;;pld [r0, #30] + ;;ENDIF + + ldr r4, [r11] ; load up packed filter coefficients + ldr r5, [r11, #4] + ldr r6, [r11, #8] + + str r1, [sp] ; push destination to stack + mov r7, r7, lsl #16 ; height is top part of counter + +; six tap filter +|height_loop_1st_6| + ldrb r8, [r0, #-2] ; load source data + ldrb r9, [r0, #-1] + ldrb r10, [r0], #2 + orr r7, r7, r3, lsr #2 ; construct loop counter + +|width_loop_1st_6| + ldrb r11, [r0, #-1] + + pkhbt lr, r8, r9, lsl #16 ; r9 | r8 + pkhbt r8, r9, r10, lsl #16 ; r10 | r9 + + ldrb r9, [r0] + + smuad lr, lr, r4 ; apply the filter + pkhbt r10, r10, r11, lsl #16 ; r11 | r10 + smuad r8, r8, r4 + pkhbt r11, r11, r9, lsl #16 ; r9 | r11 + + smlad lr, r10, r5, lr + ldrb r10, [r0, #1] + smlad r8, r11, r5, r8 + ldrb r11, [r0, #2] + + sub r7, r7, #1 + + pkhbt r9, r9, r10, lsl #16 ; r10 | r9 + pkhbt r10, r10, r11, lsl #16 ; r11 | r10 + + smlad lr, r9, r6, lr + smlad r11, r10, r6, r8 + + ands r10, r7, #0xff ; test loop counter + + add lr, lr, #0x40 ; round_shift_and_clamp + ldrneb r8, [r0, #-2] ; load data for next loop + usat lr, #8, lr, asr #7 + add r11, r11, #0x40 + ldrneb r9, [r0, #-1] + usat r11, #8, r11, asr #7 + + strh lr, [r1], r12 ; result is transposed and stored, which + ; will make second pass filtering easier. + ldrneb r10, [r0], #2 + strh r11, [r1], r12 + + bne width_loop_1st_6 + + ;;add r9, r2, #30 ; attempt to load 2 adjacent cache lines + ;;IF ARCHITECTURE=6 + ;pld [r0, r2] + ;;pld [r0, r9] + ;;ENDIF + + ldr r1, [sp] ; load and update dst address + subs r7, r7, #0x10000 + add r0, r0, r2 ; move to next input line + add r1, r1, #2 ; move over to next column + str r1, [sp] + + bne height_loop_1st_6 + + add sp, sp, #4 + ldmia sp!, {r4 - r11, pc} + + ENDP + +;--------------------------------- +; r0 short *src_ptr, +; r1 unsigned char *output_ptr, +; r2 unsigned int output_pitch, +; r3 unsigned int cnt, +; stack const short *vp8_filter +;--------------------------------- +|vp8_filter_block2d_second_pass_armv6| PROC + stmdb sp!, {r4 - r11, lr} + + ldr r11, [sp, #36] ; vp8_filter address + sub sp, sp, #4 + mov r7, r3, lsl #16 ; height is top part of counter + str r1, [sp] ; push destination to stack + + ldr r4, [r11] ; load up packed filter coefficients + ldr r5, [r11, #4] + ldr r6, [r11, #8] + + pkhbt r12, r5, r4 ; pack the filter differently + pkhbt r11, r6, r5 + + sub r0, r0, #4 ; offset input buffer + +|height_loop_2nd| + ldr r8, [r0] ; load the data + ldr r9, [r0, #4] + orr r7, r7, r3, lsr #1 ; loop counter + +|width_loop_2nd| + smuad lr, r4, r8 ; apply filter + sub r7, r7, #1 + smulbt r8, r4, r8 + + ldr r10, [r0, #8] + + smlad lr, r5, r9, lr + smladx r8, r12, r9, r8 + + ldrh r9, [r0, #12] + + smlad lr, r6, r10, lr + smladx r8, r11, r10, r8 + + add r0, r0, #4 + smlatb r10, r6, r9, r8 + + add lr, lr, #0x40 ; round_shift_and_clamp + ands r8, r7, #0xff + usat lr, #8, lr, asr #7 + add r10, r10, #0x40 + strb lr, [r1], r2 ; the result is transposed back and stored + usat r10, #8, r10, asr #7 + + ldrne r8, [r0] ; load data for next loop + ldrne r9, [r0, #4] + strb r10, [r1], r2 + + bne width_loop_2nd + + ldr r1, [sp] ; update dst for next loop + subs r7, r7, #0x10000 + add r0, r0, #16 ; updata src for next loop + add r1, r1, #1 + str r1, [sp] + + bne height_loop_2nd + + add sp, sp, #4 + ldmia sp!, {r4 - r11, pc} + + ENDP + +;------------------------------------ +; r0 unsigned char *src_ptr +; r1 unsigned char *output_ptr, +; r2 unsigned int src_pixels_per_line +; r3 unsigned int cnt, +; stack unsigned int output_pitch, +; stack const short *vp8_filter +;------------------------------------ +|vp8_filter_block2d_first_pass_only_armv6| PROC + stmdb sp!, {r4 - r11, lr} + + ldr r4, [sp, #36] ; output pitch + ldr r11, [sp, #40] ; HFilter address + sub sp, sp, #8 + + mov r7, r3 + sub r2, r2, r3 ; inside loop increments input array, + ; so the height loop only needs to add + ; r2 - width to the input pointer + + sub r4, r4, r3 + str r4, [sp] ; save modified output pitch + str r2, [sp, #4] + + mov r2, #0x40 + + ldr r4, [r11] ; load up packed filter coefficients + ldr r5, [r11, #4] + ldr r6, [r11, #8] + +; six tap filter +|height_loop_1st_only_6| + ldrb r8, [r0, #-2] ; load data + ldrb r9, [r0, #-1] + ldrb r10, [r0], #2 + + mov r12, r3, lsr #1 ; loop counter + +|width_loop_1st_only_6| + ldrb r11, [r0, #-1] + + pkhbt lr, r8, r9, lsl #16 ; r9 | r8 + pkhbt r8, r9, r10, lsl #16 ; r10 | r9 + + ldrb r9, [r0] + +;; smuad lr, lr, r4 + smlad lr, lr, r4, r2 + pkhbt r10, r10, r11, lsl #16 ; r11 | r10 +;; smuad r8, r8, r4 + smlad r8, r8, r4, r2 + pkhbt r11, r11, r9, lsl #16 ; r9 | r11 + + smlad lr, r10, r5, lr + ldrb r10, [r0, #1] + smlad r8, r11, r5, r8 + ldrb r11, [r0, #2] + + subs r12, r12, #1 + + pkhbt r9, r9, r10, lsl #16 ; r10 | r9 + pkhbt r10, r10, r11, lsl #16 ; r11 | r10 + + smlad lr, r9, r6, lr + smlad r10, r10, r6, r8 + +;; add lr, lr, #0x40 ; round_shift_and_clamp + ldrneb r8, [r0, #-2] ; load data for next loop + usat lr, #8, lr, asr #7 +;; add r10, r10, #0x40 + strb lr, [r1], #1 ; store the result + usat r10, #8, r10, asr #7 + + ldrneb r9, [r0, #-1] + strb r10, [r1], #1 + ldrneb r10, [r0], #2 + + bne width_loop_1st_only_6 + + ;;add r9, r2, #30 ; attempt to load 2 adjacent cache lines + ;;IF ARCHITECTURE=6 + ;pld [r0, r2] + ;;pld [r0, r9] + ;;ENDIF + + ldr lr, [sp] ; load back output pitch + ldr r12, [sp, #4] ; load back output pitch + subs r7, r7, #1 + add r0, r0, r12 ; updata src for next loop + add r1, r1, lr ; update dst for next loop + + bne height_loop_1st_only_6 + + add sp, sp, #8 + ldmia sp!, {r4 - r11, pc} + ENDP ; |vp8_filter_block2d_first_pass_only_armv6| + + +;------------------------------------ +; r0 unsigned char *src_ptr, +; r1 unsigned char *output_ptr, +; r2 unsigned int src_pixels_per_line +; r3 unsigned int cnt, +; stack unsigned int output_pitch, +; stack const short *vp8_filter +;------------------------------------ +|vp8_filter_block2d_second_pass_only_armv6| PROC + stmdb sp!, {r4 - r11, lr} + + ldr r11, [sp, #40] ; VFilter address + ldr r12, [sp, #36] ; output pitch + + mov r7, r3, lsl #16 ; height is top part of counter + sub r0, r0, r2, lsl #1 ; need 6 elements for filtering, 2 before, 3 after + + sub sp, sp, #8 + + ldr r4, [r11] ; load up packed filter coefficients + ldr r5, [r11, #4] + ldr r6, [r11, #8] + + str r0, [sp] ; save r0 to stack + str r1, [sp, #4] ; save dst to stack + +; six tap filter +|width_loop_2nd_only_6| + ldrb r8, [r0], r2 ; load data + orr r7, r7, r3 ; loop counter + ldrb r9, [r0], r2 + ldrb r10, [r0], r2 + +|height_loop_2nd_only_6| + ; filter first column in this inner loop, than, move to next colum. + ldrb r11, [r0], r2 + + pkhbt lr, r8, r9, lsl #16 ; r9 | r8 + pkhbt r8, r9, r10, lsl #16 ; r10 | r9 + + ldrb r9, [r0], r2 + + smuad lr, lr, r4 + pkhbt r10, r10, r11, lsl #16 ; r11 | r10 + smuad r8, r8, r4 + pkhbt r11, r11, r9, lsl #16 ; r9 | r11 + + smlad lr, r10, r5, lr + ldrb r10, [r0], r2 + smlad r8, r11, r5, r8 + ldrb r11, [r0] + + sub r7, r7, #2 + sub r0, r0, r2, lsl #2 + + pkhbt r9, r9, r10, lsl #16 ; r10 | r9 + pkhbt r10, r10, r11, lsl #16 ; r11 | r10 + + smlad lr, r9, r6, lr + smlad r10, r10, r6, r8 + + ands r9, r7, #0xff + + add lr, lr, #0x40 ; round_shift_and_clamp + ldrneb r8, [r0], r2 ; load data for next loop + usat lr, #8, lr, asr #7 + add r10, r10, #0x40 + strb lr, [r1], r12 ; store the result for the column + usat r10, #8, r10, asr #7 + + ldrneb r9, [r0], r2 + strb r10, [r1], r12 + ldrneb r10, [r0], r2 + + bne height_loop_2nd_only_6 + + ldr r0, [sp] + ldr r1, [sp, #4] + subs r7, r7, #0x10000 + add r0, r0, #1 ; move to filter next column + str r0, [sp] + add r1, r1, #1 + str r1, [sp, #4] + + bne width_loop_2nd_only_6 + + add sp, sp, #8 + + ldmia sp!, {r4 - r11, pc} + ENDP ; |vp8_filter_block2d_second_pass_only_armv6| + + END diff --git a/vp8/common/arm/armv6/idct_v6.asm b/vp8/common/arm/armv6/idct_v6.asm new file mode 100644 index 000000000..25c5165ec --- /dev/null +++ b/vp8/common/arm/armv6/idct_v6.asm @@ -0,0 +1,376 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + +; r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12 r14 + EXPORT |vp8_short_idct4x4llm_1_v6| + EXPORT |vp8_short_idct4x4llm_v6| + EXPORT |vp8_short_idct4x4llm_v6_scott| + EXPORT |vp8_short_idct4x4llm_v6_dual| + + EXPORT |vp8_dc_only_idct_armv6| + + AREA |.text|, CODE, READONLY + +;******************************************************************************** +;* void short_idct4x4llm_1_v6(INT16 * input, INT16 * output, INT32 pitch) +;* r0 INT16 * input +;* r1 INT16 * output +;* r2 INT32 pitch +;* bench: 3/5 +;******************************************************************************** + +|vp8_short_idct4x4llm_1_v6| PROC ; cycles in out pit + ; + ldrsh r0, [r0] ; load input[0] 1, r0 un 2 + add r0, r0, #4 ; 1 +4 + stmdb sp!, {r4, r5, lr} ; make room for wide writes 1 backup + mov r0, r0, asr #3 ; (input[0] + 4) >> 3 1, r0 req`d ^1 >> 3 + pkhbt r4, r0, r0, lsl #16 ; pack r0 into r4 1, r0 req`d ^1 pack + mov r5, r4 ; expand expand + + strd r4, [r1], r2 ; *output = r0, post inc 1 + strd r4, [r1], r2 ; 1 + strd r4, [r1], r2 ; 1 + strd r4, [r1] ; 1 + ; + ldmia sp!, {r4, r5, pc} ; replace vars, return restore + ENDP ; |vp8_short_idct4x4llm_1_v6| +;******************************************************************************** +;******************************************************************************** +;******************************************************************************** + +;******************************************************************************** +;* void short_idct4x4llm_v6(INT16 * input, INT16 * output, INT32 pitch) +;* r0 INT16 * input +;* r1 INT16 * output +;* r2 INT32 pitch +;* bench: +;******************************************************************************** + +|vp8_short_idct4x4llm_v6| PROC ; cycles in out pit + ; + stmdb sp!, {r4-r11, lr} ; backup registers 1 backup + ; + mov r4, #0x00004E00 ; 1 cst + orr r4, r4, #0x0000007B ; cospi8sqrt2minus1 + mov r5, #0x00008A00 ; 1 cst + orr r5, r5, #0x0000008C ; sinpi8sqrt2 + ; + mov r6, #4 ; i=4 1 i +loop1 ; + ldrsh r12, [r0, #8] ; input[4] 1, r12 unavail 2 [4] + ldrsh r3, [r0, #24] ; input[12] 1, r3 unavail 2 [12] + ldrsh r8, [r0, #16] ; input[8] 1, r8 unavail 2 [8] + ldrsh r7, [r0], #0x2 ; input[0] 1, r7 unavail 2 ++ [0] + smulwb r10, r5, r12 ; ([4] * sinpi8sqrt2) >> 16 1, r10 un 2, r12/r5 ^1 t1 + smulwb r11, r4, r3 ; ([12] * cospi8sqrt2minus1) >> 16 1, r11 un 2, r3/r4 ^1 t2 + add r9, r7, r8 ; a1 = [0] + [8] 1 a1 + sub r7, r7, r8 ; b1 = [0] - [8] 1 b1 + add r11, r3, r11 ; temp2 1 + rsb r11, r11, r10 ; c1 = temp1 - temp2 1 c1 + smulwb r3, r5, r3 ; ([12] * sinpi8sqrt2) >> 16 1, r3 un 2, r3/r5 ^ 1 t2 + smulwb r10, r4, r12 ; ([4] * cospi8sqrt2minus1) >> 16 1, r10 un 2, r12/r4 ^1 t1 + add r8, r7, r11 ; b1 + c1 1 b+c + strh r8, [r1, r2] ; out[pitch] = b1+c1 1 + sub r7, r7, r11 ; b1 - c1 1 b-c + add r10, r12, r10 ; temp1 1 + add r3, r10, r3 ; d1 = temp1 + temp2 1 d1 + add r10, r9, r3 ; a1 + d1 1 a+d + sub r3, r9, r3 ; a1 - d1 1 a-d + add r8, r2, r2 ; pitch * 2 1 p*2 + strh r7, [r1, r8] ; out[pitch*2] = b1-c1 1 + add r7, r2, r2, lsl #1 ; pitch * 3 1 p*3 + strh r3, [r1, r7] ; out[pitch*3] = a1-d1 1 + subs r6, r6, #1 ; i-- 1 -- + strh r10, [r1], #0x2 ; out[0] = a1+d1 1 ++ + bne loop1 ; if i>0, continue + ; + sub r1, r1, #8 ; set up out for next loop 1 -4 + ; for this iteration, input=prev output + mov r6, #4 ; i=4 1 i +; b returnfull +loop2 ; + ldrsh r11, [r1, #2] ; input[1] 1, r11 un 2 [1] + ldrsh r8, [r1, #6] ; input[3] 1, r8 un 2 [3] + ldrsh r3, [r1, #4] ; input[2] 1, r3 un 2 [2] + ldrsh r0, [r1] ; input[0] 1, r0 un 2 [0] + smulwb r9, r5, r11 ; ([1] * sinpi8sqrt2) >> 16 1, r9 un 2, r5/r11 ^1 t1 + smulwb r10, r4, r8 ; ([3] * cospi8sqrt2minus1) >> 16 1, r10 un 2, r4/r8 ^1 t2 + add r7, r0, r3 ; a1 = [0] + [2] 1 a1 + sub r0, r0, r3 ; b1 = [0] - [2] 1 b1 + add r10, r8, r10 ; temp2 1 + rsb r9, r10, r9 ; c1 = temp1 - temp2 1 c1 + smulwb r8, r5, r8 ; ([3] * sinpi8sqrt2) >> 16 1, r8 un 2, r5/r8 ^1 t2 + smulwb r10, r4, r11 ; ([1] * cospi8sqrt2minus1) >> 16 1, r10 un 2, r4/r11 ^1 t1 + add r3, r0, r9 ; b1+c1 1 b+c + add r3, r3, #4 ; b1+c1+4 1 +4 + add r10, r11, r10 ; temp1 1 + mov r3, r3, asr #3 ; b1+c1+4 >> 3 1, r3 ^1 >>3 + strh r3, [r1, #2] ; out[1] = b1+c1 1 + add r10, r10, r8 ; d1 = temp1 + temp2 1 d1 + add r3, r7, r10 ; a1+d1 1 a+d + add r3, r3, #4 ; a1+d1+4 1 +4 + sub r7, r7, r10 ; a1-d1 1 a-d + add r7, r7, #4 ; a1-d1+4 1 +4 + mov r3, r3, asr #3 ; a1+d1+4 >> 3 1, r3 ^1 >>3 + mov r7, r7, asr #3 ; a1-d1+4 >> 3 1, r7 ^1 >>3 + strh r7, [r1, #6] ; out[3] = a1-d1 1 + sub r0, r0, r9 ; b1-c1 1 b-c + add r0, r0, #4 ; b1-c1+4 1 +4 + subs r6, r6, #1 ; i-- 1 -- + mov r0, r0, asr #3 ; b1-c1+4 >> 3 1, r0 ^1 >>3 + strh r0, [r1, #4] ; out[2] = b1-c1 1 + strh r3, [r1], r2 ; out[0] = a1+d1 1 +; add r1, r1, r2 ; out += pitch 1 ++ + bne loop2 ; if i>0, continue +returnfull ; + ldmia sp!, {r4 - r11, pc} ; replace vars, return restore + ENDP + +;******************************************************************************** +;******************************************************************************** +;******************************************************************************** + +;******************************************************************************** +;* void short_idct4x4llm_v6_scott(INT16 * input, INT16 * output, INT32 pitch) +;* r0 INT16 * input +;* r1 INT16 * output +;* r2 INT32 pitch +;* bench: +;******************************************************************************** + +|vp8_short_idct4x4llm_v6_scott| PROC ; cycles in out pit +; mov r0, #0 ; +; ldr r0, [r0] ; + stmdb sp!, {r4 - r11, lr} ; backup registers 1 backup + ; + mov r3, #0x00004E00 ; cos + orr r3, r3, #0x0000007B ; cospi8sqrt2minus1 + mov r4, #0x00008A00 ; sin + orr r4, r4, #0x0000008C ; sinpi8sqrt2 + ; + mov r5, #0x2 ; i i + ; +short_idct4x4llm_v6_scott_loop1 ; + ldr r10, [r0, #(4*2)] ; i5 | i4 5,4 + ldr r11, [r0, #(12*2)] ; i13 | i12 13,12 + ; + smulwb r6, r4, r10 ; ((ip[4] * sinpi8sqrt2) >> 16) lt1 + smulwb r7, r3, r11 ; ((ip[12] * cospi8sqrt2minus1) >> 16) lt2 + ; + smulwb r12, r3, r10 ; ((ip[4] * cospi8sqrt2misu1) >> 16) l2t2 + smulwb r14, r4, r11 ; ((ip[12] * sinpi8sqrt2) >> 16) l2t1 + ; + add r6, r6, r7 ; partial c1 lt1-lt2 + add r12, r12, r14 ; partial d1 l2t2+l2t1 + ; + smulwt r14, r4, r10 ; ((ip[5] * sinpi8sqrt2) >> 16) ht1 + smulwt r7, r3, r11 ; ((ip[13] * cospi8sqrt2minus1) >> 16) ht2 + ; + smulwt r8, r3, r10 ; ((ip[5] * cospi8sqrt2minus1) >> 16) h2t1 + smulwt r9, r4, r11 ; ((ip[13] * sinpi8sqrt2) >> 16) h2t2 + ; + add r7, r14, r7 ; partial c1_2 ht1+ht2 + sub r8, r8, r9 ; partial d1_2 h2t1-h2t2 + ; + pkhbt r6, r6, r7, lsl #16 ; partial c1_2 | partial c1_1 pack + pkhbt r12, r12, r8, lsl #16 ; partial d1_2 | partial d1_1 pack + ; + usub16 r6, r6, r10 ; c1_2 | c1_1 c + uadd16 r12, r12, r11 ; d1_2 | d1_1 d + ; + ldr r10, [r0, #0] ; i1 | i0 1,0 + ldr r11, [r0, #(8*2)] ; i9 | i10 9,10 + ; +;;;;;; add r0, r0, #0x4 ; +4 +;;;;;; add r1, r1, #0x4 ; +4 + ; + uadd16 r8, r10, r11 ; i1 + i9 | i0 + i8 aka a1 a + usub16 r9, r10, r11 ; i1 - i9 | i0 - i8 aka b1 b + ; + uadd16 r7, r8, r12 ; a1 + d1 pair a+d + usub16 r14, r8, r12 ; a1 - d1 pair a-d + ; + str r7, [r1] ; op[0] = a1 + d1 + str r14, [r1, r2] ; op[pitch*3] = a1 - d1 + ; + add r0, r0, #0x4 ; op[pitch] = b1 + c1 ++ + add r1, r1, #0x4 ; op[pitch*2] = b1 - c1 ++ + ; + subs r5, r5, #0x1 ; -- + bne short_idct4x4llm_v6_scott_loop1 ; + ; + sub r1, r1, #16 ; reset output ptr + mov r5, #0x4 ; + mov r0, r1 ; input = output + ; +short_idct4x4llm_v6_scott_loop2 ; + ; + subs r5, r5, #0x1 ; + bne short_idct4x4llm_v6_scott_loop2 ; + ; + ldmia sp!, {r4 - r11, pc} ; + ENDP ; + ; +;******************************************************************************** +;******************************************************************************** +;******************************************************************************** + +;******************************************************************************** +;* void short_idct4x4llm_v6_dual(INT16 * input, INT16 * output, INT32 pitch) +;* r0 INT16 * input +;* r1 INT16 * output +;* r2 INT32 pitch +;* bench: +;******************************************************************************** + +|vp8_short_idct4x4llm_v6_dual| PROC ; cycles in out pit + ; + stmdb sp!, {r4-r11, lr} ; backup registers 1 backup + mov r3, #0x00004E00 ; cos + orr r3, r3, #0x0000007B ; cospi8sqrt2minus1 + mov r4, #0x00008A00 ; sin + orr r4, r4, #0x0000008C ; sinpi8sqrt2 + mov r5, #0x2 ; i=2 i +loop1_dual + ldr r6, [r0, #(4*2)] ; i5 | i4 5|4 + ldr r12, [r0, #(12*2)] ; i13 | i12 13|12 + ldr r14, [r0, #(8*2)] ; i9 | i8 9|8 + + smulwt r9, r3, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16 5c + smulwb r7, r3, r6 ; (ip[4] * cospi8sqrt2minus1) >> 16 4c + smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16 5s + smulwb r8, r4, r6 ; (ip[4] * sinpi8sqrt2) >> 16 4s + pkhbt r7, r7, r9, lsl #16 ; 5c | 4c + smulwt r11, r3, r12 ; (ip[13] * cospi8sqrt2minus1) >> 16 13c + pkhbt r8, r8, r10, lsl #16 ; 5s | 4s + uadd16 r6, r6, r7 ; 5c+5 | 4c+4 + smulwt r7, r4, r12 ; (ip[13] * sinpi8sqrt2) >> 16 13s + smulwb r9, r3, r12 ; (ip[12] * cospi8sqrt2minus1) >> 16 12c + smulwb r10, r4, r12 ; (ip[12] * sinpi8sqrt2) >> 16 12s + subs r5, r5, #0x1 ; i-- -- + pkhbt r9, r9, r11, lsl #16 ; 13c | 12c + ldr r11, [r0], #0x4 ; i1 | i0 ++ 1|0 + pkhbt r10, r10, r7, lsl #16 ; 13s | 12s + uadd16 r7, r12, r9 ; 13c+13 | 12c+12 + usub16 r7, r8, r7 ; c c + uadd16 r6, r6, r10 ; d d + uadd16 r10, r11, r14 ; a a + usub16 r8, r11, r14 ; b b + uadd16 r9, r10, r6 ; a+d a+d + usub16 r10, r10, r6 ; a-d a-d + uadd16 r6, r8, r7 ; b+c b+c + usub16 r7, r8, r7 ; b-c b-c + str r6, [r1, r2] ; o5 | o4 + add r6, r2, r2 ; pitch * 2 p2 + str r7, [r1, r6] ; o9 | o8 + add r6, r6, r2 ; pitch * 3 p3 + str r10, [r1, r6] ; o13 | o12 + str r9, [r1], #0x4 ; o1 | o0 ++ + bne loop1_dual ; + mov r5, #0x2 ; i=2 i + sub r0, r1, #8 ; reset input/output i/o +loop2_dual + ldr r6, [r0, r2] ; i5 | i4 5|4 + ldr r1, [r0] ; i1 | i0 1|0 + ldr r12, [r0, #0x4] ; i3 | i2 3|2 + add r14, r2, #0x4 ; pitch + 2 p+2 + ldr r14, [r0, r14] ; i7 | i6 7|6 + smulwt r9, r3, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16 5c + smulwt r7, r3, r1 ; (ip[1] * cospi8sqrt2minus1) >> 16 1c + smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16 5s + smulwt r8, r4, r1 ; (ip[1] * sinpi8sqrt2) >> 16 1s + pkhbt r11, r6, r1, lsl #16 ; i0 | i4 0|4 + pkhbt r7, r9, r7, lsl #16 ; 1c | 5c + pkhbt r8, r10, r8, lsl #16 ; 1s | 5s = temp1 © tc1 + pkhtb r1, r1, r6, asr #16 ; i1 | i5 1|5 + uadd16 r1, r7, r1 ; 1c+1 | 5c+5 = temp2 (d) td2 + pkhbt r9, r14, r12, lsl #16 ; i2 | i6 2|6 + uadd16 r10, r11, r9 ; a a + usub16 r9, r11, r9 ; b b + pkhtb r6, r12, r14, asr #16 ; i3 | i7 3|7 + subs r5, r5, #0x1 ; i-- -- + smulwt r7, r3, r6 ; (ip[3] * cospi8sqrt2minus1) >> 16 3c + smulwt r11, r4, r6 ; (ip[3] * sinpi8sqrt2) >> 16 3s + smulwb r12, r3, r6 ; (ip[7] * cospi8sqrt2minus1) >> 16 7c + smulwb r14, r4, r6 ; (ip[7] * sinpi8sqrt2) >> 16 7s + + pkhbt r7, r12, r7, lsl #16 ; 3c | 7c + pkhbt r11, r14, r11, lsl #16 ; 3s | 7s = temp1 (d) td1 + uadd16 r6, r7, r6 ; 3c+3 | 7c+7 = temp2 (c) tc2 + usub16 r12, r8, r6 ; c (o1 | o5) c + uadd16 r6, r11, r1 ; d (o3 | o7) d + uadd16 r7, r10, r6 ; a+d a+d + mov r8, #0x4 ; set up 4's 4 + orr r8, r8, #0x40000 ; 4|4 + usub16 r6, r10, r6 ; a-d a-d + uadd16 r6, r6, r8 ; a-d+4 3|7 + uadd16 r7, r7, r8 ; a+d+4 0|4 + uadd16 r10, r9, r12 ; b+c b+c + usub16 r1, r9, r12 ; b-c b-c + uadd16 r10, r10, r8 ; b+c+4 1|5 + uadd16 r1, r1, r8 ; b-c+4 2|6 + mov r8, r10, asr #19 ; o1 >> 3 + strh r8, [r0, #2] ; o1 + mov r8, r1, asr #19 ; o2 >> 3 + strh r8, [r0, #4] ; o2 + mov r8, r6, asr #19 ; o3 >> 3 + strh r8, [r0, #6] ; o3 + mov r8, r7, asr #19 ; o0 >> 3 + strh r8, [r0], r2 ; o0 +p + sxth r10, r10 ; + mov r8, r10, asr #3 ; o5 >> 3 + strh r8, [r0, #2] ; o5 + sxth r1, r1 ; + mov r8, r1, asr #3 ; o6 >> 3 + strh r8, [r0, #4] ; o6 + sxth r6, r6 ; + mov r8, r6, asr #3 ; o7 >> 3 + strh r8, [r0, #6] ; o7 + sxth r7, r7 ; + mov r8, r7, asr #3 ; o4 >> 3 + strh r8, [r0], r2 ; o4 +p +;;;;; subs r5, r5, #0x1 ; i-- -- + bne loop2_dual ; + ; + ldmia sp!, {r4 - r11, pc} ; replace vars, return restore + ENDP + + +; sjl added 10/17/08 +;void dc_only_idct_armv6(short input_dc, short *output, int pitch) +|vp8_dc_only_idct_armv6| PROC + stmdb sp!, {r4 - r6, lr} + + add r0, r0, #0x4 + add r4, r1, r2 ; output + shortpitch + mov r0, r0, ASR #0x3 ;aka a1 + add r5, r1, r2, LSL #1 ; output + shortpitch * 2 + pkhbt r0, r0, r0, lsl #16 ; a1 | a1 + add r6, r5, r2 ; output + shortpitch * 3 + + str r0, [r1, #0] + str r0, [r1, #4] + + str r0, [r4, #0] + str r0, [r4, #4] + + str r0, [r5, #0] + str r0, [r5, #4] + + str r0, [r6, #0] + str r0, [r6, #4] + + + ldmia sp!, {r4 - r6, pc} + + ENDP ; |vp8_dc_only_idct_armv6| + + END diff --git a/vp8/common/arm/armv6/iwalsh_v6.asm b/vp8/common/arm/armv6/iwalsh_v6.asm new file mode 100644 index 000000000..87475681f --- /dev/null +++ b/vp8/common/arm/armv6/iwalsh_v6.asm @@ -0,0 +1,151 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + EXPORT |vp8_short_inv_walsh4x4_armv6| + EXPORT |vp8_short_inv_walsh4x4_1_armv6| + + ARM + REQUIRE8 + PRESERVE8 + + AREA |.text|, CODE, READONLY ; name this block of code + +;short vp8_short_inv_walsh4x4_armv6(short *input, short *output) +|vp8_short_inv_walsh4x4_armv6| PROC + + stmdb sp!, {r4 - r11, lr} + + ldr r2, [r0], #4 ; [1 | 0] + ldr r3, [r0], #4 ; [3 | 2] + ldr r4, [r0], #4 ; [5 | 4] + ldr r5, [r0], #4 ; [7 | 6] + ldr r6, [r0], #4 ; [9 | 8] + ldr r7, [r0], #4 ; [11 | 10] + ldr r8, [r0], #4 ; [13 | 12] + ldr r9, [r0] ; [15 | 14] + + qadd16 r10, r2, r8 ; a1 [1+13 | 0+12] + qadd16 r11, r4, r6 ; b1 [5+9 | 4+8] + qsub16 r12, r4, r6 ; c1 [5-9 | 4-8] + qsub16 lr, r2, r8 ; d1 [1-13 | 0-12] + + qadd16 r2, r10, r11 ; a1 + b1 [1 | 0] + qadd16 r4, r12, lr ; c1 + d1 [5 | 4] + qsub16 r6, r10, r11 ; a1 - b1 [9 | 8] + qsub16 r8, lr, r12 ; d1 - c1 [13 | 12] + + qadd16 r10, r3, r9 ; a1 [3+15 | 2+14] + qadd16 r11, r5, r7 ; b1 [7+11 | 6+10] + qsub16 r12, r5, r7 ; c1 [7-11 | 6-10] + qsub16 lr, r3, r9 ; d1 [3-15 | 2-14] + + qadd16 r3, r10, r11 ; a1 + b1 [3 | 2] + qadd16 r5, r12, lr ; c1 + d1 [7 | 6] + qsub16 r7, r10, r11 ; a1 - b1 [11 | 10] + qsub16 r9, lr, r12 ; d1 - c1 [15 | 14] + + ; first transform complete + + qsubaddx r10, r2, r3 ; [c1|a1] [1-2 | 0+3] + qaddsubx r11, r2, r3 ; [b1|d1] [1+2 | 0-3] + qsubaddx r12, r4, r5 ; [c1|a1] [5-6 | 4+7] + qaddsubx lr, r4, r5 ; [b1|d1] [5+6 | 4-7] + + qaddsubx r2, r10, r11 ; [b2|c2] [c1+d1 | a1-b1] + qaddsubx r3, r11, r10 ; [a2|d2] [b1+a1 | d1-c1] + ldr r10, c0x00030003 + qaddsubx r4, r12, lr ; [b2|c2] [c1+d1 | a1-b1] + qaddsubx r5, lr, r12 ; [a2|d2] [b1+a1 | d1-c1] + + qadd16 r2, r2, r10 ; [b2+3|c2+3] + qadd16 r3, r3, r10 ; [a2+3|d2+3] + qadd16 r4, r4, r10 ; [b2+3|c2+3] + qadd16 r5, r5, r10 ; [a2+3|d2+3] + + asr r12, r2, #3 ; [1 | x] + pkhtb r12, r12, r3, asr #19; [1 | 0] + lsl lr, r3, #16 ; [~3 | x] + lsl r2, r2, #16 ; [~2 | x] + asr lr, lr, #3 ; [3 | x] + pkhtb lr, lr, r2, asr #19 ; [3 | 2] + + asr r2, r4, #3 ; [5 | x] + pkhtb r2, r2, r5, asr #19 ; [5 | 4] + lsl r3, r5, #16 ; [~7 | x] + lsl r4, r4, #16 ; [~6 | x] + asr r3, r3, #3 ; [7 | x] + pkhtb r3, r3, r4, asr #19 ; [7 | 6] + + str r12, [r1], #4 + str lr, [r1], #4 + str r2, [r1], #4 + str r3, [r1], #4 + + qsubaddx r2, r6, r7 ; [c1|a1] [9-10 | 8+11] + qaddsubx r3, r6, r7 ; [b1|d1] [9+10 | 8-11] + qsubaddx r4, r8, r9 ; [c1|a1] [13-14 | 12+15] + qaddsubx r5, r8, r9 ; [b1|d1] [13+14 | 12-15] + + qaddsubx r6, r2, r3 ; [b2|c2] [c1+d1 | a1-b1] + qaddsubx r7, r3, r2 ; [a2|d2] [b1+a1 | d1-c1] + qaddsubx r8, r4, r5 ; [b2|c2] [c1+d1 | a1-b1] + qaddsubx r9, r5, r4 ; [a2|d2] [b1+a1 | d1-c1] + + qadd16 r6, r6, r10 ; [b2+3|c2+3] + qadd16 r7, r7, r10 ; [a2+3|d2+3] + qadd16 r8, r8, r10 ; [b2+3|c2+3] + qadd16 r9, r9, r10 ; [a2+3|d2+3] + + asr r2, r6, #3 ; [9 | x] + pkhtb r2, r2, r7, asr #19 ; [9 | 8] + lsl r3, r7, #16 ; [~11| x] + lsl r4, r6, #16 ; [~10| x] + asr r3, r3, #3 ; [11 | x] + pkhtb r3, r3, r4, asr #19 ; [11 | 10] + + asr r4, r8, #3 ; [13 | x] + pkhtb r4, r4, r9, asr #19 ; [13 | 12] + lsl r5, r9, #16 ; [~15| x] + lsl r6, r8, #16 ; [~14| x] + asr r5, r5, #3 ; [15 | x] + pkhtb r5, r5, r6, asr #19 ; [15 | 14] + + str r2, [r1], #4 + str r3, [r1], #4 + str r4, [r1], #4 + str r5, [r1] + + ldmia sp!, {r4 - r11, pc} + ENDP ; |vp8_short_inv_walsh4x4_armv6| + + +;short vp8_short_inv_walsh4x4_1_armv6(short *input, short *output) +|vp8_short_inv_walsh4x4_1_armv6| PROC + + ldrsh r2, [r0] ; [0] + add r2, r2, #3 ; [0] + 3 + asr r2, r2, #3 ; a1 ([0]+3) >> 3 + lsl r2, r2, #16 ; [a1 | x] + orr r2, r2, r2, lsr #16 ; [a1 | a1] + + str r2, [r1], #4 + str r2, [r1], #4 + str r2, [r1], #4 + str r2, [r1], #4 + str r2, [r1], #4 + str r2, [r1], #4 + str r2, [r1], #4 + str r2, [r1] + + bx lr + ENDP ; |vp8_short_inv_walsh4x4_1_armv6| + +; Constant Pool +c0x00030003 DCD 0x00030003 + END diff --git a/vp8/common/arm/armv6/loopfilter_v6.asm b/vp8/common/arm/armv6/loopfilter_v6.asm new file mode 100644 index 000000000..c2b02dc0a --- /dev/null +++ b/vp8/common/arm/armv6/loopfilter_v6.asm @@ -0,0 +1,1263 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_loop_filter_horizontal_edge_armv6| + EXPORT |vp8_mbloop_filter_horizontal_edge_armv6| + EXPORT |vp8_loop_filter_vertical_edge_armv6| + EXPORT |vp8_mbloop_filter_vertical_edge_armv6| + + AREA |.text|, CODE, READONLY ; name this block of code + + MACRO + TRANSPOSE_MATRIX $a0, $a1, $a2, $a3, $b0, $b1, $b2, $b3 + ; input: $a0, $a1, $a2, $a3; output: $b0, $b1, $b2, $b3 + ; a0: 03 02 01 00 + ; a1: 13 12 11 10 + ; a2: 23 22 21 20 + ; a3: 33 32 31 30 + ; b3 b2 b1 b0 + + uxtb16 $b1, $a1 ; xx 12 xx 10 + uxtb16 $b0, $a0 ; xx 02 xx 00 + uxtb16 $b3, $a3 ; xx 32 xx 30 + uxtb16 $b2, $a2 ; xx 22 xx 20 + orr $b1, $b0, $b1, lsl #8 ; 12 02 10 00 + orr $b3, $b2, $b3, lsl #8 ; 32 22 30 20 + + uxtb16 $a1, $a1, ror #8 ; xx 13 xx 11 + uxtb16 $a3, $a3, ror #8 ; xx 33 xx 31 + uxtb16 $a0, $a0, ror #8 ; xx 03 xx 01 + uxtb16 $a2, $a2, ror #8 ; xx 23 xx 21 + orr $a0, $a0, $a1, lsl #8 ; 13 03 11 01 + orr $a2, $a2, $a3, lsl #8 ; 33 23 31 21 + + pkhtb $b2, $b3, $b1, asr #16 ; 32 22 12 02 -- p1 + pkhbt $b0, $b1, $b3, lsl #16 ; 30 20 10 00 -- p3 + + pkhtb $b3, $a2, $a0, asr #16 ; 33 23 13 03 -- p0 + pkhbt $b1, $a0, $a2, lsl #16 ; 31 21 11 01 -- p2 + MEND + + +src RN r0 +pstep RN r1 +count RN r5 + +;r0 unsigned char *src_ptr, +;r1 int src_pixel_step, +;r2 const char *flimit, +;r3 const char *limit, +;stack const char *thresh, +;stack int count + +;Note: All 16 elements in flimit are equal. So, in the code, only one load is needed +;for flimit. Same way applies to limit and thresh. + +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- +|vp8_loop_filter_horizontal_edge_armv6| PROC +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- + stmdb sp!, {r4 - r11, lr} + + sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines + ldr count, [sp, #40] ; count for 8-in-parallel + ldr r6, [sp, #36] ; load thresh address + sub sp, sp, #16 ; create temp buffer + + ldr r9, [src], pstep ; p3 + ldr r4, [r2], #4 ; flimit + ldr r10, [src], pstep ; p2 + ldr r2, [r3], #4 ; limit + ldr r11, [src], pstep ; p1 + uadd8 r4, r4, r4 ; flimit * 2 + ldr r3, [r6], #4 ; thresh + mov count, count, lsl #1 ; 4-in-parallel + uadd8 r4, r4, r2 ; flimit * 2 + limit + +|Hnext8| + ; vp8_filter_mask() function + ; calculate breakout conditions + ldr r12, [src], pstep ; p0 + + uqsub8 r6, r9, r10 ; p3 - p2 + uqsub8 r7, r10, r9 ; p2 - p3 + uqsub8 r8, r10, r11 ; p2 - p1 + uqsub8 r10, r11, r10 ; p1 - p2 + + orr r6, r6, r7 ; abs (p3-p2) + orr r8, r8, r10 ; abs (p2-p1) + uqsub8 lr, r6, r2 ; compare to limit. lr: vp8_filter_mask + uqsub8 r8, r8, r2 ; compare to limit + uqsub8 r6, r11, r12 ; p1 - p0 + orr lr, lr, r8 + uqsub8 r7, r12, r11 ; p0 - p1 + ldr r9, [src], pstep ; q0 + ldr r10, [src], pstep ; q1 + orr r6, r6, r7 ; abs (p1-p0) + uqsub8 r7, r6, r2 ; compare to limit + uqsub8 r8, r6, r3 ; compare to thresh -- save r8 for later + orr lr, lr, r7 + + uqsub8 r6, r11, r10 ; p1 - q1 + uqsub8 r7, r10, r11 ; q1 - p1 + uqsub8 r11, r12, r9 ; p0 - q0 + uqsub8 r12, r9, r12 ; q0 - p0 + orr r6, r6, r7 ; abs (p1-q1) + ldr r7, c0x7F7F7F7F + orr r12, r11, r12 ; abs (p0-q0) + ldr r11, [src], pstep ; q2 + uqadd8 r12, r12, r12 ; abs (p0-q0) * 2 + and r6, r7, r6, lsr #1 ; abs (p1-q1) / 2 + uqsub8 r7, r9, r10 ; q0 - q1 + uqadd8 r12, r12, r6 ; abs (p0-q0)*2 + abs (p1-q1)/2 + uqsub8 r6, r10, r9 ; q1 - q0 + uqsub8 r12, r12, r4 ; compare to flimit + uqsub8 r9, r11, r10 ; q2 - q1 + + orr lr, lr, r12 + + ldr r12, [src], pstep ; q3 + uqsub8 r10, r10, r11 ; q1 - q2 + orr r6, r7, r6 ; abs (q1-q0) + orr r10, r9, r10 ; abs (q2-q1) + uqsub8 r7, r6, r2 ; compare to limit + uqsub8 r10, r10, r2 ; compare to limit + uqsub8 r6, r6, r3 ; compare to thresh -- save r6 for later + orr lr, lr, r7 + orr lr, lr, r10 + + uqsub8 r10, r12, r11 ; q3 - q2 + uqsub8 r9, r11, r12 ; q2 - q3 + + mvn r11, #0 ; r11 == -1 + + orr r10, r10, r9 ; abs (q3-q2) + uqsub8 r10, r10, r2 ; compare to limit + + mov r12, #0 + orr lr, lr, r10 + sub src, src, pstep, lsl #2 + + usub8 lr, r12, lr ; use usub8 instead of ssub8 + sel lr, r11, r12 ; filter mask: lr + + cmp lr, #0 + beq hskip_filter ; skip filtering + + sub src, src, pstep, lsl #1 ; move src pointer down by 6 lines + + ;vp8_hevmask() function + ;calculate high edge variance + orr r10, r6, r8 ; calculate vp8_hevmask + + ldr r7, [src], pstep ; p1 + + usub8 r10, r12, r10 ; use usub8 instead of ssub8 + sel r6, r12, r11 ; obtain vp8_hevmask: r6 + + ;vp8_filter() function + ldr r8, [src], pstep ; p0 + ldr r12, c0x80808080 + ldr r9, [src], pstep ; q0 + ldr r10, [src], pstep ; q1 + + eor r7, r7, r12 ; p1 offset to convert to a signed value + eor r8, r8, r12 ; p0 offset to convert to a signed value + eor r9, r9, r12 ; q0 offset to convert to a signed value + eor r10, r10, r12 ; q1 offset to convert to a signed value + + str r9, [sp] ; store qs0 temporarily + str r8, [sp, #4] ; store ps0 temporarily + str r10, [sp, #8] ; store qs1 temporarily + str r7, [sp, #12] ; store ps1 temporarily + + qsub8 r7, r7, r10 ; vp8_signed_char_clamp(ps1-qs1) + qsub8 r8, r9, r8 ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) + + and r7, r7, r6 ; vp8_filter (r7) &= hev + + qadd8 r7, r7, r8 + ldr r9, c0x03030303 ; r9 = 3 --modified for vp8 + + qadd8 r7, r7, r8 + ldr r10, c0x04040404 + + qadd8 r7, r7, r8 + and r7, r7, lr ; vp8_filter &= mask; + + ;modify code for vp8 -- Filter1 = vp8_filter (r7) + qadd8 r8 , r7 , r9 ; Filter2 (r8) = vp8_signed_char_clamp(vp8_filter+3) + qadd8 r7 , r7 , r10 ; vp8_filter = vp8_signed_char_clamp(vp8_filter+4) + + mov r9, #0 + shadd8 r8 , r8 , r9 ; Filter2 >>= 3 + shadd8 r7 , r7 , r9 ; vp8_filter >>= 3 + shadd8 r8 , r8 , r9 + shadd8 r7 , r7 , r9 + shadd8 lr , r8 , r9 ; lr: Filter2 + shadd8 r7 , r7 , r9 ; r7: filter + + ;usub8 lr, r8, r10 ; s = (s==4)*-1 + ;sel lr, r11, r9 + ;usub8 r8, r10, r8 + ;sel r8, r11, r9 + ;and r8, r8, lr ; -1 for each element that equals 4 + + ;calculate output + ;qadd8 lr, r8, r7 ; u = vp8_signed_char_clamp(s + vp8_filter) + + ldr r8, [sp] ; load qs0 + ldr r9, [sp, #4] ; load ps0 + + ldr r10, c0x01010101 + + qsub8 r8 ,r8, r7 ; u = vp8_signed_char_clamp(qs0 - vp8_filter) + qadd8 r9, r9, lr ; u = vp8_signed_char_clamp(ps0 + Filter2) + + ;end of modification for vp8 + + mov lr, #0 + sadd8 r7, r7 , r10 ; vp8_filter += 1 + shadd8 r7, r7, lr ; vp8_filter >>= 1 + + ldr r11, [sp, #12] ; load ps1 + ldr r10, [sp, #8] ; load qs1 + + bic r7, r7, r6 ; vp8_filter &= ~hev + sub src, src, pstep, lsl #2 + + qadd8 r11, r11, r7 ; u = vp8_signed_char_clamp(ps1 + vp8_filter) + qsub8 r10, r10,r7 ; u = vp8_signed_char_clamp(qs1 - vp8_filter) + + eor r11, r11, r12 ; *op1 = u^0x80 + str r11, [src], pstep ; store op1 + eor r9, r9, r12 ; *op0 = u^0x80 + str r9, [src], pstep ; store op0 result + eor r8, r8, r12 ; *oq0 = u^0x80 + str r8, [src], pstep ; store oq0 result + eor r10, r10, r12 ; *oq1 = u^0x80 + str r10, [src], pstep ; store oq1 + + sub src, src, pstep, lsl #1 + +|hskip_filter| + add src, src, #4 + sub src, src, pstep, lsl #2 + + subs count, count, #1 + + ;pld [src] + ;pld [src, pstep] + ;pld [src, pstep, lsl #1] + ;pld [src, pstep, lsl #2] + ;pld [src, pstep, lsl #3] + + ldrne r9, [src], pstep ; p3 + ldrne r10, [src], pstep ; p2 + ldrne r11, [src], pstep ; p1 + + bne Hnext8 + + add sp, sp, #16 + ldmia sp!, {r4 - r11, pc} + ENDP ; |vp8_loop_filter_horizontal_edge_armv6| + + +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- +|vp8_mbloop_filter_horizontal_edge_armv6| PROC +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- + stmdb sp!, {r4 - r11, lr} + + sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines + ldr count, [sp, #40] ; count for 8-in-parallel + ldr r6, [sp, #36] ; load thresh address + sub sp, sp, #16 ; create temp buffer + + ldr r9, [src], pstep ; p3 + ldr r4, [r2], #4 ; flimit + ldr r10, [src], pstep ; p2 + ldr r2, [r3], #4 ; limit + ldr r11, [src], pstep ; p1 + uadd8 r4, r4, r4 ; flimit * 2 + ldr r3, [r6], #4 ; thresh + mov count, count, lsl #1 ; 4-in-parallel + uadd8 r4, r4, r2 ; flimit * 2 + limit + +|MBHnext8| + + ; vp8_filter_mask() function + ; calculate breakout conditions + ldr r12, [src], pstep ; p0 + + uqsub8 r6, r9, r10 ; p3 - p2 + uqsub8 r7, r10, r9 ; p2 - p3 + uqsub8 r8, r10, r11 ; p2 - p1 + uqsub8 r10, r11, r10 ; p1 - p2 + + orr r6, r6, r7 ; abs (p3-p2) + orr r8, r8, r10 ; abs (p2-p1) + uqsub8 lr, r6, r2 ; compare to limit. lr: vp8_filter_mask + uqsub8 r8, r8, r2 ; compare to limit + + uqsub8 r6, r11, r12 ; p1 - p0 + orr lr, lr, r8 + uqsub8 r7, r12, r11 ; p0 - p1 + ldr r9, [src], pstep ; q0 + ldr r10, [src], pstep ; q1 + orr r6, r6, r7 ; abs (p1-p0) + uqsub8 r7, r6, r2 ; compare to limit + uqsub8 r8, r6, r3 ; compare to thresh -- save r8 for later + orr lr, lr, r7 + + uqsub8 r6, r11, r10 ; p1 - q1 + uqsub8 r7, r10, r11 ; q1 - p1 + uqsub8 r11, r12, r9 ; p0 - q0 + uqsub8 r12, r9, r12 ; q0 - p0 + orr r6, r6, r7 ; abs (p1-q1) + ldr r7, c0x7F7F7F7F + orr r12, r11, r12 ; abs (p0-q0) + ldr r11, [src], pstep ; q2 + uqadd8 r12, r12, r12 ; abs (p0-q0) * 2 + and r6, r7, r6, lsr #1 ; abs (p1-q1) / 2 + uqsub8 r7, r9, r10 ; q0 - q1 + uqadd8 r12, r12, r6 ; abs (p0-q0)*2 + abs (p1-q1)/2 + uqsub8 r6, r10, r9 ; q1 - q0 + uqsub8 r12, r12, r4 ; compare to flimit + uqsub8 r9, r11, r10 ; q2 - q1 + + orr lr, lr, r12 + + ldr r12, [src], pstep ; q3 + + uqsub8 r10, r10, r11 ; q1 - q2 + orr r6, r7, r6 ; abs (q1-q0) + orr r10, r9, r10 ; abs (q2-q1) + uqsub8 r7, r6, r2 ; compare to limit + uqsub8 r10, r10, r2 ; compare to limit + uqsub8 r6, r6, r3 ; compare to thresh -- save r6 for later + orr lr, lr, r7 + orr lr, lr, r10 + + uqsub8 r10, r12, r11 ; q3 - q2 + uqsub8 r9, r11, r12 ; q2 - q3 + + mvn r11, #0 ; r11 == -1 + + orr r10, r10, r9 ; abs (q3-q2) + uqsub8 r10, r10, r2 ; compare to limit + + mov r12, #0 + + orr lr, lr, r10 + + usub8 lr, r12, lr ; use usub8 instead of ssub8 + sel lr, r11, r12 ; filter mask: lr + + cmp lr, #0 + beq mbhskip_filter ; skip filtering + + ;vp8_hevmask() function + ;calculate high edge variance + sub src, src, pstep, lsl #2 ; move src pointer down by 6 lines + sub src, src, pstep, lsl #1 + + orr r10, r6, r8 + ldr r7, [src], pstep ; p1 + + usub8 r10, r12, r10 + sel r6, r12, r11 ; hev mask: r6 + + ;vp8_mbfilter() function + ;p2, q2 are only needed at the end. Don't need to load them in now. + ldr r8, [src], pstep ; p0 + ldr r12, c0x80808080 + ldr r9, [src], pstep ; q0 + ldr r10, [src] ; q1 + + eor r7, r7, r12 ; ps1 + eor r8, r8, r12 ; ps0 + eor r9, r9, r12 ; qs0 + eor r10, r10, r12 ; qs1 + + qsub8 r12, r9, r8 ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) + str r7, [sp, #12] ; store ps1 temporarily + qsub8 r7, r7, r10 ; vp8_signed_char_clamp(ps1-qs1) + str r10, [sp, #8] ; store qs1 temporarily + qadd8 r7, r7, r12 + str r9, [sp] ; store qs0 temporarily + qadd8 r7, r7, r12 + str r8, [sp, #4] ; store ps0 temporarily + qadd8 r7, r7, r12 ; vp8_filter: r7 + + ldr r10, c0x03030303 ; r10 = 3 --modified for vp8 + ldr r9, c0x04040404 + + and r7, r7, lr ; vp8_filter &= mask (lr is free) + + mov r12, r7 ; Filter2: r12 + and r12, r12, r6 ; Filter2 &= hev + + ;modify code for vp8 + ;save bottom 3 bits so that we round one side +4 and the other +3 + qadd8 r8 , r12 , r9 ; Filter1 (r8) = vp8_signed_char_clamp(Filter2+4) + qadd8 r12 , r12 , r10 ; Filter2 (r12) = vp8_signed_char_clamp(Filter2+3) + + mov r10, #0 + shadd8 r8 , r8 , r10 ; Filter1 >>= 3 + shadd8 r12 , r12 , r10 ; Filter2 >>= 3 + shadd8 r8 , r8 , r10 + shadd8 r12 , r12 , r10 + shadd8 r8 , r8 , r10 ; r8: Filter1 + shadd8 r12 , r12 , r10 ; r12: Filter2 + + ldr r9, [sp] ; load qs0 + ldr r11, [sp, #4] ; load ps0 + + qsub8 r9 , r9, r8 ; qs0 = vp8_signed_char_clamp(qs0 - Filter1) + qadd8 r11, r11, r12 ; ps0 = vp8_signed_char_clamp(ps0 + Filter2) + + ;save bottom 3 bits so that we round one side +4 and the other +3 + ;and r8, r12, r10 ; s = Filter2 & 7 (s: r8) + ;qadd8 r12 , r12 , r9 ; Filter2 = vp8_signed_char_clamp(Filter2+4) + ;mov r10, #0 + ;shadd8 r12 , r12 , r10 ; Filter2 >>= 3 + ;usub8 lr, r8, r9 ; s = (s==4)*-1 + ;sel lr, r11, r10 + ;shadd8 r12 , r12 , r10 + ;usub8 r8, r9, r8 + ;sel r8, r11, r10 + ;ldr r9, [sp] ; load qs0 + ;ldr r11, [sp, #4] ; load ps0 + ;shadd8 r12 , r12 , r10 + ;and r8, r8, lr ; -1 for each element that equals 4 + ;qadd8 r10, r8, r12 ; u = vp8_signed_char_clamp(s + Filter2) + ;qsub8 r9 , r9, r12 ; qs0 = vp8_signed_char_clamp(qs0 - Filter2) + ;qadd8 r11, r11, r10 ; ps0 = vp8_signed_char_clamp(ps0 + u) + + ;end of modification for vp8 + + bic r12, r7, r6 ; vp8_filter &= ~hev ( r6 is free) + ;mov r12, r7 + + ;roughly 3/7th difference across boundary + mov lr, #0x1b ; 27 + mov r7, #0x3f ; 63 + + sxtb16 r6, r12 + sxtb16 r10, r12, ror #8 + smlabb r8, r6, lr, r7 + smlatb r6, r6, lr, r7 + smlabb r7, r10, lr, r7 + smultb r10, r10, lr + ssat r8, #8, r8, asr #7 + ssat r6, #8, r6, asr #7 + add r10, r10, #63 + ssat r7, #8, r7, asr #7 + ssat r10, #8, r10, asr #7 + + ldr lr, c0x80808080 + + pkhbt r6, r8, r6, lsl #16 + pkhbt r10, r7, r10, lsl #16 + uxtb16 r6, r6 + uxtb16 r10, r10 + + sub src, src, pstep + + orr r10, r6, r10, lsl #8 ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7) + + qsub8 r8, r9, r10 ; s = vp8_signed_char_clamp(qs0 - u) + qadd8 r10, r11, r10 ; s = vp8_signed_char_clamp(ps0 + u) + eor r8, r8, lr ; *oq0 = s^0x80 + str r8, [src] ; store *oq0 + sub src, src, pstep + eor r10, r10, lr ; *op0 = s^0x80 + str r10, [src] ; store *op0 + + ;roughly 2/7th difference across boundary + mov lr, #0x12 ; 18 + mov r7, #0x3f ; 63 + + sxtb16 r6, r12 + sxtb16 r10, r12, ror #8 + smlabb r8, r6, lr, r7 + smlatb r6, r6, lr, r7 + smlabb r9, r10, lr, r7 + smlatb r10, r10, lr, r7 + ssat r8, #8, r8, asr #7 + ssat r6, #8, r6, asr #7 + ssat r9, #8, r9, asr #7 + ssat r10, #8, r10, asr #7 + + ldr lr, c0x80808080 + + pkhbt r6, r8, r6, lsl #16 + pkhbt r10, r9, r10, lsl #16 + + ldr r9, [sp, #8] ; load qs1 + ldr r11, [sp, #12] ; load ps1 + + uxtb16 r6, r6 + uxtb16 r10, r10 + + sub src, src, pstep + + orr r10, r6, r10, lsl #8 ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7) + + qadd8 r11, r11, r10 ; s = vp8_signed_char_clamp(ps1 + u) + qsub8 r8, r9, r10 ; s = vp8_signed_char_clamp(qs1 - u) + eor r11, r11, lr ; *op1 = s^0x80 + str r11, [src], pstep ; store *op1 + eor r8, r8, lr ; *oq1 = s^0x80 + add src, src, pstep, lsl #1 + + mov r7, #0x3f ; 63 + + str r8, [src], pstep ; store *oq1 + + ;roughly 1/7th difference across boundary + mov lr, #0x9 ; 9 + ldr r9, [src] ; load q2 + + sxtb16 r6, r12 + sxtb16 r10, r12, ror #8 + smlabb r8, r6, lr, r7 + smlatb r6, r6, lr, r7 + smlabb r12, r10, lr, r7 + smlatb r10, r10, lr, r7 + ssat r8, #8, r8, asr #7 + ssat r6, #8, r6, asr #7 + ssat r12, #8, r12, asr #7 + ssat r10, #8, r10, asr #7 + + sub src, src, pstep, lsl #2 + + pkhbt r6, r8, r6, lsl #16 + pkhbt r10, r12, r10, lsl #16 + + sub src, src, pstep + ldr lr, c0x80808080 + + ldr r11, [src] ; load p2 + + uxtb16 r6, r6 + uxtb16 r10, r10 + + eor r9, r9, lr + eor r11, r11, lr + + orr r10, r6, r10, lsl #8 ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7) + + qadd8 r8, r11, r10 ; s = vp8_signed_char_clamp(ps2 + u) + qsub8 r10, r9, r10 ; s = vp8_signed_char_clamp(qs2 - u) + eor r8, r8, lr ; *op2 = s^0x80 + str r8, [src], pstep, lsl #2 ; store *op2 + add src, src, pstep + eor r10, r10, lr ; *oq2 = s^0x80 + str r10, [src], pstep, lsl #1 ; store *oq2 + +|mbhskip_filter| + add src, src, #4 + sub src, src, pstep, lsl #3 + subs count, count, #1 + + ldrne r9, [src], pstep ; p3 + ldrne r10, [src], pstep ; p2 + ldrne r11, [src], pstep ; p1 + + bne MBHnext8 + + add sp, sp, #16 + ldmia sp!, {r4 - r11, pc} + ENDP ; |vp8_mbloop_filter_horizontal_edge_armv6| + + +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- +|vp8_loop_filter_vertical_edge_armv6| PROC +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- + stmdb sp!, {r4 - r11, lr} + + sub src, src, #4 ; move src pointer down by 4 + ldr count, [sp, #40] ; count for 8-in-parallel + ldr r12, [sp, #36] ; load thresh address + sub sp, sp, #16 ; create temp buffer + + ldr r6, [src], pstep ; load source data + ldr r4, [r2], #4 ; flimit + ldr r7, [src], pstep + ldr r2, [r3], #4 ; limit + ldr r8, [src], pstep + uadd8 r4, r4, r4 ; flimit * 2 + ldr r3, [r12], #4 ; thresh + ldr lr, [src], pstep + mov count, count, lsl #1 ; 4-in-parallel + uadd8 r4, r4, r2 ; flimit * 2 + limit + +|Vnext8| + + ; vp8_filter_mask() function + ; calculate breakout conditions + ; transpose the source data for 4-in-parallel operation + TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12 + + uqsub8 r7, r9, r10 ; p3 - p2 + uqsub8 r8, r10, r9 ; p2 - p3 + uqsub8 r9, r10, r11 ; p2 - p1 + uqsub8 r10, r11, r10 ; p1 - p2 + orr r7, r7, r8 ; abs (p3-p2) + orr r10, r9, r10 ; abs (p2-p1) + uqsub8 lr, r7, r2 ; compare to limit. lr: vp8_filter_mask + uqsub8 r10, r10, r2 ; compare to limit + + sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines + + orr lr, lr, r10 + + uqsub8 r6, r11, r12 ; p1 - p0 + uqsub8 r7, r12, r11 ; p0 - p1 + add src, src, #4 ; move src pointer up by 4 + orr r6, r6, r7 ; abs (p1-p0) + str r11, [sp, #12] ; save p1 + uqsub8 r10, r6, r2 ; compare to limit + uqsub8 r11, r6, r3 ; compare to thresh + orr lr, lr, r10 + + ; transpose uses 8 regs(r6 - r12 and lr). Need to save reg value now + ; transpose the source data for 4-in-parallel operation + ldr r6, [src], pstep ; load source data + str r11, [sp] ; push r11 to stack + ldr r7, [src], pstep + str r12, [sp, #4] ; save current reg before load q0 - q3 data + ldr r8, [src], pstep + str lr, [sp, #8] + ldr lr, [src], pstep + + TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12 + + ldr lr, [sp, #8] ; load back (f)limit accumulator + + uqsub8 r6, r12, r11 ; q3 - q2 + uqsub8 r7, r11, r12 ; q2 - q3 + uqsub8 r12, r11, r10 ; q2 - q1 + uqsub8 r11, r10, r11 ; q1 - q2 + orr r6, r6, r7 ; abs (q3-q2) + orr r7, r12, r11 ; abs (q2-q1) + uqsub8 r6, r6, r2 ; compare to limit + uqsub8 r7, r7, r2 ; compare to limit + ldr r11, [sp, #4] ; load back p0 + ldr r12, [sp, #12] ; load back p1 + orr lr, lr, r6 + orr lr, lr, r7 + + uqsub8 r6, r11, r9 ; p0 - q0 + uqsub8 r7, r9, r11 ; q0 - p0 + uqsub8 r8, r12, r10 ; p1 - q1 + uqsub8 r11, r10, r12 ; q1 - p1 + orr r6, r6, r7 ; abs (p0-q0) + ldr r7, c0x7F7F7F7F + orr r8, r8, r11 ; abs (p1-q1) + uqadd8 r6, r6, r6 ; abs (p0-q0) * 2 + and r8, r7, r8, lsr #1 ; abs (p1-q1) / 2 + uqsub8 r11, r10, r9 ; q1 - q0 + uqadd8 r6, r8, r6 ; abs (p0-q0)*2 + abs (p1-q1)/2 + uqsub8 r12, r9, r10 ; q0 - q1 + uqsub8 r6, r6, r4 ; compare to flimit + + orr r9, r11, r12 ; abs (q1-q0) + uqsub8 r8, r9, r2 ; compare to limit + uqsub8 r10, r9, r3 ; compare to thresh + orr lr, lr, r6 + orr lr, lr, r8 + + mvn r11, #0 ; r11 == -1 + mov r12, #0 + + usub8 lr, r12, lr + ldr r9, [sp] ; load the compared result + sel lr, r11, r12 ; filter mask: lr + + cmp lr, #0 + beq vskip_filter ; skip filtering + + ;vp8_hevmask() function + ;calculate high edge variance + + sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines + + orr r9, r9, r10 + + ldrh r7, [src, #-2] + ldrh r8, [src], pstep + + usub8 r9, r12, r9 + sel r6, r12, r11 ; hev mask: r6 + + ;vp8_filter() function + ; load soure data to r6, r11, r12, lr + ldrh r9, [src, #-2] + ldrh r10, [src], pstep + + pkhbt r12, r7, r8, lsl #16 + + ldrh r7, [src, #-2] + ldrh r8, [src], pstep + + pkhbt r11, r9, r10, lsl #16 + + ldrh r9, [src, #-2] + ldrh r10, [src], pstep + + ; Transpose needs 8 regs(r6 - r12, and lr). Save r6 and lr first + str r6, [sp] + str lr, [sp, #4] + + pkhbt r6, r7, r8, lsl #16 + pkhbt lr, r9, r10, lsl #16 + + ;transpose r12, r11, r6, lr to r7, r8, r9, r10 + TRANSPOSE_MATRIX r12, r11, r6, lr, r7, r8, r9, r10 + + ;load back hev_mask r6 and filter_mask lr + ldr r12, c0x80808080 + ldr r6, [sp] + ldr lr, [sp, #4] + + eor r7, r7, r12 ; p1 offset to convert to a signed value + eor r8, r8, r12 ; p0 offset to convert to a signed value + eor r9, r9, r12 ; q0 offset to convert to a signed value + eor r10, r10, r12 ; q1 offset to convert to a signed value + + str r9, [sp] ; store qs0 temporarily + str r8, [sp, #4] ; store ps0 temporarily + str r10, [sp, #8] ; store qs1 temporarily + str r7, [sp, #12] ; store ps1 temporarily + + qsub8 r7, r7, r10 ; vp8_signed_char_clamp(ps1-qs1) + qsub8 r8, r9, r8 ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) + + and r7, r7, r6 ; vp8_filter (r7) &= hev (r7 : filter) + + qadd8 r7, r7, r8 + ldr r9, c0x03030303 ; r9 = 3 --modified for vp8 + + qadd8 r7, r7, r8 + ldr r10, c0x04040404 + + qadd8 r7, r7, r8 + ;mvn r11, #0 ; r11 == -1 + + and r7, r7, lr ; vp8_filter &= mask + + ;modify code for vp8 -- Filter1 = vp8_filter (r7) + qadd8 r8 , r7 , r9 ; Filter2 (r8) = vp8_signed_char_clamp(vp8_filter+3) + qadd8 r7 , r7 , r10 ; vp8_filter = vp8_signed_char_clamp(vp8_filter+4) + + mov r9, #0 + shadd8 r8 , r8 , r9 ; Filter2 >>= 3 + shadd8 r7 , r7 , r9 ; vp8_filter >>= 3 + shadd8 r8 , r8 , r9 + shadd8 r7 , r7 , r9 + shadd8 lr , r8 , r9 ; lr: filter2 + shadd8 r7 , r7 , r9 ; r7: filter + + ;usub8 lr, r8, r10 ; s = (s==4)*-1 + ;sel lr, r11, r9 + ;usub8 r8, r10, r8 + ;sel r8, r11, r9 + ;and r8, r8, lr ; -1 for each element that equals 4 -- r8: s + + ;calculate output + ;qadd8 lr, r8, r7 ; u = vp8_signed_char_clamp(s + vp8_filter) + + ldr r8, [sp] ; load qs0 + ldr r9, [sp, #4] ; load ps0 + + ldr r10, c0x01010101 + + qsub8 r8, r8, r7 ; u = vp8_signed_char_clamp(qs0 - vp8_filter) + qadd8 r9, r9, lr ; u = vp8_signed_char_clamp(ps0 + Filter2) + ;end of modification for vp8 + + eor r8, r8, r12 + eor r9, r9, r12 + + mov lr, #0 + + sadd8 r7, r7, r10 + shadd8 r7, r7, lr + + ldr r10, [sp, #8] ; load qs1 + ldr r11, [sp, #12] ; load ps1 + + bic r7, r7, r6 ; r7: vp8_filter + + qsub8 r10 , r10, r7 ; u = vp8_signed_char_clamp(qs1 - vp8_filter) + qadd8 r11, r11, r7 ; u = vp8_signed_char_clamp(ps1 + vp8_filter) + eor r10, r10, r12 + eor r11, r11, r12 + + sub src, src, pstep, lsl #2 + + ;we can use TRANSPOSE_MATRIX macro to transpose output - input: q1, q0, p0, p1 + ;output is b0, b1, b2, b3 + ;b0: 03 02 01 00 + ;b1: 13 12 11 10 + ;b2: 23 22 21 20 + ;b3: 33 32 31 30 + ; p1 p0 q0 q1 + ; (a3 a2 a1 a0) + TRANSPOSE_MATRIX r11, r9, r8, r10, r6, r7, r12, lr + + strh r6, [src, #-2] ; store the result + mov r6, r6, lsr #16 + strh r6, [src], pstep + + strh r7, [src, #-2] + mov r7, r7, lsr #16 + strh r7, [src], pstep + + strh r12, [src, #-2] + mov r12, r12, lsr #16 + strh r12, [src], pstep + + strh lr, [src, #-2] + mov lr, lr, lsr #16 + strh lr, [src], pstep + +|vskip_filter| + sub src, src, #4 + subs count, count, #1 + + ldrne r6, [src], pstep ; load source data + ldrne r7, [src], pstep + ldrne r8, [src], pstep + ldrne lr, [src], pstep + + bne Vnext8 + + add sp, sp, #16 + + ldmia sp!, {r4 - r11, pc} + ENDP ; |vp8_loop_filter_vertical_edge_armv6| + + + +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- +|vp8_mbloop_filter_vertical_edge_armv6| PROC +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- + stmdb sp!, {r4 - r11, lr} + + sub src, src, #4 ; move src pointer down by 4 + ldr count, [sp, #40] ; count for 8-in-parallel + ldr r12, [sp, #36] ; load thresh address + sub sp, sp, #16 ; create temp buffer + + ldr r6, [src], pstep ; load source data + ldr r4, [r2], #4 ; flimit + ldr r7, [src], pstep + ldr r2, [r3], #4 ; limit + ldr r8, [src], pstep + uadd8 r4, r4, r4 ; flimit * 2 + ldr r3, [r12], #4 ; thresh + ldr lr, [src], pstep + mov count, count, lsl #1 ; 4-in-parallel + uadd8 r4, r4, r2 ; flimit * 2 + limit + +|MBVnext8| + ; vp8_filter_mask() function + ; calculate breakout conditions + ; transpose the source data for 4-in-parallel operation + TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12 + + uqsub8 r7, r9, r10 ; p3 - p2 + uqsub8 r8, r10, r9 ; p2 - p3 + uqsub8 r9, r10, r11 ; p2 - p1 + uqsub8 r10, r11, r10 ; p1 - p2 + orr r7, r7, r8 ; abs (p3-p2) + orr r10, r9, r10 ; abs (p2-p1) + uqsub8 lr, r7, r2 ; compare to limit. lr: vp8_filter_mask + uqsub8 r10, r10, r2 ; compare to limit + + sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines + + orr lr, lr, r10 + + uqsub8 r6, r11, r12 ; p1 - p0 + uqsub8 r7, r12, r11 ; p0 - p1 + add src, src, #4 ; move src pointer up by 4 + orr r6, r6, r7 ; abs (p1-p0) + str r11, [sp, #12] ; save p1 + uqsub8 r10, r6, r2 ; compare to limit + uqsub8 r11, r6, r3 ; compare to thresh + orr lr, lr, r10 + + ; transpose uses 8 regs(r6 - r12 and lr). Need to save reg value now + ; transpose the source data for 4-in-parallel operation + ldr r6, [src], pstep ; load source data + str r11, [sp] ; push r11 to stack + ldr r7, [src], pstep + str r12, [sp, #4] ; save current reg before load q0 - q3 data + ldr r8, [src], pstep + str lr, [sp, #8] + ldr lr, [src], pstep + + TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12 + + ldr lr, [sp, #8] ; load back (f)limit accumulator + + uqsub8 r6, r12, r11 ; q3 - q2 + uqsub8 r7, r11, r12 ; q2 - q3 + uqsub8 r12, r11, r10 ; q2 - q1 + uqsub8 r11, r10, r11 ; q1 - q2 + orr r6, r6, r7 ; abs (q3-q2) + orr r7, r12, r11 ; abs (q2-q1) + uqsub8 r6, r6, r2 ; compare to limit + uqsub8 r7, r7, r2 ; compare to limit + ldr r11, [sp, #4] ; load back p0 + ldr r12, [sp, #12] ; load back p1 + orr lr, lr, r6 + orr lr, lr, r7 + + uqsub8 r6, r11, r9 ; p0 - q0 + uqsub8 r7, r9, r11 ; q0 - p0 + uqsub8 r8, r12, r10 ; p1 - q1 + uqsub8 r11, r10, r12 ; q1 - p1 + orr r6, r6, r7 ; abs (p0-q0) + ldr r7, c0x7F7F7F7F + orr r8, r8, r11 ; abs (p1-q1) + uqadd8 r6, r6, r6 ; abs (p0-q0) * 2 + and r8, r7, r8, lsr #1 ; abs (p1-q1) / 2 + uqsub8 r11, r10, r9 ; q1 - q0 + uqadd8 r6, r8, r6 ; abs (p0-q0)*2 + abs (p1-q1)/2 + uqsub8 r12, r9, r10 ; q0 - q1 + uqsub8 r6, r6, r4 ; compare to flimit + + orr r9, r11, r12 ; abs (q1-q0) + uqsub8 r8, r9, r2 ; compare to limit + uqsub8 r10, r9, r3 ; compare to thresh + orr lr, lr, r6 + orr lr, lr, r8 + + mvn r11, #0 ; r11 == -1 + mov r12, #0 + + usub8 lr, r12, lr + ldr r9, [sp] ; load the compared result + sel lr, r11, r12 ; filter mask: lr + + cmp lr, #0 + beq mbvskip_filter ; skip filtering + + + ;vp8_hevmask() function + ;calculate high edge variance + + sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines + + orr r9, r9, r10 + + ldrh r7, [src, #-2] + ldrh r8, [src], pstep + + usub8 r9, r12, r9 + sel r6, r12, r11 ; hev mask: r6 + + + ; vp8_mbfilter() function + ; p2, q2 are only needed at the end. Don't need to load them in now. + ; Transpose needs 8 regs(r6 - r12, and lr). Save r6 and lr first + ; load soure data to r6, r11, r12, lr + ldrh r9, [src, #-2] + ldrh r10, [src], pstep + + pkhbt r12, r7, r8, lsl #16 + + ldrh r7, [src, #-2] + ldrh r8, [src], pstep + + pkhbt r11, r9, r10, lsl #16 + + ldrh r9, [src, #-2] + ldrh r10, [src], pstep + + str r6, [sp] ; save r6 + str lr, [sp, #4] ; save lr + + pkhbt r6, r7, r8, lsl #16 + pkhbt lr, r9, r10, lsl #16 + + ;transpose r12, r11, r6, lr to p1, p0, q0, q1 + TRANSPOSE_MATRIX r12, r11, r6, lr, r7, r8, r9, r10 + + ;load back hev_mask r6 and filter_mask lr + ldr r12, c0x80808080 + ldr r6, [sp] + ldr lr, [sp, #4] + + eor r7, r7, r12 ; ps1 + eor r8, r8, r12 ; ps0 + eor r9, r9, r12 ; qs0 + eor r10, r10, r12 ; qs1 + + qsub8 r12, r9, r8 ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) + str r7, [sp, #12] ; store ps1 temporarily + qsub8 r7, r7, r10 ; vp8_signed_char_clamp(ps1-qs1) + str r10, [sp, #8] ; store qs1 temporarily + qadd8 r7, r7, r12 + str r9, [sp] ; store qs0 temporarily + qadd8 r7, r7, r12 + str r8, [sp, #4] ; store ps0 temporarily + qadd8 r7, r7, r12 ; vp8_filter: r7 + + ldr r10, c0x03030303 ; r10 = 3 --modified for vp8 + ldr r9, c0x04040404 + ;mvn r11, #0 ; r11 == -1 + + and r7, r7, lr ; vp8_filter &= mask (lr is free) + + mov r12, r7 ; Filter2: r12 + and r12, r12, r6 ; Filter2 &= hev + + ;modify code for vp8 + ;save bottom 3 bits so that we round one side +4 and the other +3 + qadd8 r8 , r12 , r9 ; Filter1 (r8) = vp8_signed_char_clamp(Filter2+4) + qadd8 r12 , r12 , r10 ; Filter2 (r12) = vp8_signed_char_clamp(Filter2+3) + + mov r10, #0 + shadd8 r8 , r8 , r10 ; Filter1 >>= 3 + shadd8 r12 , r12 , r10 ; Filter2 >>= 3 + shadd8 r8 , r8 , r10 + shadd8 r12 , r12 , r10 + shadd8 r8 , r8 , r10 ; r8: Filter1 + shadd8 r12 , r12 , r10 ; r12: Filter2 + + ldr r9, [sp] ; load qs0 + ldr r11, [sp, #4] ; load ps0 + + qsub8 r9 , r9, r8 ; qs0 = vp8_signed_char_clamp(qs0 - Filter1) + qadd8 r11, r11, r12 ; ps0 = vp8_signed_char_clamp(ps0 + Filter2) + + ;save bottom 3 bits so that we round one side +4 and the other +3 + ;and r8, r12, r10 ; s = Filter2 & 7 (s: r8) + ;qadd8 r12 , r12 , r9 ; Filter2 = vp8_signed_char_clamp(Filter2+4) + ;mov r10, #0 + ;shadd8 r12 , r12 , r10 ; Filter2 >>= 3 + ;usub8 lr, r8, r9 ; s = (s==4)*-1 + ;sel lr, r11, r10 + ;shadd8 r12 , r12 , r10 + ;usub8 r8, r9, r8 + ;sel r8, r11, r10 + ;ldr r9, [sp] ; load qs0 + ;ldr r11, [sp, #4] ; load ps0 + ;shadd8 r12 , r12 , r10 + ;and r8, r8, lr ; -1 for each element that equals 4 + ;qadd8 r10, r8, r12 ; u = vp8_signed_char_clamp(s + Filter2) + ;qsub8 r9 , r9, r12 ; qs0 = vp8_signed_char_clamp(qs0 - Filter2) + ;qadd8 r11, r11, r10 ; ps0 = vp8_signed_char_clamp(ps0 + u) + + ;end of modification for vp8 + + bic r12, r7, r6 ;vp8_filter &= ~hev ( r6 is free) + ;mov r12, r7 + + ;roughly 3/7th difference across boundary + mov lr, #0x1b ; 27 + mov r7, #0x3f ; 63 + + sxtb16 r6, r12 + sxtb16 r10, r12, ror #8 + smlabb r8, r6, lr, r7 + smlatb r6, r6, lr, r7 + smlabb r7, r10, lr, r7 + smultb r10, r10, lr + ssat r8, #8, r8, asr #7 + ssat r6, #8, r6, asr #7 + add r10, r10, #63 + ssat r7, #8, r7, asr #7 + ssat r10, #8, r10, asr #7 + + ldr lr, c0x80808080 + + pkhbt r6, r8, r6, lsl #16 + pkhbt r10, r7, r10, lsl #16 + uxtb16 r6, r6 + uxtb16 r10, r10 + + sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines + + orr r10, r6, r10, lsl #8 ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7) + + qsub8 r8, r9, r10 ; s = vp8_signed_char_clamp(qs0 - u) + qadd8 r10, r11, r10 ; s = vp8_signed_char_clamp(ps0 + u) + eor r8, r8, lr ; *oq0 = s^0x80 + eor r10, r10, lr ; *op0 = s^0x80 + + strb r10, [src, #-1] ; store op0 result + strb r8, [src], pstep ; store oq0 result + mov r10, r10, lsr #8 + mov r8, r8, lsr #8 + strb r10, [src, #-1] + strb r8, [src], pstep + mov r10, r10, lsr #8 + mov r8, r8, lsr #8 + strb r10, [src, #-1] + strb r8, [src], pstep + mov r10, r10, lsr #8 + mov r8, r8, lsr #8 + strb r10, [src, #-1] + strb r8, [src], pstep + + ;roughly 2/7th difference across boundary + mov lr, #0x12 ; 18 + mov r7, #0x3f ; 63 + + sxtb16 r6, r12 + sxtb16 r10, r12, ror #8 + smlabb r8, r6, lr, r7 + smlatb r6, r6, lr, r7 + smlabb r9, r10, lr, r7 + smlatb r10, r10, lr, r7 + ssat r8, #8, r8, asr #7 + ssat r6, #8, r6, asr #7 + ssat r9, #8, r9, asr #7 + ssat r10, #8, r10, asr #7 + + sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines + + pkhbt r6, r8, r6, lsl #16 + pkhbt r10, r9, r10, lsl #16 + + ldr r9, [sp, #8] ; load qs1 + ldr r11, [sp, #12] ; load ps1 + ldr lr, c0x80808080 + + uxtb16 r6, r6 + uxtb16 r10, r10 + + add src, src, #2 + + orr r10, r6, r10, lsl #8 ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7) + + qsub8 r8, r9, r10 ; s = vp8_signed_char_clamp(qs1 - u) + qadd8 r10, r11, r10 ; s = vp8_signed_char_clamp(ps1 + u) + eor r8, r8, lr ; *oq1 = s^0x80 + eor r10, r10, lr ; *op1 = s^0x80 + + ldrb r11, [src, #-5] ; load p2 for 1/7th difference across boundary + strb r10, [src, #-4] ; store op1 + strb r8, [src, #-1] ; store oq1 + ldrb r9, [src], pstep ; load q2 for 1/7th difference across boundary + + mov r10, r10, lsr #8 + mov r8, r8, lsr #8 + + ldrb r6, [src, #-5] + strb r10, [src, #-4] + strb r8, [src, #-1] + ldrb r7, [src], pstep + + mov r10, r10, lsr #8 + mov r8, r8, lsr #8 + orr r11, r11, r6, lsl #8 + orr r9, r9, r7, lsl #8 + + ldrb r6, [src, #-5] + strb r10, [src, #-4] + strb r8, [src, #-1] + ldrb r7, [src], pstep + + mov r10, r10, lsr #8 + mov r8, r8, lsr #8 + orr r11, r11, r6, lsl #16 + orr r9, r9, r7, lsl #16 + + ldrb r6, [src, #-5] + strb r10, [src, #-4] + strb r8, [src, #-1] + ldrb r7, [src], pstep + orr r11, r11, r6, lsl #24 + orr r9, r9, r7, lsl #24 + + ;roughly 1/7th difference across boundary + eor r9, r9, lr + eor r11, r11, lr + + mov lr, #0x9 ; 9 + mov r7, #0x3f ; 63 + + sxtb16 r6, r12 + sxtb16 r10, r12, ror #8 + smlabb r8, r6, lr, r7 + smlatb r6, r6, lr, r7 + smlabb r12, r10, lr, r7 + smlatb r10, r10, lr, r7 + ssat r8, #8, r8, asr #7 + ssat r6, #8, r6, asr #7 + ssat r12, #8, r12, asr #7 + ssat r10, #8, r10, asr #7 + + sub src, src, pstep, lsl #2 + + pkhbt r6, r8, r6, lsl #16 + pkhbt r10, r12, r10, lsl #16 + + uxtb16 r6, r6 + uxtb16 r10, r10 + + ldr lr, c0x80808080 + + orr r10, r6, r10, lsl #8 ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7) + + qadd8 r8, r11, r10 ; s = vp8_signed_char_clamp(ps2 + u) + qsub8 r10, r9, r10 ; s = vp8_signed_char_clamp(qs2 - u) + eor r8, r8, lr ; *op2 = s^0x80 + eor r10, r10, lr ; *oq2 = s^0x80 + + strb r8, [src, #-5] ; store *op2 + strb r10, [src], pstep ; store *oq2 + mov r8, r8, lsr #8 + mov r10, r10, lsr #8 + strb r8, [src, #-5] + strb r10, [src], pstep + mov r8, r8, lsr #8 + mov r10, r10, lsr #8 + strb r8, [src, #-5] + strb r10, [src], pstep + mov r8, r8, lsr #8 + mov r10, r10, lsr #8 + strb r8, [src, #-5] + strb r10, [src], pstep + + ;adjust src pointer for next loop + sub src, src, #2 + +|mbvskip_filter| + sub src, src, #4 + subs count, count, #1 + + ldrne r6, [src], pstep ; load source data + ldrne r7, [src], pstep + ldrne r8, [src], pstep + ldrne lr, [src], pstep + + bne MBVnext8 + + add sp, sp, #16 + + ldmia sp!, {r4 - r11, pc} + ENDP ; |vp8_mbloop_filter_vertical_edge_armv6| + +; Constant Pool +c0x80808080 DCD 0x80808080 +c0x03030303 DCD 0x03030303 +c0x04040404 DCD 0x04040404 +c0x01010101 DCD 0x01010101 +c0x7F7F7F7F DCD 0x7F7F7F7F + + END diff --git a/vp8/common/arm/armv6/recon_v6.asm b/vp8/common/arm/armv6/recon_v6.asm new file mode 100644 index 000000000..085ff80c9 --- /dev/null +++ b/vp8/common/arm/armv6/recon_v6.asm @@ -0,0 +1,280 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_recon_b_armv6| + EXPORT |vp8_recon2b_armv6| + EXPORT |vp8_recon4b_armv6| + + AREA |.text|, CODE, READONLY ; name this block of code +prd RN r0 +dif RN r1 +dst RN r2 +stride RN r3 + +;void recon_b(unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int stride) +; R0 char* pred_ptr +; R1 short * dif_ptr +; R2 char * dst_ptr +; R3 int stride + +; Description: +; Loop through the block adding the Pred and Diff together. Clamp and then +; store back into the Dst. + +; Restrictions : +; all buffers are expected to be 4 byte aligned coming in and +; going out. +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= +; +; +; +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= +|vp8_recon_b_armv6| PROC + stmdb sp!, {r4 - r9, lr} + + ;0, 1, 2, 3 + ldr r4, [prd], #16 ; 3 | 2 | 1 | 0 + ldr r6, [dif, #0] ; 1 | 0 + ldr r7, [dif, #4] ; 3 | 2 + + pkhbt r8, r6, r7, lsl #16 ; 2 | 0 + pkhtb r9, r7, r6, asr #16 ; 3 | 1 + + uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0 + uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1 + + usat16 r8, #8, r8 + usat16 r9, #8, r9 + add dif, dif, #32 + orr r8, r8, r9, lsl #8 + + str r8, [dst], stride + + ;0, 1, 2, 3 + ldr r4, [prd], #16 ; 3 | 2 | 1 | 0 +;; ldr r6, [dif, #8] ; 1 | 0 +;; ldr r7, [dif, #12] ; 3 | 2 + ldr r6, [dif, #0] ; 1 | 0 + ldr r7, [dif, #4] ; 3 | 2 + + pkhbt r8, r6, r7, lsl #16 ; 2 | 0 + pkhtb r9, r7, r6, asr #16 ; 3 | 1 + + uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0 + uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1 + + usat16 r8, #8, r8 + usat16 r9, #8, r9 + add dif, dif, #32 + orr r8, r8, r9, lsl #8 + + str r8, [dst], stride + + ;0, 1, 2, 3 + ldr r4, [prd], #16 ; 3 | 2 | 1 | 0 +;; ldr r6, [dif, #16] ; 1 | 0 +;; ldr r7, [dif, #20] ; 3 | 2 + ldr r6, [dif, #0] ; 1 | 0 + ldr r7, [dif, #4] ; 3 | 2 + + pkhbt r8, r6, r7, lsl #16 ; 2 | 0 + pkhtb r9, r7, r6, asr #16 ; 3 | 1 + + uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0 + uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1 + + usat16 r8, #8, r8 + usat16 r9, #8, r9 + add dif, dif, #32 + orr r8, r8, r9, lsl #8 + + str r8, [dst], stride + + ;0, 1, 2, 3 + ldr r4, [prd], #16 ; 3 | 2 | 1 | 0 +;; ldr r6, [dif, #24] ; 1 | 0 +;; ldr r7, [dif, #28] ; 3 | 2 + ldr r6, [dif, #0] ; 1 | 0 + ldr r7, [dif, #4] ; 3 | 2 + + pkhbt r8, r6, r7, lsl #16 ; 2 | 0 + pkhtb r9, r7, r6, asr #16 ; 3 | 1 + + uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0 + uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1 + + usat16 r8, #8, r8 + usat16 r9, #8, r9 + orr r8, r8, r9, lsl #8 + + str r8, [dst], stride + + ldmia sp!, {r4 - r9, pc} + + ENDP ; |recon_b| + +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= +; +; +; +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= +; R0 char *pred_ptr +; R1 short *dif_ptr +; R2 char *dst_ptr +; R3 int stride +|vp8_recon4b_armv6| PROC + stmdb sp!, {r4 - r9, lr} + + mov lr, #4 + +recon4b_loop + ;0, 1, 2, 3 + ldr r4, [prd], #4 ; 3 | 2 | 1 | 0 + ldr r6, [dif, #0] ; 1 | 0 + ldr r7, [dif, #4] ; 3 | 2 + + pkhbt r8, r6, r7, lsl #16 ; 2 | 0 + pkhtb r9, r7, r6, asr #16 ; 3 | 1 + + uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0 + uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1 + + usat16 r8, #8, r8 + usat16 r9, #8, r9 + orr r8, r8, r9, lsl #8 + + str r8, [dst] + + ;4, 5, 6, 7 + ldr r4, [prd], #4 +;; ldr r6, [dif, #32] +;; ldr r7, [dif, #36] + ldr r6, [dif, #8] + ldr r7, [dif, #12] + + pkhbt r8, r6, r7, lsl #16 + pkhtb r9, r7, r6, asr #16 + + uxtab16 r8, r8, r4 + uxtab16 r9, r9, r4, ror #8 + usat16 r8, #8, r8 + usat16 r9, #8, r9 + orr r8, r8, r9, lsl #8 + + str r8, [dst, #4] + + ;8, 9, 10, 11 + ldr r4, [prd], #4 +;; ldr r6, [dif, #64] +;; ldr r7, [dif, #68] + ldr r6, [dif, #16] + ldr r7, [dif, #20] + + pkhbt r8, r6, r7, lsl #16 + pkhtb r9, r7, r6, asr #16 + + uxtab16 r8, r8, r4 + uxtab16 r9, r9, r4, ror #8 + usat16 r8, #8, r8 + usat16 r9, #8, r9 + orr r8, r8, r9, lsl #8 + + str r8, [dst, #8] + + ;12, 13, 14, 15 + ldr r4, [prd], #4 +;; ldr r6, [dif, #96] +;; ldr r7, [dif, #100] + ldr r6, [dif, #24] + ldr r7, [dif, #28] + + pkhbt r8, r6, r7, lsl #16 + pkhtb r9, r7, r6, asr #16 + + uxtab16 r8, r8, r4 + uxtab16 r9, r9, r4, ror #8 + usat16 r8, #8, r8 + usat16 r9, #8, r9 + orr r8, r8, r9, lsl #8 + + str r8, [dst, #12] + + add dst, dst, stride +;; add dif, dif, #8 + add dif, dif, #32 + + subs lr, lr, #1 + bne recon4b_loop + + ldmia sp!, {r4 - r9, pc} + + ENDP ; |Recon4B| + +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= +; +; +; +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= +; R0 char *pred_ptr +; R1 short *dif_ptr +; R2 char *dst_ptr +; R3 int stride +|vp8_recon2b_armv6| PROC + stmdb sp!, {r4 - r9, lr} + + mov lr, #4 + +recon2b_loop + ;0, 1, 2, 3 + ldr r4, [prd], #4 + ldr r6, [dif, #0] + ldr r7, [dif, #4] + + pkhbt r8, r6, r7, lsl #16 + pkhtb r9, r7, r6, asr #16 + + uxtab16 r8, r8, r4 + uxtab16 r9, r9, r4, ror #8 + usat16 r8, #8, r8 + usat16 r9, #8, r9 + orr r8, r8, r9, lsl #8 + + str r8, [dst] + + ;4, 5, 6, 7 + ldr r4, [prd], #4 +;; ldr r6, [dif, #32] +;; ldr r7, [dif, #36] + ldr r6, [dif, #8] + ldr r7, [dif, #12] + + pkhbt r8, r6, r7, lsl #16 + pkhtb r9, r7, r6, asr #16 + + uxtab16 r8, r8, r4 + uxtab16 r9, r9, r4, ror #8 + usat16 r8, #8, r8 + usat16 r9, #8, r9 + orr r8, r8, r9, lsl #8 + + str r8, [dst, #4] + + add dst, dst, stride +;; add dif, dif, #8 + add dif, dif, #16 + + subs lr, lr, #1 + bne recon2b_loop + + ldmia sp!, {r4 - r9, pc} + + ENDP ; |Recon2B| + + END diff --git a/vp8/common/arm/armv6/simpleloopfilter_v6.asm b/vp8/common/arm/armv6/simpleloopfilter_v6.asm new file mode 100644 index 000000000..15c6c7d16 --- /dev/null +++ b/vp8/common/arm/armv6/simpleloopfilter_v6.asm @@ -0,0 +1,321 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_loop_filter_simple_horizontal_edge_armv6| + EXPORT |vp8_loop_filter_simple_vertical_edge_armv6| + + AREA |.text|, CODE, READONLY ; name this block of code + + MACRO + TRANSPOSE_MATRIX $a0, $a1, $a2, $a3, $b0, $b1, $b2, $b3 + ; input: $a0, $a1, $a2, $a3; output: $b0, $b1, $b2, $b3 + ; a0: 03 02 01 00 + ; a1: 13 12 11 10 + ; a2: 23 22 21 20 + ; a3: 33 32 31 30 + ; b3 b2 b1 b0 + + uxtb16 $b1, $a1 ; xx 12 xx 10 + uxtb16 $b0, $a0 ; xx 02 xx 00 + uxtb16 $b3, $a3 ; xx 32 xx 30 + uxtb16 $b2, $a2 ; xx 22 xx 20 + orr $b1, $b0, $b1, lsl #8 ; 12 02 10 00 + orr $b3, $b2, $b3, lsl #8 ; 32 22 30 20 + + uxtb16 $a1, $a1, ror #8 ; xx 13 xx 11 + uxtb16 $a3, $a3, ror #8 ; xx 33 xx 31 + uxtb16 $a0, $a0, ror #8 ; xx 03 xx 01 + uxtb16 $a2, $a2, ror #8 ; xx 23 xx 21 + orr $a0, $a0, $a1, lsl #8 ; 13 03 11 01 + orr $a2, $a2, $a3, lsl #8 ; 33 23 31 21 + + pkhtb $b2, $b3, $b1, asr #16 ; 32 22 12 02 -- p1 + pkhbt $b0, $b1, $b3, lsl #16 ; 30 20 10 00 -- p3 + + pkhtb $b3, $a2, $a0, asr #16 ; 33 23 13 03 -- p0 + pkhbt $b1, $a0, $a2, lsl #16 ; 31 21 11 01 -- p2 + MEND + + +src RN r0 +pstep RN r1 + +;r0 unsigned char *src_ptr, +;r1 int src_pixel_step, +;r2 const char *flimit, +;r3 const char *limit, +;stack const char *thresh, +;stack int count + +;Note: All 16 elements in flimit are equal. So, in the code, only one load is needed +;for flimit. Same way applies to limit and thresh. + +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- +|vp8_loop_filter_simple_horizontal_edge_armv6| PROC +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- + stmdb sp!, {r4 - r11, lr} + + sub src, src, pstep, lsl #1 ; move src pointer down by 2 lines + + ldr r12, [r3], #4 ; limit + ldr r3, [src], pstep ; p1 + + ldr r9, [sp, #36] ; count for 8-in-parallel + ldr r4, [src], pstep ; p0 + + ldr r7, [r2], #4 ; flimit + ldr r5, [src], pstep ; q0 + ldr r2, c0x80808080 + + ldr r6, [src] ; q1 + + uadd8 r7, r7, r7 ; flimit * 2 + mov r9, r9, lsl #1 ; 4-in-parallel + uadd8 r12, r7, r12 ; flimit * 2 + limit + +|simple_hnext8| + ; vp8_simple_filter_mask() function + + uqsub8 r7, r3, r6 ; p1 - q1 + uqsub8 r8, r6, r3 ; q1 - p1 + uqsub8 r10, r4, r5 ; p0 - q0 + uqsub8 r11, r5, r4 ; q0 - p0 + orr r8, r8, r7 ; abs(p1 - q1) + ldr lr, c0x7F7F7F7F ; 01111111 mask + orr r10, r10, r11 ; abs(p0 - q0) + and r8, lr, r8, lsr #1 ; abs(p1 - q1) / 2 + uqadd8 r10, r10, r10 ; abs(p0 - q0) * 2 + mvn lr, #0 ; r10 == -1 + uqadd8 r10, r10, r8 ; abs(p0 - q0)*2 + abs(p1 - q1)/2 + ; STALL waiting on r10 :( + uqsub8 r10, r10, r12 ; compare to flimit + mov r8, #0 + + usub8 r10, r8, r10 ; use usub8 instead of ssub8 + ; STALL (maybe?) when are flags set? :/ + sel r10, lr, r8 ; filter mask: lr + + cmp r10, #0 + beq simple_hskip_filter ; skip filtering + + ;vp8_simple_filter() function + + eor r3, r3, r2 ; p1 offset to convert to a signed value + eor r6, r6, r2 ; q1 offset to convert to a signed value + eor r4, r4, r2 ; p0 offset to convert to a signed value + eor r5, r5, r2 ; q0 offset to convert to a signed value + + qsub8 r3, r3, r6 ; vp8_filter (r3) = vp8_signed_char_clamp(p1-q1) + qsub8 r6, r5, r4 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( q0 - p0)) + + qadd8 r3, r3, r6 + ldr r8, c0x03030303 ; r8 = 3 + + qadd8 r3, r3, r6 + ldr r7, c0x04040404 + + qadd8 r3, r3, r6 + and r3, r3, lr ; vp8_filter &= mask; + + ;save bottom 3 bits so that we round one side +4 and the other +3 + qadd8 r8 , r3 , r8 ; Filter2 (r8) = vp8_signed_char_clamp(vp8_filter+3) + qadd8 r3 , r3 , r7 ; Filter1 (r3) = vp8_signed_char_clamp(vp8_filter+4) + + mov r7, #0 + shadd8 r8 , r8 , r7 ; Filter2 >>= 3 + shadd8 r3 , r3 , r7 ; Filter1 >>= 3 + shadd8 r8 , r8 , r7 + shadd8 r3 , r3 , r7 + shadd8 r8 , r8 , r7 ; r8: Filter2 + shadd8 r3 , r3 , r7 ; r7: filter1 + + ;calculate output + sub src, src, pstep, lsl #1 + + qadd8 r4, r4, r8 ; u = vp8_signed_char_clamp(p0 + Filter2) + qsub8 r5 ,r5, r3 ; u = vp8_signed_char_clamp(q0 - Filter1) + eor r4, r4, r2 ; *op0 = u^0x80 + str r4, [src], pstep ; store op0 result + eor r5, r5, r2 ; *oq0 = u^0x80 + str r5, [src], pstep ; store oq0 result + +|simple_hskip_filter| + add src, src, #4 + sub src, src, pstep + sub src, src, pstep, lsl #1 + + subs r9, r9, #1 + + ;pld [src] + ;pld [src, pstep] + ;pld [src, pstep, lsl #1] + + ldrne r3, [src], pstep ; p1 + ldrne r4, [src], pstep ; p0 + ldrne r5, [src], pstep ; q0 + ldrne r6, [src] ; q1 + + bne simple_hnext8 + + ldmia sp!, {r4 - r11, pc} + ENDP ; |vp8_loop_filter_simple_horizontal_edge_armv6| + + +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- +|vp8_loop_filter_simple_vertical_edge_armv6| PROC +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- + stmdb sp!, {r4 - r11, lr} + + ldr r12, [r2], #4 ; r12: flimit + ldr r2, c0x80808080 + ldr r7, [r3], #4 ; limit + + ; load soure data to r7, r8, r9, r10 + ldrh r3, [src, #-2] + ldrh r4, [src], pstep + uadd8 r12, r12, r12 ; flimit * 2 + + ldrh r5, [src, #-2] + ldrh r6, [src], pstep + uadd8 r12, r12, r7 ; flimit * 2 + limit + + pkhbt r7, r3, r4, lsl #16 + + ldrh r3, [src, #-2] + ldrh r4, [src], pstep + ldr r11, [sp, #40] ; count (r11) for 8-in-parallel + + pkhbt r8, r5, r6, lsl #16 + + ldrh r5, [src, #-2] + ldrh r6, [src], pstep + mov r11, r11, lsl #1 ; 4-in-parallel + +|simple_vnext8| + ; vp8_simple_filter_mask() function + pkhbt r9, r3, r4, lsl #16 + pkhbt r10, r5, r6, lsl #16 + + ;transpose r7, r8, r9, r10 to r3, r4, r5, r6 + TRANSPOSE_MATRIX r7, r8, r9, r10, r3, r4, r5, r6 + + uqsub8 r7, r3, r6 ; p1 - q1 + uqsub8 r8, r6, r3 ; q1 - p1 + uqsub8 r9, r4, r5 ; p0 - q0 + uqsub8 r10, r5, r4 ; q0 - p0 + orr r7, r7, r8 ; abs(p1 - q1) + orr r9, r9, r10 ; abs(p0 - q0) + ldr lr, c0x7F7F7F7F ; 0111 1111 mask + uqadd8 r9, r9, r9 ; abs(p0 - q0) * 2 + and r7, lr, r7, lsr #1 ; abs(p1 - q1) / 2 + mov r8, #0 + uqadd8 r7, r7, r9 ; abs(p0 - q0)*2 + abs(p1 - q1)/2 + mvn r10, #0 ; r10 == -1 + uqsub8 r7, r7, r12 ; compare to flimit + + usub8 r7, r8, r7 + sel r7, r10, r8 ; filter mask: lr + + cmp lr, #0 + beq simple_vskip_filter ; skip filtering + + ;vp8_simple_filter() function + eor r3, r3, r2 ; p1 offset to convert to a signed value + eor r6, r6, r2 ; q1 offset to convert to a signed value + eor r4, r4, r2 ; p0 offset to convert to a signed value + eor r5, r5, r2 ; q0 offset to convert to a signed value + + qsub8 r3, r3, r6 ; vp8_filter (r3) = vp8_signed_char_clamp(p1-q1) + qsub8 r6, r5, r4 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( q0 - p0)) + + qadd8 r3, r3, r6 + ldr r8, c0x03030303 ; r8 = 3 + + qadd8 r3, r3, r6 + ldr r7, c0x04040404 + + qadd8 r3, r3, r6 + and r3, r3, lr ; vp8_filter &= mask + + ;save bottom 3 bits so that we round one side +4 and the other +3 + qadd8 r8 , r3 , r8 ; Filter2 (r8) = vp8_signed_char_clamp(vp8_filter+3) + qadd8 r3 , r3 , r7 ; Filter1 (r3) = vp8_signed_char_clamp(vp8_filter+4) + + mov r7, #0 + shadd8 r8 , r8 , r7 ; Filter2 >>= 3 + shadd8 r3 , r3 , r7 ; Filter1 >>= 3 + shadd8 r8 , r8 , r7 + shadd8 r3 , r3 , r7 + shadd8 r8 , r8 , r7 ; r8: filter2 + shadd8 r3 , r3 , r7 ; r7: filter1 + + ;calculate output + sub src, src, pstep, lsl #2 + + qadd8 r4, r4, r8 ; u = vp8_signed_char_clamp(p0 + Filter2) + qsub8 r5, r5, r3 ; u = vp8_signed_char_clamp(q0 - Filter1) + eor r4, r4, r2 ; *op0 = u^0x80 + eor r5, r5, r2 ; *oq0 = u^0x80 + + strb r4, [src, #-1] ; store the result + mov r4, r4, lsr #8 + strb r5, [src], pstep + mov r5, r5, lsr #8 + + strb r4, [src, #-1] + mov r4, r4, lsr #8 + strb r5, [src], pstep + mov r5, r5, lsr #8 + + strb r4, [src, #-1] + mov r4, r4, lsr #8 + strb r5, [src], pstep + mov r5, r5, lsr #8 + + strb r4, [src, #-1] + strb r5, [src], pstep + +|simple_vskip_filter| + subs r11, r11, #1 + + ;pld [src] + ;pld [src, pstep] + ;pld [src, pstep, lsl #1] + + ; load soure data to r7, r8, r9, r10 + ldrneh r3, [src, #-2] + ldrneh r4, [src], pstep + + ldrneh r5, [src, #-2] + ldrneh r6, [src], pstep + + pkhbt r7, r3, r4, lsl #16 + + ldrneh r3, [src, #-2] + ldrneh r4, [src], pstep + + pkhbt r8, r5, r6, lsl #16 + + ldrneh r5, [src, #-2] + ldrneh r6, [src], pstep + + bne simple_vnext8 + + ldmia sp!, {r4 - r12, pc} + ENDP ; |vp8_loop_filter_simple_vertical_edge_armv6| + +; Constant Pool +c0x80808080 DCD 0x80808080 +c0x03030303 DCD 0x03030303 +c0x04040404 DCD 0x04040404 +c0x01010101 DCD 0x01010101 +c0x7F7F7F7F DCD 0x7F7F7F7F + + END diff --git a/vp8/common/arm/armv6/sixtappredict8x4_v6.asm b/vp8/common/arm/armv6/sixtappredict8x4_v6.asm new file mode 100644 index 000000000..551d863e9 --- /dev/null +++ b/vp8/common/arm/armv6/sixtappredict8x4_v6.asm @@ -0,0 +1,277 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_sixtap_predict8x4_armv6| + + AREA |.text|, CODE, READONLY ; name this block of code +;------------------------------------- +; r0 unsigned char *src_ptr, +; r1 int src_pixels_per_line, +; r2 int xoffset, +; r3 int yoffset, +; stack unsigned char *dst_ptr, +; stack int dst_pitch +;------------------------------------- +;note: In first pass, store the result in transpose(8linesx9columns) on stack. Temporary stack size is 184. +;Line width is 20 that is 9 short data plus 2 to make it 4bytes aligned. In second pass, load data from stack, +;and the result is stored in transpose. +|vp8_sixtap_predict8x4_armv6| PROC + stmdb sp!, {r4 - r11, lr} + sub sp, sp, #184 ;reserve space on stack for temporary storage: 20x(8+1) +4 + + cmp r2, #0 ;skip first_pass filter if xoffset=0 + str r3, [sp], #4 ;store yoffset + beq skip_firstpass_filter + +;first-pass filter + ldr r12, _filter8_coeff_ + sub r0, r0, r1, lsl #1 + + add r2, r12, r2, lsl #4 ;calculate filter location + add r0, r0, #3 ;adjust src only for loading convinience + + ldr r3, [r2] ; load up packed filter coefficients + ldr r4, [r2, #4] + ldr r5, [r2, #8] + + mov r2, #0x90000 ; height=9 is top part of counter + + sub r1, r1, #8 + mov lr, #20 + +|first_pass_hloop_v6| + ldrb r6, [r0, #-5] ; load source data + ldrb r7, [r0, #-4] + ldrb r8, [r0, #-3] + ldrb r9, [r0, #-2] + ldrb r10, [r0, #-1] + + orr r2, r2, #0x4 ; construct loop counter. width=8=4x2 + + pkhbt r6, r6, r7, lsl #16 ; r7 | r6 + pkhbt r7, r7, r8, lsl #16 ; r8 | r7 + + pkhbt r8, r8, r9, lsl #16 ; r9 | r8 + pkhbt r9, r9, r10, lsl #16 ; r10 | r9 + +|first_pass_wloop_v6| + smuad r11, r6, r3 ; vp8_filter[0], vp8_filter[1] + smuad r12, r7, r3 + + ldrb r6, [r0], #1 + + smlad r11, r8, r4, r11 ; vp8_filter[2], vp8_filter[3] + ldrb r7, [r0], #1 + smlad r12, r9, r4, r12 + + pkhbt r10, r10, r6, lsl #16 ; r10 | r9 + pkhbt r6, r6, r7, lsl #16 ; r11 | r10 + smlad r11, r10, r5, r11 ; vp8_filter[4], vp8_filter[5] + smlad r12, r6, r5, r12 + + sub r2, r2, #1 + + add r11, r11, #0x40 ; round_shift_and_clamp + tst r2, #0xff ; test loop counter + usat r11, #8, r11, asr #7 + add r12, r12, #0x40 + strh r11, [sp], lr ; result is transposed and stored, which + usat r12, #8, r12, asr #7 + + strh r12, [sp], lr + + movne r11, r6 + movne r12, r7 + + movne r6, r8 + movne r7, r9 + movne r8, r10 + movne r9, r11 + movne r10, r12 + + bne first_pass_wloop_v6 + + ;;add r9, ppl, #30 ; attempt to load 2 adjacent cache lines + ;;IF ARCHITECTURE=6 + ;pld [src, ppl] + ;;pld [src, r9] + ;;ENDIF + + subs r2, r2, #0x10000 + + mov r6, #158 + sub sp, sp, r6 + + add r0, r0, r1 ; move to next input line + + bne first_pass_hloop_v6 + +;second pass filter +secondpass_filter + mov r1, #18 + sub sp, sp, r1 ; 18+4 + + ldr r3, [sp, #-4] ; load back yoffset + ldr r0, [sp, #216] ; load dst address from stack 180+36 + ldr r1, [sp, #220] ; load dst stride from stack 180+40 + + cmp r3, #0 + beq skip_secondpass_filter + + ldr r12, _filter8_coeff_ + add lr, r12, r3, lsl #4 ;calculate filter location + + mov r2, #0x00080000 + + ldr r3, [lr] ; load up packed filter coefficients + ldr r4, [lr, #4] + ldr r5, [lr, #8] + + pkhbt r12, r4, r3 ; pack the filter differently + pkhbt r11, r5, r4 + +second_pass_hloop_v6 + ldr r6, [sp] ; load the data + ldr r7, [sp, #4] + + orr r2, r2, #2 ; loop counter + +second_pass_wloop_v6 + smuad lr, r3, r6 ; apply filter + smulbt r10, r3, r6 + + ldr r8, [sp, #8] + + smlad lr, r4, r7, lr + smladx r10, r12, r7, r10 + + ldrh r9, [sp, #12] + + smlad lr, r5, r8, lr + smladx r10, r11, r8, r10 + + add sp, sp, #4 + smlatb r10, r5, r9, r10 + + sub r2, r2, #1 + + add lr, lr, #0x40 ; round_shift_and_clamp + tst r2, #0xff + usat lr, #8, lr, asr #7 + add r10, r10, #0x40 + strb lr, [r0], r1 ; the result is transposed back and stored + usat r10, #8, r10, asr #7 + + strb r10, [r0],r1 + + movne r6, r7 + movne r7, r8 + + bne second_pass_wloop_v6 + + subs r2, r2, #0x10000 + add sp, sp, #12 ; updata src for next loop (20-8) + sub r0, r0, r1, lsl #2 + add r0, r0, #1 + + bne second_pass_hloop_v6 + + add sp, sp, #20 + ldmia sp!, {r4 - r11, pc} + +;-------------------- +skip_firstpass_filter + sub r0, r0, r1, lsl #1 + sub r1, r1, #8 + mov r2, #9 + mov r3, #20 + +skip_firstpass_hloop + ldrb r4, [r0], #1 ; load data + subs r2, r2, #1 + ldrb r5, [r0], #1 + strh r4, [sp], r3 ; store it to immediate buffer + ldrb r6, [r0], #1 ; load data + strh r5, [sp], r3 + ldrb r7, [r0], #1 + strh r6, [sp], r3 + ldrb r8, [r0], #1 + strh r7, [sp], r3 + ldrb r9, [r0], #1 + strh r8, [sp], r3 + ldrb r10, [r0], #1 + strh r9, [sp], r3 + ldrb r11, [r0], #1 + strh r10, [sp], r3 + add r0, r0, r1 ; move to next input line + strh r11, [sp], r3 + + mov r4, #158 + sub sp, sp, r4 ; move over to next column + bne skip_firstpass_hloop + + b secondpass_filter + +;-------------------- +skip_secondpass_filter + mov r2, #8 + add sp, sp, #4 ;start from src[0] instead of src[-2] + +skip_secondpass_hloop + ldr r6, [sp], #4 + subs r2, r2, #1 + ldr r8, [sp], #4 + + mov r7, r6, lsr #16 ; unpack + strb r6, [r0], r1 + mov r9, r8, lsr #16 + strb r7, [r0], r1 + add sp, sp, #12 ; 20-8 + strb r8, [r0], r1 + strb r9, [r0], r1 + + sub r0, r0, r1, lsl #2 + add r0, r0, #1 + + bne skip_secondpass_hloop + + add sp, sp, #16 ; 180 - (160 +4) + + ldmia sp!, {r4 - r11, pc} + + ENDP + +;----------------- + AREA subpelfilters8_dat, DATA, READWRITE ;read/write by default +;Data section with name data_area is specified. DCD reserves space in memory for 48 data. +;One word each is reserved. Label filter_coeff can be used to access the data. +;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ... +_filter8_coeff_ + DCD filter8_coeff +filter8_coeff + DCD 0x00000000, 0x00000080, 0x00000000, 0x00000000 + DCD 0xfffa0000, 0x000c007b, 0x0000ffff, 0x00000000 + DCD 0xfff50002, 0x0024006c, 0x0001fff8, 0x00000000 + DCD 0xfff70000, 0x0032005d, 0x0000fffa, 0x00000000 + DCD 0xfff00003, 0x004d004d, 0x0003fff0, 0x00000000 + DCD 0xfffa0000, 0x005d0032, 0x0000fff7, 0x00000000 + DCD 0xfff80001, 0x006c0024, 0x0002fff5, 0x00000000 + DCD 0xffff0000, 0x007b000c, 0x0000fffa, 0x00000000 + + ;DCD 0, 0, 128, 0, 0, 0 + ;DCD 0, -6, 123, 12, -1, 0 + ;DCD 2, -11, 108, 36, -8, 1 + ;DCD 0, -9, 93, 50, -6, 0 + ;DCD 3, -16, 77, 77, -16, 3 + ;DCD 0, -6, 50, 93, -9, 0 + ;DCD 1, -8, 36, 108, -11, 2 + ;DCD 0, -1, 12, 123, -6, 0 + + END diff --git a/vp8/common/arm/bilinearfilter_arm.c b/vp8/common/arm/bilinearfilter_arm.c new file mode 100644 index 000000000..bf972a3bc --- /dev/null +++ b/vp8/common/arm/bilinearfilter_arm.c @@ -0,0 +1,211 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include <math.h> +#include "subpixel.h" + +#define BLOCK_HEIGHT_WIDTH 4 +#define VP8_FILTER_WEIGHT 128 +#define VP8_FILTER_SHIFT 7 + +static const short bilinear_filters[8][2] = +{ + { 128, 0 }, + { 112, 16 }, + { 96, 32 }, + { 80, 48 }, + { 64, 64 }, + { 48, 80 }, + { 32, 96 }, + { 16, 112 } +}; + + +extern void vp8_filter_block2d_bil_first_pass_armv6 +( + unsigned char *src_ptr, + unsigned short *output_ptr, + unsigned int src_pixels_per_line, + unsigned int output_height, + unsigned int output_width, + const short *vp8_filter +); + +extern void vp8_filter_block2d_bil_second_pass_armv6 +( + unsigned short *src_ptr, + unsigned char *output_ptr, + int output_pitch, + unsigned int output_height, + unsigned int output_width, + const short *vp8_filter +); + +/* +void vp8_filter_block2d_bil_first_pass_6 +( + unsigned char *src_ptr, + unsigned short *output_ptr, + unsigned int src_pixels_per_line, + unsigned int output_height, + unsigned int output_width, + const short *vp8_filter +) +{ + unsigned int i, j; + + for ( i=0; i<output_height; i++ ) + { + for ( j=0; j<output_width; j++ ) + { + // Apply bilinear filter + output_ptr[j] = ( ( (int)src_ptr[0] * vp8_filter[0]) + + ((int)src_ptr[1] * vp8_filter[1]) + + (VP8_FILTER_WEIGHT/2) ) >> VP8_FILTER_SHIFT; + src_ptr++; + } + + // Next row... + src_ptr += src_pixels_per_line - output_width; + output_ptr += output_width; + } +} + +void vp8_filter_block2d_bil_second_pass_6 +( + unsigned short *src_ptr, + unsigned char *output_ptr, + int output_pitch, + unsigned int output_height, + unsigned int output_width, + const short *vp8_filter +) +{ + unsigned int i,j; + int Temp; + + for ( i=0; i<output_height; i++ ) + { + for ( j=0; j<output_width; j++ ) + { + // Apply filter + Temp = ((int)src_ptr[0] * vp8_filter[0]) + + ((int)src_ptr[output_width] * vp8_filter[1]) + + (VP8_FILTER_WEIGHT/2); + output_ptr[j] = (unsigned int)(Temp >> VP8_FILTER_SHIFT); + src_ptr++; + } + + // Next row... + //src_ptr += src_pixels_per_line - output_width; + output_ptr += output_pitch; + } +} +*/ + +void vp8_filter_block2d_bil_armv6 +( + unsigned char *src_ptr, + unsigned char *output_ptr, + unsigned int src_pixels_per_line, + unsigned int dst_pitch, + const short *HFilter, + const short *VFilter, + int Width, + int Height +) +{ + + unsigned short FData[36*16]; // Temp data bufffer used in filtering + + // First filter 1-D horizontally... + // pixel_step = 1; + vp8_filter_block2d_bil_first_pass_armv6(src_ptr, FData, src_pixels_per_line, Height + 1, Width, HFilter); + + // then 1-D vertically... + vp8_filter_block2d_bil_second_pass_armv6(FData, output_ptr, dst_pitch, Height, Width, VFilter); +} + + +void vp8_bilinear_predict4x4_armv6 +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + const short *HFilter; + const short *VFilter; + + HFilter = bilinear_filters[xoffset]; + VFilter = bilinear_filters[yoffset]; + + vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 4, 4); +} + +void vp8_bilinear_predict8x8_armv6 +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + const short *HFilter; + const short *VFilter; + + HFilter = bilinear_filters[xoffset]; + VFilter = bilinear_filters[yoffset]; + + vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 8); +} + +void vp8_bilinear_predict8x4_armv6 +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + const short *HFilter; + const short *VFilter; + + HFilter = bilinear_filters[xoffset]; + VFilter = bilinear_filters[yoffset]; + + vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 4); +} + +void vp8_bilinear_predict16x16_armv6 +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + const short *HFilter; + const short *VFilter; + + HFilter = bilinear_filters[xoffset]; + VFilter = bilinear_filters[yoffset]; + + vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 16, 16); +} diff --git a/vp8/common/arm/filter_arm.c b/vp8/common/arm/filter_arm.c new file mode 100644 index 000000000..2a4640cae --- /dev/null +++ b/vp8/common/arm/filter_arm.c @@ -0,0 +1,234 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "vpx_ports/config.h" +#include <math.h> +#include "subpixel.h" +#include "vpx_ports/mem.h" + +#define BLOCK_HEIGHT_WIDTH 4 +#define VP8_FILTER_WEIGHT 128 +#define VP8_FILTER_SHIFT 7 + +DECLARE_ALIGNED(16, static const short, sub_pel_filters[8][6]) = +{ + { 0, 0, 128, 0, 0, 0 }, // note that 1/8 pel positions are just as per alpha -0.5 bicubic + { 0, -6, 123, 12, -1, 0 }, + { 2, -11, 108, 36, -8, 1 }, // New 1/4 pel 6 tap filter + { 0, -9, 93, 50, -6, 0 }, + { 3, -16, 77, 77, -16, 3 }, // New 1/2 pel 6 tap filter + { 0, -6, 50, 93, -9, 0 }, + { 1, -8, 36, 108, -11, 2 }, // New 1/4 pel 6 tap filter + { 0, -1, 12, 123, -6, 0 }, +}; + + +extern void vp8_filter_block2d_first_pass_armv6 +( + unsigned char *src_ptr, + short *output_ptr, + unsigned int src_pixels_per_line, + unsigned int output_width, + unsigned int output_height, + const short *vp8_filter +); + +extern void vp8_filter_block2d_second_pass_armv6 +( + short *src_ptr, + unsigned char *output_ptr, + unsigned int output_pitch, + unsigned int cnt, + const short *vp8_filter +); + +extern void vp8_filter_block2d_first_pass_only_armv6 +( + unsigned char *src_ptr, + unsigned char *output_ptr, + unsigned int src_pixels_per_line, + unsigned int cnt, + unsigned int output_pitch, + const short *vp8_filter +); + + +extern void vp8_filter_block2d_second_pass_only_armv6 +( + unsigned char *src_ptr, + unsigned char *output_ptr, + unsigned int src_pixels_per_line, + unsigned int cnt, + unsigned int output_pitch, + const short *vp8_filter +); + +#if HAVE_ARMV6 +void vp8_sixtap_predict_armv6 +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + const short *HFilter; + const short *VFilter; + DECLARE_ALIGNED_ARRAY(4, short, FData, 12*4); // Temp data bufffer used in filtering + + + HFilter = sub_pel_filters[xoffset]; // 6 tap + VFilter = sub_pel_filters[yoffset]; // 6 tap + + // Vfilter is null. First pass only + if (xoffset && !yoffset) + { + //vp8_filter_block2d_first_pass_armv6 ( src_ptr, FData+2, src_pixels_per_line, 4, 4, HFilter ); + //vp8_filter_block2d_second_pass_armv6 ( FData+2, dst_ptr, dst_pitch, 4, VFilter ); + + vp8_filter_block2d_first_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 4, dst_pitch, HFilter); + } + // Hfilter is null. Second pass only + else if (!xoffset && yoffset) + { + vp8_filter_block2d_second_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 4, dst_pitch, VFilter); + } + else + { + // Vfilter is a 4 tap filter + if (yoffset & 0x1) + vp8_filter_block2d_first_pass_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 4, 7, HFilter); + // Vfilter is 6 tap filter + else + vp8_filter_block2d_first_pass_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 4, 9, HFilter); + + vp8_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 4, VFilter); + } +} + +/* +void vp8_sixtap_predict8x4_armv6 +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + const short *HFilter; + const short *VFilter; + DECLARE_ALIGNED_ARRAY(4, short, FData, 16*8); // Temp data bufffer used in filtering + + HFilter = sub_pel_filters[xoffset]; // 6 tap + VFilter = sub_pel_filters[yoffset]; // 6 tap + + +// if (xoffset && !yoffset) +// { +// vp8_filter_block2d_first_pass_only_armv6 ( src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, HFilter ); +// } + // Hfilter is null. Second pass only +// else if (!xoffset && yoffset) +// { +// vp8_filter_block2d_second_pass_only_armv6 ( src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, VFilter ); +// } +// else +// { +// if (yoffset & 0x1) + // vp8_filter_block2d_first_pass_armv6 ( src_ptr-src_pixels_per_line, FData+1, src_pixels_per_line, 8, 7, HFilter ); + // else + + vp8_filter_block2d_first_pass_armv6 ( src_ptr-(2*src_pixels_per_line), FData, src_pixels_per_line, 8, 9, HFilter ); + + vp8_filter_block2d_second_pass_armv6 ( FData+2, dst_ptr, dst_pitch, 4, 8, VFilter ); +// } +} +*/ + +void vp8_sixtap_predict8x8_armv6 +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + const short *HFilter; + const short *VFilter; + DECLARE_ALIGNED_ARRAY(4, short, FData, 16*8); // Temp data bufffer used in filtering + + HFilter = sub_pel_filters[xoffset]; // 6 tap + VFilter = sub_pel_filters[yoffset]; // 6 tap + + if (xoffset && !yoffset) + { + vp8_filter_block2d_first_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, HFilter); + } + // Hfilter is null. Second pass only + else if (!xoffset && yoffset) + { + vp8_filter_block2d_second_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, VFilter); + } + else + { + if (yoffset & 0x1) + vp8_filter_block2d_first_pass_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 8, 11, HFilter); + else + vp8_filter_block2d_first_pass_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 8, 13, HFilter); + + vp8_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 8, VFilter); + } +} + + +void vp8_sixtap_predict16x16_armv6 +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + const short *HFilter; + const short *VFilter; + DECLARE_ALIGNED_ARRAY(4, short, FData, 24*16); // Temp data bufffer used in filtering + + HFilter = sub_pel_filters[xoffset]; // 6 tap + VFilter = sub_pel_filters[yoffset]; // 6 tap + + if (xoffset && !yoffset) + { + vp8_filter_block2d_first_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 16, dst_pitch, HFilter); + } + // Hfilter is null. Second pass only + else if (!xoffset && yoffset) + { + vp8_filter_block2d_second_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 16, dst_pitch, VFilter); + } + else + { + if (yoffset & 0x1) + vp8_filter_block2d_first_pass_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 16, 19, HFilter); + else + vp8_filter_block2d_first_pass_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 16, 21, HFilter); + + vp8_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 16, VFilter); + } + +} +#endif diff --git a/vp8/common/arm/idct_arm.h b/vp8/common/arm/idct_arm.h new file mode 100644 index 000000000..f9ed21e0d --- /dev/null +++ b/vp8/common/arm/idct_arm.h @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef IDCT_ARM_H +#define IDCT_ARM_H + +#if HAVE_ARMV6 +extern prototype_idct(vp8_short_idct4x4llm_1_v6); +extern prototype_idct(vp8_short_idct4x4llm_v6_dual); +extern prototype_idct_scalar(vp8_dc_only_idct_armv6); +extern prototype_second_order(vp8_short_inv_walsh4x4_1_armv6); +extern prototype_second_order(vp8_short_inv_walsh4x4_armv6); + +#undef vp8_idct_idct1 +#define vp8_idct_idct1 vp8_short_idct4x4llm_1_v6 + +#undef vp8_idct_idct16 +#define vp8_idct_idct16 vp8_short_idct4x4llm_v6_dual + +#undef vp8_idct_idct1_scalar +#define vp8_idct_idct1_scalar vp8_dc_only_idct_armv6 + +#undef vp8_idct_iwalsh1 +#define vp8_idct_iwalsh1 vp8_short_inv_walsh4x4_1_armv6 + +#undef vp8_idct_iwalsh16 +#define vp8_idct_iwalsh16 vp8_short_inv_walsh4x4_armv6 +#endif + +#if HAVE_ARMV7 +extern prototype_idct(vp8_short_idct4x4llm_1_neon); +extern prototype_idct(vp8_short_idct4x4llm_neon); +extern prototype_idct_scalar(vp8_dc_only_idct_neon); +extern prototype_second_order(vp8_short_inv_walsh4x4_1_neon); +extern prototype_second_order(vp8_short_inv_walsh4x4_neon); + +#undef vp8_idct_idct1 +#define vp8_idct_idct1 vp8_short_idct4x4llm_1_neon + +#undef vp8_idct_idct16 +#define vp8_idct_idct16 vp8_short_idct4x4llm_neon + +#undef vp8_idct_idct1_scalar +#define vp8_idct_idct1_scalar vp8_dc_only_idct_neon + +#undef vp8_idct_iwalsh1 +#define vp8_idct_iwalsh1 vp8_short_inv_walsh4x4_1_neon + +#undef vp8_idct_iwalsh16 +#define vp8_idct_iwalsh16 vp8_short_inv_walsh4x4_neon +#endif + +#endif diff --git a/vp8/common/arm/loopfilter_arm.c b/vp8/common/arm/loopfilter_arm.c new file mode 100644 index 000000000..fa7c62617 --- /dev/null +++ b/vp8/common/arm/loopfilter_arm.c @@ -0,0 +1,246 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "vpx_ports/config.h" +#include <math.h> +#include "loopfilter.h" +#include "onyxc_int.h" + +typedef void loop_filter_uvfunction +( + unsigned char *u, // source pointer + int p, // pitch + const signed char *flimit, + const signed char *limit, + const signed char *thresh, + unsigned char *v +); + +extern prototype_loopfilter(vp8_loop_filter_horizontal_edge_armv6); +extern prototype_loopfilter(vp8_loop_filter_vertical_edge_armv6); +extern prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_armv6); +extern prototype_loopfilter(vp8_mbloop_filter_vertical_edge_armv6); +extern prototype_loopfilter(vp8_loop_filter_simple_horizontal_edge_armv6); +extern prototype_loopfilter(vp8_loop_filter_simple_vertical_edge_armv6); + +extern prototype_loopfilter(vp8_loop_filter_horizontal_edge_y_neon); +extern prototype_loopfilter(vp8_loop_filter_vertical_edge_y_neon); +extern prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_y_neon); +extern prototype_loopfilter(vp8_mbloop_filter_vertical_edge_y_neon); +extern prototype_loopfilter(vp8_loop_filter_simple_horizontal_edge_neon); +extern prototype_loopfilter(vp8_loop_filter_simple_vertical_edge_neon); + +extern loop_filter_uvfunction vp8_loop_filter_horizontal_edge_uv_neon; +extern loop_filter_uvfunction vp8_loop_filter_vertical_edge_uv_neon; +extern loop_filter_uvfunction vp8_mbloop_filter_horizontal_edge_uv_neon; +extern loop_filter_uvfunction vp8_mbloop_filter_vertical_edge_uv_neon; + + +#if HAVE_ARMV6 +//ARMV6 loopfilter functions +// Horizontal MB filtering +void vp8_loop_filter_mbh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) +{ + (void) simpler_lpf; + vp8_mbloop_filter_horizontal_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2); + + if (u_ptr) + vp8_mbloop_filter_horizontal_edge_armv6(u_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1); + + if (v_ptr) + vp8_mbloop_filter_horizontal_edge_armv6(v_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1); +} + +void vp8_loop_filter_mbhs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) +{ + (void) u_ptr; + (void) v_ptr; + (void) uv_stride; + (void) simpler_lpf; + vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2); +} + +// Vertical MB Filtering +void vp8_loop_filter_mbv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) +{ + (void) simpler_lpf; + vp8_mbloop_filter_vertical_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2); + + if (u_ptr) + vp8_mbloop_filter_vertical_edge_armv6(u_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1); + + if (v_ptr) + vp8_mbloop_filter_vertical_edge_armv6(v_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1); +} + +void vp8_loop_filter_mbvs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) +{ + (void) u_ptr; + (void) v_ptr; + (void) uv_stride; + (void) simpler_lpf; + vp8_loop_filter_simple_vertical_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2); +} + +// Horizontal B Filtering +void vp8_loop_filter_bh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) +{ + (void) simpler_lpf; + vp8_loop_filter_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + + if (u_ptr) + vp8_loop_filter_horizontal_edge_armv6(u_ptr + 4 * uv_stride, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1); + + if (v_ptr) + vp8_loop_filter_horizontal_edge_armv6(v_ptr + 4 * uv_stride, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1); +} + +void vp8_loop_filter_bhs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) +{ + (void) u_ptr; + (void) v_ptr; + (void) uv_stride; + (void) simpler_lpf; + vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); +} + +// Vertical B Filtering +void vp8_loop_filter_bv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) +{ + (void) simpler_lpf; + vp8_loop_filter_vertical_edge_armv6(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_vertical_edge_armv6(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_vertical_edge_armv6(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + + if (u_ptr) + vp8_loop_filter_vertical_edge_armv6(u_ptr + 4, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1); + + if (v_ptr) + vp8_loop_filter_vertical_edge_armv6(v_ptr + 4, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1); +} + +void vp8_loop_filter_bvs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) +{ + (void) u_ptr; + (void) v_ptr; + (void) uv_stride; + (void) simpler_lpf; + vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); +} +#endif + +#if HAVE_ARMV7 +// NEON loopfilter functions +// Horizontal MB filtering +void vp8_loop_filter_mbh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) +{ + (void) simpler_lpf; + vp8_mbloop_filter_horizontal_edge_y_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2); + + if (u_ptr) + vp8_mbloop_filter_horizontal_edge_uv_neon(u_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, v_ptr); +} + +void vp8_loop_filter_mbhs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) +{ + (void) u_ptr; + (void) v_ptr; + (void) uv_stride; + (void) simpler_lpf; + vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2); +} + +// Vertical MB Filtering +void vp8_loop_filter_mbv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) +{ + (void) simpler_lpf; + vp8_mbloop_filter_vertical_edge_y_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2); + + if (u_ptr) + vp8_mbloop_filter_vertical_edge_uv_neon(u_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, v_ptr); +} + +void vp8_loop_filter_mbvs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) +{ + (void) u_ptr; + (void) v_ptr; + (void) uv_stride; + (void) simpler_lpf; + vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2); +} + +// Horizontal B Filtering +void vp8_loop_filter_bh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) +{ + (void) simpler_lpf; + vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + + if (u_ptr) + vp8_loop_filter_horizontal_edge_uv_neon(u_ptr + 4 * uv_stride, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, v_ptr + 4 * uv_stride); +} + +void vp8_loop_filter_bhs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) +{ + (void) u_ptr; + (void) v_ptr; + (void) uv_stride; + (void) simpler_lpf; + vp8_loop_filter_simple_horizontal_edge_neon(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_simple_horizontal_edge_neon(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_simple_horizontal_edge_neon(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); +} + +// Vertical B Filtering +void vp8_loop_filter_bv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) +{ + (void) simpler_lpf; + vp8_loop_filter_vertical_edge_y_neon(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_vertical_edge_y_neon(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_vertical_edge_y_neon(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + + if (u_ptr) + vp8_loop_filter_vertical_edge_uv_neon(u_ptr + 4, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, v_ptr + 4); +} + +void vp8_loop_filter_bvs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) +{ + (void) u_ptr; + (void) v_ptr; + (void) uv_stride; + (void) simpler_lpf; + vp8_loop_filter_simple_vertical_edge_neon(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_simple_vertical_edge_neon(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_simple_vertical_edge_neon(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); +} +#endif diff --git a/vp8/common/arm/loopfilter_arm.h b/vp8/common/arm/loopfilter_arm.h new file mode 100644 index 000000000..4bb49456d --- /dev/null +++ b/vp8/common/arm/loopfilter_arm.h @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef LOOPFILTER_ARM_H +#define LOOPFILTER_ARM_H + +#if HAVE_ARMV6 +extern prototype_loopfilter_block(vp8_loop_filter_mbv_armv6); +extern prototype_loopfilter_block(vp8_loop_filter_bv_armv6); +extern prototype_loopfilter_block(vp8_loop_filter_mbh_armv6); +extern prototype_loopfilter_block(vp8_loop_filter_bh_armv6); +extern prototype_loopfilter_block(vp8_loop_filter_mbvs_armv6); +extern prototype_loopfilter_block(vp8_loop_filter_bvs_armv6); +extern prototype_loopfilter_block(vp8_loop_filter_mbhs_armv6); +extern prototype_loopfilter_block(vp8_loop_filter_bhs_armv6); + +#undef vp8_lf_normal_mb_v +#define vp8_lf_normal_mb_v vp8_loop_filter_mbv_armv6 + +#undef vp8_lf_normal_b_v +#define vp8_lf_normal_b_v vp8_loop_filter_bv_armv6 + +#undef vp8_lf_normal_mb_h +#define vp8_lf_normal_mb_h vp8_loop_filter_mbh_armv6 + +#undef vp8_lf_normal_b_h +#define vp8_lf_normal_b_h vp8_loop_filter_bh_armv6 + +#undef vp8_lf_simple_mb_v +#define vp8_lf_simple_mb_v vp8_loop_filter_mbvs_armv6 + +#undef vp8_lf_simple_b_v +#define vp8_lf_simple_b_v vp8_loop_filter_bvs_armv6 + +#undef vp8_lf_simple_mb_h +#define vp8_lf_simple_mb_h vp8_loop_filter_mbhs_armv6 + +#undef vp8_lf_simple_b_h +#define vp8_lf_simple_b_h vp8_loop_filter_bhs_armv6 +#endif + +#if HAVE_ARMV7 +extern prototype_loopfilter_block(vp8_loop_filter_mbv_neon); +extern prototype_loopfilter_block(vp8_loop_filter_bv_neon); +extern prototype_loopfilter_block(vp8_loop_filter_mbh_neon); +extern prototype_loopfilter_block(vp8_loop_filter_bh_neon); +extern prototype_loopfilter_block(vp8_loop_filter_mbvs_neon); +extern prototype_loopfilter_block(vp8_loop_filter_bvs_neon); +extern prototype_loopfilter_block(vp8_loop_filter_mbhs_neon); +extern prototype_loopfilter_block(vp8_loop_filter_bhs_neon); + +#undef vp8_lf_normal_mb_v +#define vp8_lf_normal_mb_v vp8_loop_filter_mbv_neon + +#undef vp8_lf_normal_b_v +#define vp8_lf_normal_b_v vp8_loop_filter_bv_neon + +#undef vp8_lf_normal_mb_h +#define vp8_lf_normal_mb_h vp8_loop_filter_mbh_neon + +#undef vp8_lf_normal_b_h +#define vp8_lf_normal_b_h vp8_loop_filter_bh_neon + +#undef vp8_lf_simple_mb_v +#define vp8_lf_simple_mb_v vp8_loop_filter_mbvs_neon + +#undef vp8_lf_simple_b_v +#define vp8_lf_simple_b_v vp8_loop_filter_bvs_neon + +#undef vp8_lf_simple_mb_h +#define vp8_lf_simple_mb_h vp8_loop_filter_mbhs_neon + +#undef vp8_lf_simple_b_h +#define vp8_lf_simple_b_h vp8_loop_filter_bhs_neon +#endif + +#endif diff --git a/vp8/common/arm/neon/bilinearpredict16x16_neon.asm b/vp8/common/arm/neon/bilinearpredict16x16_neon.asm new file mode 100644 index 000000000..a2fea2bd6 --- /dev/null +++ b/vp8/common/arm/neon/bilinearpredict16x16_neon.asm @@ -0,0 +1,361 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_bilinear_predict16x16_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +; r0 unsigned char *src_ptr, +; r1 int src_pixels_per_line, +; r2 int xoffset, +; r3 int yoffset, +; r4 unsigned char *dst_ptr, +; stack(r5) int dst_pitch + +|vp8_bilinear_predict16x16_neon| PROC + push {r4-r5, lr} + + ldr r12, _bifilter16_coeff_ + ldr r4, [sp, #12] ;load parameters from stack + ldr r5, [sp, #16] ;load parameters from stack + + cmp r2, #0 ;skip first_pass filter if xoffset=0 + beq secondpass_bfilter16x16_only + + add r2, r12, r2, lsl #3 ;calculate filter location + + cmp r3, #0 ;skip second_pass filter if yoffset=0 + + vld1.s32 {d31}, [r2] ;load first_pass filter + + beq firstpass_bfilter16x16_only + + sub sp, sp, #272 ;reserve space on stack for temporary storage + vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data + mov lr, sp + vld1.u8 {d5, d6, d7}, [r0], r1 + + mov r2, #3 ;loop counter + vld1.u8 {d8, d9, d10}, [r0], r1 + + vdup.8 d0, d31[0] ;first_pass filter (d0 d1) + vld1.u8 {d11, d12, d13}, [r0], r1 + + vdup.8 d1, d31[4] + +;First Pass: output_height lines x output_width columns (17x16) +filt_blk2d_fp16x16_loop_neon + pld [r0] + pld [r0, r1] + pld [r0, r1, lsl #1] + + vmull.u8 q7, d2, d0 ;(src_ptr[0] * vp8_filter[0]) + vmull.u8 q8, d3, d0 + vmull.u8 q9, d5, d0 + vmull.u8 q10, d6, d0 + vmull.u8 q11, d8, d0 + vmull.u8 q12, d9, d0 + vmull.u8 q13, d11, d0 + vmull.u8 q14, d12, d0 + + vext.8 d2, d2, d3, #1 ;construct src_ptr[1] + vext.8 d5, d5, d6, #1 + vext.8 d8, d8, d9, #1 + vext.8 d11, d11, d12, #1 + + vmlal.u8 q7, d2, d1 ;(src_ptr[0] * vp8_filter[1]) + vmlal.u8 q9, d5, d1 + vmlal.u8 q11, d8, d1 + vmlal.u8 q13, d11, d1 + + vext.8 d3, d3, d4, #1 + vext.8 d6, d6, d7, #1 + vext.8 d9, d9, d10, #1 + vext.8 d12, d12, d13, #1 + + vmlal.u8 q8, d3, d1 ;(src_ptr[0] * vp8_filter[1]) + vmlal.u8 q10, d6, d1 + vmlal.u8 q12, d9, d1 + vmlal.u8 q14, d12, d1 + + subs r2, r2, #1 + + vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8 + vqrshrn.u16 d15, q8, #7 + vqrshrn.u16 d16, q9, #7 + vqrshrn.u16 d17, q10, #7 + vqrshrn.u16 d18, q11, #7 + vqrshrn.u16 d19, q12, #7 + vqrshrn.u16 d20, q13, #7 + + vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data + vqrshrn.u16 d21, q14, #7 + vld1.u8 {d5, d6, d7}, [r0], r1 + + vst1.u8 {d14, d15, d16, d17}, [lr]! ;store result + vld1.u8 {d8, d9, d10}, [r0], r1 + vst1.u8 {d18, d19, d20, d21}, [lr]! + vld1.u8 {d11, d12, d13}, [r0], r1 + + bne filt_blk2d_fp16x16_loop_neon + +;First-pass filtering for rest 5 lines + vld1.u8 {d14, d15, d16}, [r0], r1 + + vmull.u8 q9, d2, d0 ;(src_ptr[0] * vp8_filter[0]) + vmull.u8 q10, d3, d0 + vmull.u8 q11, d5, d0 + vmull.u8 q12, d6, d0 + vmull.u8 q13, d8, d0 + vmull.u8 q14, d9, d0 + + vext.8 d2, d2, d3, #1 ;construct src_ptr[1] + vext.8 d5, d5, d6, #1 + vext.8 d8, d8, d9, #1 + + vmlal.u8 q9, d2, d1 ;(src_ptr[0] * vp8_filter[1]) + vmlal.u8 q11, d5, d1 + vmlal.u8 q13, d8, d1 + + vext.8 d3, d3, d4, #1 + vext.8 d6, d6, d7, #1 + vext.8 d9, d9, d10, #1 + + vmlal.u8 q10, d3, d1 ;(src_ptr[0] * vp8_filter[1]) + vmlal.u8 q12, d6, d1 + vmlal.u8 q14, d9, d1 + + vmull.u8 q1, d11, d0 + vmull.u8 q2, d12, d0 + vmull.u8 q3, d14, d0 + vmull.u8 q4, d15, d0 + + vext.8 d11, d11, d12, #1 ;construct src_ptr[1] + vext.8 d14, d14, d15, #1 + + vmlal.u8 q1, d11, d1 ;(src_ptr[0] * vp8_filter[1]) + vmlal.u8 q3, d14, d1 + + vext.8 d12, d12, d13, #1 + vext.8 d15, d15, d16, #1 + + vmlal.u8 q2, d12, d1 ;(src_ptr[0] * vp8_filter[1]) + vmlal.u8 q4, d15, d1 + + vqrshrn.u16 d10, q9, #7 ;shift/round/saturate to u8 + vqrshrn.u16 d11, q10, #7 + vqrshrn.u16 d12, q11, #7 + vqrshrn.u16 d13, q12, #7 + vqrshrn.u16 d14, q13, #7 + vqrshrn.u16 d15, q14, #7 + vqrshrn.u16 d16, q1, #7 + vqrshrn.u16 d17, q2, #7 + vqrshrn.u16 d18, q3, #7 + vqrshrn.u16 d19, q4, #7 + + vst1.u8 {d10, d11, d12, d13}, [lr]! ;store result + vst1.u8 {d14, d15, d16, d17}, [lr]! + vst1.u8 {d18, d19}, [lr]! + +;Second pass: 16x16 +;secondpass_filter + add r3, r12, r3, lsl #3 + sub lr, lr, #272 + + vld1.u32 {d31}, [r3] ;load second_pass filter + + vld1.u8 {d22, d23}, [lr]! ;load src data + + vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1) + vdup.8 d1, d31[4] + mov r12, #4 ;loop counter + +filt_blk2d_sp16x16_loop_neon + vld1.u8 {d24, d25}, [lr]! + vmull.u8 q1, d22, d0 ;(src_ptr[0] * vp8_filter[0]) + vld1.u8 {d26, d27}, [lr]! + vmull.u8 q2, d23, d0 + vld1.u8 {d28, d29}, [lr]! + vmull.u8 q3, d24, d0 + vld1.u8 {d30, d31}, [lr]! + + vmull.u8 q4, d25, d0 + vmull.u8 q5, d26, d0 + vmull.u8 q6, d27, d0 + vmull.u8 q7, d28, d0 + vmull.u8 q8, d29, d0 + + vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * vp8_filter[1]) + vmlal.u8 q2, d25, d1 + vmlal.u8 q3, d26, d1 + vmlal.u8 q4, d27, d1 + vmlal.u8 q5, d28, d1 + vmlal.u8 q6, d29, d1 + vmlal.u8 q7, d30, d1 + vmlal.u8 q8, d31, d1 + + subs r12, r12, #1 + + vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8 + vqrshrn.u16 d3, q2, #7 + vqrshrn.u16 d4, q3, #7 + vqrshrn.u16 d5, q4, #7 + vqrshrn.u16 d6, q5, #7 + vqrshrn.u16 d7, q6, #7 + vqrshrn.u16 d8, q7, #7 + vqrshrn.u16 d9, q8, #7 + + vst1.u8 {d2, d3}, [r4], r5 ;store result + vst1.u8 {d4, d5}, [r4], r5 + vst1.u8 {d6, d7}, [r4], r5 + vmov q11, q15 + vst1.u8 {d8, d9}, [r4], r5 + + bne filt_blk2d_sp16x16_loop_neon + + add sp, sp, #272 + + pop {r4-r5,pc} + +;-------------------- +firstpass_bfilter16x16_only + mov r2, #4 ;loop counter + vdup.8 d0, d31[0] ;first_pass filter (d0 d1) + vdup.8 d1, d31[4] + +;First Pass: output_height lines x output_width columns (16x16) +filt_blk2d_fpo16x16_loop_neon + vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data + vld1.u8 {d5, d6, d7}, [r0], r1 + vld1.u8 {d8, d9, d10}, [r0], r1 + vld1.u8 {d11, d12, d13}, [r0], r1 + + pld [r0] + pld [r0, r1] + pld [r0, r1, lsl #1] + + vmull.u8 q7, d2, d0 ;(src_ptr[0] * vp8_filter[0]) + vmull.u8 q8, d3, d0 + vmull.u8 q9, d5, d0 + vmull.u8 q10, d6, d0 + vmull.u8 q11, d8, d0 + vmull.u8 q12, d9, d0 + vmull.u8 q13, d11, d0 + vmull.u8 q14, d12, d0 + + vext.8 d2, d2, d3, #1 ;construct src_ptr[1] + vext.8 d5, d5, d6, #1 + vext.8 d8, d8, d9, #1 + vext.8 d11, d11, d12, #1 + + vmlal.u8 q7, d2, d1 ;(src_ptr[0] * vp8_filter[1]) + vmlal.u8 q9, d5, d1 + vmlal.u8 q11, d8, d1 + vmlal.u8 q13, d11, d1 + + vext.8 d3, d3, d4, #1 + vext.8 d6, d6, d7, #1 + vext.8 d9, d9, d10, #1 + vext.8 d12, d12, d13, #1 + + vmlal.u8 q8, d3, d1 ;(src_ptr[0] * vp8_filter[1]) + vmlal.u8 q10, d6, d1 + vmlal.u8 q12, d9, d1 + vmlal.u8 q14, d12, d1 + + subs r2, r2, #1 + + vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8 + vqrshrn.u16 d15, q8, #7 + vqrshrn.u16 d16, q9, #7 + vqrshrn.u16 d17, q10, #7 + vqrshrn.u16 d18, q11, #7 + vqrshrn.u16 d19, q12, #7 + vqrshrn.u16 d20, q13, #7 + vst1.u8 {d14, d15}, [r4], r5 ;store result + vqrshrn.u16 d21, q14, #7 + + vst1.u8 {d16, d17}, [r4], r5 + vst1.u8 {d18, d19}, [r4], r5 + vst1.u8 {d20, d21}, [r4], r5 + + bne filt_blk2d_fpo16x16_loop_neon + pop {r4-r5,pc} + +;--------------------- +secondpass_bfilter16x16_only +;Second pass: 16x16 +;secondpass_filter + add r3, r12, r3, lsl #3 + mov r12, #4 ;loop counter + vld1.u32 {d31}, [r3] ;load second_pass filter + vld1.u8 {d22, d23}, [r0], r1 ;load src data + + vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1) + vdup.8 d1, d31[4] + +filt_blk2d_spo16x16_loop_neon + vld1.u8 {d24, d25}, [r0], r1 + vmull.u8 q1, d22, d0 ;(src_ptr[0] * vp8_filter[0]) + vld1.u8 {d26, d27}, [r0], r1 + vmull.u8 q2, d23, d0 + vld1.u8 {d28, d29}, [r0], r1 + vmull.u8 q3, d24, d0 + vld1.u8 {d30, d31}, [r0], r1 + + vmull.u8 q4, d25, d0 + vmull.u8 q5, d26, d0 + vmull.u8 q6, d27, d0 + vmull.u8 q7, d28, d0 + vmull.u8 q8, d29, d0 + + vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * vp8_filter[1]) + vmlal.u8 q2, d25, d1 + vmlal.u8 q3, d26, d1 + vmlal.u8 q4, d27, d1 + vmlal.u8 q5, d28, d1 + vmlal.u8 q6, d29, d1 + vmlal.u8 q7, d30, d1 + vmlal.u8 q8, d31, d1 + + vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8 + vqrshrn.u16 d3, q2, #7 + vqrshrn.u16 d4, q3, #7 + vqrshrn.u16 d5, q4, #7 + vqrshrn.u16 d6, q5, #7 + vqrshrn.u16 d7, q6, #7 + vqrshrn.u16 d8, q7, #7 + vqrshrn.u16 d9, q8, #7 + + vst1.u8 {d2, d3}, [r4], r5 ;store result + subs r12, r12, #1 + vst1.u8 {d4, d5}, [r4], r5 + vmov q11, q15 + vst1.u8 {d6, d7}, [r4], r5 + vst1.u8 {d8, d9}, [r4], r5 + + bne filt_blk2d_spo16x16_loop_neon + pop {r4-r5,pc} + + ENDP + +;----------------- + AREA bifilters16_dat, DATA, READWRITE ;read/write by default +;Data section with name data_area is specified. DCD reserves space in memory for 48 data. +;One word each is reserved. Label filter_coeff can be used to access the data. +;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ... +_bifilter16_coeff_ + DCD bifilter16_coeff +bifilter16_coeff + DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112 + + END diff --git a/vp8/common/arm/neon/bilinearpredict4x4_neon.asm b/vp8/common/arm/neon/bilinearpredict4x4_neon.asm new file mode 100644 index 000000000..74d2db5dc --- /dev/null +++ b/vp8/common/arm/neon/bilinearpredict4x4_neon.asm @@ -0,0 +1,134 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_bilinear_predict4x4_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +; r0 unsigned char *src_ptr, +; r1 int src_pixels_per_line, +; r2 int xoffset, +; r3 int yoffset, +; r4 unsigned char *dst_ptr, +; stack(lr) int dst_pitch + +|vp8_bilinear_predict4x4_neon| PROC + push {r4, lr} + + ldr r12, _bifilter4_coeff_ + ldr r4, [sp, #8] ;load parameters from stack + ldr lr, [sp, #12] ;load parameters from stack + + cmp r2, #0 ;skip first_pass filter if xoffset=0 + beq skip_firstpass_filter + +;First pass: output_height lines x output_width columns (5x4) + vld1.u8 {d2}, [r0], r1 ;load src data + add r2, r12, r2, lsl #3 ;calculate Hfilter location (2coeffsx4bytes=8bytes) + + vld1.u8 {d3}, [r0], r1 + vld1.u32 {d31}, [r2] ;first_pass filter + + vld1.u8 {d4}, [r0], r1 + vdup.8 d0, d31[0] ;first_pass filter (d0-d1) + vld1.u8 {d5}, [r0], r1 + vdup.8 d1, d31[4] + vld1.u8 {d6}, [r0], r1 + + vshr.u64 q4, q1, #8 ;construct src_ptr[1] + vshr.u64 q5, q2, #8 + vshr.u64 d12, d6, #8 + + vzip.32 d2, d3 ;put 2-line data in 1 register (src_ptr[0]) + vzip.32 d4, d5 + vzip.32 d8, d9 ;put 2-line data in 1 register (src_ptr[1]) + vzip.32 d10, d11 + + vmull.u8 q7, d2, d0 ;(src_ptr[0] * vp8_filter[0]) + vmull.u8 q8, d4, d0 + vmull.u8 q9, d6, d0 + + vmlal.u8 q7, d8, d1 ;(src_ptr[1] * vp8_filter[1]) + vmlal.u8 q8, d10, d1 + vmlal.u8 q9, d12, d1 + + vqrshrn.u16 d28, q7, #7 ;shift/round/saturate to u8 + vqrshrn.u16 d29, q8, #7 + vqrshrn.u16 d30, q9, #7 + +;Second pass: 4x4 +secondpass_filter + cmp r3, #0 ;skip second_pass filter if yoffset=0 + beq skip_secondpass_filter + + add r3, r12, r3, lsl #3 ;calculate Vfilter location + vld1.u32 {d31}, [r3] ;load second_pass filter + + vdup.8 d0, d31[0] ;second_pass filter parameters (d0-d5) + vdup.8 d1, d31[4] + + vmull.u8 q1, d28, d0 + vmull.u8 q2, d29, d0 + + vext.8 d26, d28, d29, #4 ;construct src_ptr[pixel_step] + vext.8 d27, d29, d30, #4 + + vmlal.u8 q1, d26, d1 + vmlal.u8 q2, d27, d1 + + add r0, r4, lr + add r1, r0, lr + add r2, r1, lr + + vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8 + vqrshrn.u16 d3, q2, #7 + + vst1.32 {d2[0]}, [r4] ;store result + vst1.32 {d2[1]}, [r0] + vst1.32 {d3[0]}, [r1] + vst1.32 {d3[1]}, [r2] + + pop {r4, pc} + +;-------------------- +skip_firstpass_filter + + vld1.32 {d28[0]}, [r0], r1 ;load src data + vld1.32 {d28[1]}, [r0], r1 + vld1.32 {d29[0]}, [r0], r1 + vld1.32 {d29[1]}, [r0], r1 + vld1.32 {d30[0]}, [r0], r1 + + b secondpass_filter + +;--------------------- +skip_secondpass_filter + vst1.32 {d28[0]}, [r4], lr ;store result + vst1.32 {d28[1]}, [r4], lr + vst1.32 {d29[0]}, [r4], lr + vst1.32 {d29[1]}, [r4], lr + + pop {r4, pc} + + ENDP + +;----------------- + AREA bilinearfilters4_dat, DATA, READWRITE ;read/write by default +;Data section with name data_area is specified. DCD reserves space in memory for 48 data. +;One word each is reserved. Label filter_coeff can be used to access the data. +;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ... +_bifilter4_coeff_ + DCD bifilter4_coeff +bifilter4_coeff + DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112 + + END diff --git a/vp8/common/arm/neon/bilinearpredict8x4_neon.asm b/vp8/common/arm/neon/bilinearpredict8x4_neon.asm new file mode 100644 index 000000000..46ebb0e0b --- /dev/null +++ b/vp8/common/arm/neon/bilinearpredict8x4_neon.asm @@ -0,0 +1,139 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_bilinear_predict8x4_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +; r0 unsigned char *src_ptr, +; r1 int src_pixels_per_line, +; r2 int xoffset, +; r3 int yoffset, +; r4 unsigned char *dst_ptr, +; stack(lr) int dst_pitch + +|vp8_bilinear_predict8x4_neon| PROC + push {r4, lr} + + ldr r12, _bifilter8x4_coeff_ + ldr r4, [sp, #8] ;load parameters from stack + ldr lr, [sp, #12] ;load parameters from stack + + cmp r2, #0 ;skip first_pass filter if xoffset=0 + beq skip_firstpass_filter + +;First pass: output_height lines x output_width columns (5x8) + add r2, r12, r2, lsl #3 ;calculate filter location + + vld1.u8 {q1}, [r0], r1 ;load src data + vld1.u32 {d31}, [r2] ;load first_pass filter + vld1.u8 {q2}, [r0], r1 + vdup.8 d0, d31[0] ;first_pass filter (d0 d1) + vld1.u8 {q3}, [r0], r1 + vdup.8 d1, d31[4] + vld1.u8 {q4}, [r0], r1 + + vmull.u8 q6, d2, d0 ;(src_ptr[0] * vp8_filter[0]) + vld1.u8 {q5}, [r0], r1 + vmull.u8 q7, d4, d0 + vmull.u8 q8, d6, d0 + vmull.u8 q9, d8, d0 + vmull.u8 q10, d10, d0 + + vext.8 d3, d2, d3, #1 ;construct src_ptr[-1] + vext.8 d5, d4, d5, #1 + vext.8 d7, d6, d7, #1 + vext.8 d9, d8, d9, #1 + vext.8 d11, d10, d11, #1 + + vmlal.u8 q6, d3, d1 ;(src_ptr[1] * vp8_filter[1]) + vmlal.u8 q7, d5, d1 + vmlal.u8 q8, d7, d1 + vmlal.u8 q9, d9, d1 + vmlal.u8 q10, d11, d1 + + vqrshrn.u16 d22, q6, #7 ;shift/round/saturate to u8 + vqrshrn.u16 d23, q7, #7 + vqrshrn.u16 d24, q8, #7 + vqrshrn.u16 d25, q9, #7 + vqrshrn.u16 d26, q10, #7 + +;Second pass: 4x8 +secondpass_filter + cmp r3, #0 ;skip second_pass filter if yoffset=0 + beq skip_secondpass_filter + + add r3, r12, r3, lsl #3 + add r0, r4, lr + + vld1.u32 {d31}, [r3] ;load second_pass filter + add r1, r0, lr + + vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1) + vdup.8 d1, d31[4] + + vmull.u8 q1, d22, d0 ;(src_ptr[0] * vp8_filter[0]) + vmull.u8 q2, d23, d0 + vmull.u8 q3, d24, d0 + vmull.u8 q4, d25, d0 + + vmlal.u8 q1, d23, d1 ;(src_ptr[pixel_step] * vp8_filter[1]) + vmlal.u8 q2, d24, d1 + vmlal.u8 q3, d25, d1 + vmlal.u8 q4, d26, d1 + + add r2, r1, lr + + vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8 + vqrshrn.u16 d3, q2, #7 + vqrshrn.u16 d4, q3, #7 + vqrshrn.u16 d5, q4, #7 + + vst1.u8 {d2}, [r4] ;store result + vst1.u8 {d3}, [r0] + vst1.u8 {d4}, [r1] + vst1.u8 {d5}, [r2] + + pop {r4, pc} + +;-------------------- +skip_firstpass_filter + vld1.u8 {d22}, [r0], r1 ;load src data + vld1.u8 {d23}, [r0], r1 + vld1.u8 {d24}, [r0], r1 + vld1.u8 {d25}, [r0], r1 + vld1.u8 {d26}, [r0], r1 + + b secondpass_filter + +;--------------------- +skip_secondpass_filter + vst1.u8 {d22}, [r4], lr ;store result + vst1.u8 {d23}, [r4], lr + vst1.u8 {d24}, [r4], lr + vst1.u8 {d25}, [r4], lr + + pop {r4, pc} + + ENDP + +;----------------- + AREA bifilters8x4_dat, DATA, READWRITE ;read/write by default +;Data section with name data_area is specified. DCD reserves space in memory for 48 data. +;One word each is reserved. Label filter_coeff can be used to access the data. +;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ... +_bifilter8x4_coeff_ + DCD bifilter8x4_coeff +bifilter8x4_coeff + DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112 + + END diff --git a/vp8/common/arm/neon/bilinearpredict8x8_neon.asm b/vp8/common/arm/neon/bilinearpredict8x8_neon.asm new file mode 100644 index 000000000..80728d4f8 --- /dev/null +++ b/vp8/common/arm/neon/bilinearpredict8x8_neon.asm @@ -0,0 +1,187 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_bilinear_predict8x8_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +; r0 unsigned char *src_ptr, +; r1 int src_pixels_per_line, +; r2 int xoffset, +; r3 int yoffset, +; r4 unsigned char *dst_ptr, +; stack(lr) int dst_pitch + +|vp8_bilinear_predict8x8_neon| PROC + push {r4, lr} + + ldr r12, _bifilter8_coeff_ + ldr r4, [sp, #8] ;load parameters from stack + ldr lr, [sp, #12] ;load parameters from stack + + cmp r2, #0 ;skip first_pass filter if xoffset=0 + beq skip_firstpass_filter + +;First pass: output_height lines x output_width columns (9x8) + add r2, r12, r2, lsl #3 ;calculate filter location + + vld1.u8 {q1}, [r0], r1 ;load src data + vld1.u32 {d31}, [r2] ;load first_pass filter + vld1.u8 {q2}, [r0], r1 + vdup.8 d0, d31[0] ;first_pass filter (d0 d1) + vld1.u8 {q3}, [r0], r1 + vdup.8 d1, d31[4] + vld1.u8 {q4}, [r0], r1 + + vmull.u8 q6, d2, d0 ;(src_ptr[0] * vp8_filter[0]) + vmull.u8 q7, d4, d0 + vmull.u8 q8, d6, d0 + vmull.u8 q9, d8, d0 + + vext.8 d3, d2, d3, #1 ;construct src_ptr[-1] + vext.8 d5, d4, d5, #1 + vext.8 d7, d6, d7, #1 + vext.8 d9, d8, d9, #1 + + vmlal.u8 q6, d3, d1 ;(src_ptr[1] * vp8_filter[1]) + vmlal.u8 q7, d5, d1 + vmlal.u8 q8, d7, d1 + vmlal.u8 q9, d9, d1 + + vld1.u8 {q1}, [r0], r1 ;load src data + vqrshrn.u16 d22, q6, #7 ;shift/round/saturate to u8 + vld1.u8 {q2}, [r0], r1 + vqrshrn.u16 d23, q7, #7 + vld1.u8 {q3}, [r0], r1 + vqrshrn.u16 d24, q8, #7 + vld1.u8 {q4}, [r0], r1 + vqrshrn.u16 d25, q9, #7 + + ;first_pass filtering on the rest 5-line data + vld1.u8 {q5}, [r0], r1 + + vmull.u8 q6, d2, d0 ;(src_ptr[0] * vp8_filter[0]) + vmull.u8 q7, d4, d0 + vmull.u8 q8, d6, d0 + vmull.u8 q9, d8, d0 + vmull.u8 q10, d10, d0 + + vext.8 d3, d2, d3, #1 ;construct src_ptr[-1] + vext.8 d5, d4, d5, #1 + vext.8 d7, d6, d7, #1 + vext.8 d9, d8, d9, #1 + vext.8 d11, d10, d11, #1 + + vmlal.u8 q6, d3, d1 ;(src_ptr[1] * vp8_filter[1]) + vmlal.u8 q7, d5, d1 + vmlal.u8 q8, d7, d1 + vmlal.u8 q9, d9, d1 + vmlal.u8 q10, d11, d1 + + vqrshrn.u16 d26, q6, #7 ;shift/round/saturate to u8 + vqrshrn.u16 d27, q7, #7 + vqrshrn.u16 d28, q8, #7 + vqrshrn.u16 d29, q9, #7 + vqrshrn.u16 d30, q10, #7 + +;Second pass: 8x8 +secondpass_filter + cmp r3, #0 ;skip second_pass filter if yoffset=0 + beq skip_secondpass_filter + + add r3, r12, r3, lsl #3 + add r0, r4, lr + + vld1.u32 {d31}, [r3] ;load second_pass filter + add r1, r0, lr + + vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1) + vdup.8 d1, d31[4] + + vmull.u8 q1, d22, d0 ;(src_ptr[0] * vp8_filter[0]) + vmull.u8 q2, d23, d0 + vmull.u8 q3, d24, d0 + vmull.u8 q4, d25, d0 + vmull.u8 q5, d26, d0 + vmull.u8 q6, d27, d0 + vmull.u8 q7, d28, d0 + vmull.u8 q8, d29, d0 + + vmlal.u8 q1, d23, d1 ;(src_ptr[pixel_step] * vp8_filter[1]) + vmlal.u8 q2, d24, d1 + vmlal.u8 q3, d25, d1 + vmlal.u8 q4, d26, d1 + vmlal.u8 q5, d27, d1 + vmlal.u8 q6, d28, d1 + vmlal.u8 q7, d29, d1 + vmlal.u8 q8, d30, d1 + + vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8 + vqrshrn.u16 d3, q2, #7 + vqrshrn.u16 d4, q3, #7 + vqrshrn.u16 d5, q4, #7 + vqrshrn.u16 d6, q5, #7 + vqrshrn.u16 d7, q6, #7 + vqrshrn.u16 d8, q7, #7 + vqrshrn.u16 d9, q8, #7 + + vst1.u8 {d2}, [r4] ;store result + vst1.u8 {d3}, [r0] + vst1.u8 {d4}, [r1], lr + vst1.u8 {d5}, [r1], lr + vst1.u8 {d6}, [r1], lr + vst1.u8 {d7}, [r1], lr + vst1.u8 {d8}, [r1], lr + vst1.u8 {d9}, [r1], lr + + pop {r4, pc} + +;-------------------- +skip_firstpass_filter + vld1.u8 {d22}, [r0], r1 ;load src data + vld1.u8 {d23}, [r0], r1 + vld1.u8 {d24}, [r0], r1 + vld1.u8 {d25}, [r0], r1 + vld1.u8 {d26}, [r0], r1 + vld1.u8 {d27}, [r0], r1 + vld1.u8 {d28}, [r0], r1 + vld1.u8 {d29}, [r0], r1 + vld1.u8 {d30}, [r0], r1 + + b secondpass_filter + +;--------------------- +skip_secondpass_filter + vst1.u8 {d22}, [r4], lr ;store result + vst1.u8 {d23}, [r4], lr + vst1.u8 {d24}, [r4], lr + vst1.u8 {d25}, [r4], lr + vst1.u8 {d26}, [r4], lr + vst1.u8 {d27}, [r4], lr + vst1.u8 {d28}, [r4], lr + vst1.u8 {d29}, [r4], lr + + pop {r4, pc} + + ENDP + +;----------------- + AREA bifilters8_dat, DATA, READWRITE ;read/write by default +;Data section with name data_area is specified. DCD reserves space in memory for 48 data. +;One word each is reserved. Label filter_coeff can be used to access the data. +;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ... +_bifilter8_coeff_ + DCD bifilter8_coeff +bifilter8_coeff + DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112 + + END diff --git a/vp8/common/arm/neon/buildintrapredictorsmby_neon.asm b/vp8/common/arm/neon/buildintrapredictorsmby_neon.asm new file mode 100644 index 000000000..f42ac63c9 --- /dev/null +++ b/vp8/common/arm/neon/buildintrapredictorsmby_neon.asm @@ -0,0 +1,583 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_build_intra_predictors_mby_neon_func| + EXPORT |vp8_build_intra_predictors_mby_s_neon_func| + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +; r0 unsigned char *y_buffer +; r1 unsigned char *ypred_ptr +; r2 int y_stride +; r3 int mode +; stack int Up +; stack int Left + +|vp8_build_intra_predictors_mby_neon_func| PROC + push {r4-r8, lr} + + cmp r3, #0 + beq case_dc_pred + cmp r3, #1 + beq case_v_pred + cmp r3, #2 + beq case_h_pred + cmp r3, #3 + beq case_tm_pred + +case_dc_pred + ldr r4, [sp, #24] ; Up + ldr r5, [sp, #28] ; Left + + ; Default the DC average to 128 + mov r12, #128 + vdup.u8 q0, r12 + + ; Zero out running sum + mov r12, #0 + + ; compute shift and jump + adds r7, r4, r5 + beq skip_dc_pred_up_left + + ; Load above row, if it exists + cmp r4, #0 + beq skip_dc_pred_up + + sub r6, r0, r2 + vld1.8 {q1}, [r6] + vpaddl.u8 q2, q1 + vpaddl.u16 q3, q2 + vpaddl.u32 q4, q3 + + vmov.32 r4, d8[0] + vmov.32 r6, d9[0] + + add r12, r4, r6 + + ; Move back to interger registers + +skip_dc_pred_up + + cmp r5, #0 + beq skip_dc_pred_left + + sub r0, r0, #1 + + ; Load left row, if it exists + ldrb r3, [r0], r2 + ldrb r4, [r0], r2 + ldrb r5, [r0], r2 + ldrb r6, [r0], r2 + + add r12, r12, r3 + add r12, r12, r4 + add r12, r12, r5 + add r12, r12, r6 + + ldrb r3, [r0], r2 + ldrb r4, [r0], r2 + ldrb r5, [r0], r2 + ldrb r6, [r0], r2 + + add r12, r12, r3 + add r12, r12, r4 + add r12, r12, r5 + add r12, r12, r6 + + ldrb r3, [r0], r2 + ldrb r4, [r0], r2 + ldrb r5, [r0], r2 + ldrb r6, [r0], r2 + + add r12, r12, r3 + add r12, r12, r4 + add r12, r12, r5 + add r12, r12, r6 + + ldrb r3, [r0], r2 + ldrb r4, [r0], r2 + ldrb r5, [r0], r2 + ldrb r6, [r0] + + add r12, r12, r3 + add r12, r12, r4 + add r12, r12, r5 + add r12, r12, r6 + +skip_dc_pred_left + add r7, r7, #3 ; Shift + sub r4, r7, #1 + mov r5, #1 + add r12, r12, r5, lsl r4 + mov r5, r12, lsr r7 ; expected_dc + + vdup.u8 q0, r5 + +skip_dc_pred_up_left + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + + pop {r4-r8,pc} +case_v_pred + ; Copy down above row + sub r6, r0, r2 + vld1.8 {q0}, [r6] + + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + pop {r4-r8,pc} + +case_h_pred + ; Load 4x yleft_col + sub r0, r0, #1 + + ldrb r3, [r0], r2 + ldrb r4, [r0], r2 + ldrb r5, [r0], r2 + ldrb r6, [r0], r2 + vdup.u8 q0, r3 + vdup.u8 q1, r4 + vdup.u8 q2, r5 + vdup.u8 q3, r6 + vst1.u8 {q0}, [r1]! + vst1.u8 {q1}, [r1]! + vst1.u8 {q2}, [r1]! + vst1.u8 {q3}, [r1]! + + ldrb r3, [r0], r2 + ldrb r4, [r0], r2 + ldrb r5, [r0], r2 + ldrb r6, [r0], r2 + vdup.u8 q0, r3 + vdup.u8 q1, r4 + vdup.u8 q2, r5 + vdup.u8 q3, r6 + vst1.u8 {q0}, [r1]! + vst1.u8 {q1}, [r1]! + vst1.u8 {q2}, [r1]! + vst1.u8 {q3}, [r1]! + + + ldrb r3, [r0], r2 + ldrb r4, [r0], r2 + ldrb r5, [r0], r2 + ldrb r6, [r0], r2 + vdup.u8 q0, r3 + vdup.u8 q1, r4 + vdup.u8 q2, r5 + vdup.u8 q3, r6 + vst1.u8 {q0}, [r1]! + vst1.u8 {q1}, [r1]! + vst1.u8 {q2}, [r1]! + vst1.u8 {q3}, [r1]! + + ldrb r3, [r0], r2 + ldrb r4, [r0], r2 + ldrb r5, [r0], r2 + ldrb r6, [r0], r2 + vdup.u8 q0, r3 + vdup.u8 q1, r4 + vdup.u8 q2, r5 + vdup.u8 q3, r6 + vst1.u8 {q0}, [r1]! + vst1.u8 {q1}, [r1]! + vst1.u8 {q2}, [r1]! + vst1.u8 {q3}, [r1]! + + pop {r4-r8,pc} + +case_tm_pred + ; Load yabove_row + sub r3, r0, r2 + vld1.8 {q8}, [r3] + + ; Load ytop_left + sub r3, r3, #1 + ldrb r7, [r3] + + vdup.u16 q7, r7 + + ; Compute yabove_row - ytop_left + mov r3, #1 + vdup.u8 q0, r3 + + vmull.u8 q4, d16, d0 + vmull.u8 q5, d17, d0 + + vsub.s16 q4, q4, q7 + vsub.s16 q5, q5, q7 + + ; Load 4x yleft_col + sub r0, r0, #1 + mov r12, #4 + +case_tm_pred_loop + ldrb r3, [r0], r2 + ldrb r4, [r0], r2 + ldrb r5, [r0], r2 + ldrb r6, [r0], r2 + vdup.u16 q0, r3 + vdup.u16 q1, r4 + vdup.u16 q2, r5 + vdup.u16 q3, r6 + + vqadd.s16 q8, q0, q4 + vqadd.s16 q9, q0, q5 + + vqadd.s16 q10, q1, q4 + vqadd.s16 q11, q1, q5 + + vqadd.s16 q12, q2, q4 + vqadd.s16 q13, q2, q5 + + vqadd.s16 q14, q3, q4 + vqadd.s16 q15, q3, q5 + + vqshrun.s16 d0, q8, #0 + vqshrun.s16 d1, q9, #0 + + vqshrun.s16 d2, q10, #0 + vqshrun.s16 d3, q11, #0 + + vqshrun.s16 d4, q12, #0 + vqshrun.s16 d5, q13, #0 + + vqshrun.s16 d6, q14, #0 + vqshrun.s16 d7, q15, #0 + + vst1.u8 {q0}, [r1]! + vst1.u8 {q1}, [r1]! + vst1.u8 {q2}, [r1]! + vst1.u8 {q3}, [r1]! + + subs r12, r12, #1 + bne case_tm_pred_loop + + pop {r4-r8,pc} + + ENDP + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; r0 unsigned char *y_buffer +; r1 unsigned char *ypred_ptr +; r2 int y_stride +; r3 int mode +; stack int Up +; stack int Left + +|vp8_build_intra_predictors_mby_s_neon_func| PROC + push {r4-r8, lr} + + mov r1, r0 ; unsigned char *ypred_ptr = x->dst.y_buffer; //x->Predictor; + + cmp r3, #0 + beq case_dc_pred_s + cmp r3, #1 + beq case_v_pred_s + cmp r3, #2 + beq case_h_pred_s + cmp r3, #3 + beq case_tm_pred_s + +case_dc_pred_s + ldr r4, [sp, #24] ; Up + ldr r5, [sp, #28] ; Left + + ; Default the DC average to 128 + mov r12, #128 + vdup.u8 q0, r12 + + ; Zero out running sum + mov r12, #0 + + ; compute shift and jump + adds r7, r4, r5 + beq skip_dc_pred_up_left_s + + ; Load above row, if it exists + cmp r4, #0 + beq skip_dc_pred_up_s + + sub r6, r0, r2 + vld1.8 {q1}, [r6] + vpaddl.u8 q2, q1 + vpaddl.u16 q3, q2 + vpaddl.u32 q4, q3 + + vmov.32 r4, d8[0] + vmov.32 r6, d9[0] + + add r12, r4, r6 + + ; Move back to interger registers + +skip_dc_pred_up_s + + cmp r5, #0 + beq skip_dc_pred_left_s + + sub r0, r0, #1 + + ; Load left row, if it exists + ldrb r3, [r0], r2 + ldrb r4, [r0], r2 + ldrb r5, [r0], r2 + ldrb r6, [r0], r2 + + add r12, r12, r3 + add r12, r12, r4 + add r12, r12, r5 + add r12, r12, r6 + + ldrb r3, [r0], r2 + ldrb r4, [r0], r2 + ldrb r5, [r0], r2 + ldrb r6, [r0], r2 + + add r12, r12, r3 + add r12, r12, r4 + add r12, r12, r5 + add r12, r12, r6 + + ldrb r3, [r0], r2 + ldrb r4, [r0], r2 + ldrb r5, [r0], r2 + ldrb r6, [r0], r2 + + add r12, r12, r3 + add r12, r12, r4 + add r12, r12, r5 + add r12, r12, r6 + + ldrb r3, [r0], r2 + ldrb r4, [r0], r2 + ldrb r5, [r0], r2 + ldrb r6, [r0] + + add r12, r12, r3 + add r12, r12, r4 + add r12, r12, r5 + add r12, r12, r6 + +skip_dc_pred_left_s + add r7, r7, #3 ; Shift + sub r4, r7, #1 + mov r5, #1 + add r12, r12, r5, lsl r4 + mov r5, r12, lsr r7 ; expected_dc + + vdup.u8 q0, r5 + +skip_dc_pred_up_left_s + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + + pop {r4-r8,pc} +case_v_pred_s + ; Copy down above row + sub r6, r0, r2 + vld1.8 {q0}, [r6] + + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + pop {r4-r8,pc} + +case_h_pred_s + ; Load 4x yleft_col + sub r0, r0, #1 + + ldrb r3, [r0], r2 + ldrb r4, [r0], r2 + ldrb r5, [r0], r2 + ldrb r6, [r0], r2 + vdup.u8 q0, r3 + vdup.u8 q1, r4 + vdup.u8 q2, r5 + vdup.u8 q3, r6 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q1}, [r1], r2 + vst1.u8 {q2}, [r1], r2 + vst1.u8 {q3}, [r1], r2 + + ldrb r3, [r0], r2 + ldrb r4, [r0], r2 + ldrb r5, [r0], r2 + ldrb r6, [r0], r2 + vdup.u8 q0, r3 + vdup.u8 q1, r4 + vdup.u8 q2, r5 + vdup.u8 q3, r6 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q1}, [r1], r2 + vst1.u8 {q2}, [r1], r2 + vst1.u8 {q3}, [r1], r2 + + + ldrb r3, [r0], r2 + ldrb r4, [r0], r2 + ldrb r5, [r0], r2 + ldrb r6, [r0], r2 + vdup.u8 q0, r3 + vdup.u8 q1, r4 + vdup.u8 q2, r5 + vdup.u8 q3, r6 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q1}, [r1], r2 + vst1.u8 {q2}, [r1], r2 + vst1.u8 {q3}, [r1], r2 + + ldrb r3, [r0], r2 + ldrb r4, [r0], r2 + ldrb r5, [r0], r2 + ldrb r6, [r0], r2 + vdup.u8 q0, r3 + vdup.u8 q1, r4 + vdup.u8 q2, r5 + vdup.u8 q3, r6 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q1}, [r1], r2 + vst1.u8 {q2}, [r1], r2 + vst1.u8 {q3}, [r1], r2 + + pop {r4-r8,pc} + +case_tm_pred_s + ; Load yabove_row + sub r3, r0, r2 + vld1.8 {q8}, [r3] + + ; Load ytop_left + sub r3, r3, #1 + ldrb r7, [r3] + + vdup.u16 q7, r7 + + ; Compute yabove_row - ytop_left + mov r3, #1 + vdup.u8 q0, r3 + + vmull.u8 q4, d16, d0 + vmull.u8 q5, d17, d0 + + vsub.s16 q4, q4, q7 + vsub.s16 q5, q5, q7 + + ; Load 4x yleft_col + sub r0, r0, #1 + mov r12, #4 + +case_tm_pred_loop_s + ldrb r3, [r0], r2 + ldrb r4, [r0], r2 + ldrb r5, [r0], r2 + ldrb r6, [r0], r2 + vdup.u16 q0, r3 + vdup.u16 q1, r4 + vdup.u16 q2, r5 + vdup.u16 q3, r6 + + vqadd.s16 q8, q0, q4 + vqadd.s16 q9, q0, q5 + + vqadd.s16 q10, q1, q4 + vqadd.s16 q11, q1, q5 + + vqadd.s16 q12, q2, q4 + vqadd.s16 q13, q2, q5 + + vqadd.s16 q14, q3, q4 + vqadd.s16 q15, q3, q5 + + vqshrun.s16 d0, q8, #0 + vqshrun.s16 d1, q9, #0 + + vqshrun.s16 d2, q10, #0 + vqshrun.s16 d3, q11, #0 + + vqshrun.s16 d4, q12, #0 + vqshrun.s16 d5, q13, #0 + + vqshrun.s16 d6, q14, #0 + vqshrun.s16 d7, q15, #0 + + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q1}, [r1], r2 + vst1.u8 {q2}, [r1], r2 + vst1.u8 {q3}, [r1], r2 + + subs r12, r12, #1 + bne case_tm_pred_loop_s + + pop {r4-r8,pc} + + ENDP + + + END diff --git a/vp8/common/arm/neon/copymem16x16_neon.asm b/vp8/common/arm/neon/copymem16x16_neon.asm new file mode 100644 index 000000000..89d5e1018 --- /dev/null +++ b/vp8/common/arm/neon/copymem16x16_neon.asm @@ -0,0 +1,58 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_copy_mem16x16_neon| + ; ARM + ; REQUIRE8 + ; PRESERVE8 + + AREA Block, CODE, READONLY ; name this block of code +;void copy_mem16x16_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride) +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= +|vp8_copy_mem16x16_neon| PROC + + vld1.u8 {q0}, [r0], r1 + vld1.u8 {q1}, [r0], r1 + vld1.u8 {q2}, [r0], r1 + vst1.u8 {q0}, [r2], r3 + vld1.u8 {q3}, [r0], r1 + vst1.u8 {q1}, [r2], r3 + vld1.u8 {q4}, [r0], r1 + vst1.u8 {q2}, [r2], r3 + vld1.u8 {q5}, [r0], r1 + vst1.u8 {q3}, [r2], r3 + vld1.u8 {q6}, [r0], r1 + vst1.u8 {q4}, [r2], r3 + vld1.u8 {q7}, [r0], r1 + vst1.u8 {q5}, [r2], r3 + vld1.u8 {q8}, [r0], r1 + vst1.u8 {q6}, [r2], r3 + vld1.u8 {q9}, [r0], r1 + vst1.u8 {q7}, [r2], r3 + vld1.u8 {q10}, [r0], r1 + vst1.u8 {q8}, [r2], r3 + vld1.u8 {q11}, [r0], r1 + vst1.u8 {q9}, [r2], r3 + vld1.u8 {q12}, [r0], r1 + vst1.u8 {q10}, [r2], r3 + vld1.u8 {q13}, [r0], r1 + vst1.u8 {q11}, [r2], r3 + vld1.u8 {q14}, [r0], r1 + vst1.u8 {q12}, [r2], r3 + vld1.u8 {q15}, [r0], r1 + vst1.u8 {q13}, [r2], r3 + vst1.u8 {q14}, [r2], r3 + vst1.u8 {q15}, [r2], r3 + + mov pc, lr + + ENDP ; |vp8_copy_mem16x16_neon| + + END diff --git a/vp8/common/arm/neon/copymem8x4_neon.asm b/vp8/common/arm/neon/copymem8x4_neon.asm new file mode 100644 index 000000000..302f734ff --- /dev/null +++ b/vp8/common/arm/neon/copymem8x4_neon.asm @@ -0,0 +1,33 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_copy_mem8x4_neon| + ; ARM + ; REQUIRE8 + ; PRESERVE8 + + AREA Block, CODE, READONLY ; name this block of code +;void copy_mem8x4_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride) +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= +|vp8_copy_mem8x4_neon| PROC + vld1.u8 {d0}, [r0], r1 + vld1.u8 {d1}, [r0], r1 + vst1.u8 {d0}, [r2], r3 + vld1.u8 {d2}, [r0], r1 + vst1.u8 {d1}, [r2], r3 + vld1.u8 {d3}, [r0], r1 + vst1.u8 {d2}, [r2], r3 + vst1.u8 {d3}, [r2], r3 + + mov pc, lr + + ENDP ; |vp8_copy_mem8x4_neon| + + END diff --git a/vp8/common/arm/neon/copymem8x8_neon.asm b/vp8/common/arm/neon/copymem8x8_neon.asm new file mode 100644 index 000000000..50d39ef66 --- /dev/null +++ b/vp8/common/arm/neon/copymem8x8_neon.asm @@ -0,0 +1,42 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_copy_mem8x8_neon| + ; ARM + ; REQUIRE8 + ; PRESERVE8 + + AREA Block, CODE, READONLY ; name this block of code +;void copy_mem8x8_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride) +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= +|vp8_copy_mem8x8_neon| PROC + + vld1.u8 {d0}, [r0], r1 + vld1.u8 {d1}, [r0], r1 + vst1.u8 {d0}, [r2], r3 + vld1.u8 {d2}, [r0], r1 + vst1.u8 {d1}, [r2], r3 + vld1.u8 {d3}, [r0], r1 + vst1.u8 {d2}, [r2], r3 + vld1.u8 {d4}, [r0], r1 + vst1.u8 {d3}, [r2], r3 + vld1.u8 {d5}, [r0], r1 + vst1.u8 {d4}, [r2], r3 + vld1.u8 {d6}, [r0], r1 + vst1.u8 {d5}, [r2], r3 + vld1.u8 {d7}, [r0], r1 + vst1.u8 {d6}, [r2], r3 + vst1.u8 {d7}, [r2], r3 + + mov pc, lr + + ENDP ; |vp8_copy_mem8x8_neon| + + END diff --git a/vp8/common/arm/neon/iwalsh_neon.asm b/vp8/common/arm/neon/iwalsh_neon.asm new file mode 100644 index 000000000..4fc744c96 --- /dev/null +++ b/vp8/common/arm/neon/iwalsh_neon.asm @@ -0,0 +1,95 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + EXPORT |vp8_short_inv_walsh4x4_neon| + EXPORT |vp8_short_inv_walsh4x4_1_neon| + + ARM + REQUIRE8 + PRESERVE8 + + AREA |.text|, CODE, READONLY ; name this block of code + +;short vp8_short_inv_walsh4x4_neon(short *input, short *output) +|vp8_short_inv_walsh4x4_neon| PROC + + ; read in all four lines of values: d0->d3 + vldm.64 r0, {q0, q1} + + ; first for loop + + vadd.s16 d4, d0, d3 ;a = [0] + [12] + vadd.s16 d5, d1, d2 ;b = [4] + [8] + vsub.s16 d6, d1, d2 ;c = [4] - [8] + vsub.s16 d7, d0, d3 ;d = [0] - [12] + + vadd.s16 d0, d4, d5 ;a + b + vadd.s16 d1, d6, d7 ;c + d + vsub.s16 d2, d4, d5 ;a - b + vsub.s16 d3, d7, d6 ;d - c + + vtrn.32 d0, d2 ;d0: 0 1 8 9 + ;d2: 2 3 10 11 + vtrn.32 d1, d3 ;d1: 4 5 12 13 + ;d3: 6 7 14 15 + + vtrn.16 d0, d1 ;d0: 0 4 8 12 + ;d1: 1 5 9 13 + vtrn.16 d2, d3 ;d2: 2 6 10 14 + ;d3: 3 7 11 15 + + ; second for loop + + vadd.s16 d4, d0, d3 ;a = [0] + [3] + vadd.s16 d5, d1, d2 ;b = [1] + [2] + vsub.s16 d6, d1, d2 ;c = [1] - [2] + vsub.s16 d7, d0, d3 ;d = [0] - [3] + + vadd.s16 d0, d4, d5 ;e = a + b + vadd.s16 d1, d6, d7 ;f = c + d + vsub.s16 d2, d4, d5 ;g = a - b + vsub.s16 d3, d7, d6 ;h = d - c + + vmov.i16 q2, #3 + vadd.i16 q0, q0, q2 ;e/f += 3 + vadd.i16 q1, q1, q2 ;g/h += 3 + + vshr.s16 q0, q0, #3 ;e/f >> 3 + vshr.s16 q1, q1, #3 ;g/h >> 3 + + vtrn.32 d0, d2 + vtrn.32 d1, d3 + vtrn.16 d0, d1 + vtrn.16 d2, d3 + + vstmia.16 r1!, {q0} + vstmia.16 r1!, {q1} + + bx lr + ENDP ; |vp8_short_inv_walsh4x4_neon| + + +;short vp8_short_inv_walsh4x4_1_neon(short *input, short *output) +|vp8_short_inv_walsh4x4_1_neon| PROC + ; load a full line into a neon register + vld1.16 {q0}, [r0] + ; extract first element and replicate + vdup.16 q1, d0[0] + ; add 3 to all values + vmov.i16 q2, #3 + vadd.i16 q3, q1, q2 + ; right shift + vshr.s16 q3, q3, #3 + ; write it back + vstmia.16 r1!, {q3} + vstmia.16 r1!, {q3} + + bx lr + ENDP ; |vp8_short_inv_walsh4x4_1_neon| + + END diff --git a/vp8/common/arm/neon/loopfilterhorizontaledge_uv_neon.asm b/vp8/common/arm/neon/loopfilterhorizontaledge_uv_neon.asm new file mode 100644 index 000000000..e3e8e8a72 --- /dev/null +++ b/vp8/common/arm/neon/loopfilterhorizontaledge_uv_neon.asm @@ -0,0 +1,205 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_loop_filter_horizontal_edge_uv_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +;Note: flimit, limit, and thresh shpuld be positive numbers. All 16 elements in flimit +;are equal. So, in the code, only one load is needed +;for flimit. Same way applies to limit and thresh. +; r0 unsigned char *u, +; r1 int p, //pitch +; r2 const signed char *flimit, +; r3 const signed char *limit, +; stack(r4) const signed char *thresh, +; stack(r5) unsigned char *v + +|vp8_loop_filter_horizontal_edge_uv_neon| PROC + sub r0, r0, r1, lsl #2 ; move u pointer down by 4 lines + vld1.s8 {d0[], d1[]}, [r2] ; flimit + + ldr r2, [sp, #4] ; load v ptr + ldr r12, [sp, #0] ; load thresh pointer + + sub r2, r2, r1, lsl #2 ; move v pointer down by 4 lines + + vld1.u8 {d6}, [r0], r1 ; p3 + vld1.u8 {d7}, [r2], r1 ; p3 + vld1.u8 {d8}, [r0], r1 ; p2 + vld1.u8 {d9}, [r2], r1 ; p2 + vld1.u8 {d10}, [r0], r1 ; p1 + vld1.u8 {d11}, [r2], r1 ; p1 + vld1.u8 {d12}, [r0], r1 ; p0 + vld1.u8 {d13}, [r2], r1 ; p0 + vld1.u8 {d14}, [r0], r1 ; q0 + vld1.u8 {d15}, [r2], r1 ; q0 + vld1.u8 {d16}, [r0], r1 ; q1 + vld1.u8 {d17}, [r2], r1 ; q1 + vld1.u8 {d18}, [r0], r1 ; q2 + vld1.u8 {d19}, [r2], r1 ; q2 + vld1.u8 {d20}, [r0], r1 ; q3 + vld1.u8 {d21}, [r2], r1 ; q3 + + vld1.s8 {d2[], d3[]}, [r3] ; limit + vld1.s8 {d4[], d5[]}, [r12] ; thresh + + ldr r12, _lfhuv_coeff_ + ;vp8_filter_mask() function + ;vp8_hevmask() function + vabd.u8 q11, q3, q4 ; abs(p3 - p2) + vabd.u8 q12, q4, q5 ; abs(p2 - p1) + vabd.u8 q13, q5, q6 ; abs(p1 - p0) + vabd.u8 q14, q8, q7 ; abs(q1 - q0) + vabd.u8 q3, q9, q8 ; abs(q2 - q1) + vabd.u8 q4, q10, q9 ; abs(q3 - q2) + vabd.u8 q9, q6, q7 ; abs(p0 - q0) + + vcge.u8 q15, q1, q11 ; (abs(p3 - p2) > limit)*-1 + vcge.u8 q12, q1, q12 ; (abs(p2 - p1) > limit)*-1 + vcge.u8 q10, q1, q13 ; (abs(p1 - p0) > limit)*-1 + vcge.u8 q11, q1, q14 ; (abs(q1 - q0) > limit)*-1 + + vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh)*-1 + vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh)*-1 + + vcge.u8 q3, q1, q3 ; (abs(q2 - q1) > limit)*-1 + vcge.u8 q4, q1, q4 ; (abs(q3 - q2) > limit)*-1 + vadd.u8 q0, q0, q0 ; flimit * 2 + vadd.u8 q0, q0, q1 ; flimit * 2 + limit + + vand q15, q15, q12 + vand q10, q10, q11 + vand q3, q3, q4 + + vabd.u8 q2, q5, q8 ; abs(p1 - q1) + vqadd.u8 q9, q9, q9 ; abs(p0 - q0) * 2 + vshr.u8 q2, q2, #1 ; abs(p1 - q1) / 2 + vqadd.u8 q9, q9, q2 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2 + vcge.u8 q9, q0, q9 ; (abs(p0 - q0)*2 + abs(p1-q1)/2 > flimit*2 + limit)*-1 + + vld1.u8 {q0}, [r12]! + + vand q15, q15, q10 + + ;vp8_filter() function + veor q7, q7, q0 ; qs0: q0 offset to convert to a signed value + veor q6, q6, q0 ; ps0: p0 offset to convert to a signed value + veor q5, q5, q0 ; ps1: p1 offset to convert to a signed value + veor q8, q8, q0 ; qs1: q1 offset to convert to a signed value +;;;;;;;;;;;;;; + vld1.u8 {q10}, [r12]! + + ;vqsub.s8 q2, q7, q6 ; ( qs0 - ps0) + vsubl.s8 q2, d14, d12 ; ( qs0 - ps0) + vsubl.s8 q11, d15, d13 + + vand q3, q3, q9 + vmovl.u8 q4, d20 + + vqsub.s8 q1, q5, q8 ; vp8_filter = vp8_signed_char_clamp(ps1-qs1) + vorr q14, q13, q14 ; q14: vp8_hevmask + + ;vmul.i8 q2, q2, q10 ; 3 * ( qs0 - ps0) + vmul.i16 q2, q2, q4 ; 3 * ( qs0 - ps0) + vmul.i16 q11, q11, q4 + + vand q1, q1, q14 ; vp8_filter &= hev + vand q15, q15, q3 ; q15: vp8_filter_mask + ;; + ;vld1.u8 {q4}, [r12]! ;no need 7 any more + + ;vqadd.s8 q1, q1, q2 + vaddw.s8 q2, q2, d2 + vaddw.s8 q11, q11, d3 + + vld1.u8 {q9}, [r12]! + ; + vqmovn.s16 d2, q2 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) + vqmovn.s16 d3, q11 + ;; + + vand q1, q1, q15 ; vp8_filter &= mask + ;; +;;;;;;;;;;;; + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;Change for VP8 from VP7 +; vand q2, q1, q4 ; s = vp8_filter & 7 +; vqadd.s8 q1, q1, q9 ; vp8_filter = vp8_signed_char_clamp(vp8_filter+4) + ;;;; +; vshr.s8 q1, q1, #3 ; vp8_filter >>= 3 +; vceq.i8 q2, q2, q9 ; s = (s==4)*-1 + ;; +; ;calculate output +; vqsub.s8 q10, q7, q1 ; u = vp8_signed_char_clamp(qs0 - vp8_filter) +; vqadd.s8 q11, q2, q1 ; u = vp8_signed_char_clamp(s + vp8_filter) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; q10=3 + vqadd.s8 q2, q1, q10 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3) + vqadd.s8 q1, q1, q9 ; Filter1 = vp8_signed_char_clamp(vp8_filter+4) + vshr.s8 q2, q2, #3 ; Filter2 >>= 3 + vshr.s8 q1, q1, #3 ; Filter1 >>= 3 + + ;calculate output + vqadd.s8 q11, q6, q2 ; u = vp8_signed_char_clamp(ps0 + Filter2) + vqsub.s8 q10, q7, q1 ; u = vp8_signed_char_clamp(qs0 - Filter1) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + vrshr.s8 q1, q1, #1 ;round/shift: vp8_filter += 1; vp8_filter >>= 1 + + sub r0, r0, r1, lsl #2 + sub r0, r0, r1, lsl #1 + ; + + vbic q1, q1, q14 ; vp8_filter &= ~hev + + sub r2, r2, r1, lsl #2 + sub r2, r2, r1, lsl #1 + ;; + + vqadd.s8 q13, q5, q1 ; u = vp8_signed_char_clamp(ps1 + vp8_filter) + ;vqadd.s8 q11, q6, q11 ; u = vp8_signed_char_clamp(ps0 + u) + vqsub.s8 q12, q8, q1 ; u = vp8_signed_char_clamp(qs1 - vp8_filter) + ; + + veor q5, q13, q0 ; *op1 = u^0x80 + veor q6, q11, q0 ; *op0 = u^0x80 + veor q7, q10, q0 ; *oq0 = u^0x80 + veor q8, q12, q0 ; *oq1 = u^0x80 + ; + + vst1.u8 {d10}, [r0], r1 ; store u op1 + vst1.u8 {d11}, [r2], r1 ; store v op1 + vst1.u8 {d12}, [r0], r1 ; store u op0 + vst1.u8 {d13}, [r2], r1 ; store v op0 + vst1.u8 {d14}, [r0], r1 ; store u oq0 + vst1.u8 {d15}, [r2], r1 ; store v oq0 + vst1.u8 {d16}, [r0], r1 ; store u oq1 + vst1.u8 {d17}, [r2], r1 ; store v oq1 + + bx lr + ENDP ; |vp8_loop_filter_horizontal_edge_uv_neon| + +;----------------- + AREA hloopfilteruv_dat, DATA, READWRITE ;read/write by default +;Data section with name data_area is specified. DCD reserves space in memory for 16 data. +;One word each is reserved. Label filter_coeff can be used to access the data. +;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ... +_lfhuv_coeff_ + DCD lfhuv_coeff +lfhuv_coeff + DCD 0x80808080, 0x80808080, 0x80808080, 0x80808080 + DCD 0x03030303, 0x03030303, 0x03030303, 0x03030303 + DCD 0x04040404, 0x04040404, 0x04040404, 0x04040404 + DCD 0x01010101, 0x01010101, 0x01010101, 0x01010101 + + END diff --git a/vp8/common/arm/neon/loopfilterhorizontaledge_y_neon.asm b/vp8/common/arm/neon/loopfilterhorizontaledge_y_neon.asm new file mode 100644 index 000000000..f11055d42 --- /dev/null +++ b/vp8/common/arm/neon/loopfilterhorizontaledge_y_neon.asm @@ -0,0 +1,188 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_loop_filter_horizontal_edge_y_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +;Note: flimit, limit, and thresh shpuld be positive numbers. All 16 elements in flimit +;are equal. So, in the code, only one load is needed +;for flimit. Same way applies to limit and thresh. +; r0 unsigned char *s, +; r1 int p, //pitch +; r2 const signed char *flimit, +; r3 const signed char *limit, +; stack(r4) const signed char *thresh, +; //stack(r5) int count --unused + +|vp8_loop_filter_horizontal_edge_y_neon| PROC + sub r0, r0, r1, lsl #2 ; move src pointer down by 4 lines + ldr r12, [sp, #0] ; load thresh pointer + + vld1.u8 {q3}, [r0], r1 ; p3 + vld1.s8 {d0[], d1[]}, [r2] ; flimit + vld1.u8 {q4}, [r0], r1 ; p2 + vld1.s8 {d2[], d3[]}, [r3] ; limit + vld1.u8 {q5}, [r0], r1 ; p1 + vld1.s8 {d4[], d5[]}, [r12] ; thresh + vld1.u8 {q6}, [r0], r1 ; p0 + ldr r12, _lfhy_coeff_ + vld1.u8 {q7}, [r0], r1 ; q0 + + ;vp8_filter_mask() function + ;vp8_hevmask() function + vabd.u8 q11, q3, q4 ; abs(p3 - p2) + vld1.u8 {q8}, [r0], r1 ; q1 + vabd.u8 q12, q4, q5 ; abs(p2 - p1) + vld1.u8 {q9}, [r0], r1 ; q2 + vabd.u8 q13, q5, q6 ; abs(p1 - p0) + vld1.u8 {q10}, [r0], r1 ; q3 + vabd.u8 q14, q8, q7 ; abs(q1 - q0) + vabd.u8 q3, q9, q8 ; abs(q2 - q1) + vabd.u8 q4, q10, q9 ; abs(q3 - q2) + vabd.u8 q9, q6, q7 ; abs(p0 - q0) + + vcge.u8 q15, q1, q11 ; (abs(p3 - p2) > limit)*-1 + vcge.u8 q12, q1, q12 ; (abs(p2 - p1) > limit)*-1 + vcge.u8 q10, q1, q13 ; (abs(p1 - p0) > limit)*-1 + vcge.u8 q11, q1, q14 ; (abs(q1 - q0) > limit)*-1 + + vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh)*-1 + vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh)*-1 + + vcge.u8 q3, q1, q3 ; (abs(q2 - q1) > limit)*-1 + vcge.u8 q4, q1, q4 ; (abs(q3 - q2) > limit)*-1 + vadd.u8 q0, q0, q0 ; flimit * 2 + vadd.u8 q0, q0, q1 ; flimit * 2 + limit + + vand q15, q15, q12 + vand q10, q10, q11 + vand q3, q3, q4 + + vabd.u8 q2, q5, q8 ; abs(p1 - q1) + vqadd.u8 q9, q9, q9 ; abs(p0 - q0) * 2 + vshr.u8 q2, q2, #1 ; abs(p1 - q1) / 2 + vqadd.u8 q9, q9, q2 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2 + vcge.u8 q9, q0, q9 ; (abs(p0 - q0)*2 + abs(p1-q1)/2 > flimit*2 + limit)*-1 + + vld1.u8 {q0}, [r12]! + + vand q15, q15, q10 + + ;vp8_filter() function + veor q7, q7, q0 ; qs0: q0 offset to convert to a signed value + veor q6, q6, q0 ; ps0: p0 offset to convert to a signed value + veor q5, q5, q0 ; ps1: p1 offset to convert to a signed value + veor q8, q8, q0 ; qs1: q1 offset to convert to a signed value +;;;;;;;;;;;;;; + vld1.u8 {q10}, [r12]! + + ;vqsub.s8 q2, q7, q6 ; ( qs0 - ps0) + vsubl.s8 q2, d14, d12 ; ( qs0 - ps0) + vsubl.s8 q11, d15, d13 + + vand q3, q3, q9 + vmovl.u8 q4, d20 + + vqsub.s8 q1, q5, q8 ; vp8_filter = vp8_signed_char_clamp(ps1-qs1) + vorr q14, q13, q14 ; q14: vp8_hevmask + + ;vmul.i8 q2, q2, q10 ; 3 * ( qs0 - ps0) + vmul.i16 q2, q2, q4 ; 3 * ( qs0 - ps0) + vmul.i16 q11, q11, q4 + + vand q1, q1, q14 ; vp8_filter &= hev + vand q15, q15, q3 ; q15: vp8_filter_mask + ;; + ;vld1.u8 {q4}, [r12]! ;no need 7 any more + + ;vqadd.s8 q1, q1, q2 + vaddw.s8 q2, q2, d2 + vaddw.s8 q11, q11, d3 + + vld1.u8 {q9}, [r12]! + ; + vqmovn.s16 d2, q2 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) + vqmovn.s16 d3, q11 + ;; + + vand q1, q1, q15 ; vp8_filter &= mask + ;; +;;;;;;;;;;;; + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;Change for VP8 from VP7 +; vand q2, q1, q4 ; s = vp8_filter & 7 +; vqadd.s8 q1, q1, q9 ; vp8_filter = vp8_signed_char_clamp(vp8_filter+4) + ;;;; +; vshr.s8 q1, q1, #3 ; vp8_filter >>= 3 +; vceq.i8 q2, q2, q9 ; s = (s==4)*-1 + ;; +; ;calculate output +; vqsub.s8 q10, q7, q1 ; u = vp8_signed_char_clamp(qs0 - vp8_filter) +; vqadd.s8 q11, q2, q1 ; u = vp8_signed_char_clamp(s + vp8_filter) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; q10=3 + vqadd.s8 q2, q1, q10 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3) + vqadd.s8 q1, q1, q9 ; Filter1 = vp8_signed_char_clamp(vp8_filter+4) + vshr.s8 q2, q2, #3 ; Filter2 >>= 3 + vshr.s8 q1, q1, #3 ; Filter1 >>= 3 + + ;calculate output + vqadd.s8 q11, q6, q2 ; u = vp8_signed_char_clamp(ps0 + Filter2) + vqsub.s8 q10, q7, q1 ; u = vp8_signed_char_clamp(qs0 - Filter1) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + vrshr.s8 q1, q1, #1 ;round/shift: vp8_filter += 1; vp8_filter >>= 1 + + sub r0, r0, r1, lsl #2 + sub r0, r0, r1, lsl #1 + ; + + vbic q1, q1, q14 ; vp8_filter &= ~hev + ; + add r2, r1, r0 + + vqadd.s8 q13, q5, q1 ; u = vp8_signed_char_clamp(ps1 + vp8_filter) + ;vqadd.s8 q11, q6, q11 ; u = vp8_signed_char_clamp(ps0 + u) + vqsub.s8 q12, q8, q1 ; u = vp8_signed_char_clamp(qs1 - vp8_filter) + + add r3, r2, r1 + + veor q5, q13, q0 ; *op1 = u^0x80 + veor q6, q11, q0 ; *op0 = u^0x80 + veor q7, q10, q0 ; *oq0 = u^0x80 + veor q8, q12, q0 ; *oq1 = u^0x80 + + add r12, r3, r1 + + vst1.u8 {q5}, [r0] ; store op1 + vst1.u8 {q6}, [r2] ; store op0 + vst1.u8 {q7}, [r3] ; store oq0 + vst1.u8 {q8}, [r12] ; store oq1 + + bx lr + ENDP ; |vp8_loop_filter_horizontal_edge_y_neon| + +;----------------- + AREA hloopfiltery_dat, DATA, READWRITE ;read/write by default +;Data section with name data_area is specified. DCD reserves space in memory for 16 data. +;One word each is reserved. Label filter_coeff can be used to access the data. +;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ... +_lfhy_coeff_ + DCD lfhy_coeff +lfhy_coeff + DCD 0x80808080, 0x80808080, 0x80808080, 0x80808080 + DCD 0x03030303, 0x03030303, 0x03030303, 0x03030303 + DCD 0x04040404, 0x04040404, 0x04040404, 0x04040404 + DCD 0x01010101, 0x01010101, 0x01010101, 0x01010101 + + END diff --git a/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm b/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm new file mode 100644 index 000000000..6d74fab52 --- /dev/null +++ b/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm @@ -0,0 +1,117 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_loop_filter_simple_horizontal_edge_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +;Note: flimit, limit, and thresh shpuld be positive numbers. All 16 elements in flimit +;are equal. So, in the code, only one load is needed +;for flimit. Same way applies to limit and thresh. +; r0 unsigned char *s, +; r1 int p, //pitch +; r2 const signed char *flimit, +; r3 const signed char *limit, +; stack(r4) const signed char *thresh, +; //stack(r5) int count --unused + +|vp8_loop_filter_simple_horizontal_edge_neon| PROC + sub r0, r0, r1, lsl #1 ; move src pointer down by 2 lines + + ldr r12, _lfhy_coeff_ + vld1.u8 {q5}, [r0], r1 ; p1 + vld1.s8 {d2[], d3[]}, [r2] ; flimit + vld1.s8 {d26[], d27[]}, [r3] ; limit -> q13 + vld1.u8 {q6}, [r0], r1 ; p0 + vld1.u8 {q0}, [r12]! ; 0x80 + vld1.u8 {q7}, [r0], r1 ; q0 + vld1.u8 {q10}, [r12]! ; 0x03 + vld1.u8 {q8}, [r0] ; q1 + + ;vp8_filter_mask() function + vabd.u8 q15, q6, q7 ; abs(p0 - q0) + vabd.u8 q14, q5, q8 ; abs(p1 - q1) + vqadd.u8 q15, q15, q15 ; abs(p0 - q0) * 2 + vshr.u8 q14, q14, #1 ; abs(p1 - q1) / 2 + vqadd.u8 q15, q15, q14 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2 + + ;vp8_filter() function + veor q7, q7, q0 ; qs0: q0 offset to convert to a signed value + veor q6, q6, q0 ; ps0: p0 offset to convert to a signed value + veor q5, q5, q0 ; ps1: p1 offset to convert to a signed value + veor q8, q8, q0 ; qs1: q1 offset to convert to a signed value + + vadd.u8 q1, q1, q1 ; flimit * 2 + vadd.u8 q1, q1, q13 ; flimit * 2 + limit + vcge.u8 q15, q1, q15 ; (abs(p0 - q0)*2 + abs(p1-q1)/2 > flimit*2 + limit)*-1 + +;;;;;;;;;; + ;vqsub.s8 q2, q7, q6 ; ( qs0 - ps0) + vsubl.s8 q2, d14, d12 ; ( qs0 - ps0) + vsubl.s8 q3, d15, d13 + + vqsub.s8 q4, q5, q8 ; q4: vp8_filter = vp8_signed_char_clamp(ps1-qs1) + + ;vmul.i8 q2, q2, q10 ; 3 * ( qs0 - ps0) + vadd.s16 q11, q2, q2 ; 3 * ( qs0 - ps0) + vadd.s16 q12, q3, q3 + + vld1.u8 {q9}, [r12]! ; 0x04 + + vadd.s16 q2, q2, q11 + vadd.s16 q3, q3, q12 + + vaddw.s8 q2, q2, d8 ; vp8_filter + 3 * ( qs0 - ps0) + vaddw.s8 q3, q3, d9 + + ;vqadd.s8 q4, q4, q2 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) + vqmovn.s16 d8, q2 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) + vqmovn.s16 d9, q3 +;;;;;;;;;;;;; + + vand q4, q4, q15 ; vp8_filter &= mask + + vqadd.s8 q2, q4, q10 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3) + vqadd.s8 q4, q4, q9 ; Filter1 = vp8_signed_char_clamp(vp8_filter+4) + vshr.s8 q2, q2, #3 ; Filter2 >>= 3 + vshr.s8 q4, q4, #3 ; Filter1 >>= 3 + + sub r0, r0, r1, lsl #1 + + ;calculate output + vqadd.s8 q11, q6, q2 ; u = vp8_signed_char_clamp(ps0 + Filter2) + vqsub.s8 q10, q7, q4 ; u = vp8_signed_char_clamp(qs0 - Filter1) + + add r3, r0, r1 + + veor q6, q11, q0 ; *op0 = u^0x80 + veor q7, q10, q0 ; *oq0 = u^0x80 + + vst1.u8 {q6}, [r0] ; store op0 + vst1.u8 {q7}, [r3] ; store oq0 + + bx lr + ENDP ; |vp8_loop_filter_simple_horizontal_edge_neon| + +;----------------- + AREA hloopfiltery_dat, DATA, READWRITE ;read/write by default +;Data section with name data_area is specified. DCD reserves space in memory for 16 data. +;One word each is reserved. Label filter_coeff can be used to access the data. +;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ... +_lfhy_coeff_ + DCD lfhy_coeff +lfhy_coeff + DCD 0x80808080, 0x80808080, 0x80808080, 0x80808080 + DCD 0x03030303, 0x03030303, 0x03030303, 0x03030303 + DCD 0x04040404, 0x04040404, 0x04040404, 0x04040404 + + END diff --git a/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm b/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm new file mode 100644 index 000000000..2bb6222b9 --- /dev/null +++ b/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm @@ -0,0 +1,158 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_loop_filter_simple_vertical_edge_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +;Note: flimit, limit, and thresh should be positive numbers. All 16 elements in flimit +;are equal. So, in the code, only one load is needed +;for flimit. Same way applies to limit and thresh. +; r0 unsigned char *s, +; r1 int p, //pitch +; r2 const signed char *flimit, +; r3 const signed char *limit, +; stack(r4) const signed char *thresh, +; //stack(r5) int count --unused + +|vp8_loop_filter_simple_vertical_edge_neon| PROC + sub r0, r0, #2 ; move src pointer down by 2 columns + + vld4.8 {d6[0], d7[0], d8[0], d9[0]}, [r0], r1 + vld1.s8 {d2[], d3[]}, [r2] ; flimit + vld1.s8 {d26[], d27[]}, [r3] ; limit -> q13 + vld4.8 {d6[1], d7[1], d8[1], d9[1]}, [r0], r1 + ldr r12, _vlfy_coeff_ + vld4.8 {d6[2], d7[2], d8[2], d9[2]}, [r0], r1 + vld4.8 {d6[3], d7[3], d8[3], d9[3]}, [r0], r1 + vld4.8 {d6[4], d7[4], d8[4], d9[4]}, [r0], r1 + vld4.8 {d6[5], d7[5], d8[5], d9[5]}, [r0], r1 + vld4.8 {d6[6], d7[6], d8[6], d9[6]}, [r0], r1 + vld4.8 {d6[7], d7[7], d8[7], d9[7]}, [r0], r1 + + vld4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r1 + vld1.u8 {q0}, [r12]! ; 0x80 + vld4.8 {d10[1], d11[1], d12[1], d13[1]}, [r0], r1 + vld1.u8 {q11}, [r12]! ; 0x03 + vld4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r1 + vld1.u8 {q12}, [r12]! ; 0x04 + vld4.8 {d10[3], d11[3], d12[3], d13[3]}, [r0], r1 + vld4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r1 + vld4.8 {d10[5], d11[5], d12[5], d13[5]}, [r0], r1 + vld4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r1 + vld4.8 {d10[7], d11[7], d12[7], d13[7]}, [r0], r1 + + vswp d7, d10 + vswp d12, d9 + ;vswp q4, q5 ; p1:q3, p0:q5, q0:q4, q1:q6 + + ;vp8_filter_mask() function + ;vp8_hevmask() function + sub r0, r0, r1, lsl #4 + vabd.u8 q15, q5, q4 ; abs(p0 - q0) + vabd.u8 q14, q3, q6 ; abs(p1 - q1) + vqadd.u8 q15, q15, q15 ; abs(p0 - q0) * 2 + vshr.u8 q14, q14, #1 ; abs(p1 - q1) / 2 + vqadd.u8 q15, q15, q14 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2 + + veor q4, q4, q0 ; qs0: q0 offset to convert to a signed value + veor q5, q5, q0 ; ps0: p0 offset to convert to a signed value + veor q3, q3, q0 ; ps1: p1 offset to convert to a signed value + veor q6, q6, q0 ; qs1: q1 offset to convert to a signed value + + vadd.u8 q1, q1, q1 ; flimit * 2 + vadd.u8 q1, q1, q13 ; flimit * 2 + limit + vcge.u8 q15, q1, q15 ; abs(p0 - q0)*2 + abs(p1-q1)/2 > flimit*2 + limit)*-1 + + ;vp8_filter() function +;;;;;;;;;; + ;vqsub.s8 q2, q5, q4 ; ( qs0 - ps0) + vsubl.s8 q2, d8, d10 ; ( qs0 - ps0) + vsubl.s8 q13, d9, d11 + + vqsub.s8 q1, q3, q6 ; vp8_filter = vp8_signed_char_clamp(ps1-qs1) + + ;vmul.i8 q2, q2, q11 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) + vadd.s16 q10, q2, q2 ; 3 * ( qs0 - ps0) + vadd.s16 q14, q13, q13 + vadd.s16 q2, q2, q10 + vadd.s16 q13, q13, q14 + + ;vqadd.s8 q1, q1, q2 + vaddw.s8 q2, q2, d2 ; vp8_filter + 3 * ( qs0 - ps0) + vaddw.s8 q13, q13, d3 + + vqmovn.s16 d2, q2 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) + vqmovn.s16 d3, q13 + + add r0, r0, #1 + add r2, r0, r1 +;;;;;;;;;;; + + vand q1, q1, q15 ; vp8_filter &= mask + + vqadd.s8 q2, q1, q11 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3) + vqadd.s8 q1, q1, q12 ; Filter1 = vp8_signed_char_clamp(vp8_filter+4) + vshr.s8 q2, q2, #3 ; Filter2 >>= 3 + vshr.s8 q1, q1, #3 ; Filter1 >>= 3 + + ;calculate output + vqsub.s8 q10, q4, q1 ; u = vp8_signed_char_clamp(qs0 - Filter1) + vqadd.s8 q11, q5, q2 ; u = vp8_signed_char_clamp(ps0 + Filter2) + + veor q7, q10, q0 ; *oq0 = u^0x80 + veor q6, q11, q0 ; *op0 = u^0x80 + + add r3, r2, r1 + vswp d13, d14 + add r12, r3, r1 + + ;store op1, op0, oq0, oq1 + vst2.8 {d12[0], d13[0]}, [r0] + vst2.8 {d12[1], d13[1]}, [r2] + vst2.8 {d12[2], d13[2]}, [r3] + vst2.8 {d12[3], d13[3]}, [r12], r1 + add r0, r12, r1 + vst2.8 {d12[4], d13[4]}, [r12] + vst2.8 {d12[5], d13[5]}, [r0], r1 + add r2, r0, r1 + vst2.8 {d12[6], d13[6]}, [r0] + vst2.8 {d12[7], d13[7]}, [r2], r1 + add r3, r2, r1 + vst2.8 {d14[0], d15[0]}, [r2] + vst2.8 {d14[1], d15[1]}, [r3], r1 + add r12, r3, r1 + vst2.8 {d14[2], d15[2]}, [r3] + vst2.8 {d14[3], d15[3]}, [r12], r1 + add r0, r12, r1 + vst2.8 {d14[4], d15[4]}, [r12] + vst2.8 {d14[5], d15[5]}, [r0], r1 + add r2, r0, r1 + vst2.8 {d14[6], d15[6]}, [r0] + vst2.8 {d14[7], d15[7]}, [r2] + + bx lr + ENDP ; |vp8_loop_filter_simple_vertical_edge_neon| + +;----------------- + AREA vloopfiltery_dat, DATA, READWRITE ;read/write by default +;Data section with name data_area is specified. DCD reserves space in memory for 16 data. +;One word each is reserved. Label filter_coeff can be used to access the data. +;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ... +_vlfy_coeff_ + DCD vlfy_coeff +vlfy_coeff + DCD 0x80808080, 0x80808080, 0x80808080, 0x80808080 + DCD 0x03030303, 0x03030303, 0x03030303, 0x03030303 + DCD 0x04040404, 0x04040404, 0x04040404, 0x04040404 + + END diff --git a/vp8/common/arm/neon/loopfilterverticaledge_uv_neon.asm b/vp8/common/arm/neon/loopfilterverticaledge_uv_neon.asm new file mode 100644 index 000000000..d79cc68a3 --- /dev/null +++ b/vp8/common/arm/neon/loopfilterverticaledge_uv_neon.asm @@ -0,0 +1,231 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_loop_filter_vertical_edge_uv_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +;Note: flimit, limit, and thresh shpuld be positive numbers. All 16 elements in flimit +;are equal. So, in the code, only one load is needed +;for flimit. Same way applies to limit and thresh. +; r0 unsigned char *u, +; r1 int p, //pitch +; r2 const signed char *flimit, +; r3 const signed char *limit, +; stack(r4) const signed char *thresh, +; stack(r5) unsigned char *v + +|vp8_loop_filter_vertical_edge_uv_neon| PROC + sub r0, r0, #4 ; move u pointer down by 4 columns + vld1.s8 {d0[], d1[]}, [r2] ; flimit + + ldr r2, [sp, #4] ; load v ptr + ldr r12, [sp, #0] ; load thresh pointer + + sub r2, r2, #4 ; move v pointer down by 4 columns + + vld1.u8 {d6}, [r0], r1 ;load u data + vld1.u8 {d7}, [r2], r1 ;load v data + vld1.u8 {d8}, [r0], r1 + vld1.u8 {d9}, [r2], r1 + vld1.u8 {d10}, [r0], r1 + vld1.u8 {d11}, [r2], r1 + vld1.u8 {d12}, [r0], r1 + vld1.u8 {d13}, [r2], r1 + vld1.u8 {d14}, [r0], r1 + vld1.u8 {d15}, [r2], r1 + vld1.u8 {d16}, [r0], r1 + vld1.u8 {d17}, [r2], r1 + vld1.u8 {d18}, [r0], r1 + vld1.u8 {d19}, [r2], r1 + vld1.u8 {d20}, [r0], r1 + vld1.u8 {d21}, [r2], r1 + + ;transpose to 8x16 matrix + vtrn.32 q3, q7 + vtrn.32 q4, q8 + vtrn.32 q5, q9 + vtrn.32 q6, q10 + + vtrn.16 q3, q5 + vtrn.16 q4, q6 + vtrn.16 q7, q9 + vtrn.16 q8, q10 + + vtrn.8 q3, q4 + vtrn.8 q5, q6 + vtrn.8 q7, q8 + vtrn.8 q9, q10 + + vld1.s8 {d2[], d3[]}, [r3] ; limit + vld1.s8 {d4[], d5[]}, [r12] ; thresh + + ldr r12, _vlfuv_coeff_ + ;vp8_filter_mask() function + ;vp8_hevmask() function + vabd.u8 q11, q3, q4 ; abs(p3 - p2) + vabd.u8 q12, q4, q5 ; abs(p2 - p1) + vabd.u8 q13, q5, q6 ; abs(p1 - p0) + vabd.u8 q14, q8, q7 ; abs(q1 - q0) + vabd.u8 q3, q9, q8 ; abs(q2 - q1) + vabd.u8 q4, q10, q9 ; abs(q3 - q2) + vabd.u8 q9, q6, q7 ; abs(p0 - q0) + + vcge.u8 q15, q1, q11 ; (abs(p3 - p2) > limit)*-1 + vcge.u8 q12, q1, q12 ; (abs(p2 - p1) > limit)*-1 + vcge.u8 q10, q1, q13 ; (abs(p1 - p0) > limit)*-1 + vcge.u8 q11, q1, q14 ; (abs(q1 - q0) > limit)*-1 + + vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh)*-1 + vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh)*-1 + + vcge.u8 q3, q1, q3 ; (abs(q2 - q1) > limit)*-1 + vcge.u8 q4, q1, q4 ; (abs(q3 - q2) > limit)*-1 + vadd.u8 q0, q0, q0 ; flimit * 2 + vadd.u8 q0, q0, q1 ; flimit * 2 + limit + + vand q15, q15, q12 + vand q10, q10, q11 + vand q3, q3, q4 + + vabd.u8 q2, q5, q8 ; abs(p1 - q1) + vqadd.u8 q9, q9, q9 ; abs(p0 - q0) * 2 + vshr.u8 q2, q2, #1 ; abs(p1 - q1) / 2 + vqadd.u8 q9, q9, q2 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2 + vcge.u8 q9, q0, q9 ; (abs(p0 - q0)*2 + abs(p1-q1)/2 > flimit*2 + limit)*-1 + + vld1.u8 {q0}, [r12]! + + vand q15, q15, q10 + + + ;vp8_filter() function + veor q7, q7, q0 ; qs0: q0 offset to convert to a signed value + veor q6, q6, q0 ; ps0: p0 offset to convert to a signed value + veor q5, q5, q0 ; ps1: p1 offset to convert to a signed value + veor q8, q8, q0 ; qs1: q1 offset to convert to a signed value +;;;;;;;;;;;;;; + vld1.u8 {q10}, [r12]! + + ;vqsub.s8 q2, q7, q6 ; ( qs0 - ps0) + vsubl.s8 q2, d14, d12 ; ( qs0 - ps0) + vsubl.s8 q11, d15, d13 + + vand q3, q3, q9 + vmovl.u8 q4, d20 + + vqsub.s8 q1, q5, q8 ; vp8_filter = vp8_signed_char_clamp(ps1-qs1) + vorr q14, q13, q14 ; q14: vp8_hevmask + + ;vmul.i8 q2, q2, q10 ; 3 * ( qs0 - ps0) + vmul.i16 q2, q2, q4 ; 3 * ( qs0 - ps0) + vmul.i16 q11, q11, q4 + + vand q1, q1, q14 ; vp8_filter &= hev + vand q15, q15, q3 ; q15: vp8_filter_mask + ;; + ;vld1.u8 {q4}, [r12]! ;no need 7 any more + + ;vqadd.s8 q1, q1, q2 + vaddw.s8 q2, q2, d2 + vaddw.s8 q11, q11, d3 + + vld1.u8 {q9}, [r12]! + ; + vqmovn.s16 d2, q2 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) + vqmovn.s16 d3, q11 + ;; + + vand q1, q1, q15 ; vp8_filter &= mask + ;; +;;;;;;;;;;;; + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;Change for VP8 from VP7 +; vand q2, q1, q4 ; s = vp8_filter & 7 +; vqadd.s8 q1, q1, q9 ; vp8_filter = vp8_signed_char_clamp(vp8_filter+4) + ;;;; +; vshr.s8 q1, q1, #3 ; vp8_filter >>= 3 +; vceq.i8 q2, q2, q9 ; s = (s==4)*-1 + ;; +; ;calculate output +; vqsub.s8 q10, q7, q1 ; u = vp8_signed_char_clamp(qs0 - vp8_filter) +; vqadd.s8 q11, q2, q1 ; u = vp8_signed_char_clamp(s + vp8_filter) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; q10=3 + vqadd.s8 q2, q1, q10 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3) + vqadd.s8 q1, q1, q9 ; Filter1 = vp8_signed_char_clamp(vp8_filter+4) + vshr.s8 q2, q2, #3 ; Filter2 >>= 3 + vshr.s8 q1, q1, #3 ; Filter1 >>= 3 + ;calculate output + vqadd.s8 q11, q6, q2 ; u = vp8_signed_char_clamp(ps0 + Filter2) + vqsub.s8 q10, q7, q1 ; u = vp8_signed_char_clamp(qs0 - Filter1) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + vrshr.s8 q1, q1, #1 ;round/shift: vp8_filter += 1; vp8_filter >>= 1 + + sub r0, r0, r1, lsl #3 + add r0, r0, #2 + + vbic q1, q1, q14 ; vp8_filter &= ~hev + + sub r2, r2, r1, lsl #3 + add r2, r2, #2 + + vqadd.s8 q13, q5, q1 ; u = vp8_signed_char_clamp(ps1 + vp8_filter) + ;vqadd.s8 q11, q6, q11 ; u = vp8_signed_char_clamp(ps0 + u) + vqsub.s8 q12, q8, q1 ; u = vp8_signed_char_clamp(qs1 - vp8_filter) + + veor q7, q10, q0 ; *oq0 = u^0x80 + veor q5, q13, q0 ; *op1 = u^0x80 + veor q6, q11, q0 ; *op0 = u^0x80 + veor q8, q12, q0 ; *oq1 = u^0x80 + + vswp d12, d11 + vswp d16, d13 + vswp d14, d12 + vswp d16, d15 + + ;store op1, op0, oq0, oq1 + vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r1 + vst4.8 {d14[0], d15[0], d16[0], d17[0]}, [r2], r1 + vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [r0], r1 + vst4.8 {d14[1], d15[1], d16[1], d17[1]}, [r2], r1 + vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r1 + vst4.8 {d14[2], d15[2], d16[2], d17[2]}, [r2], r1 + vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [r0], r1 + vst4.8 {d14[3], d15[3], d16[3], d17[3]}, [r2], r1 + vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r1 + vst4.8 {d14[4], d15[4], d16[4], d17[4]}, [r2], r1 + vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [r0], r1 + vst4.8 {d14[5], d15[5], d16[5], d17[5]}, [r2], r1 + vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r1 + vst4.8 {d14[6], d15[6], d16[6], d17[6]}, [r2], r1 + vst4.8 {d10[7], d11[7], d12[7], d13[7]}, [r0], r1 + vst4.8 {d14[7], d15[7], d16[7], d17[7]}, [r2], r1 + + bx lr + ENDP ; |vp8_loop_filter_vertical_edge_uv_neon| + +;----------------- + AREA vloopfilteruv_dat, DATA, READWRITE ;read/write by default +;Data section with name data_area is specified. DCD reserves space in memory for 16 data. +;One word each is reserved. Label filter_coeff can be used to access the data. +;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ... +_vlfuv_coeff_ + DCD vlfuv_coeff +vlfuv_coeff + DCD 0x80808080, 0x80808080, 0x80808080, 0x80808080 + DCD 0x03030303, 0x03030303, 0x03030303, 0x03030303 + DCD 0x04040404, 0x04040404, 0x04040404, 0x04040404 + DCD 0x01010101, 0x01010101, 0x01010101, 0x01010101 + + END diff --git a/vp8/common/arm/neon/loopfilterverticaledge_y_neon.asm b/vp8/common/arm/neon/loopfilterverticaledge_y_neon.asm new file mode 100644 index 000000000..3a230a953 --- /dev/null +++ b/vp8/common/arm/neon/loopfilterverticaledge_y_neon.asm @@ -0,0 +1,235 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_loop_filter_vertical_edge_y_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +;Note: flimit, limit, and thresh shpuld be positive numbers. All 16 elements in flimit +;are equal. So, in the code, only one load is needed +;for flimit. Same way applies to limit and thresh. +; r0 unsigned char *s, +; r1 int p, //pitch +; r2 const signed char *flimit, +; r3 const signed char *limit, +; stack(r4) const signed char *thresh, +; //stack(r5) int count --unused + +|vp8_loop_filter_vertical_edge_y_neon| PROC + sub r0, r0, #4 ; move src pointer down by 4 columns + ldr r12, [sp, #0] ; load thresh pointer + + vld1.u8 {d6}, [r0], r1 ; load first 8-line src data + vld1.s8 {d0[], d1[]}, [r2] ; flimit + vld1.u8 {d8}, [r0], r1 + vld1.s8 {d2[], d3[]}, [r3] ; limit + vld1.u8 {d10}, [r0], r1 + vld1.s8 {d4[], d5[]}, [r12] ; thresh + vld1.u8 {d12}, [r0], r1 + ldr r12, _vlfy_coeff_ + vld1.u8 {d14}, [r0], r1 + vld1.u8 {d16}, [r0], r1 + vld1.u8 {d18}, [r0], r1 + vld1.u8 {d20}, [r0], r1 + + vld1.u8 {d7}, [r0], r1 ; load second 8-line src data + vld1.u8 {d9}, [r0], r1 + vld1.u8 {d11}, [r0], r1 + vld1.u8 {d13}, [r0], r1 + vld1.u8 {d15}, [r0], r1 + vld1.u8 {d17}, [r0], r1 + vld1.u8 {d19}, [r0], r1 + vld1.u8 {d21}, [r0], r1 + + ;transpose to 8x16 matrix + vtrn.32 q3, q7 + vtrn.32 q4, q8 + vtrn.32 q5, q9 + vtrn.32 q6, q10 + + vtrn.16 q3, q5 + vtrn.16 q4, q6 + vtrn.16 q7, q9 + vtrn.16 q8, q10 + + vtrn.8 q3, q4 + vtrn.8 q5, q6 + vtrn.8 q7, q8 + vtrn.8 q9, q10 + + ;vp8_filter_mask() function + ;vp8_hevmask() function + vabd.u8 q11, q3, q4 ; abs(p3 - p2) + vabd.u8 q12, q4, q5 ; abs(p2 - p1) + vabd.u8 q13, q5, q6 ; abs(p1 - p0) + vabd.u8 q14, q8, q7 ; abs(q1 - q0) + vabd.u8 q3, q9, q8 ; abs(q2 - q1) + vabd.u8 q4, q10, q9 ; abs(q3 - q2) + vabd.u8 q9, q6, q7 ; abs(p0 - q0) + + vcge.u8 q15, q1, q11 ; (abs(p3 - p2) > limit)*-1 + vcge.u8 q12, q1, q12 ; (abs(p2 - p1) > limit)*-1 + vcge.u8 q10, q1, q13 ; (abs(p1 - p0) > limit)*-1 + vcge.u8 q11, q1, q14 ; (abs(q1 - q0) > limit)*-1 + + vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh)*-1 + vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh)*-1 + + vcge.u8 q3, q1, q3 ; (abs(q2 - q1) > limit)*-1 + vcge.u8 q4, q1, q4 ; (abs(q3 - q2) > limit)*-1 + vadd.u8 q0, q0, q0 ; flimit * 2 + vadd.u8 q0, q0, q1 ; flimit * 2 + limit + + vand q15, q15, q12 + vand q10, q10, q11 + vand q3, q3, q4 + + vabd.u8 q2, q5, q8 ; abs(p1 - q1) + vqadd.u8 q9, q9, q9 ; abs(p0 - q0) * 2 + vshr.u8 q2, q2, #1 ; abs(p1 - q1) / 2 + vqadd.u8 q9, q9, q2 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2 + vcge.u8 q9, q0, q9 ; (abs(p0 - q0)*2 + abs(p1-q1)/2 > flimit*2 + limit)*-1 + + vld1.u8 {q0}, [r12]! + + vand q15, q15, q10 + + + ;vp8_filter() function + veor q7, q7, q0 ; qs0: q0 offset to convert to a signed value + veor q6, q6, q0 ; ps0: p0 offset to convert to a signed value + veor q5, q5, q0 ; ps1: p1 offset to convert to a signed value + veor q8, q8, q0 ; qs1: q1 offset to convert to a signed value +;;;;;;;;;;;;;; + vld1.u8 {q10}, [r12]! + + ;vqsub.s8 q2, q7, q6 ; ( qs0 - ps0) + vsubl.s8 q2, d14, d12 ; ( qs0 - ps0) + vsubl.s8 q11, d15, d13 + + vand q3, q3, q9 + vmovl.u8 q4, d20 + + vqsub.s8 q1, q5, q8 ; vp8_filter = vp8_signed_char_clamp(ps1-qs1) + vorr q14, q13, q14 ; q14: vp8_hevmask + + ;vmul.i8 q2, q2, q10 ; 3 * ( qs0 - ps0) + vmul.i16 q2, q2, q4 ; 3 * ( qs0 - ps0) + vmul.i16 q11, q11, q4 + + vand q1, q1, q14 ; vp8_filter &= hev + vand q15, q15, q3 ; q15: vp8_filter_mask + ;; + ;vld1.u8 {q4}, [r12]! ;no need 7 any more + + ;vqadd.s8 q1, q1, q2 + vaddw.s8 q2, q2, d2 + vaddw.s8 q11, q11, d3 + + vld1.u8 {q9}, [r12]! + ; + vqmovn.s16 d2, q2 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) + vqmovn.s16 d3, q11 + ;; + + vand q1, q1, q15 ; vp8_filter &= mask + ;; +;;;;;;;;;;;; + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;Change for VP8 from VP7 +; vand q2, q1, q4 ; s = vp8_filter & 7 +; vqadd.s8 q1, q1, q9 ; vp8_filter = vp8_signed_char_clamp(vp8_filter+4) + ;;;; +; vshr.s8 q1, q1, #3 ; vp8_filter >>= 3 +; vceq.i8 q2, q2, q9 ; s = (s==4)*-1 + ;; +; ;calculate output +; vqsub.s8 q10, q7, q1 ; u = vp8_signed_char_clamp(qs0 - vp8_filter) +; vqadd.s8 q11, q2, q1 ; u = vp8_signed_char_clamp(s + vp8_filter) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; q10=3 + vqadd.s8 q2, q1, q10 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3) + vqadd.s8 q1, q1, q9 ; Filter1 = vp8_signed_char_clamp(vp8_filter+4) + vshr.s8 q2, q2, #3 ; Filter2 >>= 3 + vshr.s8 q1, q1, #3 ; Filter1 >>= 3 + ;calculate output + vqadd.s8 q11, q6, q2 ; u = vp8_signed_char_clamp(ps0 + Filter2) + vqsub.s8 q10, q7, q1 ; u = vp8_signed_char_clamp(qs0 - Filter1) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + vrshr.s8 q1, q1, #1 ;round/shift: vp8_filter += 1; vp8_filter >>= 1 + + sub r0, r0, r1, lsl #4 + add r0, r0, #2 + ; + + vbic q1, q1, q14 ; vp8_filter &= ~hev + add r2, r0, r1 + ; + + vqadd.s8 q13, q5, q1 ; u = vp8_signed_char_clamp(ps1 + vp8_filter) + ;vqadd.s8 q11, q6, q11 ; u = vp8_signed_char_clamp(ps0 + u) + vqsub.s8 q12, q8, q1 ; u = vp8_signed_char_clamp(qs1 - vp8_filter) + + veor q7, q10, q0 ; *oq0 = u^0x80 + veor q5, q13, q0 ; *op1 = u^0x80 + veor q6, q11, q0 ; *op0 = u^0x80 + veor q8, q12, q0 ; *oq1 = u^0x80 + add r3, r2, r1 + ; + vswp d12, d11 + vswp d16, d13 + add r12, r3, r1 + vswp d14, d12 + vswp d16, d15 + + ;store op1, op0, oq0, oq1 + vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0] + vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [r2] + vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [r3] + vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [r12], r1 + add r0, r12, r1 + vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [r12] + vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [r0], r1 + add r2, r0, r1 + vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0] + vst4.8 {d10[7], d11[7], d12[7], d13[7]}, [r2], r1 + add r3, r2, r1 + vst4.8 {d14[0], d15[0], d16[0], d17[0]}, [r2] + vst4.8 {d14[1], d15[1], d16[1], d17[1]}, [r3], r1 + add r12, r3, r1 + vst4.8 {d14[2], d15[2], d16[2], d17[2]}, [r3] + vst4.8 {d14[3], d15[3], d16[3], d17[3]}, [r12], r1 + add r0, r12, r1 + vst4.8 {d14[4], d15[4], d16[4], d17[4]}, [r12] + vst4.8 {d14[5], d15[5], d16[5], d17[5]}, [r0], r1 + add r2, r0, r1 + vst4.8 {d14[6], d15[6], d16[6], d17[6]}, [r0] + vst4.8 {d14[7], d15[7], d16[7], d17[7]}, [r2] + + bx lr + ENDP ; |vp8_loop_filter_vertical_edge_y_neon| + +;----------------- + AREA vloopfiltery_dat, DATA, READWRITE ;read/write by default +;Data section with name data_area is specified. DCD reserves space in memory for 16 data. +;One word each is reserved. Label filter_coeff can be used to access the data. +;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ... +_vlfy_coeff_ + DCD vlfy_coeff +vlfy_coeff + DCD 0x80808080, 0x80808080, 0x80808080, 0x80808080 + DCD 0x03030303, 0x03030303, 0x03030303, 0x03030303 + DCD 0x04040404, 0x04040404, 0x04040404, 0x04040404 + DCD 0x01010101, 0x01010101, 0x01010101, 0x01010101 + + END diff --git a/vp8/common/arm/neon/mbloopfilterhorizontaledge_uv_neon.asm b/vp8/common/arm/neon/mbloopfilterhorizontaledge_uv_neon.asm new file mode 100644 index 000000000..86eddaa2e --- /dev/null +++ b/vp8/common/arm/neon/mbloopfilterhorizontaledge_uv_neon.asm @@ -0,0 +1,257 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_mbloop_filter_horizontal_edge_uv_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +;Note: flimit, limit, and thresh shpuld be positive numbers. All 16 elements in flimit +;are equal. So, in the code, only one load is needed +;for flimit. Same way applies to limit and thresh. +; r0 unsigned char *u, +; r1 int p, //pitch +; r2 const signed char *flimit, +; r3 const signed char *limit, +; stack(r4) const signed char *thresh, +; stack(r5) unsigned char *v +|vp8_mbloop_filter_horizontal_edge_uv_neon| PROC + sub r0, r0, r1, lsl #2 ; move u pointer down by 4 lines + vld1.s8 {d2[], d3[]}, [r3] ; limit + ldr r3, [sp, #4] ; load v ptr + ldr r12, [sp, #0] ; load thresh pointer + sub r3, r3, r1, lsl #2 ; move v pointer down by 4 lines + + vld1.u8 {d6}, [r0], r1 ; p3 + vld1.u8 {d7}, [r3], r1 ; p3 + vld1.u8 {d8}, [r0], r1 ; p2 + vld1.u8 {d9}, [r3], r1 ; p2 + vld1.u8 {d10}, [r0], r1 ; p1 + vld1.u8 {d11}, [r3], r1 ; p1 + vld1.u8 {d12}, [r0], r1 ; p0 + vld1.u8 {d13}, [r3], r1 ; p0 + vld1.u8 {d14}, [r0], r1 ; q0 + vld1.u8 {d15}, [r3], r1 ; q0 + vld1.u8 {d16}, [r0], r1 ; q1 + vld1.u8 {d17}, [r3], r1 ; q1 + vld1.u8 {d18}, [r0], r1 ; q2 + vld1.u8 {d19}, [r3], r1 ; q2 + vld1.u8 {d20}, [r0], r1 ; q3 + vld1.u8 {d21}, [r3], r1 ; q3 + + vld1.s8 {d4[], d5[]}, [r12] ; thresh + + ldr r12, _mbhlfuv_coeff_ + + ;vp8_filter_mask() function + ;vp8_hevmask() function + vabd.u8 q11, q3, q4 ; abs(p3 - p2) + vabd.u8 q12, q4, q5 ; abs(p2 - p1) + vabd.u8 q13, q5, q6 ; abs(p1 - p0) + vabd.u8 q14, q8, q7 ; abs(q1 - q0) + vabd.u8 q3, q9, q8 ; abs(q2 - q1) + vabd.u8 q0, q10, q9 ; abs(q3 - q2) + + vcge.u8 q15, q1, q11 ; (abs(p3 - p2) > limit)*-1 + vcge.u8 q12, q1, q12 ; (abs(p2 - p1) > limit)*-1 + vcge.u8 q10, q1, q13 ; (abs(p1 - p0) > limit)*-1 + vcge.u8 q11, q1, q14 ; (abs(q1 - q0) > limit)*-1 + vcge.u8 q3, q1, q3 ; (abs(q2 - q1) > limit)*-1 + vcge.u8 q0, q1, q0 ; (abs(q3 - q2) > limit)*-1 + + vand q15, q15, q12 + + vabd.u8 q12, q6, q7 ; abs(p0 - q0) + + vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh)*-1 + vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh)*-1 + + vld1.s8 {d4[], d5[]}, [r2] ; flimit + + vand q10, q10, q11 + vand q3, q3, q0 + + vld1.u8 {q0}, [r12]! + + vadd.u8 q2, q2, q2 ; flimit * 2 + vadd.u8 q2, q2, q1 ; flimit * 2 + limit + + vabd.u8 q1, q5, q8 ; abs(p1 - q1) + vqadd.u8 q12, q12, q12 ; abs(p0 - q0) * 2 + vshr.u8 q1, q1, #1 ; abs(p1 - q1) / 2 + vqadd.u8 q12, q12, q1 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2 + vcge.u8 q12, q2, q12 ; (abs(p0 - q0)*2 + abs(p1 - q1)/2 > flimit*2 + limit)*-1 + + vand q15, q15, q10 + + ;vp8_filter() function + veor q7, q7, q0 ; qs0: q0 offset to convert to a signed value + veor q6, q6, q0 ; ps0: p0 offset to convert to a signed value + veor q5, q5, q0 ; ps1: p1 offset to convert to a signed value + veor q8, q8, q0 ; qs1: q1 offset to convert to a signed value + veor q4, q4, q0 ; ps2: p2 offset to convert to a signed value + veor q9, q9, q0 ; qs2: q2 offset to convert to a signed value +;;;;;;;;;;;;; + vorr q14, q13, q14 ; q14: vp8_hevmask + + ;vqsub.s8 q2, q7, q6 ; ( qs0 - ps0) + vsubl.s8 q2, d14, d12 ; ( qs0 - ps0) + vsubl.s8 q13, d15, d13 + + vqsub.s8 q1, q5, q8 ; vp8_filter = vp8_signed_char_clamp(ps1-qs1) + + ;vadd.s8 q10, q2, q2 ; 3 * ( qs0 - ps0) + vadd.s16 q10, q2, q2 ; 3 * ( qs0 - ps0) + vadd.s16 q11, q13, q13 + + vand q3, q3, q12 + + ;vadd.s8 q2, q2, q10 + vadd.s16 q2, q2, q10 + vadd.s16 q13, q13, q11 + + vld1.u8 {q12}, [r12]! ;#3 + + ;vqadd.s8 q1, q1, q2 ; vp8_filter + 3 * ( qs0 - ps0) + vaddw.s8 q2, q2, d2 ; vp8_filter + 3 * ( qs0 - ps0) + vaddw.s8 q13, q13, d3 + + vand q15, q15, q3 ; q15: vp8_filter_mask + vld1.u8 {q11}, [r12]! ;#4 + + vqmovn.s16 d2, q2 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) + vqmovn.s16 d3, q13 + +;;;;;;;;;;;;;; + vand q1, q1, q15 ; vp8_filter &= mask + + vld1.u8 {q15}, [r12]! ;#63 + ; + vand q13, q1, q14 ; Filter2: q13; Filter2 &= hev + + vld1.u8 {d7}, [r12]! ;#9 + ; + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;Change for VP8 from VP7 +; vand q2, q13, q12 ; s = Filter2 & 7 + +; vqadd.s8 q13, q13, q11 ; Filter2 = vp8_signed_char_clamp(Filter2+4) +; vld1.u8 {d6}, [r12]! ;#18 + +; sub r0, r0, r1, lsl #3 +; sub r3, r3, r1, lsl #3 + +; vshr.s8 q13, q13, #3 ; Filter2 >>= 3 +; vceq.i8 q2, q2, q11 ; s = (s==4)*-1 + +; add r0, r0, r1 +; add r3, r3, r1 + +; vqsub.s8 q7, q7, q13 ; qs0 = vp8_signed_char_clamp(qs0 - Filter2) +; vqadd.s8 q11, q2, q13 ; u = vp8_signed_char_clamp(s + Filter2) + +; vld1.u8 {d5}, [r12]! ;#27 +; vmov q10, q15 +; vmov q12, q15 + +; vqadd.s8 q6, q6, q11 ; ps0 = vp8_signed_char_clamp(ps0 + u) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + vqadd.s8 q2, q13, q11 ; Filter1 = vp8_signed_char_clamp(Filter2+4) + vqadd.s8 q13, q13, q12 ; Filter2 = vp8_signed_char_clamp(Filter2+3) + + vld1.u8 {d6}, [r12]! ;#18 + + sub r0, r0, r1, lsl #3 + sub r3, r3, r1, lsl #3 + + vshr.s8 q2, q2, #3 ; Filter1 >>= 3 + vshr.s8 q13, q13, #3 ; Filter2 >>= 3 + + vmov q10, q15 + vmov q12, q15 + + vqsub.s8 q7, q7, q2 ; qs0 = vp8_signed_char_clamp(qs0 - Filter1) + + vld1.u8 {d5}, [r12]! ;#27 + + add r0, r0, r1 + add r3, r3, r1 + + vqadd.s8 q6, q6, q13 ; ps0 = vp8_signed_char_clamp(ps0 + Filter2) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + vbic q1, q1, q14 ; Filter2: q1; vp8_filter &= ~hev; Filter2 = vp8_filter + + ; roughly 1/7th difference across boundary + ; roughly 2/7th difference across boundary + ; roughly 3/7th difference across boundary + vmov q11, q15 + vmov q13, q15 + vmov q14, q15 + + vmlal.s8 q10, d2, d7 ; Filter2 * 9 + vmlal.s8 q11, d3, d7 + vmlal.s8 q12, d2, d6 ; Filter2 * 18 + vmlal.s8 q13, d3, d6 + vmlal.s8 q14, d2, d5 ; Filter2 * 27 + vmlal.s8 q15, d3, d5 + vqshrn.s16 d20, q10, #7 ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7) + vqshrn.s16 d21, q11, #7 + vqshrn.s16 d24, q12, #7 ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7) + vqshrn.s16 d25, q13, #7 + vqshrn.s16 d28, q14, #7 ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7) + vqshrn.s16 d29, q15, #7 + + vqsub.s8 q11, q9, q10 ; s = vp8_signed_char_clamp(qs2 - u) + vqadd.s8 q10, q4, q10 ; s = vp8_signed_char_clamp(ps2 + u) + vqsub.s8 q13, q8, q12 ; s = vp8_signed_char_clamp(qs1 - u) + vqadd.s8 q12, q5, q12 ; s = vp8_signed_char_clamp(ps1 + u) + vqsub.s8 q15, q7, q14 ; s = vp8_signed_char_clamp(qs0 - u) + vqadd.s8 q14, q6, q14 ; s = vp8_signed_char_clamp(ps0 + u) + veor q9, q11, q0 ; *oq2 = s^0x80 + veor q4, q10, q0 ; *op2 = s^0x80 + veor q8, q13, q0 ; *oq1 = s^0x80 + veor q5, q12, q0 ; *op2 = s^0x80 + veor q7, q15, q0 ; *oq0 = s^0x80 + veor q6, q14, q0 ; *op0 = s^0x80 + + vst1.u8 {d8}, [r0], r1 ; store u op2 + vst1.u8 {d9}, [r3], r1 ; store v op2 + vst1.u8 {d10}, [r0], r1 ; store u op1 + vst1.u8 {d11}, [r3], r1 ; store v op1 + vst1.u8 {d12}, [r0], r1 ; store u op0 + vst1.u8 {d13}, [r3], r1 ; store v op0 + vst1.u8 {d14}, [r0], r1 ; store u oq0 + vst1.u8 {d15}, [r3], r1 ; store v oq0 + vst1.u8 {d16}, [r0], r1 ; store u oq1 + vst1.u8 {d17}, [r3], r1 ; store v oq1 + vst1.u8 {d18}, [r0], r1 ; store u oq2 + vst1.u8 {d19}, [r3], r1 ; store v oq2 + + bx lr + ENDP ; |vp8_mbloop_filter_horizontal_edge_uv_neon| + +;----------------- + AREA mbhloopfilteruv_dat, DATA, READWRITE ;read/write by default +;Data section with name data_area is specified. DCD reserves space in memory for 16 data. +;One word each is reserved. Label filter_coeff can be used to access the data. +;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ... +_mbhlfuv_coeff_ + DCD mbhlfuv_coeff +mbhlfuv_coeff + DCD 0x80808080, 0x80808080, 0x80808080, 0x80808080 + DCD 0x03030303, 0x03030303, 0x03030303, 0x03030303 + DCD 0x04040404, 0x04040404, 0x04040404, 0x04040404 + DCD 0x003f003f, 0x003f003f, 0x003f003f, 0x003f003f + DCD 0x09090909, 0x09090909, 0x12121212, 0x12121212 + DCD 0x1b1b1b1b, 0x1b1b1b1b + + END diff --git a/vp8/common/arm/neon/mbloopfilterhorizontaledge_y_neon.asm b/vp8/common/arm/neon/mbloopfilterhorizontaledge_y_neon.asm new file mode 100644 index 000000000..2ab0fc240 --- /dev/null +++ b/vp8/common/arm/neon/mbloopfilterhorizontaledge_y_neon.asm @@ -0,0 +1,236 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_mbloop_filter_horizontal_edge_y_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +;Note: flimit, limit, and thresh shpuld be positive numbers. All 16 elements in flimit +;are equal. So, in the code, only one load is needed +;for flimit. Same way applies to limit and thresh. +; r0 unsigned char *s, +; r1 int p, //pitch +; r2 const signed char *flimit, +; r3 const signed char *limit, +; stack(r4) const signed char *thresh, +; //stack(r5) int count --unused +|vp8_mbloop_filter_horizontal_edge_y_neon| PROC + sub r0, r0, r1, lsl #2 ; move src pointer down by 4 lines + ldr r12, [sp, #0] ; load thresh pointer + + vld1.u8 {q3}, [r0], r1 ; p3 + vld1.s8 {d2[], d3[]}, [r3] ; limit + vld1.u8 {q4}, [r0], r1 ; p2 + vld1.s8 {d4[], d5[]}, [r12] ; thresh + vld1.u8 {q5}, [r0], r1 ; p1 + ldr r12, _mbhlfy_coeff_ + vld1.u8 {q6}, [r0], r1 ; p0 + + ;vp8_filter_mask() function + ;vp8_hevmask() function + vabd.u8 q11, q3, q4 ; abs(p3 - p2) + vld1.u8 {q7}, [r0], r1 ; q0 + vabd.u8 q12, q4, q5 ; abs(p2 - p1) + vld1.u8 {q8}, [r0], r1 ; q1 + vabd.u8 q13, q5, q6 ; abs(p1 - p0) + vld1.u8 {q9}, [r0], r1 ; q2 + vabd.u8 q14, q8, q7 ; abs(q1 - q0) + vld1.u8 {q10}, [r0], r1 ; q3 + vabd.u8 q3, q9, q8 ; abs(q2 - q1) + vabd.u8 q0, q10, q9 ; abs(q3 - q2) + + vcge.u8 q15, q1, q11 ; (abs(p3 - p2) > limit)*-1 + vcge.u8 q12, q1, q12 ; (abs(p2 - p1) > limit)*-1 + vcge.u8 q10, q1, q13 ; (abs(p1 - p0) > limit)*-1 + vcge.u8 q11, q1, q14 ; (abs(q1 - q0) > limit)*-1 + vcge.u8 q3, q1, q3 ; (abs(q2 - q1) > limit)*-1 + vcge.u8 q0, q1, q0 ; (abs(q3 - q2) > limit)*-1 + + vand q15, q15, q12 + + vabd.u8 q12, q6, q7 ; abs(p0 - q0) + + vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh)*-1 + vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh)*-1 + + vld1.s8 {d4[], d5[]}, [r2] ; flimit + + vand q10, q10, q11 + vand q3, q3, q0 + + vld1.u8 {q0}, [r12]! + + vadd.u8 q2, q2, q2 ; flimit * 2 + vadd.u8 q2, q2, q1 ; flimit * 2 + limit + + vabd.u8 q1, q5, q8 ; abs(p1 - q1) + vqadd.u8 q12, q12, q12 ; abs(p0 - q0) * 2 + vshr.u8 q1, q1, #1 ; abs(p1 - q1) / 2 + vqadd.u8 q12, q12, q1 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2 + vcge.u8 q12, q2, q12 ; (abs(p0 - q0)*2 + abs(p1 - q1)/2 > flimit*2 + limit)*-1 + + vand q15, q15, q10 + + ;vp8_filter() function + veor q7, q7, q0 ; qs0: q0 offset to convert to a signed value + veor q6, q6, q0 ; ps0: p0 offset to convert to a signed value + veor q5, q5, q0 ; ps1: p1 offset to convert to a signed value + veor q8, q8, q0 ; qs1: q1 offset to convert to a signed value + veor q4, q4, q0 ; ps2: p2 offset to convert to a signed value + veor q9, q9, q0 ; qs2: q2 offset to convert to a signed value +;;;;;;;;;;;;; + vorr q14, q13, q14 ; q14: vp8_hevmask + + ;vqsub.s8 q2, q7, q6 ; ( qs0 - ps0) + vsubl.s8 q2, d14, d12 ; ( qs0 - ps0) + vsubl.s8 q13, d15, d13 + + vqsub.s8 q1, q5, q8 ; vp8_filter = vp8_signed_char_clamp(ps1-qs1) + + ;vadd.s8 q10, q2, q2 ; 3 * ( qs0 - ps0) + vadd.s16 q10, q2, q2 ; 3 * ( qs0 - ps0) + vadd.s16 q11, q13, q13 + + vand q3, q3, q12 + + ;vadd.s8 q2, q2, q10 + vadd.s16 q2, q2, q10 + vadd.s16 q13, q13, q11 + + vld1.u8 {q12}, [r12]! ;#3 + + ;vqadd.s8 q1, q1, q2 ; vp8_filter + 3 * ( qs0 - ps0) + vaddw.s8 q2, q2, d2 ; vp8_filter + 3 * ( qs0 - ps0) + vaddw.s8 q13, q13, d3 + + vand q15, q15, q3 ; q15: vp8_filter_mask + vld1.u8 {q11}, [r12]! ;#4 + + vqmovn.s16 d2, q2 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) + vqmovn.s16 d3, q13 + +;;;;;;;;;;;;;; + vand q1, q1, q15 ; vp8_filter &= mask + + vld1.u8 {q15}, [r12]! ;#63 + ; + vand q13, q1, q14 ; Filter2: q13; Filter2 &= hev + + vld1.u8 {d7}, [r12]! ;#9 + sub r0, r0, r1, lsl #3 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;Change for VP8 from VP7 +; vand q2, q13, q12 ; s = Filter2 & 7 + +; vqadd.s8 q13, q13, q11 ; Filter2 = vp8_signed_char_clamp(Filter2+4) +; vld1.u8 {d6}, [r12]! ;#18 + +; add r0, r0, r1 +; add r2, r0, r1 + +; vshr.s8 q13, q13, #3 ; Filter2 >>= 3 +; vceq.i8 q2, q2, q11 ; s = (s==4)*-1 + +; add r3, r2, r1 + +; vqsub.s8 q7, q7, q13 ; qs0 = vp8_signed_char_clamp(qs0 - Filter2) +; vqadd.s8 q11, q2, q13 ; u = vp8_signed_char_clamp(s + Filter2) + +; vld1.u8 {d5}, [r12]! ;#27 +; vmov q10, q15 +; vmov q12, q15 + +; vqadd.s8 q6, q6, q11 ; ps0 = vp8_signed_char_clamp(ps0 + u) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + vqadd.s8 q2, q13, q11 ; Filter1 = vp8_signed_char_clamp(Filter2+4) + vqadd.s8 q13, q13, q12 ; Filter2 = vp8_signed_char_clamp(Filter2+3) + + vld1.u8 {d6}, [r12]! ;#18 + add r0, r0, r1 + add r2, r0, r1 + + vshr.s8 q2, q2, #3 ; Filter1 >>= 3 + vshr.s8 q13, q13, #3 ; Filter2 >>= 3 + + vmov q10, q15 + vmov q12, q15 + + vqsub.s8 q7, q7, q2 ; qs0 = vp8_signed_char_clamp(qs0 - Filter1) + + vld1.u8 {d5}, [r12]! ;#27 + add r3, r2, r1 + + vqadd.s8 q6, q6, q13 ; ps0 = vp8_signed_char_clamp(ps0 + Filter2) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + vbic q1, q1, q14 ; Filter2: q1; vp8_filter &= ~hev; Filter2 = vp8_filter + + ; roughly 1/7th difference across boundary + ; roughly 2/7th difference across boundary + ; roughly 3/7th difference across boundary + vmov q11, q15 + vmov q13, q15 + vmov q14, q15 + + vmlal.s8 q10, d2, d7 ; Filter2 * 9 + vmlal.s8 q11, d3, d7 + vmlal.s8 q12, d2, d6 ; Filter2 * 18 + vmlal.s8 q13, d3, d6 + vmlal.s8 q14, d2, d5 ; Filter2 * 27 + vmlal.s8 q15, d3, d5 + vqshrn.s16 d20, q10, #7 ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7) + vqshrn.s16 d21, q11, #7 + vqshrn.s16 d24, q12, #7 ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7) + vqshrn.s16 d25, q13, #7 + vqshrn.s16 d28, q14, #7 ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7) + vqshrn.s16 d29, q15, #7 + + vqsub.s8 q11, q9, q10 ; s = vp8_signed_char_clamp(qs2 - u) + vqadd.s8 q10, q4, q10 ; s = vp8_signed_char_clamp(ps2 + u) + vqsub.s8 q13, q8, q12 ; s = vp8_signed_char_clamp(qs1 - u) + vqadd.s8 q12, q5, q12 ; s = vp8_signed_char_clamp(ps1 + u) + vqsub.s8 q15, q7, q14 ; s = vp8_signed_char_clamp(qs0 - u) + vqadd.s8 q14, q6, q14 ; s = vp8_signed_char_clamp(ps0 + u) + veor q9, q11, q0 ; *oq2 = s^0x80 + veor q4, q10, q0 ; *op2 = s^0x80 + veor q5, q12, q0 ; *op2 = s^0x80 + veor q6, q14, q0 ; *op0 = s^0x80 + veor q8, q13, q0 ; *oq1 = s^0x80 + veor q7, q15, q0 ; *oq0 = s^0x80 + + vst1.u8 {q4}, [r0] ; store op2 + vst1.u8 {q5}, [r2] ; store op1 + vst1.u8 {q6}, [r3], r1 ; store op0 + add r12, r3, r1 + vst1.u8 {q7}, [r3] ; store oq0 + vst1.u8 {q8}, [r12], r1 ; store oq1 + vst1.u8 {q9}, [r12] ; store oq2 + + bx lr + ENDP ; |vp8_mbloop_filter_horizontal_edge_y_neon| + +;----------------- + AREA mbhloopfiltery_dat, DATA, READWRITE ;read/write by default +;Data section with name data_area is specified. DCD reserves space in memory for 16 data. +;One word each is reserved. Label filter_coeff can be used to access the data. +;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ... +_mbhlfy_coeff_ + DCD mbhlfy_coeff +mbhlfy_coeff + DCD 0x80808080, 0x80808080, 0x80808080, 0x80808080 + DCD 0x03030303, 0x03030303, 0x03030303, 0x03030303 + DCD 0x04040404, 0x04040404, 0x04040404, 0x04040404 + DCD 0x003f003f, 0x003f003f, 0x003f003f, 0x003f003f + DCD 0x09090909, 0x09090909, 0x12121212, 0x12121212 + DCD 0x1b1b1b1b, 0x1b1b1b1b + + END diff --git a/vp8/common/arm/neon/mbloopfilterverticaledge_uv_neon.asm b/vp8/common/arm/neon/mbloopfilterverticaledge_uv_neon.asm new file mode 100644 index 000000000..ad5afba34 --- /dev/null +++ b/vp8/common/arm/neon/mbloopfilterverticaledge_uv_neon.asm @@ -0,0 +1,296 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_mbloop_filter_vertical_edge_uv_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +;Note: flimit, limit, and thresh shpuld be positive numbers. All 16 elements in flimit +;are equal. So, in the code, only one load is needed +;for flimit. Same way applies to limit and thresh. +; r0 unsigned char *u, +; r1 int p, //pitch +; r2 const signed char *flimit, +; r3 const signed char *limit, +; stack(r4) const signed char *thresh, +; stack(r5) unsigned char *v +|vp8_mbloop_filter_vertical_edge_uv_neon| PROC + sub r0, r0, #4 ; move src pointer down by 4 columns + vld1.s8 {d2[], d3[]}, [r3] ; limit + ldr r3, [sp, #4] ; load v ptr + ldr r12, [sp, #0] ; load thresh pointer + + sub r3, r3, #4 ; move v pointer down by 4 columns + + vld1.u8 {d6}, [r0], r1 ;load u data + vld1.u8 {d7}, [r3], r1 ;load v data + vld1.u8 {d8}, [r0], r1 + vld1.u8 {d9}, [r3], r1 + vld1.u8 {d10}, [r0], r1 + vld1.u8 {d11}, [r3], r1 + vld1.u8 {d12}, [r0], r1 + vld1.u8 {d13}, [r3], r1 + vld1.u8 {d14}, [r0], r1 + vld1.u8 {d15}, [r3], r1 + vld1.u8 {d16}, [r0], r1 + vld1.u8 {d17}, [r3], r1 + vld1.u8 {d18}, [r0], r1 + vld1.u8 {d19}, [r3], r1 + vld1.u8 {d20}, [r0], r1 + vld1.u8 {d21}, [r3], r1 + + ;transpose to 8x16 matrix + vtrn.32 q3, q7 + vtrn.32 q4, q8 + vtrn.32 q5, q9 + vtrn.32 q6, q10 + + vtrn.16 q3, q5 + vtrn.16 q4, q6 + vtrn.16 q7, q9 + vtrn.16 q8, q10 + + vtrn.8 q3, q4 + vtrn.8 q5, q6 + vtrn.8 q7, q8 + vtrn.8 q9, q10 + + sub sp, sp, #32 + vld1.s8 {d4[], d5[]}, [r12] ; thresh + vst1.u8 {q3}, [sp]! + ldr r12, _mbvlfuv_coeff_ + vst1.u8 {q10}, [sp]! + + ;vp8_filter_mask() function + ;vp8_hevmask() function + vabd.u8 q11, q3, q4 ; abs(p3 - p2) + vabd.u8 q12, q4, q5 ; abs(p2 - p1) + vabd.u8 q13, q5, q6 ; abs(p1 - p0) + vabd.u8 q14, q8, q7 ; abs(q1 - q0) + vabd.u8 q3, q9, q8 ; abs(q2 - q1) + vabd.u8 q0, q10, q9 ; abs(q3 - q2) + + vcge.u8 q15, q1, q11 ; (abs(p3 - p2) > limit)*-1 + vcge.u8 q12, q1, q12 ; (abs(p2 - p1) > limit)*-1 + vcge.u8 q10, q1, q13 ; (abs(p1 - p0) > limit)*-1 + vcge.u8 q11, q1, q14 ; (abs(q1 - q0) > limit)*-1 + vcge.u8 q3, q1, q3 ; (abs(q2 - q1) > limit)*-1 + vcge.u8 q0, q1, q0 ; (abs(q3 - q2) > limit)*-1 + + vand q15, q15, q12 + + vabd.u8 q12, q6, q7 ; abs(p0 - q0) + + vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh)*-1 + vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh)*-1 + + vld1.s8 {d4[], d5[]}, [r2] ; flimit + + vand q10, q10, q11 + vand q3, q3, q0 + + vld1.u8 {q0}, [r12]! + + vadd.u8 q2, q2, q2 ; flimit * 2 + vadd.u8 q2, q2, q1 ; flimit * 2 + limit + + vabd.u8 q1, q5, q8 ; abs(p1 - q1) + vqadd.u8 q12, q12, q12 ; abs(p0 - q0) * 2 + vshr.u8 q1, q1, #1 ; abs(p1 - q1) / 2 + vqadd.u8 q12, q12, q1 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2 + vcge.u8 q12, q2, q12 ; (abs(p0 - q0)*2 + abs(p1 - q1)/2 > flimit*2 + limit)*-1 + + vand q15, q15, q10 + + ;vp8_filter() function + veor q7, q7, q0 ; qs0: q0 offset to convert to a signed value + veor q6, q6, q0 ; ps0: p0 offset to convert to a signed value + veor q5, q5, q0 ; ps1: p1 offset to convert to a signed value + veor q8, q8, q0 ; qs1: q1 offset to convert to a signed value + veor q4, q4, q0 ; ps2: p2 offset to convert to a signed value + veor q9, q9, q0 ; qs2: q2 offset to convert to a signed value +;;;;;;;;;;;;; + vorr q14, q13, q14 ; q14: vp8_hevmask + + ;vqsub.s8 q2, q7, q6 ; ( qs0 - ps0) + vsubl.s8 q2, d14, d12 ; ( qs0 - ps0) + vsubl.s8 q13, d15, d13 + + vqsub.s8 q1, q5, q8 ; vp8_filter = vp8_signed_char_clamp(ps1-qs1) + + ;vadd.s8 q10, q2, q2 ; 3 * ( qs0 - ps0) + vadd.s16 q10, q2, q2 ; 3 * ( qs0 - ps0) + vadd.s16 q11, q13, q13 + + vand q3, q3, q12 + + ;vadd.s8 q2, q2, q10 + vadd.s16 q2, q2, q10 + vadd.s16 q13, q13, q11 + + vld1.u8 {q12}, [r12]! ;#3 + + ;vqadd.s8 q1, q1, q2 ; vp8_filter + 3 * ( qs0 - ps0) + vaddw.s8 q2, q2, d2 ; vp8_filter + 3 * ( qs0 - ps0) + vaddw.s8 q13, q13, d3 + + vand q15, q15, q3 ; q15: vp8_filter_mask + vld1.u8 {q11}, [r12]! ;#4 + + vqmovn.s16 d2, q2 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) + vqmovn.s16 d3, q13 + +;;;;;;;;;;;;;; + vand q1, q1, q15 ; vp8_filter &= mask + + vld1.u8 {q15}, [r12]! ;#63 + ; + vand q13, q1, q14 ; Filter2: q13; Filter2 &= hev + + vld1.u8 {d7}, [r12]! ;#9 + ; + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;Change for VP8 from VP7 +; vand q2, q13, q12 ; s = Filter2 & 7 + +; vqadd.s8 q13, q13, q11 ; Filter2 = vp8_signed_char_clamp(Filter2+4) +; vld1.u8 {d6}, [r12]! ;#18 + +; sub r0, r0, r1, lsl #3 +; sub r3, r3, r1, lsl #3 +; sub sp, sp, #32 + +; vshr.s8 q13, q13, #3 ; Filter2 >>= 3 +; vceq.i8 q2, q2, q11 ; s = (s==4)*-1 + +; vqsub.s8 q7, q7, q13 ; qs0 = vp8_signed_char_clamp(qs0 - Filter2) +; vqadd.s8 q11, q2, q13 ; u = vp8_signed_char_clamp(s + Filter2) + +; vld1.u8 {d5}, [r12]! ;#27 +; vmov q10, q15 +; vmov q12, q15 + +; vqadd.s8 q6, q6, q11 ; ps0 = vp8_signed_char_clamp(ps0 + u) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + vqadd.s8 q2, q13, q11 ; Filter1 = vp8_signed_char_clamp(Filter2+4) + vqadd.s8 q13, q13, q12 ; Filter2 = vp8_signed_char_clamp(Filter2+3) + + vld1.u8 {d6}, [r12]! ;#18 + + sub r0, r0, r1, lsl #3 + sub r3, r3, r1, lsl #3 + + vshr.s8 q2, q2, #3 ; Filter1 >>= 3 + vshr.s8 q13, q13, #3 ; Filter2 >>= 3 + + vmov q10, q15 + vmov q12, q15 + + vqsub.s8 q7, q7, q2 ; qs0 = vp8_signed_char_clamp(qs0 - Filter1) + + vld1.u8 {d5}, [r12]! ;#27 + + sub sp, sp, #32 + + vqadd.s8 q6, q6, q13 ; ps0 = vp8_signed_char_clamp(ps0 + Filter2) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + vbic q1, q1, q14 ; Filter2: q1; vp8_filter &= ~hev; Filter2 = vp8_filter + + ; roughly 1/7th difference across boundary + ; roughly 2/7th difference across boundary + ; roughly 3/7th difference across boundary + vmov q11, q15 + vmov q13, q15 + vmov q14, q15 + + vmlal.s8 q10, d2, d7 ; Filter2 * 9 + vmlal.s8 q11, d3, d7 + vmlal.s8 q12, d2, d6 ; Filter2 * 18 + vmlal.s8 q13, d3, d6 + vmlal.s8 q14, d2, d5 ; Filter2 * 27 + vmlal.s8 q15, d3, d5 + vqshrn.s16 d20, q10, #7 ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7) + vqshrn.s16 d21, q11, #7 + vqshrn.s16 d24, q12, #7 ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7) + vqshrn.s16 d25, q13, #7 + vqshrn.s16 d28, q14, #7 ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7) + vqshrn.s16 d29, q15, #7 + + vqsub.s8 q11, q9, q10 ; s = vp8_signed_char_clamp(qs2 - u) + vqadd.s8 q10, q4, q10 ; s = vp8_signed_char_clamp(ps2 + u) + vqsub.s8 q13, q8, q12 ; s = vp8_signed_char_clamp(qs1 - u) + vqadd.s8 q12, q5, q12 ; s = vp8_signed_char_clamp(ps1 + u) + vqsub.s8 q15, q7, q14 ; s = vp8_signed_char_clamp(qs0 - u) + vqadd.s8 q14, q6, q14 ; s = vp8_signed_char_clamp(ps0 + u) + veor q9, q11, q0 ; *oq2 = s^0x80 + veor q4, q10, q0 ; *op2 = s^0x80 + veor q8, q13, q0 ; *oq1 = s^0x80 + veor q5, q12, q0 ; *op2 = s^0x80 + veor q7, q15, q0 ; *oq0 = s^0x80 + vld1.u8 {q3}, [sp]! + veor q6, q14, q0 ; *op0 = s^0x80 + vld1.u8 {q10}, [sp]! + + ;transpose to 16x8 matrix + vtrn.32 q3, q7 + vtrn.32 q4, q8 + vtrn.32 q5, q9 + vtrn.32 q6, q10 + + vtrn.16 q3, q5 + vtrn.16 q4, q6 + vtrn.16 q7, q9 + vtrn.16 q8, q10 + + vtrn.8 q3, q4 + vtrn.8 q5, q6 + vtrn.8 q7, q8 + vtrn.8 q9, q10 + + ;store op2, op1, op0, oq0, oq1, oq2 + vst1.8 {d6}, [r0], r1 + vst1.8 {d7}, [r3], r1 + vst1.8 {d8}, [r0], r1 + vst1.8 {d9}, [r3], r1 + vst1.8 {d10}, [r0], r1 + vst1.8 {d11}, [r3], r1 + vst1.8 {d12}, [r0], r1 + vst1.8 {d13}, [r3], r1 + vst1.8 {d14}, [r0], r1 + vst1.8 {d15}, [r3], r1 + vst1.8 {d16}, [r0], r1 + vst1.8 {d17}, [r3], r1 + vst1.8 {d18}, [r0], r1 + vst1.8 {d19}, [r3], r1 + vst1.8 {d20}, [r0], r1 + vst1.8 {d21}, [r3], r1 + + bx lr + ENDP ; |vp8_mbloop_filter_vertical_edge_uv_neon| + +;----------------- + AREA mbvloopfilteruv_dat, DATA, READWRITE ;read/write by default +;Data section with name data_area is specified. DCD reserves space in memory for 16 data. +;One word each is reserved. Label filter_coeff can be used to access the data. +;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ... +_mbvlfuv_coeff_ + DCD mbvlfuv_coeff +mbvlfuv_coeff + DCD 0x80808080, 0x80808080, 0x80808080, 0x80808080 + DCD 0x03030303, 0x03030303, 0x03030303, 0x03030303 + DCD 0x04040404, 0x04040404, 0x04040404, 0x04040404 + DCD 0x003f003f, 0x003f003f, 0x003f003f, 0x003f003f + DCD 0x09090909, 0x09090909, 0x12121212, 0x12121212 + DCD 0x1b1b1b1b, 0x1b1b1b1b + + END diff --git a/vp8/common/arm/neon/mbloopfilterverticaledge_y_neon.asm b/vp8/common/arm/neon/mbloopfilterverticaledge_y_neon.asm new file mode 100644 index 000000000..60e517519 --- /dev/null +++ b/vp8/common/arm/neon/mbloopfilterverticaledge_y_neon.asm @@ -0,0 +1,303 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_mbloop_filter_vertical_edge_y_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +;Note: flimit, limit, and thresh shpuld be positive numbers. All 16 elements in flimit +;are equal. So, in the code, only one load is needed +;for flimit. Same way applies to limit and thresh. +; r0 unsigned char *s, +; r1 int p, //pitch +; r2 const signed char *flimit, +; r3 const signed char *limit, +; stack(r4) const signed char *thresh, +; //stack(r5) int count --unused +|vp8_mbloop_filter_vertical_edge_y_neon| PROC + sub r0, r0, #4 ; move src pointer down by 4 columns + + vld1.u8 {d6}, [r0], r1 ; load first 8-line src data + ldr r12, [sp, #0] ; load thresh pointer + vld1.u8 {d8}, [r0], r1 + sub sp, sp, #32 + vld1.u8 {d10}, [r0], r1 + vld1.u8 {d12}, [r0], r1 + vld1.u8 {d14}, [r0], r1 + vld1.u8 {d16}, [r0], r1 + vld1.u8 {d18}, [r0], r1 + vld1.u8 {d20}, [r0], r1 + + vld1.u8 {d7}, [r0], r1 ; load second 8-line src data + vld1.u8 {d9}, [r0], r1 + vld1.u8 {d11}, [r0], r1 + vld1.u8 {d13}, [r0], r1 + vld1.u8 {d15}, [r0], r1 + vld1.u8 {d17}, [r0], r1 + vld1.u8 {d19}, [r0], r1 + vld1.u8 {d21}, [r0], r1 + + ;transpose to 8x16 matrix + vtrn.32 q3, q7 + vtrn.32 q4, q8 + vtrn.32 q5, q9 + vtrn.32 q6, q10 + + vtrn.16 q3, q5 + vtrn.16 q4, q6 + vtrn.16 q7, q9 + vtrn.16 q8, q10 + + vtrn.8 q3, q4 + vtrn.8 q5, q6 + vtrn.8 q7, q8 + vtrn.8 q9, q10 + + vld1.s8 {d2[], d3[]}, [r3] ; limit + vst1.u8 {q3}, [sp]! + vld1.s8 {d4[], d5[]}, [r12] ; thresh + ldr r12, _mbvlfy_coeff_ + vst1.u8 {q10}, [sp]! + + ;vp8_filter_mask() function + ;vp8_hevmask() function + vabd.u8 q11, q3, q4 ; abs(p3 - p2) + vabd.u8 q12, q4, q5 ; abs(p2 - p1) + vabd.u8 q13, q5, q6 ; abs(p1 - p0) + vabd.u8 q14, q8, q7 ; abs(q1 - q0) + vabd.u8 q3, q9, q8 ; abs(q2 - q1) + vabd.u8 q0, q10, q9 ; abs(q3 - q2) + + vcge.u8 q15, q1, q11 ; (abs(p3 - p2) > limit)*-1 + vcge.u8 q12, q1, q12 ; (abs(p2 - p1) > limit)*-1 + vcge.u8 q10, q1, q13 ; (abs(p1 - p0) > limit)*-1 + vcge.u8 q11, q1, q14 ; (abs(q1 - q0) > limit)*-1 + vcge.u8 q3, q1, q3 ; (abs(q2 - q1) > limit)*-1 + vcge.u8 q0, q1, q0 ; (abs(q3 - q2) > limit)*-1 + + vand q15, q15, q12 + + vabd.u8 q12, q6, q7 ; abs(p0 - q0) + + vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh)*-1 + vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh)*-1 + + vld1.s8 {d4[], d5[]}, [r2] ; flimit + + vand q10, q10, q11 + vand q3, q3, q0 + + vld1.u8 {q0}, [r12]! + + vadd.u8 q2, q2, q2 ; flimit * 2 + vadd.u8 q2, q2, q1 ; flimit * 2 + limit + + vabd.u8 q1, q5, q8 ; abs(p1 - q1) + vqadd.u8 q12, q12, q12 ; abs(p0 - q0) * 2 + vshr.u8 q1, q1, #1 ; abs(p1 - q1) / 2 + vqadd.u8 q12, q12, q1 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2 + vcge.u8 q12, q2, q12 ; (abs(p0 - q0)*2 + abs(p1 - q1)/2 > flimit*2 + limit)*-1 + + vand q15, q15, q10 + + ;vp8_filter() function + veor q7, q7, q0 ; qs0: q0 offset to convert to a signed value + veor q6, q6, q0 ; ps0: p0 offset to convert to a signed value + veor q5, q5, q0 ; ps1: p1 offset to convert to a signed value + veor q8, q8, q0 ; qs1: q1 offset to convert to a signed value + veor q4, q4, q0 ; ps2: p2 offset to convert to a signed value + veor q9, q9, q0 ; qs2: q2 offset to convert to a signed value +;;;;;;;;;;;;; + vorr q14, q13, q14 ; q14: vp8_hevmask + + ;vqsub.s8 q2, q7, q6 ; ( qs0 - ps0) + vsubl.s8 q2, d14, d12 ; ( qs0 - ps0) + vsubl.s8 q13, d15, d13 + + vqsub.s8 q1, q5, q8 ; vp8_filter = vp8_signed_char_clamp(ps1-qs1) + + ;vadd.s8 q10, q2, q2 ; 3 * ( qs0 - ps0) + vadd.s16 q10, q2, q2 ; 3 * ( qs0 - ps0) + vadd.s16 q11, q13, q13 + + vand q3, q3, q12 + + ;vadd.s8 q2, q2, q10 + vadd.s16 q2, q2, q10 + vadd.s16 q13, q13, q11 + + vld1.u8 {q12}, [r12]! ;#3 + + ;vqadd.s8 q1, q1, q2 ; vp8_filter + 3 * ( qs0 - ps0) + vaddw.s8 q2, q2, d2 ; vp8_filter + 3 * ( qs0 - ps0) + vaddw.s8 q13, q13, d3 + + vand q15, q15, q3 ; q15: vp8_filter_mask + vld1.u8 {q11}, [r12]! ;#4 + + vqmovn.s16 d2, q2 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) + vqmovn.s16 d3, q13 + +;;;;;;;;;;;;;; + vand q1, q1, q15 ; vp8_filter &= mask + + vld1.u8 {q15}, [r12]! ;#63 + ; + vand q13, q1, q14 ; Filter2: q13; Filter2 &= hev + + vld1.u8 {d7}, [r12]! ;#9 + ; + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;Change for VP8 from VP7 +; vand q2, q13, q12 ; s = Filter2 & 7 + +; vqadd.s8 q13, q13, q11 ; Filter2 = vp8_signed_char_clamp(Filter2+4) +; vld1.u8 {d6}, [r12]! ;#18 + +; sub r0, r0, r1, lsl #4 +; sub sp, sp, #32 +; add r2, r0, r1 + +; vshr.s8 q13, q13, #3 ; Filter2 >>= 3 +; vceq.i8 q2, q2, q11 ; s = (s==4)*-1 + +; add r3, r2, r1 + +; vqsub.s8 q7, q7, q13 ; qs0 = vp8_signed_char_clamp(qs0 - Filter2) +; vqadd.s8 q11, q2, q13 ; u = vp8_signed_char_clamp(s + Filter2) + +; vld1.u8 {d5}, [r12]! ;#27 +; vmov q10, q15 +; vmov q12, q15 + +; vqadd.s8 q6, q6, q11 ; ps0 = vp8_signed_char_clamp(ps0 + u) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + vqadd.s8 q2, q13, q11 ; Filter1 = vp8_signed_char_clamp(Filter2+4) + vqadd.s8 q13, q13, q12 ; Filter2 = vp8_signed_char_clamp(Filter2+3) + + vld1.u8 {d6}, [r12]! ;#18 + sub r0, r0, r1, lsl #4 + sub sp, sp, #32 + + add r2, r0, r1 + + vshr.s8 q2, q2, #3 ; Filter1 >>= 3 + vshr.s8 q13, q13, #3 ; Filter2 >>= 3 + + vmov q10, q15 + vmov q12, q15 + + vqsub.s8 q7, q7, q2 ; qs0 = vp8_signed_char_clamp(qs0 - Filter1) + + vld1.u8 {d5}, [r12]! ;#27 + add r3, r2, r1 + + vqadd.s8 q6, q6, q13 ; ps0 = vp8_signed_char_clamp(ps0 + Filter2) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + vbic q1, q1, q14 ; Filter2: q1; vp8_filter &= ~hev; Filter2 = vp8_filter + + ; roughly 1/7th difference across boundary + ; roughly 2/7th difference across boundary + ; roughly 3/7th difference across boundary + vmov q11, q15 + vmov q13, q15 + vmov q14, q15 + + vmlal.s8 q10, d2, d7 ; Filter2 * 9 + vmlal.s8 q11, d3, d7 + vmlal.s8 q12, d2, d6 ; Filter2 * 18 + vmlal.s8 q13, d3, d6 + vmlal.s8 q14, d2, d5 ; Filter2 * 27 + vmlal.s8 q15, d3, d5 + vqshrn.s16 d20, q10, #7 ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7) + vqshrn.s16 d21, q11, #7 + vqshrn.s16 d24, q12, #7 ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7) + vqshrn.s16 d25, q13, #7 + vqshrn.s16 d28, q14, #7 ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7) + vqshrn.s16 d29, q15, #7 + + vqsub.s8 q11, q9, q10 ; s = vp8_signed_char_clamp(qs2 - u) + vqadd.s8 q10, q4, q10 ; s = vp8_signed_char_clamp(ps2 + u) + vqsub.s8 q13, q8, q12 ; s = vp8_signed_char_clamp(qs1 - u) + vqadd.s8 q12, q5, q12 ; s = vp8_signed_char_clamp(ps1 + u) + vqsub.s8 q15, q7, q14 ; s = vp8_signed_char_clamp(qs0 - u) + vqadd.s8 q14, q6, q14 ; s = vp8_signed_char_clamp(ps0 + u) + veor q9, q11, q0 ; *oq2 = s^0x80 + veor q4, q10, q0 ; *op2 = s^0x80 + veor q8, q13, q0 ; *oq1 = s^0x80 + veor q5, q12, q0 ; *op2 = s^0x80 + veor q7, q15, q0 ; *oq0 = s^0x80 + vld1.u8 {q3}, [sp]! + veor q6, q14, q0 ; *op0 = s^0x80 + vld1.u8 {q10}, [sp]! + + ;transpose to 16x8 matrix + vtrn.32 q3, q7 + vtrn.32 q4, q8 + vtrn.32 q5, q9 + vtrn.32 q6, q10 + add r12, r3, r1 + + vtrn.16 q3, q5 + vtrn.16 q4, q6 + vtrn.16 q7, q9 + vtrn.16 q8, q10 + + vtrn.8 q3, q4 + vtrn.8 q5, q6 + vtrn.8 q7, q8 + vtrn.8 q9, q10 + + ;store op2, op1, op0, oq0, oq1, oq2 + vst1.8 {d6}, [r0] + vst1.8 {d8}, [r2] + vst1.8 {d10}, [r3] + vst1.8 {d12}, [r12], r1 + add r0, r12, r1 + vst1.8 {d14}, [r12] + vst1.8 {d16}, [r0], r1 + add r2, r0, r1 + vst1.8 {d18}, [r0] + vst1.8 {d20}, [r2], r1 + add r3, r2, r1 + vst1.8 {d7}, [r2] + vst1.8 {d9}, [r3], r1 + add r12, r3, r1 + vst1.8 {d11}, [r3] + vst1.8 {d13}, [r12], r1 + add r0, r12, r1 + vst1.8 {d15}, [r12] + vst1.8 {d17}, [r0], r1 + add r2, r0, r1 + vst1.8 {d19}, [r0] + vst1.8 {d21}, [r2] + + bx lr + ENDP ; |vp8_mbloop_filter_vertical_edge_y_neon| + +;----------------- + AREA mbvloopfiltery_dat, DATA, READWRITE ;read/write by default +;Data section with name data_area is specified. DCD reserves space in memory for 16 data. +;One word each is reserved. Label filter_coeff can be used to access the data. +;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ... +_mbvlfy_coeff_ + DCD mbvlfy_coeff +mbvlfy_coeff + DCD 0x80808080, 0x80808080, 0x80808080, 0x80808080 + DCD 0x03030303, 0x03030303, 0x03030303, 0x03030303 + DCD 0x04040404, 0x04040404, 0x04040404, 0x04040404 + DCD 0x003f003f, 0x003f003f, 0x003f003f, 0x003f003f + DCD 0x09090909, 0x09090909, 0x12121212, 0x12121212 + DCD 0x1b1b1b1b, 0x1b1b1b1b + + END diff --git a/vp8/common/arm/neon/recon16x16mb_neon.asm b/vp8/common/arm/neon/recon16x16mb_neon.asm new file mode 100644 index 000000000..b9ba1cbc3 --- /dev/null +++ b/vp8/common/arm/neon/recon16x16mb_neon.asm @@ -0,0 +1,130 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_recon16x16mb_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; r0 unsigned char *pred_ptr, +; r1 short *diff_ptr, +; r2 unsigned char *dst_ptr, +; r3 int ystride, +; stack unsigned char *udst_ptr, +; stack unsigned char *vdst_ptr + +|vp8_recon16x16mb_neon| PROC + mov r12, #4 ;loop counter for Y loop + +recon16x16mb_loop_y + vld1.u8 {q12, q13}, [r0]! ;load data from pred_ptr + vld1.16 {q8, q9}, [r1]! ;load data from diff_ptr + vld1.u8 {q14, q15}, [r0]! + vld1.16 {q10, q11}, [r1]! + + vmovl.u8 q0, d24 ;modify Pred data from 8 bits to 16 bits + vmovl.u8 q1, d25 + vmovl.u8 q2, d26 + vmovl.u8 q3, d27 + vmovl.u8 q4, d28 + vmovl.u8 q5, d29 + vmovl.u8 q6, d30 + vld1.16 {q12, q13}, [r1]! + vmovl.u8 q7, d31 + vld1.16 {q14, q15}, [r1]! + + pld [r0] + pld [r1] + pld [r1, #64] + + vadd.s16 q0, q0, q8 ;add Diff data and Pred data together + vadd.s16 q1, q1, q9 + vadd.s16 q2, q2, q10 + vadd.s16 q3, q3, q11 + vadd.s16 q4, q4, q12 + vadd.s16 q5, q5, q13 + vadd.s16 q6, q6, q14 + vadd.s16 q7, q7, q15 + + vqmovun.s16 d0, q0 ;CLAMP() saturation + vqmovun.s16 d1, q1 + vqmovun.s16 d2, q2 + vqmovun.s16 d3, q3 + vqmovun.s16 d4, q4 + vqmovun.s16 d5, q5 + vst1.u8 {q0}, [r2], r3 ;store result + vqmovun.s16 d6, q6 + vst1.u8 {q1}, [r2], r3 + vqmovun.s16 d7, q7 + vst1.u8 {q2}, [r2], r3 + subs r12, r12, #1 + + moveq r12, #2 ;loop counter for UV loop + + vst1.u8 {q3}, [r2], r3 + bne recon16x16mb_loop_y + + mov r3, r3, lsr #1 ;uv_stride = ystride>>1 + ldr r2, [sp] ;load upred_ptr + +recon16x16mb_loop_uv + vld1.u8 {q12, q13}, [r0]! ;load data from pred_ptr + vld1.16 {q8, q9}, [r1]! ;load data from diff_ptr + vld1.u8 {q14, q15}, [r0]! + vld1.16 {q10, q11}, [r1]! + + vmovl.u8 q0, d24 ;modify Pred data from 8 bits to 16 bits + vmovl.u8 q1, d25 + vmovl.u8 q2, d26 + vmovl.u8 q3, d27 + vmovl.u8 q4, d28 + vmovl.u8 q5, d29 + vmovl.u8 q6, d30 + vld1.16 {q12, q13}, [r1]! + vmovl.u8 q7, d31 + vld1.16 {q14, q15}, [r1]! + + vadd.s16 q0, q0, q8 ;add Diff data and Pred data together + vadd.s16 q1, q1, q9 + vadd.s16 q2, q2, q10 + vadd.s16 q3, q3, q11 + vadd.s16 q4, q4, q12 + vadd.s16 q5, q5, q13 + vadd.s16 q6, q6, q14 + + vqmovun.s16 d0, q0 ;CLAMP() saturation + vadd.s16 q7, q7, q15 + vqmovun.s16 d1, q1 + vqmovun.s16 d2, q2 + vqmovun.s16 d3, q3 + vst1.u8 {d0}, [r2], r3 ;store result + vqmovun.s16 d4, q4 + vst1.u8 {d1}, [r2], r3 + vqmovun.s16 d5, q5 + vst1.u8 {d2}, [r2], r3 + vqmovun.s16 d6, q6 + vst1.u8 {d3}, [r2], r3 + vqmovun.s16 d7, q7 + vst1.u8 {d4}, [r2], r3 + subs r12, r12, #1 + + vst1.u8 {d5}, [r2], r3 + vst1.u8 {d6}, [r2], r3 + vst1.u8 {d7}, [r2], r3 + + ldrne r2, [sp, #4] ;load vpred_ptr + bne recon16x16mb_loop_uv + + bx lr + + ENDP + END diff --git a/vp8/common/arm/neon/recon2b_neon.asm b/vp8/common/arm/neon/recon2b_neon.asm new file mode 100644 index 000000000..25aaf8c8e --- /dev/null +++ b/vp8/common/arm/neon/recon2b_neon.asm @@ -0,0 +1,53 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_recon2b_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; r0 unsigned char *pred_ptr, +; r1 short *diff_ptr, +; r2 unsigned char *dst_ptr, +; r3 int stride + +|vp8_recon2b_neon| PROC + vld1.u8 {q8, q9}, [r0] ;load data from pred_ptr + vld1.16 {q4, q5}, [r1]! ;load data from diff_ptr + + vmovl.u8 q0, d16 ;modify Pred data from 8 bits to 16 bits + vld1.16 {q6, q7}, [r1]! + vmovl.u8 q1, d17 + vmovl.u8 q2, d18 + vmovl.u8 q3, d19 + + vadd.s16 q0, q0, q4 ;add Diff data and Pred data together + vadd.s16 q1, q1, q5 + vadd.s16 q2, q2, q6 + vadd.s16 q3, q3, q7 + + vqmovun.s16 d0, q0 ;CLAMP() saturation + vqmovun.s16 d1, q1 + vqmovun.s16 d2, q2 + vqmovun.s16 d3, q3 + add r0, r2, r3 + + vst1.u8 {d0}, [r2] ;store result + vst1.u8 {d1}, [r0], r3 + add r2, r0, r3 + vst1.u8 {d2}, [r0] + vst1.u8 {d3}, [r2], r3 + + bx lr + + ENDP + END diff --git a/vp8/common/arm/neon/recon4b_neon.asm b/vp8/common/arm/neon/recon4b_neon.asm new file mode 100644 index 000000000..a4f5b806b --- /dev/null +++ b/vp8/common/arm/neon/recon4b_neon.asm @@ -0,0 +1,68 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_recon4b_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; r0 unsigned char *pred_ptr, +; r1 short *diff_ptr, +; r2 unsigned char *dst_ptr, +; r3 int stride + +|vp8_recon4b_neon| PROC + vld1.u8 {q12, q13}, [r0]! ;load data from pred_ptr + vld1.16 {q8, q9}, [r1]! ;load data from diff_ptr + vld1.u8 {q14, q15}, [r0] + vld1.16 {q10, q11}, [r1]! + + vmovl.u8 q0, d24 ;modify Pred data from 8 bits to 16 bits + vmovl.u8 q1, d25 + vmovl.u8 q2, d26 + vmovl.u8 q3, d27 + vmovl.u8 q4, d28 + vmovl.u8 q5, d29 + vmovl.u8 q6, d30 + vld1.16 {q12, q13}, [r1]! + vmovl.u8 q7, d31 + vld1.16 {q14, q15}, [r1] + + vadd.s16 q0, q0, q8 ;add Diff data and Pred data together + vadd.s16 q1, q1, q9 + vadd.s16 q2, q2, q10 + vadd.s16 q3, q3, q11 + vadd.s16 q4, q4, q12 + vadd.s16 q5, q5, q13 + vadd.s16 q6, q6, q14 + vadd.s16 q7, q7, q15 + + vqmovun.s16 d0, q0 ;CLAMP() saturation + vqmovun.s16 d1, q1 + vqmovun.s16 d2, q2 + vqmovun.s16 d3, q3 + vqmovun.s16 d4, q4 + vqmovun.s16 d5, q5 + vqmovun.s16 d6, q6 + vqmovun.s16 d7, q7 + add r0, r2, r3 + + vst1.u8 {q0}, [r2] ;store result + vst1.u8 {q1}, [r0], r3 + add r2, r0, r3 + vst1.u8 {q2}, [r0] + vst1.u8 {q3}, [r2], r3 + + bx lr + + ENDP + END diff --git a/vp8/common/arm/neon/reconb_neon.asm b/vp8/common/arm/neon/reconb_neon.asm new file mode 100644 index 000000000..16d85a0d5 --- /dev/null +++ b/vp8/common/arm/neon/reconb_neon.asm @@ -0,0 +1,60 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_recon_b_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; r0 unsigned char *pred_ptr, +; r1 short *diff_ptr, +; r2 unsigned char *dst_ptr, +; r3 int stride + +|vp8_recon_b_neon| PROC + mov r12, #16 + + vld1.u8 {d28}, [r0], r12 ;load 4 data/line from pred_ptr + vld1.16 {q10, q11}, [r1]! ;load data from diff_ptr + vld1.u8 {d29}, [r0], r12 + vld1.16 {q11, q12}, [r1]! + vld1.u8 {d30}, [r0], r12 + vld1.16 {q12, q13}, [r1]! + vld1.u8 {d31}, [r0], r12 + vld1.16 {q13}, [r1] + + vmovl.u8 q0, d28 ;modify Pred data from 8 bits to 16 bits + vmovl.u8 q1, d29 ;Pred data in d0, d2, d4, d6 + vmovl.u8 q2, d30 + vmovl.u8 q3, d31 + + vadd.s16 d0, d0, d20 ;add Diff data and Pred data together + vadd.s16 d2, d2, d22 + vadd.s16 d4, d4, d24 + vadd.s16 d6, d6, d26 + + vqmovun.s16 d0, q0 ;CLAMP() saturation + vqmovun.s16 d1, q1 + vqmovun.s16 d2, q2 + vqmovun.s16 d3, q3 + add r1, r2, r3 + + vst1.32 {d0[0]}, [r2] ;store result + vst1.32 {d1[0]}, [r1], r3 + add r2, r1, r3 + vst1.32 {d2[0]}, [r1] + vst1.32 {d3[0]}, [r2], r3 + + bx lr + + ENDP + END diff --git a/vp8/common/arm/neon/save_neon_reg.asm b/vp8/common/arm/neon/save_neon_reg.asm new file mode 100644 index 000000000..4873e447f --- /dev/null +++ b/vp8/common/arm/neon/save_neon_reg.asm @@ -0,0 +1,35 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_push_neon| + EXPORT |vp8_pop_neon| + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +|vp8_push_neon| PROC + vst1.i64 {d8, d9, d10, d11}, [r0]! + vst1.i64 {d12, d13, d14, d15}, [r0]! + bx lr + + ENDP + +|vp8_pop_neon| PROC + vld1.i64 {d8, d9, d10, d11}, [r0]! + vld1.i64 {d12, d13, d14, d15}, [r0]! + bx lr + + ENDP + + END + diff --git a/vp8/common/arm/neon/shortidct4x4llm_1_neon.asm b/vp8/common/arm/neon/shortidct4x4llm_1_neon.asm new file mode 100644 index 000000000..7d06ff908 --- /dev/null +++ b/vp8/common/arm/neon/shortidct4x4llm_1_neon.asm @@ -0,0 +1,66 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_short_idct4x4llm_1_neon| + EXPORT |vp8_dc_only_idct_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void vp8_short_idct4x4llm_1_c(short *input, short *output, int pitch); +; r0 short *input; +; r1 short *output; +; r2 int pitch; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +|vp8_short_idct4x4llm_1_neon| PROC + vld1.16 {d0[]}, [r0] ;load input[0] + + add r3, r1, r2 + add r12, r3, r2 + + vrshr.s16 d0, d0, #3 + + add r0, r12, r2 + + vst1.16 {d0}, [r1] + vst1.16 {d0}, [r3] + vst1.16 {d0}, [r12] + vst1.16 {d0}, [r0] + + bx lr + ENDP + +;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void vp8_dc_only_idct_c(short input_dc, short *output, int pitch); +; r0 short input_dc; +; r1 short *output; +; r2 int pitch; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +|vp8_dc_only_idct_neon| PROC + vdup.16 d0, r0 + + add r3, r1, r2 + add r12, r3, r2 + + vrshr.s16 d0, d0, #3 + + add r0, r12, r2 + + vst1.16 {d0}, [r1] + vst1.16 {d0}, [r3] + vst1.16 {d0}, [r12] + vst1.16 {d0}, [r0] + + bx lr + + ENDP + END diff --git a/vp8/common/arm/neon/shortidct4x4llm_neon.asm b/vp8/common/arm/neon/shortidct4x4llm_neon.asm new file mode 100644 index 000000000..ffecfbfbc --- /dev/null +++ b/vp8/common/arm/neon/shortidct4x4llm_neon.asm @@ -0,0 +1,126 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_short_idct4x4llm_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +;************************************************************* +;void vp8_short_idct4x4llm_c(short *input, short *output, int pitch) +;r0 short * input +;r1 short * output +;r2 int pitch +;************************************************************* +;static const int cospi8sqrt2minus1=20091; +;static const int sinpi8sqrt2 =35468; +;static const int rounding = 0; +;Optimization note: The resulted data from dequantization are signed 13-bit data that is +;in the range of [-4096, 4095]. This allows to use "vqdmulh"(neon) instruction since +;it won't go out of range (13+16+1=30bits<32bits). This instruction gives the high half +;result of the multiplication that is needed in IDCT. + +|vp8_short_idct4x4llm_neon| PROC + ldr r12, _idct_coeff_ + vld1.16 {q1, q2}, [r0] + vld1.16 {d0}, [r12] + + vswp d3, d4 ;q2(vp[4] vp[12]) + + vqdmulh.s16 q3, q2, d0[2] + vqdmulh.s16 q4, q2, d0[0] + + vqadd.s16 d12, d2, d3 ;a1 + vqsub.s16 d13, d2, d3 ;b1 + + vshr.s16 q3, q3, #1 + vshr.s16 q4, q4, #1 + + vqadd.s16 q3, q3, q2 ;modify since sinpi8sqrt2 > 65536/2 (negtive number) + vqadd.s16 q4, q4, q2 + + ;d6 - c1:temp1 + ;d7 - d1:temp2 + ;d8 - d1:temp1 + ;d9 - c1:temp2 + + vqsub.s16 d10, d6, d9 ;c1 + vqadd.s16 d11, d7, d8 ;d1 + + vqadd.s16 d2, d12, d11 + vqadd.s16 d3, d13, d10 + vqsub.s16 d4, d13, d10 + vqsub.s16 d5, d12, d11 + + vtrn.32 d2, d4 + vtrn.32 d3, d5 + vtrn.16 d2, d3 + vtrn.16 d4, d5 + + vswp d3, d4 + + vqdmulh.s16 q3, q2, d0[2] + vqdmulh.s16 q4, q2, d0[0] + + vqadd.s16 d12, d2, d3 ;a1 + vqsub.s16 d13, d2, d3 ;b1 + + vshr.s16 q3, q3, #1 + vshr.s16 q4, q4, #1 + + vqadd.s16 q3, q3, q2 ;modify since sinpi8sqrt2 > 65536/2 (negtive number) + vqadd.s16 q4, q4, q2 + + vqsub.s16 d10, d6, d9 ;c1 + vqadd.s16 d11, d7, d8 ;d1 + + vqadd.s16 d2, d12, d11 + vqadd.s16 d3, d13, d10 + vqsub.s16 d4, d13, d10 + vqsub.s16 d5, d12, d11 + + vrshr.s16 d2, d2, #3 + vrshr.s16 d3, d3, #3 + vrshr.s16 d4, d4, #3 + vrshr.s16 d5, d5, #3 + + add r3, r1, r2 + add r12, r3, r2 + add r0, r12, r2 + + vtrn.32 d2, d4 + vtrn.32 d3, d5 + vtrn.16 d2, d3 + vtrn.16 d4, d5 + + vst1.16 {d2}, [r1] + vst1.16 {d3}, [r3] + vst1.16 {d4}, [r12] + vst1.16 {d5}, [r0] + + bx lr + + ENDP + +;----------------- + AREA idct4x4_dat, DATA, READWRITE ;read/write by default +;Data section with name data_area is specified. DCD reserves space in memory for 48 data. +;One word each is reserved. Label filter_coeff can be used to access the data. +;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ... +_idct_coeff_ + DCD idct_coeff +idct_coeff + DCD 0x4e7b4e7b, 0x8a8c8a8c + +;20091, 20091, 35468, 35468 + + END diff --git a/vp8/common/arm/neon/sixtappredict16x16_neon.asm b/vp8/common/arm/neon/sixtappredict16x16_neon.asm new file mode 100644 index 000000000..9f5f0d2ce --- /dev/null +++ b/vp8/common/arm/neon/sixtappredict16x16_neon.asm @@ -0,0 +1,494 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_sixtap_predict16x16_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +; r0 unsigned char *src_ptr, +; r1 int src_pixels_per_line, +; r2 int xoffset, +; r3 int yoffset, +; r4 unsigned char *dst_ptr, +; stack(r5) int dst_pitch + +;Note: To take advantage of 8-bit mulplication instruction in NEON. First apply abs() to +; filter coeffs to make them u8. Then, use vmlsl for negtive coeffs. After multiplication, +; the result can be negtive. So, I treat the result as s16. But, since it is also possible +; that the result can be a large positive number (> 2^15-1), which could be confused as a +; negtive number. To avoid that error, apply filter coeffs in the order of 0, 1, 4 ,5 ,2, +; which ensures that the result stays in s16 range. Finally, saturated add the result by +; applying 3rd filter coeff. Same applys to other filter functions. + +|vp8_sixtap_predict16x16_neon| PROC + push {r4-r5, lr} + + ldr r12, _filter16_coeff_ + ldr r4, [sp, #12] ;load parameters from stack + ldr r5, [sp, #16] ;load parameters from stack + + cmp r2, #0 ;skip first_pass filter if xoffset=0 + beq secondpass_filter16x16_only + + add r2, r12, r2, lsl #5 ;calculate filter location + + cmp r3, #0 ;skip second_pass filter if yoffset=0 + + vld1.s32 {q14, q15}, [r2] ;load first_pass filter + + beq firstpass_filter16x16_only + + sub sp, sp, #336 ;reserve space on stack for temporary storage + mov lr, sp + + vabs.s32 q12, q14 + vabs.s32 q13, q15 + + mov r2, #7 ;loop counter + sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2) + sub r0, r0, r1, lsl #1 + + vdup.8 d0, d24[0] ;first_pass filter (d0-d5) + vdup.8 d1, d24[4] + vdup.8 d2, d25[0] + vdup.8 d3, d25[4] + vdup.8 d4, d26[0] + vdup.8 d5, d26[4] + +;First Pass: output_height lines x output_width columns (21x16) +filt_blk2d_fp16x16_loop_neon + vld1.u8 {d6, d7, d8}, [r0], r1 ;load src data + vld1.u8 {d9, d10, d11}, [r0], r1 + vld1.u8 {d12, d13, d14}, [r0], r1 + + pld [r0] + pld [r0, r1] + pld [r0, r1, lsl #1] + + vmull.u8 q8, d6, d0 ;(src_ptr[-2] * vp8_filter[0]) + vmull.u8 q9, d7, d0 + vmull.u8 q10, d9, d0 + vmull.u8 q11, d10, d0 + vmull.u8 q12, d12, d0 + vmull.u8 q13, d13, d0 + + vext.8 d28, d6, d7, #1 ;construct src_ptr[-1] + vext.8 d29, d9, d10, #1 + vext.8 d30, d12, d13, #1 + + vmlsl.u8 q8, d28, d1 ;-(src_ptr[-1] * vp8_filter[1]) + vmlsl.u8 q10, d29, d1 + vmlsl.u8 q12, d30, d1 + + vext.8 d28, d7, d8, #1 + vext.8 d29, d10, d11, #1 + vext.8 d30, d13, d14, #1 + + vmlsl.u8 q9, d28, d1 ;-(src_ptr[-1] * vp8_filter[1]) + vmlsl.u8 q11, d29, d1 + vmlsl.u8 q13, d30, d1 + + vext.8 d28, d6, d7, #4 ;construct src_ptr[2] + vext.8 d29, d9, d10, #4 + vext.8 d30, d12, d13, #4 + + vmlsl.u8 q8, d28, d4 ;-(src_ptr[2] * vp8_filter[4]) + vmlsl.u8 q10, d29, d4 + vmlsl.u8 q12, d30, d4 + + vext.8 d28, d7, d8, #4 + vext.8 d29, d10, d11, #4 + vext.8 d30, d13, d14, #4 + + vmlsl.u8 q9, d28, d4 ;-(src_ptr[2] * vp8_filter[4]) + vmlsl.u8 q11, d29, d4 + vmlsl.u8 q13, d30, d4 + + vext.8 d28, d6, d7, #5 ;construct src_ptr[3] + vext.8 d29, d9, d10, #5 + vext.8 d30, d12, d13, #5 + + vmlal.u8 q8, d28, d5 ;(src_ptr[3] * vp8_filter[5]) + vmlal.u8 q10, d29, d5 + vmlal.u8 q12, d30, d5 + + vext.8 d28, d7, d8, #5 + vext.8 d29, d10, d11, #5 + vext.8 d30, d13, d14, #5 + + vmlal.u8 q9, d28, d5 ;(src_ptr[3] * vp8_filter[5]) + vmlal.u8 q11, d29, d5 + vmlal.u8 q13, d30, d5 + + vext.8 d28, d6, d7, #2 ;construct src_ptr[0] + vext.8 d29, d9, d10, #2 + vext.8 d30, d12, d13, #2 + + vmlal.u8 q8, d28, d2 ;(src_ptr[0] * vp8_filter[2]) + vmlal.u8 q10, d29, d2 + vmlal.u8 q12, d30, d2 + + vext.8 d28, d7, d8, #2 + vext.8 d29, d10, d11, #2 + vext.8 d30, d13, d14, #2 + + vmlal.u8 q9, d28, d2 ;(src_ptr[0] * vp8_filter[2]) + vmlal.u8 q11, d29, d2 + vmlal.u8 q13, d30, d2 + + vext.8 d28, d6, d7, #3 ;construct src_ptr[1] + vext.8 d29, d9, d10, #3 + vext.8 d30, d12, d13, #3 + + vext.8 d15, d7, d8, #3 + vext.8 d31, d10, d11, #3 + vext.8 d6, d13, d14, #3 + + vmull.u8 q4, d28, d3 ;(src_ptr[1] * vp8_filter[3]) + vmull.u8 q5, d29, d3 + vmull.u8 q6, d30, d3 + + vqadd.s16 q8, q4 ;sum of all (src_data*filter_parameters) + vqadd.s16 q10, q5 + vqadd.s16 q12, q6 + + vmull.u8 q6, d15, d3 ;(src_ptr[1] * vp8_filter[3]) + vmull.u8 q7, d31, d3 + vmull.u8 q3, d6, d3 + + subs r2, r2, #1 + + vqadd.s16 q9, q6 + vqadd.s16 q11, q7 + vqadd.s16 q13, q3 + + vqrshrun.s16 d6, q8, #7 ;shift/round/saturate to u8 + vqrshrun.s16 d7, q9, #7 + vqrshrun.s16 d8, q10, #7 + vqrshrun.s16 d9, q11, #7 + vqrshrun.s16 d10, q12, #7 + vqrshrun.s16 d11, q13, #7 + + vst1.u8 {d6, d7, d8}, [lr]! ;store result + vst1.u8 {d9, d10, d11}, [lr]! + + bne filt_blk2d_fp16x16_loop_neon + +;Second pass: 16x16 +;secondpass_filter - do first 8-columns and then second 8-columns + add r3, r12, r3, lsl #5 + sub lr, lr, #336 + + vld1.s32 {q5, q6}, [r3] ;load second_pass filter + mov r3, #2 ;loop counter + + vabs.s32 q7, q5 + vabs.s32 q8, q6 + + mov r2, #16 + + vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5) + vdup.8 d1, d14[4] + vdup.8 d2, d15[0] + vdup.8 d3, d15[4] + vdup.8 d4, d16[0] + vdup.8 d5, d16[4] + +filt_blk2d_sp16x16_outloop_neon + vld1.u8 {d18}, [lr], r2 ;load src data + vld1.u8 {d19}, [lr], r2 + vld1.u8 {d20}, [lr], r2 + vld1.u8 {d21}, [lr], r2 + mov r12, #4 ;loop counter + vld1.u8 {d22}, [lr], r2 + +secondpass_inner_loop_neon + vld1.u8 {d23}, [lr], r2 ;load src data + vld1.u8 {d24}, [lr], r2 + vld1.u8 {d25}, [lr], r2 + vld1.u8 {d26}, [lr], r2 + + vmull.u8 q3, d18, d0 ;(src_ptr[-2] * vp8_filter[0]) + vmull.u8 q4, d19, d0 + vmull.u8 q5, d20, d0 + vmull.u8 q6, d21, d0 + + vmlsl.u8 q3, d19, d1 ;-(src_ptr[-1] * vp8_filter[1]) + vmlsl.u8 q4, d20, d1 + vmlsl.u8 q5, d21, d1 + vmlsl.u8 q6, d22, d1 + + vmlsl.u8 q3, d22, d4 ;-(src_ptr[2] * vp8_filter[4]) + vmlsl.u8 q4, d23, d4 + vmlsl.u8 q5, d24, d4 + vmlsl.u8 q6, d25, d4 + + vmlal.u8 q3, d20, d2 ;(src_ptr[0] * vp8_filter[2]) + vmlal.u8 q4, d21, d2 + vmlal.u8 q5, d22, d2 + vmlal.u8 q6, d23, d2 + + vmlal.u8 q3, d23, d5 ;(src_ptr[3] * vp8_filter[5]) + vmlal.u8 q4, d24, d5 + vmlal.u8 q5, d25, d5 + vmlal.u8 q6, d26, d5 + + vmull.u8 q7, d21, d3 ;(src_ptr[1] * vp8_filter[3]) + vmull.u8 q8, d22, d3 + vmull.u8 q9, d23, d3 + vmull.u8 q10, d24, d3 + + subs r12, r12, #1 + + vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) + vqadd.s16 q8, q4 + vqadd.s16 q9, q5 + vqadd.s16 q10, q6 + + vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8 + vqrshrun.s16 d7, q8, #7 + vqrshrun.s16 d8, q9, #7 + vqrshrun.s16 d9, q10, #7 + + vst1.u8 {d6}, [r4], r5 ;store result + vmov q9, q11 + vst1.u8 {d7}, [r4], r5 + vmov q10, q12 + vst1.u8 {d8}, [r4], r5 + vmov d22, d26 + vst1.u8 {d9}, [r4], r5 + + bne secondpass_inner_loop_neon + + subs r3, r3, #1 + sub lr, lr, #336 + add lr, lr, #8 + + sub r4, r4, r5, lsl #4 + add r4, r4, #8 + + bne filt_blk2d_sp16x16_outloop_neon + + add sp, sp, #336 + pop {r4-r5,pc} + +;-------------------- +firstpass_filter16x16_only + vabs.s32 q12, q14 + vabs.s32 q13, q15 + + mov r2, #8 ;loop counter + sub r0, r0, #2 ;move srcptr back to (column-2) + + vdup.8 d0, d24[0] ;first_pass filter (d0-d5) + vdup.8 d1, d24[4] + vdup.8 d2, d25[0] + vdup.8 d3, d25[4] + vdup.8 d4, d26[0] + vdup.8 d5, d26[4] + +;First Pass: output_height lines x output_width columns (16x16) +filt_blk2d_fpo16x16_loop_neon + vld1.u8 {d6, d7, d8}, [r0], r1 ;load src data + vld1.u8 {d9, d10, d11}, [r0], r1 + + pld [r0] + pld [r0, r1] + + vmull.u8 q6, d6, d0 ;(src_ptr[-2] * vp8_filter[0]) + vmull.u8 q7, d7, d0 + vmull.u8 q8, d9, d0 + vmull.u8 q9, d10, d0 + + vext.8 d20, d6, d7, #1 ;construct src_ptr[-1] + vext.8 d21, d9, d10, #1 + vext.8 d22, d7, d8, #1 + vext.8 d23, d10, d11, #1 + vext.8 d24, d6, d7, #4 ;construct src_ptr[2] + vext.8 d25, d9, d10, #4 + vext.8 d26, d7, d8, #4 + vext.8 d27, d10, d11, #4 + vext.8 d28, d6, d7, #5 ;construct src_ptr[3] + vext.8 d29, d9, d10, #5 + + vmlsl.u8 q6, d20, d1 ;-(src_ptr[-1] * vp8_filter[1]) + vmlsl.u8 q8, d21, d1 + vmlsl.u8 q7, d22, d1 ;-(src_ptr[-1] * vp8_filter[1]) + vmlsl.u8 q9, d23, d1 + vmlsl.u8 q6, d24, d4 ;-(src_ptr[2] * vp8_filter[4]) + vmlsl.u8 q8, d25, d4 + vmlsl.u8 q7, d26, d4 ;-(src_ptr[2] * vp8_filter[4]) + vmlsl.u8 q9, d27, d4 + vmlal.u8 q6, d28, d5 ;(src_ptr[3] * vp8_filter[5]) + vmlal.u8 q8, d29, d5 + + vext.8 d20, d7, d8, #5 + vext.8 d21, d10, d11, #5 + vext.8 d22, d6, d7, #2 ;construct src_ptr[0] + vext.8 d23, d9, d10, #2 + vext.8 d24, d7, d8, #2 + vext.8 d25, d10, d11, #2 + + vext.8 d26, d6, d7, #3 ;construct src_ptr[1] + vext.8 d27, d9, d10, #3 + vext.8 d28, d7, d8, #3 + vext.8 d29, d10, d11, #3 + + vmlal.u8 q7, d20, d5 ;(src_ptr[3] * vp8_filter[5]) + vmlal.u8 q9, d21, d5 + vmlal.u8 q6, d22, d2 ;(src_ptr[0] * vp8_filter[2]) + vmlal.u8 q8, d23, d2 + vmlal.u8 q7, d24, d2 ;(src_ptr[0] * vp8_filter[2]) + vmlal.u8 q9, d25, d2 + + vmull.u8 q10, d26, d3 ;(src_ptr[1] * vp8_filter[3]) + vmull.u8 q11, d27, d3 + vmull.u8 q12, d28, d3 ;(src_ptr[1] * vp8_filter[3]) + vmull.u8 q15, d29, d3 + + vqadd.s16 q6, q10 ;sum of all (src_data*filter_parameters) + vqadd.s16 q8, q11 + vqadd.s16 q7, q12 + vqadd.s16 q9, q15 + + subs r2, r2, #1 + + vqrshrun.s16 d6, q6, #7 ;shift/round/saturate to u8 + vqrshrun.s16 d7, q7, #7 + vqrshrun.s16 d8, q8, #7 + vqrshrun.s16 d9, q9, #7 + + vst1.u8 {q3}, [r4], r5 ;store result + vst1.u8 {q4}, [r4], r5 + + bne filt_blk2d_fpo16x16_loop_neon + + pop {r4-r5,pc} + +;-------------------- +secondpass_filter16x16_only +;Second pass: 16x16 + add r3, r12, r3, lsl #5 + sub r0, r0, r1, lsl #1 + + vld1.s32 {q5, q6}, [r3] ;load second_pass filter + mov r3, #2 ;loop counter + + vabs.s32 q7, q5 + vabs.s32 q8, q6 + + vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5) + vdup.8 d1, d14[4] + vdup.8 d2, d15[0] + vdup.8 d3, d15[4] + vdup.8 d4, d16[0] + vdup.8 d5, d16[4] + +filt_blk2d_spo16x16_outloop_neon + vld1.u8 {d18}, [r0], r1 ;load src data + vld1.u8 {d19}, [r0], r1 + vld1.u8 {d20}, [r0], r1 + vld1.u8 {d21}, [r0], r1 + mov r12, #4 ;loop counter + vld1.u8 {d22}, [r0], r1 + +secondpass_only_inner_loop_neon + vld1.u8 {d23}, [r0], r1 ;load src data + vld1.u8 {d24}, [r0], r1 + vld1.u8 {d25}, [r0], r1 + vld1.u8 {d26}, [r0], r1 + + vmull.u8 q3, d18, d0 ;(src_ptr[-2] * vp8_filter[0]) + vmull.u8 q4, d19, d0 + vmull.u8 q5, d20, d0 + vmull.u8 q6, d21, d0 + + vmlsl.u8 q3, d19, d1 ;-(src_ptr[-1] * vp8_filter[1]) + vmlsl.u8 q4, d20, d1 + vmlsl.u8 q5, d21, d1 + vmlsl.u8 q6, d22, d1 + + vmlsl.u8 q3, d22, d4 ;-(src_ptr[2] * vp8_filter[4]) + vmlsl.u8 q4, d23, d4 + vmlsl.u8 q5, d24, d4 + vmlsl.u8 q6, d25, d4 + + vmlal.u8 q3, d20, d2 ;(src_ptr[0] * vp8_filter[2]) + vmlal.u8 q4, d21, d2 + vmlal.u8 q5, d22, d2 + vmlal.u8 q6, d23, d2 + + vmlal.u8 q3, d23, d5 ;(src_ptr[3] * vp8_filter[5]) + vmlal.u8 q4, d24, d5 + vmlal.u8 q5, d25, d5 + vmlal.u8 q6, d26, d5 + + vmull.u8 q7, d21, d3 ;(src_ptr[1] * vp8_filter[3]) + vmull.u8 q8, d22, d3 + vmull.u8 q9, d23, d3 + vmull.u8 q10, d24, d3 + + subs r12, r12, #1 + + vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) + vqadd.s16 q8, q4 + vqadd.s16 q9, q5 + vqadd.s16 q10, q6 + + vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8 + vqrshrun.s16 d7, q8, #7 + vqrshrun.s16 d8, q9, #7 + vqrshrun.s16 d9, q10, #7 + + vst1.u8 {d6}, [r4], r5 ;store result + vmov q9, q11 + vst1.u8 {d7}, [r4], r5 + vmov q10, q12 + vst1.u8 {d8}, [r4], r5 + vmov d22, d26 + vst1.u8 {d9}, [r4], r5 + + bne secondpass_only_inner_loop_neon + + subs r3, r3, #1 + sub r0, r0, r1, lsl #4 + sub r0, r0, r1, lsl #2 + sub r0, r0, r1 + add r0, r0, #8 + + sub r4, r4, r5, lsl #4 + add r4, r4, #8 + + bne filt_blk2d_spo16x16_outloop_neon + + pop {r4-r5,pc} + + ENDP + +;----------------- + AREA subpelfilters16_dat, DATA, READWRITE ;read/write by default +;Data section with name data_area is specified. DCD reserves space in memory for 48 data. +;One word each is reserved. Label filter_coeff can be used to access the data. +;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ... +_filter16_coeff_ + DCD filter16_coeff +filter16_coeff + DCD 0, 0, 128, 0, 0, 0, 0, 0 + DCD 0, -6, 123, 12, -1, 0, 0, 0 + DCD 2, -11, 108, 36, -8, 1, 0, 0 + DCD 0, -9, 93, 50, -6, 0, 0, 0 + DCD 3, -16, 77, 77, -16, 3, 0, 0 + DCD 0, -6, 50, 93, -9, 0, 0, 0 + DCD 1, -8, 36, 108, -11, 2, 0, 0 + DCD 0, -1, 12, 123, -6, 0, 0, 0 + + END diff --git a/vp8/common/arm/neon/sixtappredict4x4_neon.asm b/vp8/common/arm/neon/sixtappredict4x4_neon.asm new file mode 100644 index 000000000..c23a9dbd1 --- /dev/null +++ b/vp8/common/arm/neon/sixtappredict4x4_neon.asm @@ -0,0 +1,425 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_sixtap_predict_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +; r0 unsigned char *src_ptr, +; r1 int src_pixels_per_line, +; r2 int xoffset, +; r3 int yoffset, +; stack(r4) unsigned char *dst_ptr, +; stack(lr) int dst_pitch + +|vp8_sixtap_predict_neon| PROC + push {r4, lr} + + ldr r12, _filter4_coeff_ + ldr r4, [sp, #8] ;load parameters from stack + ldr lr, [sp, #12] ;load parameters from stack + + cmp r2, #0 ;skip first_pass filter if xoffset=0 + beq secondpass_filter4x4_only + + add r2, r12, r2, lsl #5 ;calculate filter location + + cmp r3, #0 ;skip second_pass filter if yoffset=0 + vld1.s32 {q14, q15}, [r2] ;load first_pass filter + + beq firstpass_filter4x4_only + + vabs.s32 q12, q14 ;get abs(filer_parameters) + vabs.s32 q13, q15 + + sub r0, r0, #2 ;go back 2 columns of src data + sub r0, r0, r1, lsl #1 ;go back 2 lines of src data + +;First pass: output_height lines x output_width columns (9x4) + vld1.u8 {q3}, [r0], r1 ;load first 4-line src data + vdup.8 d0, d24[0] ;first_pass filter (d0-d5) + vld1.u8 {q4}, [r0], r1 + vdup.8 d1, d24[4] + vld1.u8 {q5}, [r0], r1 + vdup.8 d2, d25[0] + vld1.u8 {q6}, [r0], r1 + vdup.8 d3, d25[4] + vdup.8 d4, d26[0] + vdup.8 d5, d26[4] + + pld [r0] + pld [r0, r1] + pld [r0, r1, lsl #1] + + vext.8 d18, d6, d7, #5 ;construct src_ptr[3] + vext.8 d19, d8, d9, #5 + vext.8 d20, d10, d11, #5 + vext.8 d21, d12, d13, #5 + + vswp d7, d8 ;discard 2nd half data after src_ptr[3] is done + vswp d11, d12 + + vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[3]) + vzip.32 d20, d21 + vmull.u8 q7, d18, d5 ;(src_ptr[3] * vp8_filter[5]) + vmull.u8 q8, d20, d5 + + vmov q4, q3 ;keep original src data in q4 q6 + vmov q6, q5 + + vzip.32 d6, d7 ;construct src_ptr[-2], and put 2-line data together + vzip.32 d10, d11 + vshr.u64 q9, q4, #8 ;construct src_ptr[-1] + vshr.u64 q10, q6, #8 + vmlal.u8 q7, d6, d0 ;+(src_ptr[-2] * vp8_filter[0]) + vmlal.u8 q8, d10, d0 + + vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[-1]) + vzip.32 d20, d21 + vshr.u64 q3, q4, #32 ;construct src_ptr[2] + vshr.u64 q5, q6, #32 + vmlsl.u8 q7, d18, d1 ;-(src_ptr[-1] * vp8_filter[1]) + vmlsl.u8 q8, d20, d1 + + vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[2]) + vzip.32 d10, d11 + vshr.u64 q9, q4, #16 ;construct src_ptr[0] + vshr.u64 q10, q6, #16 + vmlsl.u8 q7, d6, d4 ;-(src_ptr[2] * vp8_filter[4]) + vmlsl.u8 q8, d10, d4 + + vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[0]) + vzip.32 d20, d21 + vshr.u64 q3, q4, #24 ;construct src_ptr[1] + vshr.u64 q5, q6, #24 + vmlal.u8 q7, d18, d2 ;(src_ptr[0] * vp8_filter[2]) + vmlal.u8 q8, d20, d2 + + vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[1]) + vzip.32 d10, d11 + vmull.u8 q9, d6, d3 ;(src_ptr[1] * vp8_filter[3]) + vmull.u8 q10, d10, d3 + + vld1.u8 {q3}, [r0], r1 ;load rest 5-line src data + vld1.u8 {q4}, [r0], r1 + + vqadd.s16 q7, q9 ;sum of all (src_data*filter_parameters) + vqadd.s16 q8, q10 + + vld1.u8 {q5}, [r0], r1 + vld1.u8 {q6}, [r0], r1 + + vqrshrun.s16 d27, q7, #7 ;shift/round/saturate to u8 + vqrshrun.s16 d28, q8, #7 + + ;First Pass on rest 5-line data + vld1.u8 {q11}, [r0], r1 + + vext.8 d18, d6, d7, #5 ;construct src_ptr[3] + vext.8 d19, d8, d9, #5 + vext.8 d20, d10, d11, #5 + vext.8 d21, d12, d13, #5 + + vswp d7, d8 ;discard 2nd half data after src_ptr[3] is done + vswp d11, d12 + + vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[3]) + vzip.32 d20, d21 + vext.8 d31, d22, d23, #5 ;construct src_ptr[3] + vmull.u8 q7, d18, d5 ;(src_ptr[3] * vp8_filter[5]) + vmull.u8 q8, d20, d5 + vmull.u8 q12, d31, d5 ;(src_ptr[3] * vp8_filter[5]) + + vmov q4, q3 ;keep original src data in q4 q6 + vmov q6, q5 + + vzip.32 d6, d7 ;construct src_ptr[-2], and put 2-line data together + vzip.32 d10, d11 + vshr.u64 q9, q4, #8 ;construct src_ptr[-1] + vshr.u64 q10, q6, #8 + + vmlal.u8 q7, d6, d0 ;+(src_ptr[-2] * vp8_filter[0]) + vmlal.u8 q8, d10, d0 + vmlal.u8 q12, d22, d0 ;(src_ptr[-2] * vp8_filter[0]) + + vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[-1]) + vzip.32 d20, d21 + vshr.u64 q3, q4, #32 ;construct src_ptr[2] + vshr.u64 q5, q6, #32 + vext.8 d31, d22, d23, #1 ;construct src_ptr[-1] + + vmlsl.u8 q7, d18, d1 ;-(src_ptr[-1] * vp8_filter[1]) + vmlsl.u8 q8, d20, d1 + vmlsl.u8 q12, d31, d1 ;-(src_ptr[-1] * vp8_filter[1]) + + vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[2]) + vzip.32 d10, d11 + vshr.u64 q9, q4, #16 ;construct src_ptr[0] + vshr.u64 q10, q6, #16 + vext.8 d31, d22, d23, #4 ;construct src_ptr[2] + + vmlsl.u8 q7, d6, d4 ;-(src_ptr[2] * vp8_filter[4]) + vmlsl.u8 q8, d10, d4 + vmlsl.u8 q12, d31, d4 ;-(src_ptr[2] * vp8_filter[4]) + + vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[0]) + vzip.32 d20, d21 + vshr.u64 q3, q4, #24 ;construct src_ptr[1] + vshr.u64 q5, q6, #24 + vext.8 d31, d22, d23, #2 ;construct src_ptr[0] + + vmlal.u8 q7, d18, d2 ;(src_ptr[0] * vp8_filter[2]) + vmlal.u8 q8, d20, d2 + vmlal.u8 q12, d31, d2 ;(src_ptr[0] * vp8_filter[2]) + + vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[1]) + vzip.32 d10, d11 + vext.8 d31, d22, d23, #3 ;construct src_ptr[1] + vmull.u8 q9, d6, d3 ;(src_ptr[1] * vp8_filter[3]) + vmull.u8 q10, d10, d3 + vmull.u8 q11, d31, d3 ;(src_ptr[1] * vp8_filter[3]) + + add r3, r12, r3, lsl #5 + + vqadd.s16 q7, q9 ;sum of all (src_data*filter_parameters) + vqadd.s16 q8, q10 + vqadd.s16 q12, q11 + + vext.8 d23, d27, d28, #4 + vld1.s32 {q5, q6}, [r3] ;load second_pass filter + + vqrshrun.s16 d29, q7, #7 ;shift/round/saturate to u8 + vqrshrun.s16 d30, q8, #7 + vqrshrun.s16 d31, q12, #7 + +;Second pass: 4x4 + vabs.s32 q7, q5 + vabs.s32 q8, q6 + + vext.8 d24, d28, d29, #4 + vext.8 d25, d29, d30, #4 + vext.8 d26, d30, d31, #4 + + vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5) + vdup.8 d1, d14[4] + vdup.8 d2, d15[0] + vdup.8 d3, d15[4] + vdup.8 d4, d16[0] + vdup.8 d5, d16[4] + + vmull.u8 q3, d27, d0 ;(src_ptr[-2] * vp8_filter[0]) + vmull.u8 q4, d28, d0 + + vmull.u8 q5, d25, d5 ;(src_ptr[3] * vp8_filter[5]) + vmull.u8 q6, d26, d5 + + vmlsl.u8 q3, d29, d4 ;-(src_ptr[2] * vp8_filter[4]) + vmlsl.u8 q4, d30, d4 + + vmlsl.u8 q5, d23, d1 ;-(src_ptr[-1] * vp8_filter[1]) + vmlsl.u8 q6, d24, d1 + + vmlal.u8 q3, d28, d2 ;(src_ptr[0] * vp8_filter[2]) + vmlal.u8 q4, d29, d2 + + vmlal.u8 q5, d24, d3 ;(src_ptr[1] * vp8_filter[3]) + vmlal.u8 q6, d25, d3 + + add r0, r4, lr + add r1, r0, lr + add r2, r1, lr + + vqadd.s16 q5, q3 ;sum of all (src_data*filter_parameters) + vqadd.s16 q6, q4 + + vqrshrun.s16 d3, q5, #7 ;shift/round/saturate to u8 + vqrshrun.s16 d4, q6, #7 + + vst1.32 {d3[0]}, [r4] ;store result + vst1.32 {d3[1]}, [r0] + vst1.32 {d4[0]}, [r1] + vst1.32 {d4[1]}, [r2] + + pop {r4, pc} + + +;--------------------- +firstpass_filter4x4_only + vabs.s32 q12, q14 ;get abs(filer_parameters) + vabs.s32 q13, q15 + + sub r0, r0, #2 ;go back 2 columns of src data + +;First pass: output_height lines x output_width columns (4x4) + vld1.u8 {q3}, [r0], r1 ;load first 4-line src data + vdup.8 d0, d24[0] ;first_pass filter (d0-d5) + vld1.u8 {q4}, [r0], r1 + vdup.8 d1, d24[4] + vld1.u8 {q5}, [r0], r1 + vdup.8 d2, d25[0] + vld1.u8 {q6}, [r0], r1 + + vdup.8 d3, d25[4] + vdup.8 d4, d26[0] + vdup.8 d5, d26[4] + + vext.8 d18, d6, d7, #5 ;construct src_ptr[3] + vext.8 d19, d8, d9, #5 + vext.8 d20, d10, d11, #5 + vext.8 d21, d12, d13, #5 + + vswp d7, d8 ;discard 2nd half data after src_ptr[3] is done + vswp d11, d12 + + vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[3]) + vzip.32 d20, d21 + vmull.u8 q7, d18, d5 ;(src_ptr[3] * vp8_filter[5]) + vmull.u8 q8, d20, d5 + + vmov q4, q3 ;keep original src data in q4 q6 + vmov q6, q5 + + vzip.32 d6, d7 ;construct src_ptr[-2], and put 2-line data together + vzip.32 d10, d11 + vshr.u64 q9, q4, #8 ;construct src_ptr[-1] + vshr.u64 q10, q6, #8 + vmlal.u8 q7, d6, d0 ;+(src_ptr[-2] * vp8_filter[0]) + vmlal.u8 q8, d10, d0 + + vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[-1]) + vzip.32 d20, d21 + vshr.u64 q3, q4, #32 ;construct src_ptr[2] + vshr.u64 q5, q6, #32 + vmlsl.u8 q7, d18, d1 ;-(src_ptr[-1] * vp8_filter[1]) + vmlsl.u8 q8, d20, d1 + + vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[2]) + vzip.32 d10, d11 + vshr.u64 q9, q4, #16 ;construct src_ptr[0] + vshr.u64 q10, q6, #16 + vmlsl.u8 q7, d6, d4 ;-(src_ptr[2] * vp8_filter[4]) + vmlsl.u8 q8, d10, d4 + + vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[0]) + vzip.32 d20, d21 + vshr.u64 q3, q4, #24 ;construct src_ptr[1] + vshr.u64 q5, q6, #24 + vmlal.u8 q7, d18, d2 ;(src_ptr[0] * vp8_filter[2]) + vmlal.u8 q8, d20, d2 + + vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[1]) + vzip.32 d10, d11 + vmull.u8 q9, d6, d3 ;(src_ptr[1] * vp8_filter[3]) + vmull.u8 q10, d10, d3 + + add r0, r4, lr + add r1, r0, lr + add r2, r1, lr + + vqadd.s16 q7, q9 ;sum of all (src_data*filter_parameters) + vqadd.s16 q8, q10 + + vqrshrun.s16 d27, q7, #7 ;shift/round/saturate to u8 + vqrshrun.s16 d28, q8, #7 + + vst1.32 {d27[0]}, [r4] ;store result + vst1.32 {d27[1]}, [r0] + vst1.32 {d28[0]}, [r1] + vst1.32 {d28[1]}, [r2] + + pop {r4, pc} + + +;--------------------- +secondpass_filter4x4_only + sub r0, r0, r1, lsl #1 + add r3, r12, r3, lsl #5 + + vld1.32 {d27[0]}, [r0], r1 ;load src data + vld1.s32 {q5, q6}, [r3] ;load second_pass filter + vld1.32 {d27[1]}, [r0], r1 + vabs.s32 q7, q5 + vld1.32 {d28[0]}, [r0], r1 + vabs.s32 q8, q6 + vld1.32 {d28[1]}, [r0], r1 + vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5) + vld1.32 {d29[0]}, [r0], r1 + vdup.8 d1, d14[4] + vld1.32 {d29[1]}, [r0], r1 + vdup.8 d2, d15[0] + vld1.32 {d30[0]}, [r0], r1 + vdup.8 d3, d15[4] + vld1.32 {d30[1]}, [r0], r1 + vdup.8 d4, d16[0] + vld1.32 {d31[0]}, [r0], r1 + vdup.8 d5, d16[4] + + vext.8 d23, d27, d28, #4 + vext.8 d24, d28, d29, #4 + vext.8 d25, d29, d30, #4 + vext.8 d26, d30, d31, #4 + + vmull.u8 q3, d27, d0 ;(src_ptr[-2] * vp8_filter[0]) + vmull.u8 q4, d28, d0 + + vmull.u8 q5, d25, d5 ;(src_ptr[3] * vp8_filter[5]) + vmull.u8 q6, d26, d5 + + vmlsl.u8 q3, d29, d4 ;-(src_ptr[2] * vp8_filter[4]) + vmlsl.u8 q4, d30, d4 + + vmlsl.u8 q5, d23, d1 ;-(src_ptr[-1] * vp8_filter[1]) + vmlsl.u8 q6, d24, d1 + + vmlal.u8 q3, d28, d2 ;(src_ptr[0] * vp8_filter[2]) + vmlal.u8 q4, d29, d2 + + vmlal.u8 q5, d24, d3 ;(src_ptr[1] * vp8_filter[3]) + vmlal.u8 q6, d25, d3 + + add r0, r4, lr + add r1, r0, lr + add r2, r1, lr + + vqadd.s16 q5, q3 ;sum of all (src_data*filter_parameters) + vqadd.s16 q6, q4 + + vqrshrun.s16 d3, q5, #7 ;shift/round/saturate to u8 + vqrshrun.s16 d4, q6, #7 + + vst1.32 {d3[0]}, [r4] ;store result + vst1.32 {d3[1]}, [r0] + vst1.32 {d4[0]}, [r1] + vst1.32 {d4[1]}, [r2] + + pop {r4, pc} + + ENDP + +;----------------- + AREA subpelfilters4_dat, DATA, READWRITE ;read/write by default +;Data section with name data_area is specified. DCD reserves space in memory for 48 data. +;One word each is reserved. Label filter_coeff can be used to access the data. +;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ... +_filter4_coeff_ + DCD filter4_coeff +filter4_coeff + DCD 0, 0, 128, 0, 0, 0, 0, 0 + DCD 0, -6, 123, 12, -1, 0, 0, 0 + DCD 2, -11, 108, 36, -8, 1, 0, 0 + DCD 0, -9, 93, 50, -6, 0, 0, 0 + DCD 3, -16, 77, 77, -16, 3, 0, 0 + DCD 0, -6, 50, 93, -9, 0, 0, 0 + DCD 1, -8, 36, 108, -11, 2, 0, 0 + DCD 0, -1, 12, 123, -6, 0, 0, 0 + + END diff --git a/vp8/common/arm/neon/sixtappredict8x4_neon.asm b/vp8/common/arm/neon/sixtappredict8x4_neon.asm new file mode 100644 index 000000000..18e19f958 --- /dev/null +++ b/vp8/common/arm/neon/sixtappredict8x4_neon.asm @@ -0,0 +1,476 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_sixtap_predict8x4_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +; r0 unsigned char *src_ptr, +; r1 int src_pixels_per_line, +; r2 int xoffset, +; r3 int yoffset, +; r4 unsigned char *dst_ptr, +; stack(r5) int dst_pitch + +|vp8_sixtap_predict8x4_neon| PROC + push {r4-r5, lr} + + ldr r12, _filter8_coeff_ + ldr r4, [sp, #12] ;load parameters from stack + ldr r5, [sp, #16] ;load parameters from stack + + cmp r2, #0 ;skip first_pass filter if xoffset=0 + beq secondpass_filter8x4_only + + add r2, r12, r2, lsl #5 ;calculate filter location + + cmp r3, #0 ;skip second_pass filter if yoffset=0 + + vld1.s32 {q14, q15}, [r2] ;load first_pass filter + + beq firstpass_filter8x4_only + + sub sp, sp, #32 ;reserve space on stack for temporary storage + vabs.s32 q12, q14 + vabs.s32 q13, q15 + + sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2) + mov lr, sp + sub r0, r0, r1, lsl #1 + + vdup.8 d0, d24[0] ;first_pass filter (d0-d5) + vdup.8 d1, d24[4] + vdup.8 d2, d25[0] + +;First pass: output_height lines x output_width columns (9x8) + vld1.u8 {q3}, [r0], r1 ;load src data + vdup.8 d3, d25[4] + vld1.u8 {q4}, [r0], r1 + vdup.8 d4, d26[0] + vld1.u8 {q5}, [r0], r1 + vdup.8 d5, d26[4] + vld1.u8 {q6}, [r0], r1 + + pld [r0] + pld [r0, r1] + pld [r0, r1, lsl #1] + + vmull.u8 q7, d6, d0 ;(src_ptr[-2] * vp8_filter[0]) + vmull.u8 q8, d8, d0 + vmull.u8 q9, d10, d0 + vmull.u8 q10, d12, d0 + + vext.8 d28, d6, d7, #1 ;construct src_ptr[-1] + vext.8 d29, d8, d9, #1 + vext.8 d30, d10, d11, #1 + vext.8 d31, d12, d13, #1 + + vmlsl.u8 q7, d28, d1 ;-(src_ptr[-1] * vp8_filter[1]) + vmlsl.u8 q8, d29, d1 + vmlsl.u8 q9, d30, d1 + vmlsl.u8 q10, d31, d1 + + vext.8 d28, d6, d7, #4 ;construct src_ptr[2] + vext.8 d29, d8, d9, #4 + vext.8 d30, d10, d11, #4 + vext.8 d31, d12, d13, #4 + + vmlsl.u8 q7, d28, d4 ;-(src_ptr[2] * vp8_filter[4]) + vmlsl.u8 q8, d29, d4 + vmlsl.u8 q9, d30, d4 + vmlsl.u8 q10, d31, d4 + + vext.8 d28, d6, d7, #2 ;construct src_ptr[0] + vext.8 d29, d8, d9, #2 + vext.8 d30, d10, d11, #2 + vext.8 d31, d12, d13, #2 + + vmlal.u8 q7, d28, d2 ;(src_ptr[0] * vp8_filter[2]) + vmlal.u8 q8, d29, d2 + vmlal.u8 q9, d30, d2 + vmlal.u8 q10, d31, d2 + + vext.8 d28, d6, d7, #5 ;construct src_ptr[3] + vext.8 d29, d8, d9, #5 + vext.8 d30, d10, d11, #5 + vext.8 d31, d12, d13, #5 + + vmlal.u8 q7, d28, d5 ;(src_ptr[3] * vp8_filter[5]) + vmlal.u8 q8, d29, d5 + vmlal.u8 q9, d30, d5 + vmlal.u8 q10, d31, d5 + + vext.8 d28, d6, d7, #3 ;construct src_ptr[1] + vext.8 d29, d8, d9, #3 + vext.8 d30, d10, d11, #3 + vext.8 d31, d12, d13, #3 + + vmull.u8 q3, d28, d3 ;(src_ptr[1] * vp8_filter[3]) + vmull.u8 q4, d29, d3 + vmull.u8 q5, d30, d3 + vmull.u8 q6, d31, d3 + + vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) + vqadd.s16 q8, q4 + vqadd.s16 q9, q5 + vqadd.s16 q10, q6 + + vld1.u8 {q3}, [r0], r1 ;load src data + + vqrshrun.s16 d22, q7, #7 ;shift/round/saturate to u8 + vqrshrun.s16 d23, q8, #7 + vqrshrun.s16 d24, q9, #7 + vqrshrun.s16 d25, q10, #7 + + vld1.u8 {q4}, [r0], r1 + vst1.u8 {d22}, [lr]! ;store result + vld1.u8 {q5}, [r0], r1 + vst1.u8 {d23}, [lr]! + vld1.u8 {q6}, [r0], r1 + vst1.u8 {d24}, [lr]! + vld1.u8 {q7}, [r0], r1 + vst1.u8 {d25}, [lr]! + + ;first_pass filtering on the rest 5-line data + vmull.u8 q8, d6, d0 ;(src_ptr[-2] * vp8_filter[0]) + vmull.u8 q9, d8, d0 + vmull.u8 q10, d10, d0 + vmull.u8 q11, d12, d0 + vmull.u8 q12, d14, d0 + + vext.8 d27, d6, d7, #1 ;construct src_ptr[-1] + vext.8 d28, d8, d9, #1 + vext.8 d29, d10, d11, #1 + vext.8 d30, d12, d13, #1 + vext.8 d31, d14, d15, #1 + + vmlsl.u8 q8, d27, d1 ;-(src_ptr[-1] * vp8_filter[1]) + vmlsl.u8 q9, d28, d1 + vmlsl.u8 q10, d29, d1 + vmlsl.u8 q11, d30, d1 + vmlsl.u8 q12, d31, d1 + + vext.8 d27, d6, d7, #4 ;construct src_ptr[2] + vext.8 d28, d8, d9, #4 + vext.8 d29, d10, d11, #4 + vext.8 d30, d12, d13, #4 + vext.8 d31, d14, d15, #4 + + vmlsl.u8 q8, d27, d4 ;-(src_ptr[2] * vp8_filter[4]) + vmlsl.u8 q9, d28, d4 + vmlsl.u8 q10, d29, d4 + vmlsl.u8 q11, d30, d4 + vmlsl.u8 q12, d31, d4 + + vext.8 d27, d6, d7, #2 ;construct src_ptr[0] + vext.8 d28, d8, d9, #2 + vext.8 d29, d10, d11, #2 + vext.8 d30, d12, d13, #2 + vext.8 d31, d14, d15, #2 + + vmlal.u8 q8, d27, d2 ;(src_ptr[0] * vp8_filter[2]) + vmlal.u8 q9, d28, d2 + vmlal.u8 q10, d29, d2 + vmlal.u8 q11, d30, d2 + vmlal.u8 q12, d31, d2 + + vext.8 d27, d6, d7, #5 ;construct src_ptr[3] + vext.8 d28, d8, d9, #5 + vext.8 d29, d10, d11, #5 + vext.8 d30, d12, d13, #5 + vext.8 d31, d14, d15, #5 + + vmlal.u8 q8, d27, d5 ;(src_ptr[3] * vp8_filter[5]) + vmlal.u8 q9, d28, d5 + vmlal.u8 q10, d29, d5 + vmlal.u8 q11, d30, d5 + vmlal.u8 q12, d31, d5 + + vext.8 d27, d6, d7, #3 ;construct src_ptr[1] + vext.8 d28, d8, d9, #3 + vext.8 d29, d10, d11, #3 + vext.8 d30, d12, d13, #3 + vext.8 d31, d14, d15, #3 + + vmull.u8 q3, d27, d3 ;(src_ptr[1] * vp8_filter[3]) + vmull.u8 q4, d28, d3 + vmull.u8 q5, d29, d3 + vmull.u8 q6, d30, d3 + vmull.u8 q7, d31, d3 + + vqadd.s16 q8, q3 ;sum of all (src_data*filter_parameters) + vqadd.s16 q9, q4 + vqadd.s16 q10, q5 + vqadd.s16 q11, q6 + vqadd.s16 q12, q7 + + vqrshrun.s16 d26, q8, #7 ;shift/round/saturate to u8 + vqrshrun.s16 d27, q9, #7 + vqrshrun.s16 d28, q10, #7 + vqrshrun.s16 d29, q11, #7 ;load intermediate data from stack + vqrshrun.s16 d30, q12, #7 + +;Second pass: 8x4 +;secondpass_filter + add r3, r12, r3, lsl #5 + sub lr, lr, #32 + + vld1.s32 {q5, q6}, [r3] ;load second_pass filter + vld1.u8 {q11}, [lr]! + + vabs.s32 q7, q5 + vabs.s32 q8, q6 + + vld1.u8 {q12}, [lr]! + + vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5) + vdup.8 d1, d14[4] + vdup.8 d2, d15[0] + vdup.8 d3, d15[4] + vdup.8 d4, d16[0] + vdup.8 d5, d16[4] + + vmull.u8 q3, d22, d0 ;(src_ptr[-2] * vp8_filter[0]) + vmull.u8 q4, d23, d0 + vmull.u8 q5, d24, d0 + vmull.u8 q6, d25, d0 + + vmlsl.u8 q3, d23, d1 ;-(src_ptr[-1] * vp8_filter[1]) + vmlsl.u8 q4, d24, d1 + vmlsl.u8 q5, d25, d1 + vmlsl.u8 q6, d26, d1 + + vmlsl.u8 q3, d26, d4 ;-(src_ptr[2] * vp8_filter[4]) + vmlsl.u8 q4, d27, d4 + vmlsl.u8 q5, d28, d4 + vmlsl.u8 q6, d29, d4 + + vmlal.u8 q3, d24, d2 ;(src_ptr[0] * vp8_filter[2]) + vmlal.u8 q4, d25, d2 + vmlal.u8 q5, d26, d2 + vmlal.u8 q6, d27, d2 + + vmlal.u8 q3, d27, d5 ;(src_ptr[3] * vp8_filter[5]) + vmlal.u8 q4, d28, d5 + vmlal.u8 q5, d29, d5 + vmlal.u8 q6, d30, d5 + + vmull.u8 q7, d25, d3 ;(src_ptr[1] * vp8_filter[3]) + vmull.u8 q8, d26, d3 + vmull.u8 q9, d27, d3 + vmull.u8 q10, d28, d3 + + vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) + vqadd.s16 q8, q4 + vqadd.s16 q9, q5 + vqadd.s16 q10, q6 + + vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8 + vqrshrun.s16 d7, q8, #7 + vqrshrun.s16 d8, q9, #7 + vqrshrun.s16 d9, q10, #7 + + vst1.u8 {d6}, [r4], r5 ;store result + vst1.u8 {d7}, [r4], r5 + vst1.u8 {d8}, [r4], r5 + vst1.u8 {d9}, [r4], r5 + + add sp, sp, #32 + pop {r4-r5,pc} + +;-------------------- +firstpass_filter8x4_only + vabs.s32 q12, q14 + vabs.s32 q13, q15 + + sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2) + vld1.u8 {q3}, [r0], r1 ;load src data + + vdup.8 d0, d24[0] ;first_pass filter (d0-d5) + vld1.u8 {q4}, [r0], r1 + vdup.8 d1, d24[4] + vld1.u8 {q5}, [r0], r1 + vdup.8 d2, d25[0] + vld1.u8 {q6}, [r0], r1 + vdup.8 d3, d25[4] + vdup.8 d4, d26[0] + vdup.8 d5, d26[4] + +;First pass: output_height lines x output_width columns (4x8) + pld [r0] + pld [r0, r1] + pld [r0, r1, lsl #1] + + vmull.u8 q7, d6, d0 ;(src_ptr[-2] * vp8_filter[0]) + vmull.u8 q8, d8, d0 + vmull.u8 q9, d10, d0 + vmull.u8 q10, d12, d0 + + vext.8 d28, d6, d7, #1 ;construct src_ptr[-1] + vext.8 d29, d8, d9, #1 + vext.8 d30, d10, d11, #1 + vext.8 d31, d12, d13, #1 + + vmlsl.u8 q7, d28, d1 ;-(src_ptr[-1] * vp8_filter[1]) + vmlsl.u8 q8, d29, d1 + vmlsl.u8 q9, d30, d1 + vmlsl.u8 q10, d31, d1 + + vext.8 d28, d6, d7, #4 ;construct src_ptr[2] + vext.8 d29, d8, d9, #4 + vext.8 d30, d10, d11, #4 + vext.8 d31, d12, d13, #4 + + vmlsl.u8 q7, d28, d4 ;-(src_ptr[2] * vp8_filter[4]) + vmlsl.u8 q8, d29, d4 + vmlsl.u8 q9, d30, d4 + vmlsl.u8 q10, d31, d4 + + vext.8 d28, d6, d7, #2 ;construct src_ptr[0] + vext.8 d29, d8, d9, #2 + vext.8 d30, d10, d11, #2 + vext.8 d31, d12, d13, #2 + + vmlal.u8 q7, d28, d2 ;(src_ptr[0] * vp8_filter[2]) + vmlal.u8 q8, d29, d2 + vmlal.u8 q9, d30, d2 + vmlal.u8 q10, d31, d2 + + vext.8 d28, d6, d7, #5 ;construct src_ptr[3] + vext.8 d29, d8, d9, #5 + vext.8 d30, d10, d11, #5 + vext.8 d31, d12, d13, #5 + + vmlal.u8 q7, d28, d5 ;(src_ptr[3] * vp8_filter[5]) + vmlal.u8 q8, d29, d5 + vmlal.u8 q9, d30, d5 + vmlal.u8 q10, d31, d5 + + vext.8 d28, d6, d7, #3 ;construct src_ptr[1] + vext.8 d29, d8, d9, #3 + vext.8 d30, d10, d11, #3 + vext.8 d31, d12, d13, #3 + + vmull.u8 q3, d28, d3 ;(src_ptr[1] * vp8_filter[3]) + vmull.u8 q4, d29, d3 + vmull.u8 q5, d30, d3 + vmull.u8 q6, d31, d3 + + vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) + vqadd.s16 q8, q4 + vqadd.s16 q9, q5 + vqadd.s16 q10, q6 + + vqrshrun.s16 d22, q7, #7 ;shift/round/saturate to u8 + vqrshrun.s16 d23, q8, #7 + vqrshrun.s16 d24, q9, #7 + vqrshrun.s16 d25, q10, #7 + + vst1.u8 {d22}, [r4], r5 ;store result + vst1.u8 {d23}, [r4], r5 + vst1.u8 {d24}, [r4], r5 + vst1.u8 {d25}, [r4], r5 + + pop {r4-r5,pc} + +;--------------------- +secondpass_filter8x4_only +;Second pass: 8x4 + add r3, r12, r3, lsl #5 + sub r0, r0, r1, lsl #1 + vld1.s32 {q5, q6}, [r3] ;load second_pass filter + vabs.s32 q7, q5 + vabs.s32 q8, q6 + + vld1.u8 {d22}, [r0], r1 + vld1.u8 {d23}, [r0], r1 + vld1.u8 {d24}, [r0], r1 + vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5) + vld1.u8 {d25}, [r0], r1 + vdup.8 d1, d14[4] + vld1.u8 {d26}, [r0], r1 + vdup.8 d2, d15[0] + vld1.u8 {d27}, [r0], r1 + vdup.8 d3, d15[4] + vld1.u8 {d28}, [r0], r1 + vdup.8 d4, d16[0] + vld1.u8 {d29}, [r0], r1 + vdup.8 d5, d16[4] + vld1.u8 {d30}, [r0], r1 + + vmull.u8 q3, d22, d0 ;(src_ptr[-2] * vp8_filter[0]) + vmull.u8 q4, d23, d0 + vmull.u8 q5, d24, d0 + vmull.u8 q6, d25, d0 + + vmlsl.u8 q3, d23, d1 ;-(src_ptr[-1] * vp8_filter[1]) + vmlsl.u8 q4, d24, d1 + vmlsl.u8 q5, d25, d1 + vmlsl.u8 q6, d26, d1 + + vmlsl.u8 q3, d26, d4 ;-(src_ptr[2] * vp8_filter[4]) + vmlsl.u8 q4, d27, d4 + vmlsl.u8 q5, d28, d4 + vmlsl.u8 q6, d29, d4 + + vmlal.u8 q3, d24, d2 ;(src_ptr[0] * vp8_filter[2]) + vmlal.u8 q4, d25, d2 + vmlal.u8 q5, d26, d2 + vmlal.u8 q6, d27, d2 + + vmlal.u8 q3, d27, d5 ;(src_ptr[3] * vp8_filter[5]) + vmlal.u8 q4, d28, d5 + vmlal.u8 q5, d29, d5 + vmlal.u8 q6, d30, d5 + + vmull.u8 q7, d25, d3 ;(src_ptr[1] * vp8_filter[3]) + vmull.u8 q8, d26, d3 + vmull.u8 q9, d27, d3 + vmull.u8 q10, d28, d3 + + vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) + vqadd.s16 q8, q4 + vqadd.s16 q9, q5 + vqadd.s16 q10, q6 + + vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8 + vqrshrun.s16 d7, q8, #7 + vqrshrun.s16 d8, q9, #7 + vqrshrun.s16 d9, q10, #7 + + vst1.u8 {d6}, [r4], r5 ;store result + vst1.u8 {d7}, [r4], r5 + vst1.u8 {d8}, [r4], r5 + vst1.u8 {d9}, [r4], r5 + + pop {r4-r5,pc} + + ENDP + +;----------------- + AREA subpelfilters8_dat, DATA, READWRITE ;read/write by default +;Data section with name data_area is specified. DCD reserves space in memory for 48 data. +;One word each is reserved. Label filter_coeff can be used to access the data. +;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ... +_filter8_coeff_ + DCD filter8_coeff +filter8_coeff + DCD 0, 0, 128, 0, 0, 0, 0, 0 + DCD 0, -6, 123, 12, -1, 0, 0, 0 + DCD 2, -11, 108, 36, -8, 1, 0, 0 + DCD 0, -9, 93, 50, -6, 0, 0, 0 + DCD 3, -16, 77, 77, -16, 3, 0, 0 + DCD 0, -6, 50, 93, -9, 0, 0, 0 + DCD 1, -8, 36, 108, -11, 2, 0, 0 + DCD 0, -1, 12, 123, -6, 0, 0, 0 + + END diff --git a/vp8/common/arm/neon/sixtappredict8x8_neon.asm b/vp8/common/arm/neon/sixtappredict8x8_neon.asm new file mode 100644 index 000000000..d27485e6c --- /dev/null +++ b/vp8/common/arm/neon/sixtappredict8x8_neon.asm @@ -0,0 +1,527 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_sixtap_predict8x8_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +; r0 unsigned char *src_ptr, +; r1 int src_pixels_per_line, +; r2 int xoffset, +; r3 int yoffset, +; stack(r4) unsigned char *dst_ptr, +; stack(r5) int dst_pitch + +|vp8_sixtap_predict8x8_neon| PROC + push {r4-r5, lr} + + ldr r12, _filter8_coeff_ + + ldr r4, [sp, #12] ;load parameters from stack + ldr r5, [sp, #16] ;load parameters from stack + + cmp r2, #0 ;skip first_pass filter if xoffset=0 + beq secondpass_filter8x8_only + + add r2, r12, r2, lsl #5 ;calculate filter location + + cmp r3, #0 ;skip second_pass filter if yoffset=0 + + vld1.s32 {q14, q15}, [r2] ;load first_pass filter + + beq firstpass_filter8x8_only + + sub sp, sp, #64 ;reserve space on stack for temporary storage + mov lr, sp + + vabs.s32 q12, q14 + vabs.s32 q13, q15 + + mov r2, #2 ;loop counter + sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2) + sub r0, r0, r1, lsl #1 + + vdup.8 d0, d24[0] ;first_pass filter (d0-d5) + vdup.8 d1, d24[4] + vdup.8 d2, d25[0] + +;First pass: output_height lines x output_width columns (13x8) + vld1.u8 {q3}, [r0], r1 ;load src data + vdup.8 d3, d25[4] + vld1.u8 {q4}, [r0], r1 + vdup.8 d4, d26[0] + vld1.u8 {q5}, [r0], r1 + vdup.8 d5, d26[4] + vld1.u8 {q6}, [r0], r1 + +filt_blk2d_fp8x8_loop_neon + pld [r0] + pld [r0, r1] + pld [r0, r1, lsl #1] + + vmull.u8 q7, d6, d0 ;(src_ptr[-2] * vp8_filter[0]) + vmull.u8 q8, d8, d0 + vmull.u8 q9, d10, d0 + vmull.u8 q10, d12, d0 + + vext.8 d28, d6, d7, #1 ;construct src_ptr[-1] + vext.8 d29, d8, d9, #1 + vext.8 d30, d10, d11, #1 + vext.8 d31, d12, d13, #1 + + vmlsl.u8 q7, d28, d1 ;-(src_ptr[-1] * vp8_filter[1]) + vmlsl.u8 q8, d29, d1 + vmlsl.u8 q9, d30, d1 + vmlsl.u8 q10, d31, d1 + + vext.8 d28, d6, d7, #4 ;construct src_ptr[2] + vext.8 d29, d8, d9, #4 + vext.8 d30, d10, d11, #4 + vext.8 d31, d12, d13, #4 + + vmlsl.u8 q7, d28, d4 ;-(src_ptr[2] * vp8_filter[4]) + vmlsl.u8 q8, d29, d4 + vmlsl.u8 q9, d30, d4 + vmlsl.u8 q10, d31, d4 + + vext.8 d28, d6, d7, #2 ;construct src_ptr[0] + vext.8 d29, d8, d9, #2 + vext.8 d30, d10, d11, #2 + vext.8 d31, d12, d13, #2 + + vmlal.u8 q7, d28, d2 ;(src_ptr[0] * vp8_filter[2]) + vmlal.u8 q8, d29, d2 + vmlal.u8 q9, d30, d2 + vmlal.u8 q10, d31, d2 + + vext.8 d28, d6, d7, #5 ;construct src_ptr[3] + vext.8 d29, d8, d9, #5 + vext.8 d30, d10, d11, #5 + vext.8 d31, d12, d13, #5 + + vmlal.u8 q7, d28, d5 ;(src_ptr[3] * vp8_filter[5]) + vmlal.u8 q8, d29, d5 + vmlal.u8 q9, d30, d5 + vmlal.u8 q10, d31, d5 + + vext.8 d28, d6, d7, #3 ;construct src_ptr[1] + vext.8 d29, d8, d9, #3 + vext.8 d30, d10, d11, #3 + vext.8 d31, d12, d13, #3 + + vmull.u8 q3, d28, d3 ;(src_ptr[1] * vp8_filter[3]) + vmull.u8 q4, d29, d3 + vmull.u8 q5, d30, d3 + vmull.u8 q6, d31, d3 + + subs r2, r2, #1 + + vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) + vqadd.s16 q8, q4 + vqadd.s16 q9, q5 + vqadd.s16 q10, q6 + + vld1.u8 {q3}, [r0], r1 ;load src data + + vqrshrun.s16 d22, q7, #7 ;shift/round/saturate to u8 + vqrshrun.s16 d23, q8, #7 + vqrshrun.s16 d24, q9, #7 + vqrshrun.s16 d25, q10, #7 + + vst1.u8 {d22}, [lr]! ;store result + vld1.u8 {q4}, [r0], r1 + vst1.u8 {d23}, [lr]! + vld1.u8 {q5}, [r0], r1 + vst1.u8 {d24}, [lr]! + vld1.u8 {q6}, [r0], r1 + vst1.u8 {d25}, [lr]! + + bne filt_blk2d_fp8x8_loop_neon + + ;first_pass filtering on the rest 5-line data + ;vld1.u8 {q3}, [r0], r1 ;load src data + ;vld1.u8 {q4}, [r0], r1 + ;vld1.u8 {q5}, [r0], r1 + ;vld1.u8 {q6}, [r0], r1 + vld1.u8 {q7}, [r0], r1 + + vmull.u8 q8, d6, d0 ;(src_ptr[-2] * vp8_filter[0]) + vmull.u8 q9, d8, d0 + vmull.u8 q10, d10, d0 + vmull.u8 q11, d12, d0 + vmull.u8 q12, d14, d0 + + vext.8 d27, d6, d7, #1 ;construct src_ptr[-1] + vext.8 d28, d8, d9, #1 + vext.8 d29, d10, d11, #1 + vext.8 d30, d12, d13, #1 + vext.8 d31, d14, d15, #1 + + vmlsl.u8 q8, d27, d1 ;-(src_ptr[-1] * vp8_filter[1]) + vmlsl.u8 q9, d28, d1 + vmlsl.u8 q10, d29, d1 + vmlsl.u8 q11, d30, d1 + vmlsl.u8 q12, d31, d1 + + vext.8 d27, d6, d7, #4 ;construct src_ptr[2] + vext.8 d28, d8, d9, #4 + vext.8 d29, d10, d11, #4 + vext.8 d30, d12, d13, #4 + vext.8 d31, d14, d15, #4 + + vmlsl.u8 q8, d27, d4 ;-(src_ptr[2] * vp8_filter[4]) + vmlsl.u8 q9, d28, d4 + vmlsl.u8 q10, d29, d4 + vmlsl.u8 q11, d30, d4 + vmlsl.u8 q12, d31, d4 + + vext.8 d27, d6, d7, #2 ;construct src_ptr[0] + vext.8 d28, d8, d9, #2 + vext.8 d29, d10, d11, #2 + vext.8 d30, d12, d13, #2 + vext.8 d31, d14, d15, #2 + + vmlal.u8 q8, d27, d2 ;(src_ptr[0] * vp8_filter[2]) + vmlal.u8 q9, d28, d2 + vmlal.u8 q10, d29, d2 + vmlal.u8 q11, d30, d2 + vmlal.u8 q12, d31, d2 + + vext.8 d27, d6, d7, #5 ;construct src_ptr[3] + vext.8 d28, d8, d9, #5 + vext.8 d29, d10, d11, #5 + vext.8 d30, d12, d13, #5 + vext.8 d31, d14, d15, #5 + + vmlal.u8 q8, d27, d5 ;(src_ptr[3] * vp8_filter[5]) + vmlal.u8 q9, d28, d5 + vmlal.u8 q10, d29, d5 + vmlal.u8 q11, d30, d5 + vmlal.u8 q12, d31, d5 + + vext.8 d27, d6, d7, #3 ;construct src_ptr[1] + vext.8 d28, d8, d9, #3 + vext.8 d29, d10, d11, #3 + vext.8 d30, d12, d13, #3 + vext.8 d31, d14, d15, #3 + + vmull.u8 q3, d27, d3 ;(src_ptr[1] * vp8_filter[3]) + vmull.u8 q4, d28, d3 + vmull.u8 q5, d29, d3 + vmull.u8 q6, d30, d3 + vmull.u8 q7, d31, d3 + + vqadd.s16 q8, q3 ;sum of all (src_data*filter_parameters) + vqadd.s16 q9, q4 + vqadd.s16 q10, q5 + vqadd.s16 q11, q6 + vqadd.s16 q12, q7 + + add r3, r12, r3, lsl #5 + + vqrshrun.s16 d26, q8, #7 ;shift/round/saturate to u8 + sub lr, lr, #64 + vqrshrun.s16 d27, q9, #7 + vld1.u8 {q9}, [lr]! ;load intermediate data from stack + vqrshrun.s16 d28, q10, #7 + vld1.u8 {q10}, [lr]! + + vld1.s32 {q5, q6}, [r3] ;load second_pass filter + + vqrshrun.s16 d29, q11, #7 + vld1.u8 {q11}, [lr]! + + vabs.s32 q7, q5 + vabs.s32 q8, q6 + + vqrshrun.s16 d30, q12, #7 + vld1.u8 {q12}, [lr]! + +;Second pass: 8x8 + mov r3, #2 ;loop counter + + vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5) + vdup.8 d1, d14[4] + vdup.8 d2, d15[0] + vdup.8 d3, d15[4] + vdup.8 d4, d16[0] + vdup.8 d5, d16[4] + +filt_blk2d_sp8x8_loop_neon + vmull.u8 q3, d18, d0 ;(src_ptr[-2] * vp8_filter[0]) + vmull.u8 q4, d19, d0 + vmull.u8 q5, d20, d0 + vmull.u8 q6, d21, d0 + + vmlsl.u8 q3, d19, d1 ;-(src_ptr[-1] * vp8_filter[1]) + vmlsl.u8 q4, d20, d1 + vmlsl.u8 q5, d21, d1 + vmlsl.u8 q6, d22, d1 + + vmlsl.u8 q3, d22, d4 ;-(src_ptr[2] * vp8_filter[4]) + vmlsl.u8 q4, d23, d4 + vmlsl.u8 q5, d24, d4 + vmlsl.u8 q6, d25, d4 + + vmlal.u8 q3, d20, d2 ;(src_ptr[0] * vp8_filter[2]) + vmlal.u8 q4, d21, d2 + vmlal.u8 q5, d22, d2 + vmlal.u8 q6, d23, d2 + + vmlal.u8 q3, d23, d5 ;(src_ptr[3] * vp8_filter[5]) + vmlal.u8 q4, d24, d5 + vmlal.u8 q5, d25, d5 + vmlal.u8 q6, d26, d5 + + vmull.u8 q7, d21, d3 ;(src_ptr[1] * vp8_filter[3]) + vmull.u8 q8, d22, d3 + vmull.u8 q9, d23, d3 + vmull.u8 q10, d24, d3 + + subs r3, r3, #1 + + vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) + vqadd.s16 q8, q4 + vqadd.s16 q9, q5 + vqadd.s16 q10, q6 + + vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8 + vqrshrun.s16 d7, q8, #7 + vqrshrun.s16 d8, q9, #7 + vqrshrun.s16 d9, q10, #7 + + vmov q9, q11 + vst1.u8 {d6}, [r4], r5 ;store result + vmov q10, q12 + vst1.u8 {d7}, [r4], r5 + vmov q11, q13 + vst1.u8 {d8}, [r4], r5 + vmov q12, q14 + vst1.u8 {d9}, [r4], r5 + vmov d26, d30 + + bne filt_blk2d_sp8x8_loop_neon + + add sp, sp, #64 + pop {r4-r5,pc} + +;--------------------- +firstpass_filter8x8_only + ;add r2, r12, r2, lsl #5 ;calculate filter location + ;vld1.s32 {q14, q15}, [r2] ;load first_pass filter + vabs.s32 q12, q14 + vabs.s32 q13, q15 + + mov r2, #2 ;loop counter + sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2) + + vdup.8 d0, d24[0] ;first_pass filter (d0-d5) + vdup.8 d1, d24[4] + vdup.8 d2, d25[0] + vdup.8 d3, d25[4] + vdup.8 d4, d26[0] + vdup.8 d5, d26[4] + +;First pass: output_height lines x output_width columns (8x8) +filt_blk2d_fpo8x8_loop_neon + vld1.u8 {q3}, [r0], r1 ;load src data + vld1.u8 {q4}, [r0], r1 + vld1.u8 {q5}, [r0], r1 + vld1.u8 {q6}, [r0], r1 + + pld [r0] + pld [r0, r1] + pld [r0, r1, lsl #1] + + vmull.u8 q7, d6, d0 ;(src_ptr[-2] * vp8_filter[0]) + vmull.u8 q8, d8, d0 + vmull.u8 q9, d10, d0 + vmull.u8 q10, d12, d0 + + vext.8 d28, d6, d7, #1 ;construct src_ptr[-1] + vext.8 d29, d8, d9, #1 + vext.8 d30, d10, d11, #1 + vext.8 d31, d12, d13, #1 + + vmlsl.u8 q7, d28, d1 ;-(src_ptr[-1] * vp8_filter[1]) + vmlsl.u8 q8, d29, d1 + vmlsl.u8 q9, d30, d1 + vmlsl.u8 q10, d31, d1 + + vext.8 d28, d6, d7, #4 ;construct src_ptr[2] + vext.8 d29, d8, d9, #4 + vext.8 d30, d10, d11, #4 + vext.8 d31, d12, d13, #4 + + vmlsl.u8 q7, d28, d4 ;-(src_ptr[2] * vp8_filter[4]) + vmlsl.u8 q8, d29, d4 + vmlsl.u8 q9, d30, d4 + vmlsl.u8 q10, d31, d4 + + vext.8 d28, d6, d7, #2 ;construct src_ptr[0] + vext.8 d29, d8, d9, #2 + vext.8 d30, d10, d11, #2 + vext.8 d31, d12, d13, #2 + + vmlal.u8 q7, d28, d2 ;(src_ptr[0] * vp8_filter[2]) + vmlal.u8 q8, d29, d2 + vmlal.u8 q9, d30, d2 + vmlal.u8 q10, d31, d2 + + vext.8 d28, d6, d7, #5 ;construct src_ptr[3] + vext.8 d29, d8, d9, #5 + vext.8 d30, d10, d11, #5 + vext.8 d31, d12, d13, #5 + + vmlal.u8 q7, d28, d5 ;(src_ptr[3] * vp8_filter[5]) + vmlal.u8 q8, d29, d5 + vmlal.u8 q9, d30, d5 + vmlal.u8 q10, d31, d5 + + vext.8 d28, d6, d7, #3 ;construct src_ptr[1] + vext.8 d29, d8, d9, #3 + vext.8 d30, d10, d11, #3 + vext.8 d31, d12, d13, #3 + + vmull.u8 q3, d28, d3 ;(src_ptr[1] * vp8_filter[3]) + vmull.u8 q4, d29, d3 + vmull.u8 q5, d30, d3 + vmull.u8 q6, d31, d3 + ; + vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) + vqadd.s16 q8, q4 + vqadd.s16 q9, q5 + vqadd.s16 q10, q6 + + subs r2, r2, #1 + + vqrshrun.s16 d22, q7, #7 ;shift/round/saturate to u8 + vqrshrun.s16 d23, q8, #7 + vqrshrun.s16 d24, q9, #7 + vqrshrun.s16 d25, q10, #7 + + vst1.u8 {d22}, [r4], r5 ;store result + vst1.u8 {d23}, [r4], r5 + vst1.u8 {d24}, [r4], r5 + vst1.u8 {d25}, [r4], r5 + + bne filt_blk2d_fpo8x8_loop_neon + + pop {r4-r5,pc} + +;--------------------- +secondpass_filter8x8_only + sub r0, r0, r1, lsl #1 + add r3, r12, r3, lsl #5 + + vld1.u8 {d18}, [r0], r1 ;load src data + vld1.s32 {q5, q6}, [r3] ;load second_pass filter + vld1.u8 {d19}, [r0], r1 + vabs.s32 q7, q5 + vld1.u8 {d20}, [r0], r1 + vabs.s32 q8, q6 + vld1.u8 {d21}, [r0], r1 + mov r3, #2 ;loop counter + vld1.u8 {d22}, [r0], r1 + vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5) + vld1.u8 {d23}, [r0], r1 + vdup.8 d1, d14[4] + vld1.u8 {d24}, [r0], r1 + vdup.8 d2, d15[0] + vld1.u8 {d25}, [r0], r1 + vdup.8 d3, d15[4] + vld1.u8 {d26}, [r0], r1 + vdup.8 d4, d16[0] + vld1.u8 {d27}, [r0], r1 + vdup.8 d5, d16[4] + vld1.u8 {d28}, [r0], r1 + vld1.u8 {d29}, [r0], r1 + vld1.u8 {d30}, [r0], r1 + +;Second pass: 8x8 +filt_blk2d_spo8x8_loop_neon + vmull.u8 q3, d18, d0 ;(src_ptr[-2] * vp8_filter[0]) + vmull.u8 q4, d19, d0 + vmull.u8 q5, d20, d0 + vmull.u8 q6, d21, d0 + + vmlsl.u8 q3, d19, d1 ;-(src_ptr[-1] * vp8_filter[1]) + vmlsl.u8 q4, d20, d1 + vmlsl.u8 q5, d21, d1 + vmlsl.u8 q6, d22, d1 + + vmlsl.u8 q3, d22, d4 ;-(src_ptr[2] * vp8_filter[4]) + vmlsl.u8 q4, d23, d4 + vmlsl.u8 q5, d24, d4 + vmlsl.u8 q6, d25, d4 + + vmlal.u8 q3, d20, d2 ;(src_ptr[0] * vp8_filter[2]) + vmlal.u8 q4, d21, d2 + vmlal.u8 q5, d22, d2 + vmlal.u8 q6, d23, d2 + + vmlal.u8 q3, d23, d5 ;(src_ptr[3] * vp8_filter[5]) + vmlal.u8 q4, d24, d5 + vmlal.u8 q5, d25, d5 + vmlal.u8 q6, d26, d5 + + vmull.u8 q7, d21, d3 ;(src_ptr[1] * vp8_filter[3]) + vmull.u8 q8, d22, d3 + vmull.u8 q9, d23, d3 + vmull.u8 q10, d24, d3 + + subs r3, r3, #1 + + vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) + vqadd.s16 q8, q4 + vqadd.s16 q9, q5 + vqadd.s16 q10, q6 + + vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8 + vqrshrun.s16 d7, q8, #7 + vqrshrun.s16 d8, q9, #7 + vqrshrun.s16 d9, q10, #7 + + vmov q9, q11 + vst1.u8 {d6}, [r4], r5 ;store result + vmov q10, q12 + vst1.u8 {d7}, [r4], r5 + vmov q11, q13 + vst1.u8 {d8}, [r4], r5 + vmov q12, q14 + vst1.u8 {d9}, [r4], r5 + vmov d26, d30 + + bne filt_blk2d_spo8x8_loop_neon + + pop {r4-r5,pc} + + ENDP + +;----------------- + AREA subpelfilters8_dat, DATA, READWRITE ;read/write by default +;Data section with name data_area is specified. DCD reserves space in memory for 48 data. +;One word each is reserved. Label filter_coeff can be used to access the data. +;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ... +_filter8_coeff_ + DCD filter8_coeff +filter8_coeff + DCD 0, 0, 128, 0, 0, 0, 0, 0 + DCD 0, -6, 123, 12, -1, 0, 0, 0 + DCD 2, -11, 108, 36, -8, 1, 0, 0 + DCD 0, -9, 93, 50, -6, 0, 0, 0 + DCD 3, -16, 77, 77, -16, 3, 0, 0 + DCD 0, -6, 50, 93, -9, 0, 0, 0 + DCD 1, -8, 36, 108, -11, 2, 0, 0 + DCD 0, -1, 12, 123, -6, 0, 0, 0 + + END diff --git a/vp8/common/arm/recon_arm.c b/vp8/common/arm/recon_arm.c new file mode 100644 index 000000000..130059e64 --- /dev/null +++ b/vp8/common/arm/recon_arm.c @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "vpx_ports/config.h" +#include "recon.h" +#include "blockd.h" + +extern void vp8_recon16x16mb_neon(unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int ystride, unsigned char *udst_ptr, unsigned char *vdst_ptr); + +/* +void vp8_recon16x16mby(MACROBLOCKD *x) +{ + int i; + for(i=0;i<16;i+=4) + { + //vp8_recon4b(&x->block[i]); + BLOCKD *b = &x->block[i]; + vp8_recon4b (b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + } +} +*/ +void vp8_recon16x16mby(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x) +{ + BLOCKD *b = &x->block[0]; + RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + + //b = &x->block[4]; + b += 4; + RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + + //b = &x->block[8]; + b += 4; + RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + + //b = &x->block[12]; + b += 4; + RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); +} + +#if HAVE_ARMV7 +void vp8_recon16x16mb(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x) +{ + unsigned char *pred_ptr = &x->predictor[0]; + short *diff_ptr = &x->diff[0]; + unsigned char *dst_ptr = x->dst.y_buffer; + unsigned char *udst_ptr = x->dst.u_buffer; + unsigned char *vdst_ptr = x->dst.v_buffer; + int ystride = x->dst.y_stride; + //int uv_stride = x->dst.uv_stride; + + vp8_recon16x16mb_neon(pred_ptr, diff_ptr, dst_ptr, ystride, udst_ptr, vdst_ptr); +} + +#else +/* +void vp8_recon16x16mb(MACROBLOCKD *x) +{ + int i; + + for(i=0;i<16;i+=4) + { +// vp8_recon4b(&x->block[i]); + BLOCKD *b = &x->block[i]; + vp8_recon4b (b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + + } + for(i=16;i<24;i+=2) + { +// vp8_recon2b(&x->block[i]); + BLOCKD *b = &x->block[i]; + vp8_recon2b (b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + } +} +*/ +void vp8_recon16x16mb(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x) +{ + BLOCKD *b = &x->block[0]; + + RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + b += 4; + RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + b += 4; + RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + b += 4; + RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + b += 4; + + //b = &x->block[16]; + + RECON_INVOKE(rtcd, recon2)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + b++; + b++; + RECON_INVOKE(rtcd, recon2)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + b++; + b++; + RECON_INVOKE(rtcd, recon2)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + b++; + b++; + RECON_INVOKE(rtcd, recon2)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); +} +#endif diff --git a/vp8/common/arm/recon_arm.h b/vp8/common/arm/recon_arm.h new file mode 100644 index 000000000..fd9f85eea --- /dev/null +++ b/vp8/common/arm/recon_arm.h @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef RECON_ARM_H +#define RECON_ARM_H + +#if HAVE_ARMV6 +extern prototype_recon_block(vp8_recon_b_armv6); +extern prototype_recon_block(vp8_recon2b_armv6); +extern prototype_recon_block(vp8_recon4b_armv6); + +extern prototype_copy_block(vp8_copy_mem8x8_v6); +extern prototype_copy_block(vp8_copy_mem8x4_v6); +extern prototype_copy_block(vp8_copy_mem16x16_v6); + +#undef vp8_recon_recon +#define vp8_recon_recon vp8_recon_b_armv6 + +#undef vp8_recon_recon2 +#define vp8_recon_recon2 vp8_recon2b_armv6 + +#undef vp8_recon_recon4 +#define vp8_recon_recon4 vp8_recon4b_armv6 + +#undef vp8_recon_copy8x8 +#define vp8_recon_copy8x8 vp8_copy_mem8x8_v6 + +#undef vp8_recon_copy8x4 +#define vp8_recon_copy8x4 vp8_copy_mem8x4_v6 + +#undef vp8_recon_copy16x16 +#define vp8_recon_copy16x16 vp8_copy_mem16x16_v6 +#endif + +#if HAVE_ARMV7 +extern prototype_recon_block(vp8_recon_b_neon); +extern prototype_recon_block(vp8_recon2b_neon); +extern prototype_recon_block(vp8_recon4b_neon); + +extern prototype_copy_block(vp8_copy_mem8x8_neon); +extern prototype_copy_block(vp8_copy_mem8x4_neon); +extern prototype_copy_block(vp8_copy_mem16x16_neon); + +#undef vp8_recon_recon +#define vp8_recon_recon vp8_recon_b_neon + +#undef vp8_recon_recon2 +#define vp8_recon_recon2 vp8_recon2b_neon + +#undef vp8_recon_recon4 +#define vp8_recon_recon4 vp8_recon4b_neon + +#undef vp8_recon_copy8x8 +#define vp8_recon_copy8x8 vp8_copy_mem8x8_neon + +#undef vp8_recon_copy8x4 +#define vp8_recon_copy8x4 vp8_copy_mem8x4_neon + +#undef vp8_recon_copy16x16 +#define vp8_recon_copy16x16 vp8_copy_mem16x16_neon +#endif + +#endif diff --git a/vp8/common/arm/reconintra4x4_arm.c b/vp8/common/arm/reconintra4x4_arm.c new file mode 100644 index 000000000..334d35236 --- /dev/null +++ b/vp8/common/arm/reconintra4x4_arm.c @@ -0,0 +1,408 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "vpx_ports/config.h" +#include "recon.h" +#include "vpx_mem/vpx_mem.h" +#include "reconintra.h" + +void vp8_predict_intra4x4(BLOCKD *x, + int b_mode, + unsigned char *predictor) +{ + int i, r, c; + + unsigned char *Above = *(x->base_dst) + x->dst - x->dst_stride; + unsigned char Left[4]; + unsigned char top_left = Above[-1]; + + Left[0] = (*(x->base_dst))[x->dst - 1]; + Left[1] = (*(x->base_dst))[x->dst - 1 + x->dst_stride]; + Left[2] = (*(x->base_dst))[x->dst - 1 + 2 * x->dst_stride]; + Left[3] = (*(x->base_dst))[x->dst - 1 + 3 * x->dst_stride]; + + switch (b_mode) + { + case B_DC_PRED: + { + int expected_dc = 0; + + for (i = 0; i < 4; i++) + { + expected_dc += Above[i]; + expected_dc += Left[i]; + } + + expected_dc = (expected_dc + 4) >> 3; + + for (r = 0; r < 4; r++) + { + for (c = 0; c < 4; c++) + { + predictor[c] = expected_dc; + } + + predictor += 16; + } + } + break; + case B_TM_PRED: + { + // prediction similar to true_motion prediction + for (r = 0; r < 4; r++) + { + for (c = 0; c < 4; c++) + { + int pred = Above[c] - top_left + Left[r]; + + if (pred < 0) + pred = 0; + + if (pred > 255) + pred = 255; + + predictor[c] = pred; + } + + predictor += 16; + } + } + break; + + case B_VE_PRED: + { + + unsigned int ap[4]; + ap[0] = (top_left + 2 * Above[0] + Above[1] + 2) >> 2; + ap[1] = (Above[0] + 2 * Above[1] + Above[2] + 2) >> 2; + ap[2] = (Above[1] + 2 * Above[2] + Above[3] + 2) >> 2; + ap[3] = (Above[2] + 2 * Above[3] + Above[4] + 2) >> 2; + + for (r = 0; r < 4; r++) + { + for (c = 0; c < 4; c++) + { + + predictor[c] = ap[c]; + } + + predictor += 16; + } + + } + break; + + + case B_HE_PRED: + { + + unsigned int lp[4]; + lp[0] = (top_left + 2 * Left[0] + Left[1] + 2) >> 2; + lp[1] = (Left[0] + 2 * Left[1] + Left[2] + 2) >> 2; + lp[2] = (Left[1] + 2 * Left[2] + Left[3] + 2) >> 2; + lp[3] = (Left[2] + 2 * Left[3] + Left[3] + 2) >> 2; + + for (r = 0; r < 4; r++) + { + for (c = 0; c < 4; c++) + { + predictor[c] = lp[r]; + } + + predictor += 16; + } + } + break; + case B_LD_PRED: + { + unsigned char *ptr = Above; + predictor[0 * 16 + 0] = (ptr[0] + ptr[1] * 2 + ptr[2] + 2) >> 2; + predictor[0 * 16 + 1] = + predictor[1 * 16 + 0] = (ptr[1] + ptr[2] * 2 + ptr[3] + 2) >> 2; + predictor[0 * 16 + 2] = + predictor[1 * 16 + 1] = + predictor[2 * 16 + 0] = (ptr[2] + ptr[3] * 2 + ptr[4] + 2) >> 2; + predictor[0 * 16 + 3] = + predictor[1 * 16 + 2] = + predictor[2 * 16 + 1] = + predictor[3 * 16 + 0] = (ptr[3] + ptr[4] * 2 + ptr[5] + 2) >> 2; + predictor[1 * 16 + 3] = + predictor[2 * 16 + 2] = + predictor[3 * 16 + 1] = (ptr[4] + ptr[5] * 2 + ptr[6] + 2) >> 2; + predictor[2 * 16 + 3] = + predictor[3 * 16 + 2] = (ptr[5] + ptr[6] * 2 + ptr[7] + 2) >> 2; + predictor[3 * 16 + 3] = (ptr[6] + ptr[7] * 2 + ptr[7] + 2) >> 2; + + } + break; + case B_RD_PRED: + { + + unsigned char pp[9]; + + pp[0] = Left[3]; + pp[1] = Left[2]; + pp[2] = Left[1]; + pp[3] = Left[0]; + pp[4] = top_left; + pp[5] = Above[0]; + pp[6] = Above[1]; + pp[7] = Above[2]; + pp[8] = Above[3]; + + predictor[3 * 16 + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2; + predictor[3 * 16 + 1] = + predictor[2 * 16 + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2; + predictor[3 * 16 + 2] = + predictor[2 * 16 + 1] = + predictor[1 * 16 + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2; + predictor[3 * 16 + 3] = + predictor[2 * 16 + 2] = + predictor[1 * 16 + 1] = + predictor[0 * 16 + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2; + predictor[2 * 16 + 3] = + predictor[1 * 16 + 2] = + predictor[0 * 16 + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2; + predictor[1 * 16 + 3] = + predictor[0 * 16 + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2; + predictor[0 * 16 + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2; + + } + break; + case B_VR_PRED: + { + + unsigned char pp[9]; + + pp[0] = Left[3]; + pp[1] = Left[2]; + pp[2] = Left[1]; + pp[3] = Left[0]; + pp[4] = top_left; + pp[5] = Above[0]; + pp[6] = Above[1]; + pp[7] = Above[2]; + pp[8] = Above[3]; + + + predictor[3 * 16 + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2; + predictor[2 * 16 + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2; + predictor[3 * 16 + 1] = + predictor[1 * 16 + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2; + predictor[2 * 16 + 1] = + predictor[0 * 16 + 0] = (pp[4] + pp[5] + 1) >> 1; + predictor[3 * 16 + 2] = + predictor[1 * 16 + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2; + predictor[2 * 16 + 2] = + predictor[0 * 16 + 1] = (pp[5] + pp[6] + 1) >> 1; + predictor[3 * 16 + 3] = + predictor[1 * 16 + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2; + predictor[2 * 16 + 3] = + predictor[0 * 16 + 2] = (pp[6] + pp[7] + 1) >> 1; + predictor[1 * 16 + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2; + predictor[0 * 16 + 3] = (pp[7] + pp[8] + 1) >> 1; + + } + break; + case B_VL_PRED: + { + + unsigned char *pp = Above; + + predictor[0 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1; + predictor[1 * 16 + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2; + predictor[2 * 16 + 0] = + predictor[0 * 16 + 1] = (pp[1] + pp[2] + 1) >> 1; + predictor[1 * 16 + 1] = + predictor[3 * 16 + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2; + predictor[2 * 16 + 1] = + predictor[0 * 16 + 2] = (pp[2] + pp[3] + 1) >> 1; + predictor[3 * 16 + 1] = + predictor[1 * 16 + 2] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2; + predictor[0 * 16 + 3] = + predictor[2 * 16 + 2] = (pp[3] + pp[4] + 1) >> 1; + predictor[1 * 16 + 3] = + predictor[3 * 16 + 2] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2; + predictor[2 * 16 + 3] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2; + predictor[3 * 16 + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2; + } + break; + + case B_HD_PRED: + { + unsigned char pp[9]; + pp[0] = Left[3]; + pp[1] = Left[2]; + pp[2] = Left[1]; + pp[3] = Left[0]; + pp[4] = top_left; + pp[5] = Above[0]; + pp[6] = Above[1]; + pp[7] = Above[2]; + pp[8] = Above[3]; + + + predictor[3 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1; + predictor[3 * 16 + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2; + predictor[2 * 16 + 0] = + predictor[3 * 16 + 2] = (pp[1] + pp[2] + 1) >> 1; + predictor[2 * 16 + 1] = + predictor[3 * 16 + 3] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2; + predictor[2 * 16 + 2] = + predictor[1 * 16 + 0] = (pp[2] + pp[3] + 1) >> 1; + predictor[2 * 16 + 3] = + predictor[1 * 16 + 1] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2; + predictor[1 * 16 + 2] = + predictor[0 * 16 + 0] = (pp[3] + pp[4] + 1) >> 1; + predictor[1 * 16 + 3] = + predictor[0 * 16 + 1] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2; + predictor[0 * 16 + 2] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2; + predictor[0 * 16 + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2; + } + break; + + + case B_HU_PRED: + { + unsigned char *pp = Left; + predictor[0 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1; + predictor[0 * 16 + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2; + predictor[0 * 16 + 2] = + predictor[1 * 16 + 0] = (pp[1] + pp[2] + 1) >> 1; + predictor[0 * 16 + 3] = + predictor[1 * 16 + 1] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2; + predictor[1 * 16 + 2] = + predictor[2 * 16 + 0] = (pp[2] + pp[3] + 1) >> 1; + predictor[1 * 16 + 3] = + predictor[2 * 16 + 1] = (pp[2] + pp[3] * 2 + pp[3] + 2) >> 2; + predictor[2 * 16 + 2] = + predictor[2 * 16 + 3] = + predictor[3 * 16 + 0] = + predictor[3 * 16 + 1] = + predictor[3 * 16 + 2] = + predictor[3 * 16 + 3] = pp[3]; + } + break; + + + } +} +// copy 4 bytes from the above right down so that the 4x4 prediction modes using pixels above and +// to the right prediction have filled in pixels to use. +void vp8_intra_prediction_down_copy(MACROBLOCKD *x) +{ + unsigned char *above_right = *(x->block[0].base_dst) + x->block[0].dst - x->block[0].dst_stride + 16; + + unsigned int *src_ptr = (unsigned int *)above_right; + unsigned int *dst_ptr0 = (unsigned int *)(above_right + 4 * x->block[0].dst_stride); + unsigned int *dst_ptr1 = (unsigned int *)(above_right + 8 * x->block[0].dst_stride); + unsigned int *dst_ptr2 = (unsigned int *)(above_right + 12 * x->block[0].dst_stride); + + *dst_ptr0 = *src_ptr; + *dst_ptr1 = *src_ptr; + *dst_ptr2 = *src_ptr; +} + + + +/* +void vp8_recon_intra4x4mb(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x) +{ + int i; + + vp8_intra_prediction_down_copy(x); + + for(i=0;i<16;i++) + { + BLOCKD *b = &x->block[i]; + + vp8_predict_intra4x4(b, x->block[i].bmi.mode,x->block[i].predictor); + RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + } + + vp8_recon_intra_mbuv(x); + +} +*/ +void vp8_recon_intra4x4mb(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x) +{ + int i; + BLOCKD *b = &x->block[0]; + + vp8_intra_prediction_down_copy(x); + + { + vp8_predict_intra4x4(b, b->bmi.mode, b->predictor); + RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + b += 1; + + vp8_predict_intra4x4(b, b->bmi.mode, b->predictor); + RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + b += 1; + + vp8_predict_intra4x4(b, b->bmi.mode, b->predictor); + RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + b += 1; + + vp8_predict_intra4x4(b, b->bmi.mode, b->predictor); + RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + b += 1; + + vp8_predict_intra4x4(b, b->bmi.mode, b->predictor); + RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + b += 1; + + vp8_predict_intra4x4(b, b->bmi.mode, b->predictor); + RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + b += 1; + + vp8_predict_intra4x4(b, b->bmi.mode, b->predictor); + RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + b += 1; + + vp8_predict_intra4x4(b, b->bmi.mode, b->predictor); + RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + b += 1; + + vp8_predict_intra4x4(b, b->bmi.mode, b->predictor); + RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + b += 1; + + vp8_predict_intra4x4(b, b->bmi.mode, b->predictor); + RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + b += 1; + + vp8_predict_intra4x4(b, b->bmi.mode, b->predictor); + RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + b += 1; + + vp8_predict_intra4x4(b, b->bmi.mode, b->predictor); + RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + b += 1; + + vp8_predict_intra4x4(b, b->bmi.mode, b->predictor); + RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + b += 1; + + vp8_predict_intra4x4(b, b->bmi.mode, b->predictor); + RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + b += 1; + + vp8_predict_intra4x4(b, b->bmi.mode, b->predictor); + RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + b += 1; + + vp8_predict_intra4x4(b, b->bmi.mode, b->predictor); + RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + } + + vp8_recon_intra_mbuv(rtcd, x); + +} diff --git a/vp8/common/arm/reconintra_arm.c b/vp8/common/arm/reconintra_arm.c new file mode 100644 index 000000000..d7ee1ddfa --- /dev/null +++ b/vp8/common/arm/reconintra_arm.c @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "vpx_ports/config.h" +#include "blockd.h" +#include "reconintra.h" +#include "vpx_mem/vpx_mem.h" +#include "recon.h" + +#if HAVE_ARMV7 +extern void vp8_build_intra_predictors_mby_neon_func( + unsigned char *y_buffer, + unsigned char *ypred_ptr, + int y_stride, + int mode, + int Up, + int Left); + +void vp8_build_intra_predictors_mby_neon(MACROBLOCKD *x) +{ + unsigned char *y_buffer = x->dst.y_buffer; + unsigned char *ypred_ptr = x->predictor; + int y_stride = x->dst.y_stride; + int mode = x->mbmi.mode; + int Up = x->up_available; + int Left = x->left_available; + + vp8_build_intra_predictors_mby_neon_func(y_buffer, ypred_ptr, y_stride, mode, Up, Left); +} +#endif + + +#if HAVE_ARMV7 +extern void vp8_build_intra_predictors_mby_s_neon_func( + unsigned char *y_buffer, + unsigned char *ypred_ptr, + int y_stride, + int mode, + int Up, + int Left); + +void vp8_build_intra_predictors_mby_s_neon(MACROBLOCKD *x) +{ + unsigned char *y_buffer = x->dst.y_buffer; + unsigned char *ypred_ptr = x->predictor; + int y_stride = x->dst.y_stride; + int mode = x->mbmi.mode; + int Up = x->up_available; + int Left = x->left_available; + + vp8_build_intra_predictors_mby_s_neon_func(y_buffer, ypred_ptr, y_stride, mode, Up, Left); +} + +#endif diff --git a/vp8/common/arm/subpixel_arm.h b/vp8/common/arm/subpixel_arm.h new file mode 100644 index 000000000..56aec55b9 --- /dev/null +++ b/vp8/common/arm/subpixel_arm.h @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef SUBPIXEL_ARM_H +#define SUBPIXEL_ARM_H + +#if HAVE_ARMV6 +extern prototype_subpixel_predict(vp8_sixtap_predict16x16_armv6); +extern prototype_subpixel_predict(vp8_sixtap_predict8x8_armv6); +extern prototype_subpixel_predict(vp8_sixtap_predict8x4_armv6); +extern prototype_subpixel_predict(vp8_sixtap_predict_armv6); +extern prototype_subpixel_predict(vp8_bilinear_predict16x16_armv6); +extern prototype_subpixel_predict(vp8_bilinear_predict8x8_armv6); +extern prototype_subpixel_predict(vp8_bilinear_predict8x4_armv6); +extern prototype_subpixel_predict(vp8_bilinear_predict4x4_armv6); + +#undef vp8_subpix_sixtap16x16 +#define vp8_subpix_sixtap16x16 vp8_sixtap_predict16x16_armv6 + +#undef vp8_subpix_sixtap8x8 +#define vp8_subpix_sixtap8x8 vp8_sixtap_predict8x8_armv6 + +#undef vp8_subpix_sixtap8x4 +#define vp8_subpix_sixtap8x4 vp8_sixtap_predict8x4_armv6 + +#undef vp8_subpix_sixtap4x4 +#define vp8_subpix_sixtap4x4 vp8_sixtap_predict_armv6 + +#undef vp8_subpix_bilinear16x16 +#define vp8_subpix_bilinear16x16 vp8_bilinear_predict16x16_armv6 + +#undef vp8_subpix_bilinear8x8 +#define vp8_subpix_bilinear8x8 vp8_bilinear_predict8x8_armv6 + +#undef vp8_subpix_bilinear8x4 +#define vp8_subpix_bilinear8x4 vp8_bilinear_predict8x4_armv6 + +#undef vp8_subpix_bilinear4x4 +#define vp8_subpix_bilinear4x4 vp8_bilinear_predict4x4_armv6 +#endif + +#if HAVE_ARMV7 +extern prototype_subpixel_predict(vp8_sixtap_predict16x16_neon); +extern prototype_subpixel_predict(vp8_sixtap_predict8x8_neon); +extern prototype_subpixel_predict(vp8_sixtap_predict8x4_neon); +extern prototype_subpixel_predict(vp8_sixtap_predict_neon); +extern prototype_subpixel_predict(vp8_bilinear_predict16x16_neon); +extern prototype_subpixel_predict(vp8_bilinear_predict8x8_neon); +extern prototype_subpixel_predict(vp8_bilinear_predict8x4_neon); +extern prototype_subpixel_predict(vp8_bilinear_predict4x4_neon); + +#undef vp8_subpix_sixtap16x16 +#define vp8_subpix_sixtap16x16 vp8_sixtap_predict16x16_neon + +#undef vp8_subpix_sixtap8x8 +#define vp8_subpix_sixtap8x8 vp8_sixtap_predict8x8_neon + +#undef vp8_subpix_sixtap8x4 +#define vp8_subpix_sixtap8x4 vp8_sixtap_predict8x4_neon + +#undef vp8_subpix_sixtap4x4 +#define vp8_subpix_sixtap4x4 vp8_sixtap_predict_neon + +#undef vp8_subpix_bilinear16x16 +#define vp8_subpix_bilinear16x16 vp8_bilinear_predict16x16_neon + +#undef vp8_subpix_bilinear8x8 +#define vp8_subpix_bilinear8x8 vp8_bilinear_predict8x8_neon + +#undef vp8_subpix_bilinear8x4 +#define vp8_subpix_bilinear8x4 vp8_bilinear_predict8x4_neon + +#undef vp8_subpix_bilinear4x4 +#define vp8_subpix_bilinear4x4 vp8_bilinear_predict4x4_neon +#endif + +#endif diff --git a/vp8/common/arm/systemdependent.c b/vp8/common/arm/systemdependent.c new file mode 100644 index 000000000..ecc6929c0 --- /dev/null +++ b/vp8/common/arm/systemdependent.c @@ -0,0 +1,148 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "vpx_ports/config.h" +#include "g_common.h" +#include "pragmas.h" +#include "subpixel.h" +#include "loopfilter.h" +#include "recon.h" +#include "idct.h" +#include "onyxc_int.h" + +void (*vp8_build_intra_predictors_mby_ptr)(MACROBLOCKD *x); +extern void vp8_build_intra_predictors_mby(MACROBLOCKD *x); +extern void vp8_build_intra_predictors_mby_neon(MACROBLOCKD *x); + +void (*vp8_build_intra_predictors_mby_s_ptr)(MACROBLOCKD *x); +extern void vp8_build_intra_predictors_mby_s(MACROBLOCKD *x); +extern void vp8_build_intra_predictors_mby_s_neon(MACROBLOCKD *x); + +void vp8_machine_specific_config(VP8_COMMON *ctx) +{ +#if CONFIG_RUNTIME_CPU_DETECT + VP8_COMMON_RTCD *rtcd = &ctx->rtcd; + +#if HAVE_ARMV7 + rtcd->subpix.sixtap16x16 = vp8_sixtap_predict16x16_neon; + rtcd->subpix.sixtap8x8 = vp8_sixtap_predict8x8_neon; + rtcd->subpix.sixtap8x4 = vp8_sixtap_predict8x4_neon; + rtcd->subpix.sixtap4x4 = vp8_sixtap_predict_neon; + rtcd->subpix.bilinear16x16 = vp8_bilinear_predict16x16_neon; + rtcd->subpix.bilinear8x8 = vp8_bilinear_predict8x8_neon; + rtcd->subpix.bilinear8x4 = vp8_bilinear_predict8x4_neon; + rtcd->subpix.bilinear4x4 = vp8_bilinear_predict4x4_neon; + + rtcd->idct.idct1 = vp8_short_idct4x4llm_1_neon; + rtcd->idct.idct16 = vp8_short_idct4x4llm_neon; + rtcd->idct.idct1_scalar = vp8_dc_only_idct_neon; + rtcd->idct.iwalsh1 = vp8_short_inv_walsh4x4_1_neon; + rtcd->idct.iwalsh16 = vp8_short_inv_walsh4x4_neon; + + rtcd->loopfilter.normal_mb_v = vp8_loop_filter_mbv_neon; + rtcd->loopfilter.normal_b_v = vp8_loop_filter_bv_neon; + rtcd->loopfilter.normal_mb_h = vp8_loop_filter_mbh_neon; + rtcd->loopfilter.normal_b_h = vp8_loop_filter_bh_neon; + rtcd->loopfilter.simple_mb_v = vp8_loop_filter_mbvs_neon; + rtcd->loopfilter.simple_b_v = vp8_loop_filter_bvs_neon; + rtcd->loopfilter.simple_mb_h = vp8_loop_filter_mbhs_neon; + rtcd->loopfilter.simple_b_h = vp8_loop_filter_bhs_neon; + + rtcd->recon.copy16x16 = vp8_copy_mem16x16_neon; + rtcd->recon.copy8x8 = vp8_copy_mem8x8_neon; + rtcd->recon.copy8x4 = vp8_copy_mem8x4_neon; + rtcd->recon.recon = vp8_recon_b_neon; + rtcd->recon.recon2 = vp8_recon2b_neon; + rtcd->recon.recon4 = vp8_recon4b_neon; +#elif HAVE_ARMV6 + + rtcd->subpix.sixtap16x16 = vp8_sixtap_predict16x16_armv6; + rtcd->subpix.sixtap8x8 = vp8_sixtap_predict8x8_armv6; + rtcd->subpix.sixtap8x4 = vp8_sixtap_predict8x4_armv6; + rtcd->subpix.sixtap4x4 = vp8_sixtap_predict_armv6; + rtcd->subpix.bilinear16x16 = vp8_bilinear_predict16x16_armv6; + rtcd->subpix.bilinear8x8 = vp8_bilinear_predict8x8_armv6; + rtcd->subpix.bilinear8x4 = vp8_bilinear_predict8x4_armv6; + rtcd->subpix.bilinear4x4 = vp8_bilinear_predict4x4_armv6; + + rtcd->idct.idct1 = vp8_short_idct4x4llm_1_v6; + rtcd->idct.idct16 = vp8_short_idct4x4llm_v6_dual; + rtcd->idct.idct1_scalar = vp8_dc_only_idct_armv6; + rtcd->idct.iwalsh1 = vp8_short_inv_walsh4x4_1_armv6; + rtcd->idct.iwalsh16 = vp8_short_inv_walsh4x4_armv6; + + rtcd->loopfilter.normal_mb_v = vp8_loop_filter_mbv_armv6; + rtcd->loopfilter.normal_b_v = vp8_loop_filter_bv_armv6; + rtcd->loopfilter.normal_mb_h = vp8_loop_filter_mbh_armv6; + rtcd->loopfilter.normal_b_h = vp8_loop_filter_bh_armv6; + rtcd->loopfilter.simple_mb_v = vp8_loop_filter_mbvs_armv6; + rtcd->loopfilter.simple_b_v = vp8_loop_filter_bvs_armv6; + rtcd->loopfilter.simple_mb_h = vp8_loop_filter_mbhs_armv6; + rtcd->loopfilter.simple_b_h = vp8_loop_filter_bhs_armv6; + + rtcd->recon.copy16x16 = vp8_copy_mem16x16_v6; + rtcd->recon.copy8x8 = vp8_copy_mem8x8_v6; + rtcd->recon.copy8x4 = vp8_copy_mem8x4_v6; + rtcd->recon.recon = vp8_recon_b_armv6; + rtcd->recon.recon2 = vp8_recon2b_armv6; + rtcd->recon.recon4 = vp8_recon4b_armv6; +#else +//pure c + rtcd->idct.idct1 = vp8_short_idct4x4llm_1_c; + rtcd->idct.idct16 = vp8_short_idct4x4llm_c; + rtcd->idct.idct1_scalar = vp8_dc_only_idct_c; + rtcd->idct.iwalsh1 = vp8_short_inv_walsh4x4_1_c; + rtcd->idct.iwalsh16 = vp8_short_inv_walsh4x4_c; + + rtcd->recon.copy16x16 = vp8_copy_mem16x16_c; + rtcd->recon.copy8x8 = vp8_copy_mem8x8_c; + rtcd->recon.copy8x4 = vp8_copy_mem8x4_c; + rtcd->recon.recon = vp8_recon_b_c; + rtcd->recon.recon2 = vp8_recon2b_c; + rtcd->recon.recon4 = vp8_recon4b_c; + + rtcd->subpix.sixtap16x16 = vp8_sixtap_predict16x16_c; + rtcd->subpix.sixtap8x8 = vp8_sixtap_predict8x8_c; + rtcd->subpix.sixtap8x4 = vp8_sixtap_predict8x4_c; + rtcd->subpix.sixtap4x4 = vp8_sixtap_predict_c; + rtcd->subpix.bilinear16x16 = vp8_bilinear_predict16x16_c; + rtcd->subpix.bilinear8x8 = vp8_bilinear_predict8x8_c; + rtcd->subpix.bilinear8x4 = vp8_bilinear_predict8x4_c; + rtcd->subpix.bilinear4x4 = vp8_bilinear_predict4x4_c; + + rtcd->loopfilter.normal_mb_v = vp8_loop_filter_mbv_c; + rtcd->loopfilter.normal_b_v = vp8_loop_filter_bv_c; + rtcd->loopfilter.normal_mb_h = vp8_loop_filter_mbh_c; + rtcd->loopfilter.normal_b_h = vp8_loop_filter_bh_c; + rtcd->loopfilter.simple_mb_v = vp8_loop_filter_mbvs_c; + rtcd->loopfilter.simple_b_v = vp8_loop_filter_bvs_c; + rtcd->loopfilter.simple_mb_h = vp8_loop_filter_mbhs_c; + rtcd->loopfilter.simple_b_h = vp8_loop_filter_bhs_c; +#endif + + rtcd->postproc.down = vp8_mbpost_proc_down_c; + rtcd->postproc.across = vp8_mbpost_proc_across_ip_c; + rtcd->postproc.downacross = vp8_post_proc_down_and_across_c; + rtcd->postproc.addnoise = vp8_plane_add_noise_c; +#endif + +#if HAVE_ARMV7 + vp8_build_intra_predictors_mby_ptr = vp8_build_intra_predictors_mby_neon; + vp8_build_intra_predictors_mby_s_ptr = vp8_build_intra_predictors_mby_s_neon; +#elif HAVE_ARMV6 + vp8_build_intra_predictors_mby_ptr = vp8_build_intra_predictors_mby; + vp8_build_intra_predictors_mby_s_ptr = vp8_build_intra_predictors_mby_s; +#else + vp8_build_intra_predictors_mby_ptr = vp8_build_intra_predictors_mby; + vp8_build_intra_predictors_mby_s_ptr = vp8_build_intra_predictors_mby_s; + +#endif + +} diff --git a/vp8/common/arm/vpx_asm_offsets.c b/vp8/common/arm/vpx_asm_offsets.c new file mode 100644 index 000000000..68634bf55 --- /dev/null +++ b/vp8/common/arm/vpx_asm_offsets.c @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "vpx_ports/config.h" +#include <stddef.h> + +#if CONFIG_VP8_ENCODER +#include "vpx_scale/yv12config.h" +#endif + +#if CONFIG_VP8_DECODER +#include "onyxd_int.h" +#endif + +#define DEFINE(sym, val) int sym = val; + +/* +#define BLANK() asm volatile("\n->" : : ) +*/ + +/* + * int main(void) + * { + */ + +#if CONFIG_VP8_DECODER || CONFIG_VP8_ENCODER +DEFINE(yv12_buffer_config_y_width, offsetof(YV12_BUFFER_CONFIG, y_width)); +DEFINE(yv12_buffer_config_y_height, offsetof(YV12_BUFFER_CONFIG, y_height)); +DEFINE(yv12_buffer_config_y_stride, offsetof(YV12_BUFFER_CONFIG, y_stride)); +DEFINE(yv12_buffer_config_uv_width, offsetof(YV12_BUFFER_CONFIG, uv_width)); +DEFINE(yv12_buffer_config_uv_height, offsetof(YV12_BUFFER_CONFIG, uv_height)); +DEFINE(yv12_buffer_config_uv_stride, offsetof(YV12_BUFFER_CONFIG, uv_stride)); +DEFINE(yv12_buffer_config_y_buffer, offsetof(YV12_BUFFER_CONFIG, y_buffer)); +DEFINE(yv12_buffer_config_u_buffer, offsetof(YV12_BUFFER_CONFIG, u_buffer)); +DEFINE(yv12_buffer_config_v_buffer, offsetof(YV12_BUFFER_CONFIG, v_buffer)); +DEFINE(yv12_buffer_config_border, offsetof(YV12_BUFFER_CONFIG, border)); +#endif + +#if CONFIG_VP8_DECODER +DEFINE(mb_diff, offsetof(MACROBLOCKD, diff)); +DEFINE(mb_predictor, offsetof(MACROBLOCKD, predictor)); +DEFINE(mb_dst_y_stride, offsetof(MACROBLOCKD, dst.y_stride)); +DEFINE(mb_dst_y_buffer, offsetof(MACROBLOCKD, dst.y_buffer)); +DEFINE(mb_dst_u_buffer, offsetof(MACROBLOCKD, dst.u_buffer)); +DEFINE(mb_dst_v_buffer, offsetof(MACROBLOCKD, dst.v_buffer)); +DEFINE(mb_mbmi_mode, offsetof(MACROBLOCKD, mbmi.mode)); +DEFINE(mb_up_available, offsetof(MACROBLOCKD, up_available)); +DEFINE(mb_left_available, offsetof(MACROBLOCKD, left_available)); + +DEFINE(detok_scan, offsetof(DETOK, scan)); +DEFINE(detok_ptr_onyxblock2context_leftabove, offsetof(DETOK, ptr_onyxblock2context_leftabove)); +DEFINE(detok_onyx_coef_tree_ptr, offsetof(DETOK, vp8_coef_tree_ptr)); +DEFINE(detok_teb_base_ptr, offsetof(DETOK, teb_base_ptr)); +DEFINE(detok_norm_ptr, offsetof(DETOK, norm_ptr)); +DEFINE(detok_ptr_onyx_coef_bands_x, offsetof(DETOK, ptr_onyx_coef_bands_x)); + +DEFINE(DETOK_A, offsetof(DETOK, A)); +DEFINE(DETOK_L, offsetof(DETOK, L)); + +DEFINE(detok_qcoeff_start_ptr, offsetof(DETOK, qcoeff_start_ptr)); +DEFINE(detok_current_bc, offsetof(DETOK, current_bc)); +DEFINE(detok_coef_probs, offsetof(DETOK, coef_probs)); +DEFINE(detok_eob, offsetof(DETOK, eob)); + +DEFINE(bool_decoder_lowvalue, offsetof(BOOL_DECODER, lowvalue)); +DEFINE(bool_decoder_range, offsetof(BOOL_DECODER, range)); +DEFINE(bool_decoder_value, offsetof(BOOL_DECODER, value)); +DEFINE(bool_decoder_count, offsetof(BOOL_DECODER, count)); +DEFINE(bool_decoder_user_buffer, offsetof(BOOL_DECODER, user_buffer)); +DEFINE(bool_decoder_user_buffer_sz, offsetof(BOOL_DECODER, user_buffer_sz)); +DEFINE(bool_decoder_decode_buffer, offsetof(BOOL_DECODER, decode_buffer)); +DEFINE(bool_decoder_read_ptr, offsetof(BOOL_DECODER, read_ptr)); +DEFINE(bool_decoder_write_ptr, offsetof(BOOL_DECODER, write_ptr)); + +DEFINE(tokenextrabits_min_val, offsetof(TOKENEXTRABITS, min_val)); +DEFINE(tokenextrabits_length, offsetof(TOKENEXTRABITS, Length)); +#endif + +//add asserts for any offset that is not supported by assembly code +//add asserts for any size that is not supported by assembly code +/* + * return 0; + * } + */ |