diff options
author | John Koleszar <jkoleszar@google.com> | 2010-05-18 11:58:33 -0400 |
---|---|---|
committer | John Koleszar <jkoleszar@google.com> | 2010-05-18 11:58:33 -0400 |
commit | 0ea50ce9cb4b65eee6afa1d041fe8beb5abda667 (patch) | |
tree | 1f3b9019f28bc56fd3156f96e5a9653a983ee61b /vp8/common/arm/armv6 | |
download | libvpx-0ea50ce9cb4b65eee6afa1d041fe8beb5abda667.tar libvpx-0ea50ce9cb4b65eee6afa1d041fe8beb5abda667.tar.gz libvpx-0ea50ce9cb4b65eee6afa1d041fe8beb5abda667.tar.bz2 libvpx-0ea50ce9cb4b65eee6afa1d041fe8beb5abda667.zip |
Initial WebM release
Diffstat (limited to 'vp8/common/arm/armv6')
-rw-r--r-- | vp8/common/arm/armv6/bilinearfilter_v6.asm | 237 | ||||
-rw-r--r-- | vp8/common/arm/armv6/copymem16x16_v6.asm | 181 | ||||
-rw-r--r-- | vp8/common/arm/armv6/copymem8x4_v6.asm | 127 | ||||
-rw-r--r-- | vp8/common/arm/armv6/copymem8x8_v6.asm | 127 | ||||
-rw-r--r-- | vp8/common/arm/armv6/filter_v6.asm | 383 | ||||
-rw-r--r-- | vp8/common/arm/armv6/idct_v6.asm | 376 | ||||
-rw-r--r-- | vp8/common/arm/armv6/iwalsh_v6.asm | 151 | ||||
-rw-r--r-- | vp8/common/arm/armv6/loopfilter_v6.asm | 1263 | ||||
-rw-r--r-- | vp8/common/arm/armv6/recon_v6.asm | 280 | ||||
-rw-r--r-- | vp8/common/arm/armv6/simpleloopfilter_v6.asm | 321 | ||||
-rw-r--r-- | vp8/common/arm/armv6/sixtappredict8x4_v6.asm | 277 |
11 files changed, 3723 insertions, 0 deletions
diff --git a/vp8/common/arm/armv6/bilinearfilter_v6.asm b/vp8/common/arm/armv6/bilinearfilter_v6.asm new file mode 100644 index 000000000..4428cf8ff --- /dev/null +++ b/vp8/common/arm/armv6/bilinearfilter_v6.asm @@ -0,0 +1,237 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_filter_block2d_bil_first_pass_armv6| + EXPORT |vp8_filter_block2d_bil_second_pass_armv6| + + AREA |.text|, CODE, READONLY ; name this block of code + +;------------------------------------- +; r0 unsigned char *src_ptr, +; r1 unsigned short *output_ptr, +; r2 unsigned int src_pixels_per_line, +; r3 unsigned int output_height, +; stack unsigned int output_width, +; stack const short *vp8_filter +;------------------------------------- +; The output is transposed stroed in output array to make it easy for second pass filtering. +|vp8_filter_block2d_bil_first_pass_armv6| PROC + stmdb sp!, {r4 - r11, lr} + + ldr r11, [sp, #40] ; vp8_filter address + ldr r4, [sp, #36] ; output width + + mov r12, r3 ; outer-loop counter + sub r2, r2, r4 ; src increment for height loop + + ;;IF ARCHITECTURE=6 + pld [r0] + ;;ENDIF + + ldr r5, [r11] ; load up filter coefficients + + mov r3, r3, lsl #1 ; output_height*2 + add r3, r3, #2 ; plus 2 to make output buffer 4-bit aligned since height is actually (height+1) + + mov r11, r1 ; save output_ptr for each row + + cmp r5, #128 ; if filter coef = 128, then skip the filter + beq bil_null_1st_filter + +|bil_height_loop_1st_v6| + ldrb r6, [r0] ; load source data + ldrb r7, [r0, #1] + ldrb r8, [r0, #2] + mov lr, r4, lsr #2 ; 4-in-parellel loop counter + +|bil_width_loop_1st_v6| + ldrb r9, [r0, #3] + ldrb r10, [r0, #4] + + pkhbt r6, r6, r7, lsl #16 ; src[1] | src[0] + pkhbt r7, r7, r8, lsl #16 ; src[2] | src[1] + + smuad r6, r6, r5 ; apply the filter + pkhbt r8, r8, r9, lsl #16 ; src[3] | src[2] + smuad r7, r7, r5 + pkhbt r9, r9, r10, lsl #16 ; src[4] | src[3] + + smuad r8, r8, r5 + smuad r9, r9, r5 + + add r0, r0, #4 + subs lr, lr, #1 + + add r6, r6, #0x40 ; round_shift_and_clamp + add r7, r7, #0x40 + usat r6, #16, r6, asr #7 + usat r7, #16, r7, asr #7 + + strh r6, [r1], r3 ; result is transposed and stored + + add r8, r8, #0x40 ; round_shift_and_clamp + strh r7, [r1], r3 + add r9, r9, #0x40 + usat r8, #16, r8, asr #7 + usat r9, #16, r9, asr #7 + + strh r8, [r1], r3 ; result is transposed and stored + + ldrneb r6, [r0] ; load source data + strh r9, [r1], r3 + + ldrneb r7, [r0, #1] + ldrneb r8, [r0, #2] + + bne bil_width_loop_1st_v6 + + add r0, r0, r2 ; move to next input row + subs r12, r12, #1 + + ;;IF ARCHITECTURE=6 + pld [r0] + ;;ENDIF + + add r11, r11, #2 ; move over to next column + mov r1, r11 + + bne bil_height_loop_1st_v6 + + ldmia sp!, {r4 - r11, pc} + +|bil_null_1st_filter| +|bil_height_loop_null_1st| + mov lr, r4, lsr #2 ; loop counter + +|bil_width_loop_null_1st| + ldrb r6, [r0] ; load data + ldrb r7, [r0, #1] + ldrb r8, [r0, #2] + ldrb r9, [r0, #3] + + strh r6, [r1], r3 ; store it to immediate buffer + add r0, r0, #4 + strh r7, [r1], r3 + subs lr, lr, #1 + strh r8, [r1], r3 + strh r9, [r1], r3 + + bne bil_width_loop_null_1st + + subs r12, r12, #1 + add r0, r0, r2 ; move to next input line + add r11, r11, #2 ; move over to next column + mov r1, r11 + + bne bil_height_loop_null_1st + + ldmia sp!, {r4 - r11, pc} + + ENDP ; |vp8_filter_block2d_bil_first_pass_armv6| + + +;--------------------------------- +; r0 unsigned short *src_ptr, +; r1 unsigned char *output_ptr, +; r2 int output_pitch, +; r3 unsigned int output_height, +; stack unsigned int output_width, +; stack const short *vp8_filter +;--------------------------------- +|vp8_filter_block2d_bil_second_pass_armv6| PROC + stmdb sp!, {r4 - r11, lr} + + ldr r11, [sp, #40] ; vp8_filter address + ldr r4, [sp, #36] ; output width + + ldr r5, [r11] ; load up filter coefficients + mov r12, r4 ; outer-loop counter = width, since we work on transposed data matrix + mov r11, r1 + + cmp r5, #128 ; if filter coef = 128, then skip the filter + beq bil_null_2nd_filter + +|bil_height_loop_2nd| + ldr r6, [r0] ; load the data + ldr r8, [r0, #4] + ldrh r10, [r0, #8] + mov lr, r3, lsr #2 ; loop counter + +|bil_width_loop_2nd| + pkhtb r7, r6, r8 ; src[1] | src[2] + pkhtb r9, r8, r10 ; src[3] | src[4] + + smuad r6, r6, r5 ; apply filter + smuad r8, r8, r5 ; apply filter + + subs lr, lr, #1 + + smuadx r7, r7, r5 ; apply filter + smuadx r9, r9, r5 ; apply filter + + add r0, r0, #8 + + add r6, r6, #0x40 ; round_shift_and_clamp + add r7, r7, #0x40 + usat r6, #8, r6, asr #7 + usat r7, #8, r7, asr #7 + strb r6, [r1], r2 ; the result is transposed back and stored + + add r8, r8, #0x40 ; round_shift_and_clamp + strb r7, [r1], r2 + add r9, r9, #0x40 + usat r8, #8, r8, asr #7 + usat r9, #8, r9, asr #7 + strb r8, [r1], r2 ; the result is transposed back and stored + + ldrne r6, [r0] ; load data + strb r9, [r1], r2 + ldrne r8, [r0, #4] + ldrneh r10, [r0, #8] + + bne bil_width_loop_2nd + + subs r12, r12, #1 + add r0, r0, #4 ; update src for next row + add r11, r11, #1 + mov r1, r11 + + bne bil_height_loop_2nd + ldmia sp!, {r4 - r11, pc} + +|bil_null_2nd_filter| +|bil_height_loop_null_2nd| + mov lr, r3, lsr #2 + +|bil_width_loop_null_2nd| + ldr r6, [r0], #4 ; load data + subs lr, lr, #1 + ldr r8, [r0], #4 + + strb r6, [r1], r2 ; store data + mov r7, r6, lsr #16 + strb r7, [r1], r2 + mov r9, r8, lsr #16 + strb r8, [r1], r2 + strb r9, [r1], r2 + + bne bil_width_loop_null_2nd + + subs r12, r12, #1 + add r0, r0, #4 + add r11, r11, #1 + mov r1, r11 + + bne bil_height_loop_null_2nd + + ldmia sp!, {r4 - r11, pc} + ENDP ; |vp8_filter_block2d_second_pass_armv6| + + END diff --git a/vp8/common/arm/armv6/copymem16x16_v6.asm b/vp8/common/arm/armv6/copymem16x16_v6.asm new file mode 100644 index 000000000..00e97397c --- /dev/null +++ b/vp8/common/arm/armv6/copymem16x16_v6.asm @@ -0,0 +1,181 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_copy_mem16x16_v6| + ; ARM + ; REQUIRE8 + ; PRESERVE8 + + AREA Block, CODE, READONLY ; name this block of code +;void copy_mem16x16_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride) +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= +|vp8_copy_mem16x16_v6| PROC + stmdb sp!, {r4 - r7} + ;push {r4-r7} + + ;preload + pld [r0] + pld [r0, r1] + pld [r0, r1, lsl #1] + + ands r4, r0, #15 + beq copy_mem16x16_fast + + ands r4, r0, #7 + beq copy_mem16x16_8 + + ands r4, r0, #3 + beq copy_mem16x16_4 + + ;copy one byte each time + ldrb r4, [r0] + ldrb r5, [r0, #1] + ldrb r6, [r0, #2] + ldrb r7, [r0, #3] + + mov r12, #16 + +copy_mem16x16_1_loop + strb r4, [r2] + strb r5, [r2, #1] + strb r6, [r2, #2] + strb r7, [r2, #3] + + ldrb r4, [r0, #4] + ldrb r5, [r0, #5] + ldrb r6, [r0, #6] + ldrb r7, [r0, #7] + + subs r12, r12, #1 + + strb r4, [r2, #4] + strb r5, [r2, #5] + strb r6, [r2, #6] + strb r7, [r2, #7] + + ldrb r4, [r0, #8] + ldrb r5, [r0, #9] + ldrb r6, [r0, #10] + ldrb r7, [r0, #11] + + strb r4, [r2, #8] + strb r5, [r2, #9] + strb r6, [r2, #10] + strb r7, [r2, #11] + + ldrb r4, [r0, #12] + ldrb r5, [r0, #13] + ldrb r6, [r0, #14] + ldrb r7, [r0, #15] + + add r0, r0, r1 + + strb r4, [r2, #12] + strb r5, [r2, #13] + strb r6, [r2, #14] + strb r7, [r2, #15] + + add r2, r2, r3 + + ldrneb r4, [r0] + ldrneb r5, [r0, #1] + ldrneb r6, [r0, #2] + ldrneb r7, [r0, #3] + + bne copy_mem16x16_1_loop + + ldmia sp!, {r4 - r7} + ;pop {r4-r7} + mov pc, lr + +;copy 4 bytes each time +copy_mem16x16_4 + ldr r4, [r0] + ldr r5, [r0, #4] + ldr r6, [r0, #8] + ldr r7, [r0, #12] + + mov r12, #16 + +copy_mem16x16_4_loop + subs r12, r12, #1 + add r0, r0, r1 + + str r4, [r2] + str r5, [r2, #4] + str r6, [r2, #8] + str r7, [r2, #12] + + add r2, r2, r3 + + ldrne r4, [r0] + ldrne r5, [r0, #4] + ldrne r6, [r0, #8] + ldrne r7, [r0, #12] + + bne copy_mem16x16_4_loop + + ldmia sp!, {r4 - r7} + ;pop {r4-r7} + mov pc, lr + +;copy 8 bytes each time +copy_mem16x16_8 + sub r1, r1, #16 + sub r3, r3, #16 + + mov r12, #16 + +copy_mem16x16_8_loop + ldmia r0!, {r4-r5} + ;ldm r0, {r4-r5} + ldmia r0!, {r6-r7} + + add r0, r0, r1 + + stmia r2!, {r4-r5} + subs r12, r12, #1 + ;stm r2, {r4-r5} + stmia r2!, {r6-r7} + + add r2, r2, r3 + + bne copy_mem16x16_8_loop + + ldmia sp!, {r4 - r7} + ;pop {r4-r7} + mov pc, lr + +;copy 16 bytes each time +copy_mem16x16_fast + ;sub r1, r1, #16 + ;sub r3, r3, #16 + + mov r12, #16 + +copy_mem16x16_fast_loop + ldmia r0, {r4-r7} + ;ldm r0, {r4-r7} + add r0, r0, r1 + + subs r12, r12, #1 + stmia r2, {r4-r7} + ;stm r2, {r4-r7} + add r2, r2, r3 + + bne copy_mem16x16_fast_loop + + ldmia sp!, {r4 - r7} + ;pop {r4-r7} + mov pc, lr + + ENDP ; |vp8_copy_mem16x16_v6| + + END diff --git a/vp8/common/arm/armv6/copymem8x4_v6.asm b/vp8/common/arm/armv6/copymem8x4_v6.asm new file mode 100644 index 000000000..94473ca65 --- /dev/null +++ b/vp8/common/arm/armv6/copymem8x4_v6.asm @@ -0,0 +1,127 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_copy_mem8x4_v6| + ; ARM + ; REQUIRE8 + ; PRESERVE8 + + AREA Block, CODE, READONLY ; name this block of code +;void vp8_copy_mem8x4_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride) +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= +|vp8_copy_mem8x4_v6| PROC + ;push {r4-r5} + stmdb sp!, {r4-r5} + + ;preload + pld [r0] + pld [r0, r1] + pld [r0, r1, lsl #1] + + ands r4, r0, #7 + beq copy_mem8x4_fast + + ands r4, r0, #3 + beq copy_mem8x4_4 + + ;copy 1 byte each time + ldrb r4, [r0] + ldrb r5, [r0, #1] + + mov r12, #4 + +copy_mem8x4_1_loop + strb r4, [r2] + strb r5, [r2, #1] + + ldrb r4, [r0, #2] + ldrb r5, [r0, #3] + + subs r12, r12, #1 + + strb r4, [r2, #2] + strb r5, [r2, #3] + + ldrb r4, [r0, #4] + ldrb r5, [r0, #5] + + strb r4, [r2, #4] + strb r5, [r2, #5] + + ldrb r4, [r0, #6] + ldrb r5, [r0, #7] + + add r0, r0, r1 + + strb r4, [r2, #6] + strb r5, [r2, #7] + + add r2, r2, r3 + + ldrneb r4, [r0] + ldrneb r5, [r0, #1] + + bne copy_mem8x4_1_loop + + ldmia sp!, {r4 - r5} + ;pop {r4-r5} + mov pc, lr + +;copy 4 bytes each time +copy_mem8x4_4 + ldr r4, [r0] + ldr r5, [r0, #4] + + mov r12, #4 + +copy_mem8x4_4_loop + subs r12, r12, #1 + add r0, r0, r1 + + str r4, [r2] + str r5, [r2, #4] + + add r2, r2, r3 + + ldrne r4, [r0] + ldrne r5, [r0, #4] + + bne copy_mem8x4_4_loop + + ldmia sp!, {r4-r5} + ;pop {r4-r5} + mov pc, lr + +;copy 8 bytes each time +copy_mem8x4_fast + ;sub r1, r1, #8 + ;sub r3, r3, #8 + + mov r12, #4 + +copy_mem8x4_fast_loop + ldmia r0, {r4-r5} + ;ldm r0, {r4-r5} + add r0, r0, r1 + + subs r12, r12, #1 + stmia r2, {r4-r5} + ;stm r2, {r4-r5} + add r2, r2, r3 + + bne copy_mem8x4_fast_loop + + ldmia sp!, {r4-r5} + ;pop {r4-r5} + mov pc, lr + + ENDP ; |vp8_copy_mem8x4_v6| + + END diff --git a/vp8/common/arm/armv6/copymem8x8_v6.asm b/vp8/common/arm/armv6/copymem8x8_v6.asm new file mode 100644 index 000000000..7cfa53389 --- /dev/null +++ b/vp8/common/arm/armv6/copymem8x8_v6.asm @@ -0,0 +1,127 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_copy_mem8x8_v6| + ; ARM + ; REQUIRE8 + ; PRESERVE8 + + AREA Block, CODE, READONLY ; name this block of code +;void copy_mem8x8_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride) +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= +|vp8_copy_mem8x8_v6| PROC + ;push {r4-r5} + stmdb sp!, {r4-r5} + + ;preload + pld [r0] + pld [r0, r1] + pld [r0, r1, lsl #1] + + ands r4, r0, #7 + beq copy_mem8x8_fast + + ands r4, r0, #3 + beq copy_mem8x8_4 + + ;copy 1 byte each time + ldrb r4, [r0] + ldrb r5, [r0, #1] + + mov r12, #8 + +copy_mem8x8_1_loop + strb r4, [r2] + strb r5, [r2, #1] + + ldrb r4, [r0, #2] + ldrb r5, [r0, #3] + + subs r12, r12, #1 + + strb r4, [r2, #2] + strb r5, [r2, #3] + + ldrb r4, [r0, #4] + ldrb r5, [r0, #5] + + strb r4, [r2, #4] + strb r5, [r2, #5] + + ldrb r4, [r0, #6] + ldrb r5, [r0, #7] + + add r0, r0, r1 + + strb r4, [r2, #6] + strb r5, [r2, #7] + + add r2, r2, r3 + + ldrneb r4, [r0] + ldrneb r5, [r0, #1] + + bne copy_mem8x8_1_loop + + ldmia sp!, {r4 - r5} + ;pop {r4-r5} + mov pc, lr + +;copy 4 bytes each time +copy_mem8x8_4 + ldr r4, [r0] + ldr r5, [r0, #4] + + mov r12, #8 + +copy_mem8x8_4_loop + subs r12, r12, #1 + add r0, r0, r1 + + str r4, [r2] + str r5, [r2, #4] + + add r2, r2, r3 + + ldrne r4, [r0] + ldrne r5, [r0, #4] + + bne copy_mem8x8_4_loop + + ldmia sp!, {r4 - r5} + ;pop {r4-r5} + mov pc, lr + +;copy 8 bytes each time +copy_mem8x8_fast + ;sub r1, r1, #8 + ;sub r3, r3, #8 + + mov r12, #8 + +copy_mem8x8_fast_loop + ldmia r0, {r4-r5} + ;ldm r0, {r4-r5} + add r0, r0, r1 + + subs r12, r12, #1 + stmia r2, {r4-r5} + ;stm r2, {r4-r5} + add r2, r2, r3 + + bne copy_mem8x8_fast_loop + + ldmia sp!, {r4-r5} + ;pop {r4-r5} + mov pc, lr + + ENDP ; |vp8_copy_mem8x8_v6| + + END diff --git a/vp8/common/arm/armv6/filter_v6.asm b/vp8/common/arm/armv6/filter_v6.asm new file mode 100644 index 000000000..a7863fc94 --- /dev/null +++ b/vp8/common/arm/armv6/filter_v6.asm @@ -0,0 +1,383 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_filter_block2d_first_pass_armv6| + EXPORT |vp8_filter_block2d_second_pass_armv6| + EXPORT |vp8_filter_block2d_first_pass_only_armv6| + EXPORT |vp8_filter_block2d_second_pass_only_armv6| + + AREA |.text|, CODE, READONLY ; name this block of code +;------------------------------------- +; r0 unsigned char *src_ptr +; r1 short *output_ptr +; r2 unsigned int src_pixels_per_line +; r3 unsigned int output_width +; stack unsigned int output_height +; stack const short *vp8_filter +;------------------------------------- +; vp8_filter the input and put in the output array. Apply the 6 tap FIR filter with +; the output being a 2 byte value and the intput being a 1 byte value. +|vp8_filter_block2d_first_pass_armv6| PROC + stmdb sp!, {r4 - r11, lr} + + ldr r11, [sp, #40] ; vp8_filter address + ldr r7, [sp, #36] ; output height + + sub r2, r2, r3 ; inside loop increments input array, + ; so the height loop only needs to add + ; r2 - width to the input pointer + + mov r3, r3, lsl #1 ; multiply width by 2 because using shorts + add r12, r3, #16 ; square off the output + sub sp, sp, #4 + + ;;IF ARCHITECTURE=6 + ;pld [r0, #-2] + ;;pld [r0, #30] + ;;ENDIF + + ldr r4, [r11] ; load up packed filter coefficients + ldr r5, [r11, #4] + ldr r6, [r11, #8] + + str r1, [sp] ; push destination to stack + mov r7, r7, lsl #16 ; height is top part of counter + +; six tap filter +|height_loop_1st_6| + ldrb r8, [r0, #-2] ; load source data + ldrb r9, [r0, #-1] + ldrb r10, [r0], #2 + orr r7, r7, r3, lsr #2 ; construct loop counter + +|width_loop_1st_6| + ldrb r11, [r0, #-1] + + pkhbt lr, r8, r9, lsl #16 ; r9 | r8 + pkhbt r8, r9, r10, lsl #16 ; r10 | r9 + + ldrb r9, [r0] + + smuad lr, lr, r4 ; apply the filter + pkhbt r10, r10, r11, lsl #16 ; r11 | r10 + smuad r8, r8, r4 + pkhbt r11, r11, r9, lsl #16 ; r9 | r11 + + smlad lr, r10, r5, lr + ldrb r10, [r0, #1] + smlad r8, r11, r5, r8 + ldrb r11, [r0, #2] + + sub r7, r7, #1 + + pkhbt r9, r9, r10, lsl #16 ; r10 | r9 + pkhbt r10, r10, r11, lsl #16 ; r11 | r10 + + smlad lr, r9, r6, lr + smlad r11, r10, r6, r8 + + ands r10, r7, #0xff ; test loop counter + + add lr, lr, #0x40 ; round_shift_and_clamp + ldrneb r8, [r0, #-2] ; load data for next loop + usat lr, #8, lr, asr #7 + add r11, r11, #0x40 + ldrneb r9, [r0, #-1] + usat r11, #8, r11, asr #7 + + strh lr, [r1], r12 ; result is transposed and stored, which + ; will make second pass filtering easier. + ldrneb r10, [r0], #2 + strh r11, [r1], r12 + + bne width_loop_1st_6 + + ;;add r9, r2, #30 ; attempt to load 2 adjacent cache lines + ;;IF ARCHITECTURE=6 + ;pld [r0, r2] + ;;pld [r0, r9] + ;;ENDIF + + ldr r1, [sp] ; load and update dst address + subs r7, r7, #0x10000 + add r0, r0, r2 ; move to next input line + add r1, r1, #2 ; move over to next column + str r1, [sp] + + bne height_loop_1st_6 + + add sp, sp, #4 + ldmia sp!, {r4 - r11, pc} + + ENDP + +;--------------------------------- +; r0 short *src_ptr, +; r1 unsigned char *output_ptr, +; r2 unsigned int output_pitch, +; r3 unsigned int cnt, +; stack const short *vp8_filter +;--------------------------------- +|vp8_filter_block2d_second_pass_armv6| PROC + stmdb sp!, {r4 - r11, lr} + + ldr r11, [sp, #36] ; vp8_filter address + sub sp, sp, #4 + mov r7, r3, lsl #16 ; height is top part of counter + str r1, [sp] ; push destination to stack + + ldr r4, [r11] ; load up packed filter coefficients + ldr r5, [r11, #4] + ldr r6, [r11, #8] + + pkhbt r12, r5, r4 ; pack the filter differently + pkhbt r11, r6, r5 + + sub r0, r0, #4 ; offset input buffer + +|height_loop_2nd| + ldr r8, [r0] ; load the data + ldr r9, [r0, #4] + orr r7, r7, r3, lsr #1 ; loop counter + +|width_loop_2nd| + smuad lr, r4, r8 ; apply filter + sub r7, r7, #1 + smulbt r8, r4, r8 + + ldr r10, [r0, #8] + + smlad lr, r5, r9, lr + smladx r8, r12, r9, r8 + + ldrh r9, [r0, #12] + + smlad lr, r6, r10, lr + smladx r8, r11, r10, r8 + + add r0, r0, #4 + smlatb r10, r6, r9, r8 + + add lr, lr, #0x40 ; round_shift_and_clamp + ands r8, r7, #0xff + usat lr, #8, lr, asr #7 + add r10, r10, #0x40 + strb lr, [r1], r2 ; the result is transposed back and stored + usat r10, #8, r10, asr #7 + + ldrne r8, [r0] ; load data for next loop + ldrne r9, [r0, #4] + strb r10, [r1], r2 + + bne width_loop_2nd + + ldr r1, [sp] ; update dst for next loop + subs r7, r7, #0x10000 + add r0, r0, #16 ; updata src for next loop + add r1, r1, #1 + str r1, [sp] + + bne height_loop_2nd + + add sp, sp, #4 + ldmia sp!, {r4 - r11, pc} + + ENDP + +;------------------------------------ +; r0 unsigned char *src_ptr +; r1 unsigned char *output_ptr, +; r2 unsigned int src_pixels_per_line +; r3 unsigned int cnt, +; stack unsigned int output_pitch, +; stack const short *vp8_filter +;------------------------------------ +|vp8_filter_block2d_first_pass_only_armv6| PROC + stmdb sp!, {r4 - r11, lr} + + ldr r4, [sp, #36] ; output pitch + ldr r11, [sp, #40] ; HFilter address + sub sp, sp, #8 + + mov r7, r3 + sub r2, r2, r3 ; inside loop increments input array, + ; so the height loop only needs to add + ; r2 - width to the input pointer + + sub r4, r4, r3 + str r4, [sp] ; save modified output pitch + str r2, [sp, #4] + + mov r2, #0x40 + + ldr r4, [r11] ; load up packed filter coefficients + ldr r5, [r11, #4] + ldr r6, [r11, #8] + +; six tap filter +|height_loop_1st_only_6| + ldrb r8, [r0, #-2] ; load data + ldrb r9, [r0, #-1] + ldrb r10, [r0], #2 + + mov r12, r3, lsr #1 ; loop counter + +|width_loop_1st_only_6| + ldrb r11, [r0, #-1] + + pkhbt lr, r8, r9, lsl #16 ; r9 | r8 + pkhbt r8, r9, r10, lsl #16 ; r10 | r9 + + ldrb r9, [r0] + +;; smuad lr, lr, r4 + smlad lr, lr, r4, r2 + pkhbt r10, r10, r11, lsl #16 ; r11 | r10 +;; smuad r8, r8, r4 + smlad r8, r8, r4, r2 + pkhbt r11, r11, r9, lsl #16 ; r9 | r11 + + smlad lr, r10, r5, lr + ldrb r10, [r0, #1] + smlad r8, r11, r5, r8 + ldrb r11, [r0, #2] + + subs r12, r12, #1 + + pkhbt r9, r9, r10, lsl #16 ; r10 | r9 + pkhbt r10, r10, r11, lsl #16 ; r11 | r10 + + smlad lr, r9, r6, lr + smlad r10, r10, r6, r8 + +;; add lr, lr, #0x40 ; round_shift_and_clamp + ldrneb r8, [r0, #-2] ; load data for next loop + usat lr, #8, lr, asr #7 +;; add r10, r10, #0x40 + strb lr, [r1], #1 ; store the result + usat r10, #8, r10, asr #7 + + ldrneb r9, [r0, #-1] + strb r10, [r1], #1 + ldrneb r10, [r0], #2 + + bne width_loop_1st_only_6 + + ;;add r9, r2, #30 ; attempt to load 2 adjacent cache lines + ;;IF ARCHITECTURE=6 + ;pld [r0, r2] + ;;pld [r0, r9] + ;;ENDIF + + ldr lr, [sp] ; load back output pitch + ldr r12, [sp, #4] ; load back output pitch + subs r7, r7, #1 + add r0, r0, r12 ; updata src for next loop + add r1, r1, lr ; update dst for next loop + + bne height_loop_1st_only_6 + + add sp, sp, #8 + ldmia sp!, {r4 - r11, pc} + ENDP ; |vp8_filter_block2d_first_pass_only_armv6| + + +;------------------------------------ +; r0 unsigned char *src_ptr, +; r1 unsigned char *output_ptr, +; r2 unsigned int src_pixels_per_line +; r3 unsigned int cnt, +; stack unsigned int output_pitch, +; stack const short *vp8_filter +;------------------------------------ +|vp8_filter_block2d_second_pass_only_armv6| PROC + stmdb sp!, {r4 - r11, lr} + + ldr r11, [sp, #40] ; VFilter address + ldr r12, [sp, #36] ; output pitch + + mov r7, r3, lsl #16 ; height is top part of counter + sub r0, r0, r2, lsl #1 ; need 6 elements for filtering, 2 before, 3 after + + sub sp, sp, #8 + + ldr r4, [r11] ; load up packed filter coefficients + ldr r5, [r11, #4] + ldr r6, [r11, #8] + + str r0, [sp] ; save r0 to stack + str r1, [sp, #4] ; save dst to stack + +; six tap filter +|width_loop_2nd_only_6| + ldrb r8, [r0], r2 ; load data + orr r7, r7, r3 ; loop counter + ldrb r9, [r0], r2 + ldrb r10, [r0], r2 + +|height_loop_2nd_only_6| + ; filter first column in this inner loop, than, move to next colum. + ldrb r11, [r0], r2 + + pkhbt lr, r8, r9, lsl #16 ; r9 | r8 + pkhbt r8, r9, r10, lsl #16 ; r10 | r9 + + ldrb r9, [r0], r2 + + smuad lr, lr, r4 + pkhbt r10, r10, r11, lsl #16 ; r11 | r10 + smuad r8, r8, r4 + pkhbt r11, r11, r9, lsl #16 ; r9 | r11 + + smlad lr, r10, r5, lr + ldrb r10, [r0], r2 + smlad r8, r11, r5, r8 + ldrb r11, [r0] + + sub r7, r7, #2 + sub r0, r0, r2, lsl #2 + + pkhbt r9, r9, r10, lsl #16 ; r10 | r9 + pkhbt r10, r10, r11, lsl #16 ; r11 | r10 + + smlad lr, r9, r6, lr + smlad r10, r10, r6, r8 + + ands r9, r7, #0xff + + add lr, lr, #0x40 ; round_shift_and_clamp + ldrneb r8, [r0], r2 ; load data for next loop + usat lr, #8, lr, asr #7 + add r10, r10, #0x40 + strb lr, [r1], r12 ; store the result for the column + usat r10, #8, r10, asr #7 + + ldrneb r9, [r0], r2 + strb r10, [r1], r12 + ldrneb r10, [r0], r2 + + bne height_loop_2nd_only_6 + + ldr r0, [sp] + ldr r1, [sp, #4] + subs r7, r7, #0x10000 + add r0, r0, #1 ; move to filter next column + str r0, [sp] + add r1, r1, #1 + str r1, [sp, #4] + + bne width_loop_2nd_only_6 + + add sp, sp, #8 + + ldmia sp!, {r4 - r11, pc} + ENDP ; |vp8_filter_block2d_second_pass_only_armv6| + + END diff --git a/vp8/common/arm/armv6/idct_v6.asm b/vp8/common/arm/armv6/idct_v6.asm new file mode 100644 index 000000000..25c5165ec --- /dev/null +++ b/vp8/common/arm/armv6/idct_v6.asm @@ -0,0 +1,376 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + +; r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12 r14 + EXPORT |vp8_short_idct4x4llm_1_v6| + EXPORT |vp8_short_idct4x4llm_v6| + EXPORT |vp8_short_idct4x4llm_v6_scott| + EXPORT |vp8_short_idct4x4llm_v6_dual| + + EXPORT |vp8_dc_only_idct_armv6| + + AREA |.text|, CODE, READONLY + +;******************************************************************************** +;* void short_idct4x4llm_1_v6(INT16 * input, INT16 * output, INT32 pitch) +;* r0 INT16 * input +;* r1 INT16 * output +;* r2 INT32 pitch +;* bench: 3/5 +;******************************************************************************** + +|vp8_short_idct4x4llm_1_v6| PROC ; cycles in out pit + ; + ldrsh r0, [r0] ; load input[0] 1, r0 un 2 + add r0, r0, #4 ; 1 +4 + stmdb sp!, {r4, r5, lr} ; make room for wide writes 1 backup + mov r0, r0, asr #3 ; (input[0] + 4) >> 3 1, r0 req`d ^1 >> 3 + pkhbt r4, r0, r0, lsl #16 ; pack r0 into r4 1, r0 req`d ^1 pack + mov r5, r4 ; expand expand + + strd r4, [r1], r2 ; *output = r0, post inc 1 + strd r4, [r1], r2 ; 1 + strd r4, [r1], r2 ; 1 + strd r4, [r1] ; 1 + ; + ldmia sp!, {r4, r5, pc} ; replace vars, return restore + ENDP ; |vp8_short_idct4x4llm_1_v6| +;******************************************************************************** +;******************************************************************************** +;******************************************************************************** + +;******************************************************************************** +;* void short_idct4x4llm_v6(INT16 * input, INT16 * output, INT32 pitch) +;* r0 INT16 * input +;* r1 INT16 * output +;* r2 INT32 pitch +;* bench: +;******************************************************************************** + +|vp8_short_idct4x4llm_v6| PROC ; cycles in out pit + ; + stmdb sp!, {r4-r11, lr} ; backup registers 1 backup + ; + mov r4, #0x00004E00 ; 1 cst + orr r4, r4, #0x0000007B ; cospi8sqrt2minus1 + mov r5, #0x00008A00 ; 1 cst + orr r5, r5, #0x0000008C ; sinpi8sqrt2 + ; + mov r6, #4 ; i=4 1 i +loop1 ; + ldrsh r12, [r0, #8] ; input[4] 1, r12 unavail 2 [4] + ldrsh r3, [r0, #24] ; input[12] 1, r3 unavail 2 [12] + ldrsh r8, [r0, #16] ; input[8] 1, r8 unavail 2 [8] + ldrsh r7, [r0], #0x2 ; input[0] 1, r7 unavail 2 ++ [0] + smulwb r10, r5, r12 ; ([4] * sinpi8sqrt2) >> 16 1, r10 un 2, r12/r5 ^1 t1 + smulwb r11, r4, r3 ; ([12] * cospi8sqrt2minus1) >> 16 1, r11 un 2, r3/r4 ^1 t2 + add r9, r7, r8 ; a1 = [0] + [8] 1 a1 + sub r7, r7, r8 ; b1 = [0] - [8] 1 b1 + add r11, r3, r11 ; temp2 1 + rsb r11, r11, r10 ; c1 = temp1 - temp2 1 c1 + smulwb r3, r5, r3 ; ([12] * sinpi8sqrt2) >> 16 1, r3 un 2, r3/r5 ^ 1 t2 + smulwb r10, r4, r12 ; ([4] * cospi8sqrt2minus1) >> 16 1, r10 un 2, r12/r4 ^1 t1 + add r8, r7, r11 ; b1 + c1 1 b+c + strh r8, [r1, r2] ; out[pitch] = b1+c1 1 + sub r7, r7, r11 ; b1 - c1 1 b-c + add r10, r12, r10 ; temp1 1 + add r3, r10, r3 ; d1 = temp1 + temp2 1 d1 + add r10, r9, r3 ; a1 + d1 1 a+d + sub r3, r9, r3 ; a1 - d1 1 a-d + add r8, r2, r2 ; pitch * 2 1 p*2 + strh r7, [r1, r8] ; out[pitch*2] = b1-c1 1 + add r7, r2, r2, lsl #1 ; pitch * 3 1 p*3 + strh r3, [r1, r7] ; out[pitch*3] = a1-d1 1 + subs r6, r6, #1 ; i-- 1 -- + strh r10, [r1], #0x2 ; out[0] = a1+d1 1 ++ + bne loop1 ; if i>0, continue + ; + sub r1, r1, #8 ; set up out for next loop 1 -4 + ; for this iteration, input=prev output + mov r6, #4 ; i=4 1 i +; b returnfull +loop2 ; + ldrsh r11, [r1, #2] ; input[1] 1, r11 un 2 [1] + ldrsh r8, [r1, #6] ; input[3] 1, r8 un 2 [3] + ldrsh r3, [r1, #4] ; input[2] 1, r3 un 2 [2] + ldrsh r0, [r1] ; input[0] 1, r0 un 2 [0] + smulwb r9, r5, r11 ; ([1] * sinpi8sqrt2) >> 16 1, r9 un 2, r5/r11 ^1 t1 + smulwb r10, r4, r8 ; ([3] * cospi8sqrt2minus1) >> 16 1, r10 un 2, r4/r8 ^1 t2 + add r7, r0, r3 ; a1 = [0] + [2] 1 a1 + sub r0, r0, r3 ; b1 = [0] - [2] 1 b1 + add r10, r8, r10 ; temp2 1 + rsb r9, r10, r9 ; c1 = temp1 - temp2 1 c1 + smulwb r8, r5, r8 ; ([3] * sinpi8sqrt2) >> 16 1, r8 un 2, r5/r8 ^1 t2 + smulwb r10, r4, r11 ; ([1] * cospi8sqrt2minus1) >> 16 1, r10 un 2, r4/r11 ^1 t1 + add r3, r0, r9 ; b1+c1 1 b+c + add r3, r3, #4 ; b1+c1+4 1 +4 + add r10, r11, r10 ; temp1 1 + mov r3, r3, asr #3 ; b1+c1+4 >> 3 1, r3 ^1 >>3 + strh r3, [r1, #2] ; out[1] = b1+c1 1 + add r10, r10, r8 ; d1 = temp1 + temp2 1 d1 + add r3, r7, r10 ; a1+d1 1 a+d + add r3, r3, #4 ; a1+d1+4 1 +4 + sub r7, r7, r10 ; a1-d1 1 a-d + add r7, r7, #4 ; a1-d1+4 1 +4 + mov r3, r3, asr #3 ; a1+d1+4 >> 3 1, r3 ^1 >>3 + mov r7, r7, asr #3 ; a1-d1+4 >> 3 1, r7 ^1 >>3 + strh r7, [r1, #6] ; out[3] = a1-d1 1 + sub r0, r0, r9 ; b1-c1 1 b-c + add r0, r0, #4 ; b1-c1+4 1 +4 + subs r6, r6, #1 ; i-- 1 -- + mov r0, r0, asr #3 ; b1-c1+4 >> 3 1, r0 ^1 >>3 + strh r0, [r1, #4] ; out[2] = b1-c1 1 + strh r3, [r1], r2 ; out[0] = a1+d1 1 +; add r1, r1, r2 ; out += pitch 1 ++ + bne loop2 ; if i>0, continue +returnfull ; + ldmia sp!, {r4 - r11, pc} ; replace vars, return restore + ENDP + +;******************************************************************************** +;******************************************************************************** +;******************************************************************************** + +;******************************************************************************** +;* void short_idct4x4llm_v6_scott(INT16 * input, INT16 * output, INT32 pitch) +;* r0 INT16 * input +;* r1 INT16 * output +;* r2 INT32 pitch +;* bench: +;******************************************************************************** + +|vp8_short_idct4x4llm_v6_scott| PROC ; cycles in out pit +; mov r0, #0 ; +; ldr r0, [r0] ; + stmdb sp!, {r4 - r11, lr} ; backup registers 1 backup + ; + mov r3, #0x00004E00 ; cos + orr r3, r3, #0x0000007B ; cospi8sqrt2minus1 + mov r4, #0x00008A00 ; sin + orr r4, r4, #0x0000008C ; sinpi8sqrt2 + ; + mov r5, #0x2 ; i i + ; +short_idct4x4llm_v6_scott_loop1 ; + ldr r10, [r0, #(4*2)] ; i5 | i4 5,4 + ldr r11, [r0, #(12*2)] ; i13 | i12 13,12 + ; + smulwb r6, r4, r10 ; ((ip[4] * sinpi8sqrt2) >> 16) lt1 + smulwb r7, r3, r11 ; ((ip[12] * cospi8sqrt2minus1) >> 16) lt2 + ; + smulwb r12, r3, r10 ; ((ip[4] * cospi8sqrt2misu1) >> 16) l2t2 + smulwb r14, r4, r11 ; ((ip[12] * sinpi8sqrt2) >> 16) l2t1 + ; + add r6, r6, r7 ; partial c1 lt1-lt2 + add r12, r12, r14 ; partial d1 l2t2+l2t1 + ; + smulwt r14, r4, r10 ; ((ip[5] * sinpi8sqrt2) >> 16) ht1 + smulwt r7, r3, r11 ; ((ip[13] * cospi8sqrt2minus1) >> 16) ht2 + ; + smulwt r8, r3, r10 ; ((ip[5] * cospi8sqrt2minus1) >> 16) h2t1 + smulwt r9, r4, r11 ; ((ip[13] * sinpi8sqrt2) >> 16) h2t2 + ; + add r7, r14, r7 ; partial c1_2 ht1+ht2 + sub r8, r8, r9 ; partial d1_2 h2t1-h2t2 + ; + pkhbt r6, r6, r7, lsl #16 ; partial c1_2 | partial c1_1 pack + pkhbt r12, r12, r8, lsl #16 ; partial d1_2 | partial d1_1 pack + ; + usub16 r6, r6, r10 ; c1_2 | c1_1 c + uadd16 r12, r12, r11 ; d1_2 | d1_1 d + ; + ldr r10, [r0, #0] ; i1 | i0 1,0 + ldr r11, [r0, #(8*2)] ; i9 | i10 9,10 + ; +;;;;;; add r0, r0, #0x4 ; +4 +;;;;;; add r1, r1, #0x4 ; +4 + ; + uadd16 r8, r10, r11 ; i1 + i9 | i0 + i8 aka a1 a + usub16 r9, r10, r11 ; i1 - i9 | i0 - i8 aka b1 b + ; + uadd16 r7, r8, r12 ; a1 + d1 pair a+d + usub16 r14, r8, r12 ; a1 - d1 pair a-d + ; + str r7, [r1] ; op[0] = a1 + d1 + str r14, [r1, r2] ; op[pitch*3] = a1 - d1 + ; + add r0, r0, #0x4 ; op[pitch] = b1 + c1 ++ + add r1, r1, #0x4 ; op[pitch*2] = b1 - c1 ++ + ; + subs r5, r5, #0x1 ; -- + bne short_idct4x4llm_v6_scott_loop1 ; + ; + sub r1, r1, #16 ; reset output ptr + mov r5, #0x4 ; + mov r0, r1 ; input = output + ; +short_idct4x4llm_v6_scott_loop2 ; + ; + subs r5, r5, #0x1 ; + bne short_idct4x4llm_v6_scott_loop2 ; + ; + ldmia sp!, {r4 - r11, pc} ; + ENDP ; + ; +;******************************************************************************** +;******************************************************************************** +;******************************************************************************** + +;******************************************************************************** +;* void short_idct4x4llm_v6_dual(INT16 * input, INT16 * output, INT32 pitch) +;* r0 INT16 * input +;* r1 INT16 * output +;* r2 INT32 pitch +;* bench: +;******************************************************************************** + +|vp8_short_idct4x4llm_v6_dual| PROC ; cycles in out pit + ; + stmdb sp!, {r4-r11, lr} ; backup registers 1 backup + mov r3, #0x00004E00 ; cos + orr r3, r3, #0x0000007B ; cospi8sqrt2minus1 + mov r4, #0x00008A00 ; sin + orr r4, r4, #0x0000008C ; sinpi8sqrt2 + mov r5, #0x2 ; i=2 i +loop1_dual + ldr r6, [r0, #(4*2)] ; i5 | i4 5|4 + ldr r12, [r0, #(12*2)] ; i13 | i12 13|12 + ldr r14, [r0, #(8*2)] ; i9 | i8 9|8 + + smulwt r9, r3, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16 5c + smulwb r7, r3, r6 ; (ip[4] * cospi8sqrt2minus1) >> 16 4c + smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16 5s + smulwb r8, r4, r6 ; (ip[4] * sinpi8sqrt2) >> 16 4s + pkhbt r7, r7, r9, lsl #16 ; 5c | 4c + smulwt r11, r3, r12 ; (ip[13] * cospi8sqrt2minus1) >> 16 13c + pkhbt r8, r8, r10, lsl #16 ; 5s | 4s + uadd16 r6, r6, r7 ; 5c+5 | 4c+4 + smulwt r7, r4, r12 ; (ip[13] * sinpi8sqrt2) >> 16 13s + smulwb r9, r3, r12 ; (ip[12] * cospi8sqrt2minus1) >> 16 12c + smulwb r10, r4, r12 ; (ip[12] * sinpi8sqrt2) >> 16 12s + subs r5, r5, #0x1 ; i-- -- + pkhbt r9, r9, r11, lsl #16 ; 13c | 12c + ldr r11, [r0], #0x4 ; i1 | i0 ++ 1|0 + pkhbt r10, r10, r7, lsl #16 ; 13s | 12s + uadd16 r7, r12, r9 ; 13c+13 | 12c+12 + usub16 r7, r8, r7 ; c c + uadd16 r6, r6, r10 ; d d + uadd16 r10, r11, r14 ; a a + usub16 r8, r11, r14 ; b b + uadd16 r9, r10, r6 ; a+d a+d + usub16 r10, r10, r6 ; a-d a-d + uadd16 r6, r8, r7 ; b+c b+c + usub16 r7, r8, r7 ; b-c b-c + str r6, [r1, r2] ; o5 | o4 + add r6, r2, r2 ; pitch * 2 p2 + str r7, [r1, r6] ; o9 | o8 + add r6, r6, r2 ; pitch * 3 p3 + str r10, [r1, r6] ; o13 | o12 + str r9, [r1], #0x4 ; o1 | o0 ++ + bne loop1_dual ; + mov r5, #0x2 ; i=2 i + sub r0, r1, #8 ; reset input/output i/o +loop2_dual + ldr r6, [r0, r2] ; i5 | i4 5|4 + ldr r1, [r0] ; i1 | i0 1|0 + ldr r12, [r0, #0x4] ; i3 | i2 3|2 + add r14, r2, #0x4 ; pitch + 2 p+2 + ldr r14, [r0, r14] ; i7 | i6 7|6 + smulwt r9, r3, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16 5c + smulwt r7, r3, r1 ; (ip[1] * cospi8sqrt2minus1) >> 16 1c + smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16 5s + smulwt r8, r4, r1 ; (ip[1] * sinpi8sqrt2) >> 16 1s + pkhbt r11, r6, r1, lsl #16 ; i0 | i4 0|4 + pkhbt r7, r9, r7, lsl #16 ; 1c | 5c + pkhbt r8, r10, r8, lsl #16 ; 1s | 5s = temp1 © tc1 + pkhtb r1, r1, r6, asr #16 ; i1 | i5 1|5 + uadd16 r1, r7, r1 ; 1c+1 | 5c+5 = temp2 (d) td2 + pkhbt r9, r14, r12, lsl #16 ; i2 | i6 2|6 + uadd16 r10, r11, r9 ; a a + usub16 r9, r11, r9 ; b b + pkhtb r6, r12, r14, asr #16 ; i3 | i7 3|7 + subs r5, r5, #0x1 ; i-- -- + smulwt r7, r3, r6 ; (ip[3] * cospi8sqrt2minus1) >> 16 3c + smulwt r11, r4, r6 ; (ip[3] * sinpi8sqrt2) >> 16 3s + smulwb r12, r3, r6 ; (ip[7] * cospi8sqrt2minus1) >> 16 7c + smulwb r14, r4, r6 ; (ip[7] * sinpi8sqrt2) >> 16 7s + + pkhbt r7, r12, r7, lsl #16 ; 3c | 7c + pkhbt r11, r14, r11, lsl #16 ; 3s | 7s = temp1 (d) td1 + uadd16 r6, r7, r6 ; 3c+3 | 7c+7 = temp2 (c) tc2 + usub16 r12, r8, r6 ; c (o1 | o5) c + uadd16 r6, r11, r1 ; d (o3 | o7) d + uadd16 r7, r10, r6 ; a+d a+d + mov r8, #0x4 ; set up 4's 4 + orr r8, r8, #0x40000 ; 4|4 + usub16 r6, r10, r6 ; a-d a-d + uadd16 r6, r6, r8 ; a-d+4 3|7 + uadd16 r7, r7, r8 ; a+d+4 0|4 + uadd16 r10, r9, r12 ; b+c b+c + usub16 r1, r9, r12 ; b-c b-c + uadd16 r10, r10, r8 ; b+c+4 1|5 + uadd16 r1, r1, r8 ; b-c+4 2|6 + mov r8, r10, asr #19 ; o1 >> 3 + strh r8, [r0, #2] ; o1 + mov r8, r1, asr #19 ; o2 >> 3 + strh r8, [r0, #4] ; o2 + mov r8, r6, asr #19 ; o3 >> 3 + strh r8, [r0, #6] ; o3 + mov r8, r7, asr #19 ; o0 >> 3 + strh r8, [r0], r2 ; o0 +p + sxth r10, r10 ; + mov r8, r10, asr #3 ; o5 >> 3 + strh r8, [r0, #2] ; o5 + sxth r1, r1 ; + mov r8, r1, asr #3 ; o6 >> 3 + strh r8, [r0, #4] ; o6 + sxth r6, r6 ; + mov r8, r6, asr #3 ; o7 >> 3 + strh r8, [r0, #6] ; o7 + sxth r7, r7 ; + mov r8, r7, asr #3 ; o4 >> 3 + strh r8, [r0], r2 ; o4 +p +;;;;; subs r5, r5, #0x1 ; i-- -- + bne loop2_dual ; + ; + ldmia sp!, {r4 - r11, pc} ; replace vars, return restore + ENDP + + +; sjl added 10/17/08 +;void dc_only_idct_armv6(short input_dc, short *output, int pitch) +|vp8_dc_only_idct_armv6| PROC + stmdb sp!, {r4 - r6, lr} + + add r0, r0, #0x4 + add r4, r1, r2 ; output + shortpitch + mov r0, r0, ASR #0x3 ;aka a1 + add r5, r1, r2, LSL #1 ; output + shortpitch * 2 + pkhbt r0, r0, r0, lsl #16 ; a1 | a1 + add r6, r5, r2 ; output + shortpitch * 3 + + str r0, [r1, #0] + str r0, [r1, #4] + + str r0, [r4, #0] + str r0, [r4, #4] + + str r0, [r5, #0] + str r0, [r5, #4] + + str r0, [r6, #0] + str r0, [r6, #4] + + + ldmia sp!, {r4 - r6, pc} + + ENDP ; |vp8_dc_only_idct_armv6| + + END diff --git a/vp8/common/arm/armv6/iwalsh_v6.asm b/vp8/common/arm/armv6/iwalsh_v6.asm new file mode 100644 index 000000000..87475681f --- /dev/null +++ b/vp8/common/arm/armv6/iwalsh_v6.asm @@ -0,0 +1,151 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + EXPORT |vp8_short_inv_walsh4x4_armv6| + EXPORT |vp8_short_inv_walsh4x4_1_armv6| + + ARM + REQUIRE8 + PRESERVE8 + + AREA |.text|, CODE, READONLY ; name this block of code + +;short vp8_short_inv_walsh4x4_armv6(short *input, short *output) +|vp8_short_inv_walsh4x4_armv6| PROC + + stmdb sp!, {r4 - r11, lr} + + ldr r2, [r0], #4 ; [1 | 0] + ldr r3, [r0], #4 ; [3 | 2] + ldr r4, [r0], #4 ; [5 | 4] + ldr r5, [r0], #4 ; [7 | 6] + ldr r6, [r0], #4 ; [9 | 8] + ldr r7, [r0], #4 ; [11 | 10] + ldr r8, [r0], #4 ; [13 | 12] + ldr r9, [r0] ; [15 | 14] + + qadd16 r10, r2, r8 ; a1 [1+13 | 0+12] + qadd16 r11, r4, r6 ; b1 [5+9 | 4+8] + qsub16 r12, r4, r6 ; c1 [5-9 | 4-8] + qsub16 lr, r2, r8 ; d1 [1-13 | 0-12] + + qadd16 r2, r10, r11 ; a1 + b1 [1 | 0] + qadd16 r4, r12, lr ; c1 + d1 [5 | 4] + qsub16 r6, r10, r11 ; a1 - b1 [9 | 8] + qsub16 r8, lr, r12 ; d1 - c1 [13 | 12] + + qadd16 r10, r3, r9 ; a1 [3+15 | 2+14] + qadd16 r11, r5, r7 ; b1 [7+11 | 6+10] + qsub16 r12, r5, r7 ; c1 [7-11 | 6-10] + qsub16 lr, r3, r9 ; d1 [3-15 | 2-14] + + qadd16 r3, r10, r11 ; a1 + b1 [3 | 2] + qadd16 r5, r12, lr ; c1 + d1 [7 | 6] + qsub16 r7, r10, r11 ; a1 - b1 [11 | 10] + qsub16 r9, lr, r12 ; d1 - c1 [15 | 14] + + ; first transform complete + + qsubaddx r10, r2, r3 ; [c1|a1] [1-2 | 0+3] + qaddsubx r11, r2, r3 ; [b1|d1] [1+2 | 0-3] + qsubaddx r12, r4, r5 ; [c1|a1] [5-6 | 4+7] + qaddsubx lr, r4, r5 ; [b1|d1] [5+6 | 4-7] + + qaddsubx r2, r10, r11 ; [b2|c2] [c1+d1 | a1-b1] + qaddsubx r3, r11, r10 ; [a2|d2] [b1+a1 | d1-c1] + ldr r10, c0x00030003 + qaddsubx r4, r12, lr ; [b2|c2] [c1+d1 | a1-b1] + qaddsubx r5, lr, r12 ; [a2|d2] [b1+a1 | d1-c1] + + qadd16 r2, r2, r10 ; [b2+3|c2+3] + qadd16 r3, r3, r10 ; [a2+3|d2+3] + qadd16 r4, r4, r10 ; [b2+3|c2+3] + qadd16 r5, r5, r10 ; [a2+3|d2+3] + + asr r12, r2, #3 ; [1 | x] + pkhtb r12, r12, r3, asr #19; [1 | 0] + lsl lr, r3, #16 ; [~3 | x] + lsl r2, r2, #16 ; [~2 | x] + asr lr, lr, #3 ; [3 | x] + pkhtb lr, lr, r2, asr #19 ; [3 | 2] + + asr r2, r4, #3 ; [5 | x] + pkhtb r2, r2, r5, asr #19 ; [5 | 4] + lsl r3, r5, #16 ; [~7 | x] + lsl r4, r4, #16 ; [~6 | x] + asr r3, r3, #3 ; [7 | x] + pkhtb r3, r3, r4, asr #19 ; [7 | 6] + + str r12, [r1], #4 + str lr, [r1], #4 + str r2, [r1], #4 + str r3, [r1], #4 + + qsubaddx r2, r6, r7 ; [c1|a1] [9-10 | 8+11] + qaddsubx r3, r6, r7 ; [b1|d1] [9+10 | 8-11] + qsubaddx r4, r8, r9 ; [c1|a1] [13-14 | 12+15] + qaddsubx r5, r8, r9 ; [b1|d1] [13+14 | 12-15] + + qaddsubx r6, r2, r3 ; [b2|c2] [c1+d1 | a1-b1] + qaddsubx r7, r3, r2 ; [a2|d2] [b1+a1 | d1-c1] + qaddsubx r8, r4, r5 ; [b2|c2] [c1+d1 | a1-b1] + qaddsubx r9, r5, r4 ; [a2|d2] [b1+a1 | d1-c1] + + qadd16 r6, r6, r10 ; [b2+3|c2+3] + qadd16 r7, r7, r10 ; [a2+3|d2+3] + qadd16 r8, r8, r10 ; [b2+3|c2+3] + qadd16 r9, r9, r10 ; [a2+3|d2+3] + + asr r2, r6, #3 ; [9 | x] + pkhtb r2, r2, r7, asr #19 ; [9 | 8] + lsl r3, r7, #16 ; [~11| x] + lsl r4, r6, #16 ; [~10| x] + asr r3, r3, #3 ; [11 | x] + pkhtb r3, r3, r4, asr #19 ; [11 | 10] + + asr r4, r8, #3 ; [13 | x] + pkhtb r4, r4, r9, asr #19 ; [13 | 12] + lsl r5, r9, #16 ; [~15| x] + lsl r6, r8, #16 ; [~14| x] + asr r5, r5, #3 ; [15 | x] + pkhtb r5, r5, r6, asr #19 ; [15 | 14] + + str r2, [r1], #4 + str r3, [r1], #4 + str r4, [r1], #4 + str r5, [r1] + + ldmia sp!, {r4 - r11, pc} + ENDP ; |vp8_short_inv_walsh4x4_armv6| + + +;short vp8_short_inv_walsh4x4_1_armv6(short *input, short *output) +|vp8_short_inv_walsh4x4_1_armv6| PROC + + ldrsh r2, [r0] ; [0] + add r2, r2, #3 ; [0] + 3 + asr r2, r2, #3 ; a1 ([0]+3) >> 3 + lsl r2, r2, #16 ; [a1 | x] + orr r2, r2, r2, lsr #16 ; [a1 | a1] + + str r2, [r1], #4 + str r2, [r1], #4 + str r2, [r1], #4 + str r2, [r1], #4 + str r2, [r1], #4 + str r2, [r1], #4 + str r2, [r1], #4 + str r2, [r1] + + bx lr + ENDP ; |vp8_short_inv_walsh4x4_1_armv6| + +; Constant Pool +c0x00030003 DCD 0x00030003 + END diff --git a/vp8/common/arm/armv6/loopfilter_v6.asm b/vp8/common/arm/armv6/loopfilter_v6.asm new file mode 100644 index 000000000..c2b02dc0a --- /dev/null +++ b/vp8/common/arm/armv6/loopfilter_v6.asm @@ -0,0 +1,1263 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_loop_filter_horizontal_edge_armv6| + EXPORT |vp8_mbloop_filter_horizontal_edge_armv6| + EXPORT |vp8_loop_filter_vertical_edge_armv6| + EXPORT |vp8_mbloop_filter_vertical_edge_armv6| + + AREA |.text|, CODE, READONLY ; name this block of code + + MACRO + TRANSPOSE_MATRIX $a0, $a1, $a2, $a3, $b0, $b1, $b2, $b3 + ; input: $a0, $a1, $a2, $a3; output: $b0, $b1, $b2, $b3 + ; a0: 03 02 01 00 + ; a1: 13 12 11 10 + ; a2: 23 22 21 20 + ; a3: 33 32 31 30 + ; b3 b2 b1 b0 + + uxtb16 $b1, $a1 ; xx 12 xx 10 + uxtb16 $b0, $a0 ; xx 02 xx 00 + uxtb16 $b3, $a3 ; xx 32 xx 30 + uxtb16 $b2, $a2 ; xx 22 xx 20 + orr $b1, $b0, $b1, lsl #8 ; 12 02 10 00 + orr $b3, $b2, $b3, lsl #8 ; 32 22 30 20 + + uxtb16 $a1, $a1, ror #8 ; xx 13 xx 11 + uxtb16 $a3, $a3, ror #8 ; xx 33 xx 31 + uxtb16 $a0, $a0, ror #8 ; xx 03 xx 01 + uxtb16 $a2, $a2, ror #8 ; xx 23 xx 21 + orr $a0, $a0, $a1, lsl #8 ; 13 03 11 01 + orr $a2, $a2, $a3, lsl #8 ; 33 23 31 21 + + pkhtb $b2, $b3, $b1, asr #16 ; 32 22 12 02 -- p1 + pkhbt $b0, $b1, $b3, lsl #16 ; 30 20 10 00 -- p3 + + pkhtb $b3, $a2, $a0, asr #16 ; 33 23 13 03 -- p0 + pkhbt $b1, $a0, $a2, lsl #16 ; 31 21 11 01 -- p2 + MEND + + +src RN r0 +pstep RN r1 +count RN r5 + +;r0 unsigned char *src_ptr, +;r1 int src_pixel_step, +;r2 const char *flimit, +;r3 const char *limit, +;stack const char *thresh, +;stack int count + +;Note: All 16 elements in flimit are equal. So, in the code, only one load is needed +;for flimit. Same way applies to limit and thresh. + +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- +|vp8_loop_filter_horizontal_edge_armv6| PROC +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- + stmdb sp!, {r4 - r11, lr} + + sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines + ldr count, [sp, #40] ; count for 8-in-parallel + ldr r6, [sp, #36] ; load thresh address + sub sp, sp, #16 ; create temp buffer + + ldr r9, [src], pstep ; p3 + ldr r4, [r2], #4 ; flimit + ldr r10, [src], pstep ; p2 + ldr r2, [r3], #4 ; limit + ldr r11, [src], pstep ; p1 + uadd8 r4, r4, r4 ; flimit * 2 + ldr r3, [r6], #4 ; thresh + mov count, count, lsl #1 ; 4-in-parallel + uadd8 r4, r4, r2 ; flimit * 2 + limit + +|Hnext8| + ; vp8_filter_mask() function + ; calculate breakout conditions + ldr r12, [src], pstep ; p0 + + uqsub8 r6, r9, r10 ; p3 - p2 + uqsub8 r7, r10, r9 ; p2 - p3 + uqsub8 r8, r10, r11 ; p2 - p1 + uqsub8 r10, r11, r10 ; p1 - p2 + + orr r6, r6, r7 ; abs (p3-p2) + orr r8, r8, r10 ; abs (p2-p1) + uqsub8 lr, r6, r2 ; compare to limit. lr: vp8_filter_mask + uqsub8 r8, r8, r2 ; compare to limit + uqsub8 r6, r11, r12 ; p1 - p0 + orr lr, lr, r8 + uqsub8 r7, r12, r11 ; p0 - p1 + ldr r9, [src], pstep ; q0 + ldr r10, [src], pstep ; q1 + orr r6, r6, r7 ; abs (p1-p0) + uqsub8 r7, r6, r2 ; compare to limit + uqsub8 r8, r6, r3 ; compare to thresh -- save r8 for later + orr lr, lr, r7 + + uqsub8 r6, r11, r10 ; p1 - q1 + uqsub8 r7, r10, r11 ; q1 - p1 + uqsub8 r11, r12, r9 ; p0 - q0 + uqsub8 r12, r9, r12 ; q0 - p0 + orr r6, r6, r7 ; abs (p1-q1) + ldr r7, c0x7F7F7F7F + orr r12, r11, r12 ; abs (p0-q0) + ldr r11, [src], pstep ; q2 + uqadd8 r12, r12, r12 ; abs (p0-q0) * 2 + and r6, r7, r6, lsr #1 ; abs (p1-q1) / 2 + uqsub8 r7, r9, r10 ; q0 - q1 + uqadd8 r12, r12, r6 ; abs (p0-q0)*2 + abs (p1-q1)/2 + uqsub8 r6, r10, r9 ; q1 - q0 + uqsub8 r12, r12, r4 ; compare to flimit + uqsub8 r9, r11, r10 ; q2 - q1 + + orr lr, lr, r12 + + ldr r12, [src], pstep ; q3 + uqsub8 r10, r10, r11 ; q1 - q2 + orr r6, r7, r6 ; abs (q1-q0) + orr r10, r9, r10 ; abs (q2-q1) + uqsub8 r7, r6, r2 ; compare to limit + uqsub8 r10, r10, r2 ; compare to limit + uqsub8 r6, r6, r3 ; compare to thresh -- save r6 for later + orr lr, lr, r7 + orr lr, lr, r10 + + uqsub8 r10, r12, r11 ; q3 - q2 + uqsub8 r9, r11, r12 ; q2 - q3 + + mvn r11, #0 ; r11 == -1 + + orr r10, r10, r9 ; abs (q3-q2) + uqsub8 r10, r10, r2 ; compare to limit + + mov r12, #0 + orr lr, lr, r10 + sub src, src, pstep, lsl #2 + + usub8 lr, r12, lr ; use usub8 instead of ssub8 + sel lr, r11, r12 ; filter mask: lr + + cmp lr, #0 + beq hskip_filter ; skip filtering + + sub src, src, pstep, lsl #1 ; move src pointer down by 6 lines + + ;vp8_hevmask() function + ;calculate high edge variance + orr r10, r6, r8 ; calculate vp8_hevmask + + ldr r7, [src], pstep ; p1 + + usub8 r10, r12, r10 ; use usub8 instead of ssub8 + sel r6, r12, r11 ; obtain vp8_hevmask: r6 + + ;vp8_filter() function + ldr r8, [src], pstep ; p0 + ldr r12, c0x80808080 + ldr r9, [src], pstep ; q0 + ldr r10, [src], pstep ; q1 + + eor r7, r7, r12 ; p1 offset to convert to a signed value + eor r8, r8, r12 ; p0 offset to convert to a signed value + eor r9, r9, r12 ; q0 offset to convert to a signed value + eor r10, r10, r12 ; q1 offset to convert to a signed value + + str r9, [sp] ; store qs0 temporarily + str r8, [sp, #4] ; store ps0 temporarily + str r10, [sp, #8] ; store qs1 temporarily + str r7, [sp, #12] ; store ps1 temporarily + + qsub8 r7, r7, r10 ; vp8_signed_char_clamp(ps1-qs1) + qsub8 r8, r9, r8 ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) + + and r7, r7, r6 ; vp8_filter (r7) &= hev + + qadd8 r7, r7, r8 + ldr r9, c0x03030303 ; r9 = 3 --modified for vp8 + + qadd8 r7, r7, r8 + ldr r10, c0x04040404 + + qadd8 r7, r7, r8 + and r7, r7, lr ; vp8_filter &= mask; + + ;modify code for vp8 -- Filter1 = vp8_filter (r7) + qadd8 r8 , r7 , r9 ; Filter2 (r8) = vp8_signed_char_clamp(vp8_filter+3) + qadd8 r7 , r7 , r10 ; vp8_filter = vp8_signed_char_clamp(vp8_filter+4) + + mov r9, #0 + shadd8 r8 , r8 , r9 ; Filter2 >>= 3 + shadd8 r7 , r7 , r9 ; vp8_filter >>= 3 + shadd8 r8 , r8 , r9 + shadd8 r7 , r7 , r9 + shadd8 lr , r8 , r9 ; lr: Filter2 + shadd8 r7 , r7 , r9 ; r7: filter + + ;usub8 lr, r8, r10 ; s = (s==4)*-1 + ;sel lr, r11, r9 + ;usub8 r8, r10, r8 + ;sel r8, r11, r9 + ;and r8, r8, lr ; -1 for each element that equals 4 + + ;calculate output + ;qadd8 lr, r8, r7 ; u = vp8_signed_char_clamp(s + vp8_filter) + + ldr r8, [sp] ; load qs0 + ldr r9, [sp, #4] ; load ps0 + + ldr r10, c0x01010101 + + qsub8 r8 ,r8, r7 ; u = vp8_signed_char_clamp(qs0 - vp8_filter) + qadd8 r9, r9, lr ; u = vp8_signed_char_clamp(ps0 + Filter2) + + ;end of modification for vp8 + + mov lr, #0 + sadd8 r7, r7 , r10 ; vp8_filter += 1 + shadd8 r7, r7, lr ; vp8_filter >>= 1 + + ldr r11, [sp, #12] ; load ps1 + ldr r10, [sp, #8] ; load qs1 + + bic r7, r7, r6 ; vp8_filter &= ~hev + sub src, src, pstep, lsl #2 + + qadd8 r11, r11, r7 ; u = vp8_signed_char_clamp(ps1 + vp8_filter) + qsub8 r10, r10,r7 ; u = vp8_signed_char_clamp(qs1 - vp8_filter) + + eor r11, r11, r12 ; *op1 = u^0x80 + str r11, [src], pstep ; store op1 + eor r9, r9, r12 ; *op0 = u^0x80 + str r9, [src], pstep ; store op0 result + eor r8, r8, r12 ; *oq0 = u^0x80 + str r8, [src], pstep ; store oq0 result + eor r10, r10, r12 ; *oq1 = u^0x80 + str r10, [src], pstep ; store oq1 + + sub src, src, pstep, lsl #1 + +|hskip_filter| + add src, src, #4 + sub src, src, pstep, lsl #2 + + subs count, count, #1 + + ;pld [src] + ;pld [src, pstep] + ;pld [src, pstep, lsl #1] + ;pld [src, pstep, lsl #2] + ;pld [src, pstep, lsl #3] + + ldrne r9, [src], pstep ; p3 + ldrne r10, [src], pstep ; p2 + ldrne r11, [src], pstep ; p1 + + bne Hnext8 + + add sp, sp, #16 + ldmia sp!, {r4 - r11, pc} + ENDP ; |vp8_loop_filter_horizontal_edge_armv6| + + +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- +|vp8_mbloop_filter_horizontal_edge_armv6| PROC +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- + stmdb sp!, {r4 - r11, lr} + + sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines + ldr count, [sp, #40] ; count for 8-in-parallel + ldr r6, [sp, #36] ; load thresh address + sub sp, sp, #16 ; create temp buffer + + ldr r9, [src], pstep ; p3 + ldr r4, [r2], #4 ; flimit + ldr r10, [src], pstep ; p2 + ldr r2, [r3], #4 ; limit + ldr r11, [src], pstep ; p1 + uadd8 r4, r4, r4 ; flimit * 2 + ldr r3, [r6], #4 ; thresh + mov count, count, lsl #1 ; 4-in-parallel + uadd8 r4, r4, r2 ; flimit * 2 + limit + +|MBHnext8| + + ; vp8_filter_mask() function + ; calculate breakout conditions + ldr r12, [src], pstep ; p0 + + uqsub8 r6, r9, r10 ; p3 - p2 + uqsub8 r7, r10, r9 ; p2 - p3 + uqsub8 r8, r10, r11 ; p2 - p1 + uqsub8 r10, r11, r10 ; p1 - p2 + + orr r6, r6, r7 ; abs (p3-p2) + orr r8, r8, r10 ; abs (p2-p1) + uqsub8 lr, r6, r2 ; compare to limit. lr: vp8_filter_mask + uqsub8 r8, r8, r2 ; compare to limit + + uqsub8 r6, r11, r12 ; p1 - p0 + orr lr, lr, r8 + uqsub8 r7, r12, r11 ; p0 - p1 + ldr r9, [src], pstep ; q0 + ldr r10, [src], pstep ; q1 + orr r6, r6, r7 ; abs (p1-p0) + uqsub8 r7, r6, r2 ; compare to limit + uqsub8 r8, r6, r3 ; compare to thresh -- save r8 for later + orr lr, lr, r7 + + uqsub8 r6, r11, r10 ; p1 - q1 + uqsub8 r7, r10, r11 ; q1 - p1 + uqsub8 r11, r12, r9 ; p0 - q0 + uqsub8 r12, r9, r12 ; q0 - p0 + orr r6, r6, r7 ; abs (p1-q1) + ldr r7, c0x7F7F7F7F + orr r12, r11, r12 ; abs (p0-q0) + ldr r11, [src], pstep ; q2 + uqadd8 r12, r12, r12 ; abs (p0-q0) * 2 + and r6, r7, r6, lsr #1 ; abs (p1-q1) / 2 + uqsub8 r7, r9, r10 ; q0 - q1 + uqadd8 r12, r12, r6 ; abs (p0-q0)*2 + abs (p1-q1)/2 + uqsub8 r6, r10, r9 ; q1 - q0 + uqsub8 r12, r12, r4 ; compare to flimit + uqsub8 r9, r11, r10 ; q2 - q1 + + orr lr, lr, r12 + + ldr r12, [src], pstep ; q3 + + uqsub8 r10, r10, r11 ; q1 - q2 + orr r6, r7, r6 ; abs (q1-q0) + orr r10, r9, r10 ; abs (q2-q1) + uqsub8 r7, r6, r2 ; compare to limit + uqsub8 r10, r10, r2 ; compare to limit + uqsub8 r6, r6, r3 ; compare to thresh -- save r6 for later + orr lr, lr, r7 + orr lr, lr, r10 + + uqsub8 r10, r12, r11 ; q3 - q2 + uqsub8 r9, r11, r12 ; q2 - q3 + + mvn r11, #0 ; r11 == -1 + + orr r10, r10, r9 ; abs (q3-q2) + uqsub8 r10, r10, r2 ; compare to limit + + mov r12, #0 + + orr lr, lr, r10 + + usub8 lr, r12, lr ; use usub8 instead of ssub8 + sel lr, r11, r12 ; filter mask: lr + + cmp lr, #0 + beq mbhskip_filter ; skip filtering + + ;vp8_hevmask() function + ;calculate high edge variance + sub src, src, pstep, lsl #2 ; move src pointer down by 6 lines + sub src, src, pstep, lsl #1 + + orr r10, r6, r8 + ldr r7, [src], pstep ; p1 + + usub8 r10, r12, r10 + sel r6, r12, r11 ; hev mask: r6 + + ;vp8_mbfilter() function + ;p2, q2 are only needed at the end. Don't need to load them in now. + ldr r8, [src], pstep ; p0 + ldr r12, c0x80808080 + ldr r9, [src], pstep ; q0 + ldr r10, [src] ; q1 + + eor r7, r7, r12 ; ps1 + eor r8, r8, r12 ; ps0 + eor r9, r9, r12 ; qs0 + eor r10, r10, r12 ; qs1 + + qsub8 r12, r9, r8 ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) + str r7, [sp, #12] ; store ps1 temporarily + qsub8 r7, r7, r10 ; vp8_signed_char_clamp(ps1-qs1) + str r10, [sp, #8] ; store qs1 temporarily + qadd8 r7, r7, r12 + str r9, [sp] ; store qs0 temporarily + qadd8 r7, r7, r12 + str r8, [sp, #4] ; store ps0 temporarily + qadd8 r7, r7, r12 ; vp8_filter: r7 + + ldr r10, c0x03030303 ; r10 = 3 --modified for vp8 + ldr r9, c0x04040404 + + and r7, r7, lr ; vp8_filter &= mask (lr is free) + + mov r12, r7 ; Filter2: r12 + and r12, r12, r6 ; Filter2 &= hev + + ;modify code for vp8 + ;save bottom 3 bits so that we round one side +4 and the other +3 + qadd8 r8 , r12 , r9 ; Filter1 (r8) = vp8_signed_char_clamp(Filter2+4) + qadd8 r12 , r12 , r10 ; Filter2 (r12) = vp8_signed_char_clamp(Filter2+3) + + mov r10, #0 + shadd8 r8 , r8 , r10 ; Filter1 >>= 3 + shadd8 r12 , r12 , r10 ; Filter2 >>= 3 + shadd8 r8 , r8 , r10 + shadd8 r12 , r12 , r10 + shadd8 r8 , r8 , r10 ; r8: Filter1 + shadd8 r12 , r12 , r10 ; r12: Filter2 + + ldr r9, [sp] ; load qs0 + ldr r11, [sp, #4] ; load ps0 + + qsub8 r9 , r9, r8 ; qs0 = vp8_signed_char_clamp(qs0 - Filter1) + qadd8 r11, r11, r12 ; ps0 = vp8_signed_char_clamp(ps0 + Filter2) + + ;save bottom 3 bits so that we round one side +4 and the other +3 + ;and r8, r12, r10 ; s = Filter2 & 7 (s: r8) + ;qadd8 r12 , r12 , r9 ; Filter2 = vp8_signed_char_clamp(Filter2+4) + ;mov r10, #0 + ;shadd8 r12 , r12 , r10 ; Filter2 >>= 3 + ;usub8 lr, r8, r9 ; s = (s==4)*-1 + ;sel lr, r11, r10 + ;shadd8 r12 , r12 , r10 + ;usub8 r8, r9, r8 + ;sel r8, r11, r10 + ;ldr r9, [sp] ; load qs0 + ;ldr r11, [sp, #4] ; load ps0 + ;shadd8 r12 , r12 , r10 + ;and r8, r8, lr ; -1 for each element that equals 4 + ;qadd8 r10, r8, r12 ; u = vp8_signed_char_clamp(s + Filter2) + ;qsub8 r9 , r9, r12 ; qs0 = vp8_signed_char_clamp(qs0 - Filter2) + ;qadd8 r11, r11, r10 ; ps0 = vp8_signed_char_clamp(ps0 + u) + + ;end of modification for vp8 + + bic r12, r7, r6 ; vp8_filter &= ~hev ( r6 is free) + ;mov r12, r7 + + ;roughly 3/7th difference across boundary + mov lr, #0x1b ; 27 + mov r7, #0x3f ; 63 + + sxtb16 r6, r12 + sxtb16 r10, r12, ror #8 + smlabb r8, r6, lr, r7 + smlatb r6, r6, lr, r7 + smlabb r7, r10, lr, r7 + smultb r10, r10, lr + ssat r8, #8, r8, asr #7 + ssat r6, #8, r6, asr #7 + add r10, r10, #63 + ssat r7, #8, r7, asr #7 + ssat r10, #8, r10, asr #7 + + ldr lr, c0x80808080 + + pkhbt r6, r8, r6, lsl #16 + pkhbt r10, r7, r10, lsl #16 + uxtb16 r6, r6 + uxtb16 r10, r10 + + sub src, src, pstep + + orr r10, r6, r10, lsl #8 ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7) + + qsub8 r8, r9, r10 ; s = vp8_signed_char_clamp(qs0 - u) + qadd8 r10, r11, r10 ; s = vp8_signed_char_clamp(ps0 + u) + eor r8, r8, lr ; *oq0 = s^0x80 + str r8, [src] ; store *oq0 + sub src, src, pstep + eor r10, r10, lr ; *op0 = s^0x80 + str r10, [src] ; store *op0 + + ;roughly 2/7th difference across boundary + mov lr, #0x12 ; 18 + mov r7, #0x3f ; 63 + + sxtb16 r6, r12 + sxtb16 r10, r12, ror #8 + smlabb r8, r6, lr, r7 + smlatb r6, r6, lr, r7 + smlabb r9, r10, lr, r7 + smlatb r10, r10, lr, r7 + ssat r8, #8, r8, asr #7 + ssat r6, #8, r6, asr #7 + ssat r9, #8, r9, asr #7 + ssat r10, #8, r10, asr #7 + + ldr lr, c0x80808080 + + pkhbt r6, r8, r6, lsl #16 + pkhbt r10, r9, r10, lsl #16 + + ldr r9, [sp, #8] ; load qs1 + ldr r11, [sp, #12] ; load ps1 + + uxtb16 r6, r6 + uxtb16 r10, r10 + + sub src, src, pstep + + orr r10, r6, r10, lsl #8 ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7) + + qadd8 r11, r11, r10 ; s = vp8_signed_char_clamp(ps1 + u) + qsub8 r8, r9, r10 ; s = vp8_signed_char_clamp(qs1 - u) + eor r11, r11, lr ; *op1 = s^0x80 + str r11, [src], pstep ; store *op1 + eor r8, r8, lr ; *oq1 = s^0x80 + add src, src, pstep, lsl #1 + + mov r7, #0x3f ; 63 + + str r8, [src], pstep ; store *oq1 + + ;roughly 1/7th difference across boundary + mov lr, #0x9 ; 9 + ldr r9, [src] ; load q2 + + sxtb16 r6, r12 + sxtb16 r10, r12, ror #8 + smlabb r8, r6, lr, r7 + smlatb r6, r6, lr, r7 + smlabb r12, r10, lr, r7 + smlatb r10, r10, lr, r7 + ssat r8, #8, r8, asr #7 + ssat r6, #8, r6, asr #7 + ssat r12, #8, r12, asr #7 + ssat r10, #8, r10, asr #7 + + sub src, src, pstep, lsl #2 + + pkhbt r6, r8, r6, lsl #16 + pkhbt r10, r12, r10, lsl #16 + + sub src, src, pstep + ldr lr, c0x80808080 + + ldr r11, [src] ; load p2 + + uxtb16 r6, r6 + uxtb16 r10, r10 + + eor r9, r9, lr + eor r11, r11, lr + + orr r10, r6, r10, lsl #8 ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7) + + qadd8 r8, r11, r10 ; s = vp8_signed_char_clamp(ps2 + u) + qsub8 r10, r9, r10 ; s = vp8_signed_char_clamp(qs2 - u) + eor r8, r8, lr ; *op2 = s^0x80 + str r8, [src], pstep, lsl #2 ; store *op2 + add src, src, pstep + eor r10, r10, lr ; *oq2 = s^0x80 + str r10, [src], pstep, lsl #1 ; store *oq2 + +|mbhskip_filter| + add src, src, #4 + sub src, src, pstep, lsl #3 + subs count, count, #1 + + ldrne r9, [src], pstep ; p3 + ldrne r10, [src], pstep ; p2 + ldrne r11, [src], pstep ; p1 + + bne MBHnext8 + + add sp, sp, #16 + ldmia sp!, {r4 - r11, pc} + ENDP ; |vp8_mbloop_filter_horizontal_edge_armv6| + + +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- +|vp8_loop_filter_vertical_edge_armv6| PROC +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- + stmdb sp!, {r4 - r11, lr} + + sub src, src, #4 ; move src pointer down by 4 + ldr count, [sp, #40] ; count for 8-in-parallel + ldr r12, [sp, #36] ; load thresh address + sub sp, sp, #16 ; create temp buffer + + ldr r6, [src], pstep ; load source data + ldr r4, [r2], #4 ; flimit + ldr r7, [src], pstep + ldr r2, [r3], #4 ; limit + ldr r8, [src], pstep + uadd8 r4, r4, r4 ; flimit * 2 + ldr r3, [r12], #4 ; thresh + ldr lr, [src], pstep + mov count, count, lsl #1 ; 4-in-parallel + uadd8 r4, r4, r2 ; flimit * 2 + limit + +|Vnext8| + + ; vp8_filter_mask() function + ; calculate breakout conditions + ; transpose the source data for 4-in-parallel operation + TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12 + + uqsub8 r7, r9, r10 ; p3 - p2 + uqsub8 r8, r10, r9 ; p2 - p3 + uqsub8 r9, r10, r11 ; p2 - p1 + uqsub8 r10, r11, r10 ; p1 - p2 + orr r7, r7, r8 ; abs (p3-p2) + orr r10, r9, r10 ; abs (p2-p1) + uqsub8 lr, r7, r2 ; compare to limit. lr: vp8_filter_mask + uqsub8 r10, r10, r2 ; compare to limit + + sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines + + orr lr, lr, r10 + + uqsub8 r6, r11, r12 ; p1 - p0 + uqsub8 r7, r12, r11 ; p0 - p1 + add src, src, #4 ; move src pointer up by 4 + orr r6, r6, r7 ; abs (p1-p0) + str r11, [sp, #12] ; save p1 + uqsub8 r10, r6, r2 ; compare to limit + uqsub8 r11, r6, r3 ; compare to thresh + orr lr, lr, r10 + + ; transpose uses 8 regs(r6 - r12 and lr). Need to save reg value now + ; transpose the source data for 4-in-parallel operation + ldr r6, [src], pstep ; load source data + str r11, [sp] ; push r11 to stack + ldr r7, [src], pstep + str r12, [sp, #4] ; save current reg before load q0 - q3 data + ldr r8, [src], pstep + str lr, [sp, #8] + ldr lr, [src], pstep + + TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12 + + ldr lr, [sp, #8] ; load back (f)limit accumulator + + uqsub8 r6, r12, r11 ; q3 - q2 + uqsub8 r7, r11, r12 ; q2 - q3 + uqsub8 r12, r11, r10 ; q2 - q1 + uqsub8 r11, r10, r11 ; q1 - q2 + orr r6, r6, r7 ; abs (q3-q2) + orr r7, r12, r11 ; abs (q2-q1) + uqsub8 r6, r6, r2 ; compare to limit + uqsub8 r7, r7, r2 ; compare to limit + ldr r11, [sp, #4] ; load back p0 + ldr r12, [sp, #12] ; load back p1 + orr lr, lr, r6 + orr lr, lr, r7 + + uqsub8 r6, r11, r9 ; p0 - q0 + uqsub8 r7, r9, r11 ; q0 - p0 + uqsub8 r8, r12, r10 ; p1 - q1 + uqsub8 r11, r10, r12 ; q1 - p1 + orr r6, r6, r7 ; abs (p0-q0) + ldr r7, c0x7F7F7F7F + orr r8, r8, r11 ; abs (p1-q1) + uqadd8 r6, r6, r6 ; abs (p0-q0) * 2 + and r8, r7, r8, lsr #1 ; abs (p1-q1) / 2 + uqsub8 r11, r10, r9 ; q1 - q0 + uqadd8 r6, r8, r6 ; abs (p0-q0)*2 + abs (p1-q1)/2 + uqsub8 r12, r9, r10 ; q0 - q1 + uqsub8 r6, r6, r4 ; compare to flimit + + orr r9, r11, r12 ; abs (q1-q0) + uqsub8 r8, r9, r2 ; compare to limit + uqsub8 r10, r9, r3 ; compare to thresh + orr lr, lr, r6 + orr lr, lr, r8 + + mvn r11, #0 ; r11 == -1 + mov r12, #0 + + usub8 lr, r12, lr + ldr r9, [sp] ; load the compared result + sel lr, r11, r12 ; filter mask: lr + + cmp lr, #0 + beq vskip_filter ; skip filtering + + ;vp8_hevmask() function + ;calculate high edge variance + + sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines + + orr r9, r9, r10 + + ldrh r7, [src, #-2] + ldrh r8, [src], pstep + + usub8 r9, r12, r9 + sel r6, r12, r11 ; hev mask: r6 + + ;vp8_filter() function + ; load soure data to r6, r11, r12, lr + ldrh r9, [src, #-2] + ldrh r10, [src], pstep + + pkhbt r12, r7, r8, lsl #16 + + ldrh r7, [src, #-2] + ldrh r8, [src], pstep + + pkhbt r11, r9, r10, lsl #16 + + ldrh r9, [src, #-2] + ldrh r10, [src], pstep + + ; Transpose needs 8 regs(r6 - r12, and lr). Save r6 and lr first + str r6, [sp] + str lr, [sp, #4] + + pkhbt r6, r7, r8, lsl #16 + pkhbt lr, r9, r10, lsl #16 + + ;transpose r12, r11, r6, lr to r7, r8, r9, r10 + TRANSPOSE_MATRIX r12, r11, r6, lr, r7, r8, r9, r10 + + ;load back hev_mask r6 and filter_mask lr + ldr r12, c0x80808080 + ldr r6, [sp] + ldr lr, [sp, #4] + + eor r7, r7, r12 ; p1 offset to convert to a signed value + eor r8, r8, r12 ; p0 offset to convert to a signed value + eor r9, r9, r12 ; q0 offset to convert to a signed value + eor r10, r10, r12 ; q1 offset to convert to a signed value + + str r9, [sp] ; store qs0 temporarily + str r8, [sp, #4] ; store ps0 temporarily + str r10, [sp, #8] ; store qs1 temporarily + str r7, [sp, #12] ; store ps1 temporarily + + qsub8 r7, r7, r10 ; vp8_signed_char_clamp(ps1-qs1) + qsub8 r8, r9, r8 ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) + + and r7, r7, r6 ; vp8_filter (r7) &= hev (r7 : filter) + + qadd8 r7, r7, r8 + ldr r9, c0x03030303 ; r9 = 3 --modified for vp8 + + qadd8 r7, r7, r8 + ldr r10, c0x04040404 + + qadd8 r7, r7, r8 + ;mvn r11, #0 ; r11 == -1 + + and r7, r7, lr ; vp8_filter &= mask + + ;modify code for vp8 -- Filter1 = vp8_filter (r7) + qadd8 r8 , r7 , r9 ; Filter2 (r8) = vp8_signed_char_clamp(vp8_filter+3) + qadd8 r7 , r7 , r10 ; vp8_filter = vp8_signed_char_clamp(vp8_filter+4) + + mov r9, #0 + shadd8 r8 , r8 , r9 ; Filter2 >>= 3 + shadd8 r7 , r7 , r9 ; vp8_filter >>= 3 + shadd8 r8 , r8 , r9 + shadd8 r7 , r7 , r9 + shadd8 lr , r8 , r9 ; lr: filter2 + shadd8 r7 , r7 , r9 ; r7: filter + + ;usub8 lr, r8, r10 ; s = (s==4)*-1 + ;sel lr, r11, r9 + ;usub8 r8, r10, r8 + ;sel r8, r11, r9 + ;and r8, r8, lr ; -1 for each element that equals 4 -- r8: s + + ;calculate output + ;qadd8 lr, r8, r7 ; u = vp8_signed_char_clamp(s + vp8_filter) + + ldr r8, [sp] ; load qs0 + ldr r9, [sp, #4] ; load ps0 + + ldr r10, c0x01010101 + + qsub8 r8, r8, r7 ; u = vp8_signed_char_clamp(qs0 - vp8_filter) + qadd8 r9, r9, lr ; u = vp8_signed_char_clamp(ps0 + Filter2) + ;end of modification for vp8 + + eor r8, r8, r12 + eor r9, r9, r12 + + mov lr, #0 + + sadd8 r7, r7, r10 + shadd8 r7, r7, lr + + ldr r10, [sp, #8] ; load qs1 + ldr r11, [sp, #12] ; load ps1 + + bic r7, r7, r6 ; r7: vp8_filter + + qsub8 r10 , r10, r7 ; u = vp8_signed_char_clamp(qs1 - vp8_filter) + qadd8 r11, r11, r7 ; u = vp8_signed_char_clamp(ps1 + vp8_filter) + eor r10, r10, r12 + eor r11, r11, r12 + + sub src, src, pstep, lsl #2 + + ;we can use TRANSPOSE_MATRIX macro to transpose output - input: q1, q0, p0, p1 + ;output is b0, b1, b2, b3 + ;b0: 03 02 01 00 + ;b1: 13 12 11 10 + ;b2: 23 22 21 20 + ;b3: 33 32 31 30 + ; p1 p0 q0 q1 + ; (a3 a2 a1 a0) + TRANSPOSE_MATRIX r11, r9, r8, r10, r6, r7, r12, lr + + strh r6, [src, #-2] ; store the result + mov r6, r6, lsr #16 + strh r6, [src], pstep + + strh r7, [src, #-2] + mov r7, r7, lsr #16 + strh r7, [src], pstep + + strh r12, [src, #-2] + mov r12, r12, lsr #16 + strh r12, [src], pstep + + strh lr, [src, #-2] + mov lr, lr, lsr #16 + strh lr, [src], pstep + +|vskip_filter| + sub src, src, #4 + subs count, count, #1 + + ldrne r6, [src], pstep ; load source data + ldrne r7, [src], pstep + ldrne r8, [src], pstep + ldrne lr, [src], pstep + + bne Vnext8 + + add sp, sp, #16 + + ldmia sp!, {r4 - r11, pc} + ENDP ; |vp8_loop_filter_vertical_edge_armv6| + + + +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- +|vp8_mbloop_filter_vertical_edge_armv6| PROC +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- + stmdb sp!, {r4 - r11, lr} + + sub src, src, #4 ; move src pointer down by 4 + ldr count, [sp, #40] ; count for 8-in-parallel + ldr r12, [sp, #36] ; load thresh address + sub sp, sp, #16 ; create temp buffer + + ldr r6, [src], pstep ; load source data + ldr r4, [r2], #4 ; flimit + ldr r7, [src], pstep + ldr r2, [r3], #4 ; limit + ldr r8, [src], pstep + uadd8 r4, r4, r4 ; flimit * 2 + ldr r3, [r12], #4 ; thresh + ldr lr, [src], pstep + mov count, count, lsl #1 ; 4-in-parallel + uadd8 r4, r4, r2 ; flimit * 2 + limit + +|MBVnext8| + ; vp8_filter_mask() function + ; calculate breakout conditions + ; transpose the source data for 4-in-parallel operation + TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12 + + uqsub8 r7, r9, r10 ; p3 - p2 + uqsub8 r8, r10, r9 ; p2 - p3 + uqsub8 r9, r10, r11 ; p2 - p1 + uqsub8 r10, r11, r10 ; p1 - p2 + orr r7, r7, r8 ; abs (p3-p2) + orr r10, r9, r10 ; abs (p2-p1) + uqsub8 lr, r7, r2 ; compare to limit. lr: vp8_filter_mask + uqsub8 r10, r10, r2 ; compare to limit + + sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines + + orr lr, lr, r10 + + uqsub8 r6, r11, r12 ; p1 - p0 + uqsub8 r7, r12, r11 ; p0 - p1 + add src, src, #4 ; move src pointer up by 4 + orr r6, r6, r7 ; abs (p1-p0) + str r11, [sp, #12] ; save p1 + uqsub8 r10, r6, r2 ; compare to limit + uqsub8 r11, r6, r3 ; compare to thresh + orr lr, lr, r10 + + ; transpose uses 8 regs(r6 - r12 and lr). Need to save reg value now + ; transpose the source data for 4-in-parallel operation + ldr r6, [src], pstep ; load source data + str r11, [sp] ; push r11 to stack + ldr r7, [src], pstep + str r12, [sp, #4] ; save current reg before load q0 - q3 data + ldr r8, [src], pstep + str lr, [sp, #8] + ldr lr, [src], pstep + + TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12 + + ldr lr, [sp, #8] ; load back (f)limit accumulator + + uqsub8 r6, r12, r11 ; q3 - q2 + uqsub8 r7, r11, r12 ; q2 - q3 + uqsub8 r12, r11, r10 ; q2 - q1 + uqsub8 r11, r10, r11 ; q1 - q2 + orr r6, r6, r7 ; abs (q3-q2) + orr r7, r12, r11 ; abs (q2-q1) + uqsub8 r6, r6, r2 ; compare to limit + uqsub8 r7, r7, r2 ; compare to limit + ldr r11, [sp, #4] ; load back p0 + ldr r12, [sp, #12] ; load back p1 + orr lr, lr, r6 + orr lr, lr, r7 + + uqsub8 r6, r11, r9 ; p0 - q0 + uqsub8 r7, r9, r11 ; q0 - p0 + uqsub8 r8, r12, r10 ; p1 - q1 + uqsub8 r11, r10, r12 ; q1 - p1 + orr r6, r6, r7 ; abs (p0-q0) + ldr r7, c0x7F7F7F7F + orr r8, r8, r11 ; abs (p1-q1) + uqadd8 r6, r6, r6 ; abs (p0-q0) * 2 + and r8, r7, r8, lsr #1 ; abs (p1-q1) / 2 + uqsub8 r11, r10, r9 ; q1 - q0 + uqadd8 r6, r8, r6 ; abs (p0-q0)*2 + abs (p1-q1)/2 + uqsub8 r12, r9, r10 ; q0 - q1 + uqsub8 r6, r6, r4 ; compare to flimit + + orr r9, r11, r12 ; abs (q1-q0) + uqsub8 r8, r9, r2 ; compare to limit + uqsub8 r10, r9, r3 ; compare to thresh + orr lr, lr, r6 + orr lr, lr, r8 + + mvn r11, #0 ; r11 == -1 + mov r12, #0 + + usub8 lr, r12, lr + ldr r9, [sp] ; load the compared result + sel lr, r11, r12 ; filter mask: lr + + cmp lr, #0 + beq mbvskip_filter ; skip filtering + + + ;vp8_hevmask() function + ;calculate high edge variance + + sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines + + orr r9, r9, r10 + + ldrh r7, [src, #-2] + ldrh r8, [src], pstep + + usub8 r9, r12, r9 + sel r6, r12, r11 ; hev mask: r6 + + + ; vp8_mbfilter() function + ; p2, q2 are only needed at the end. Don't need to load them in now. + ; Transpose needs 8 regs(r6 - r12, and lr). Save r6 and lr first + ; load soure data to r6, r11, r12, lr + ldrh r9, [src, #-2] + ldrh r10, [src], pstep + + pkhbt r12, r7, r8, lsl #16 + + ldrh r7, [src, #-2] + ldrh r8, [src], pstep + + pkhbt r11, r9, r10, lsl #16 + + ldrh r9, [src, #-2] + ldrh r10, [src], pstep + + str r6, [sp] ; save r6 + str lr, [sp, #4] ; save lr + + pkhbt r6, r7, r8, lsl #16 + pkhbt lr, r9, r10, lsl #16 + + ;transpose r12, r11, r6, lr to p1, p0, q0, q1 + TRANSPOSE_MATRIX r12, r11, r6, lr, r7, r8, r9, r10 + + ;load back hev_mask r6 and filter_mask lr + ldr r12, c0x80808080 + ldr r6, [sp] + ldr lr, [sp, #4] + + eor r7, r7, r12 ; ps1 + eor r8, r8, r12 ; ps0 + eor r9, r9, r12 ; qs0 + eor r10, r10, r12 ; qs1 + + qsub8 r12, r9, r8 ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) + str r7, [sp, #12] ; store ps1 temporarily + qsub8 r7, r7, r10 ; vp8_signed_char_clamp(ps1-qs1) + str r10, [sp, #8] ; store qs1 temporarily + qadd8 r7, r7, r12 + str r9, [sp] ; store qs0 temporarily + qadd8 r7, r7, r12 + str r8, [sp, #4] ; store ps0 temporarily + qadd8 r7, r7, r12 ; vp8_filter: r7 + + ldr r10, c0x03030303 ; r10 = 3 --modified for vp8 + ldr r9, c0x04040404 + ;mvn r11, #0 ; r11 == -1 + + and r7, r7, lr ; vp8_filter &= mask (lr is free) + + mov r12, r7 ; Filter2: r12 + and r12, r12, r6 ; Filter2 &= hev + + ;modify code for vp8 + ;save bottom 3 bits so that we round one side +4 and the other +3 + qadd8 r8 , r12 , r9 ; Filter1 (r8) = vp8_signed_char_clamp(Filter2+4) + qadd8 r12 , r12 , r10 ; Filter2 (r12) = vp8_signed_char_clamp(Filter2+3) + + mov r10, #0 + shadd8 r8 , r8 , r10 ; Filter1 >>= 3 + shadd8 r12 , r12 , r10 ; Filter2 >>= 3 + shadd8 r8 , r8 , r10 + shadd8 r12 , r12 , r10 + shadd8 r8 , r8 , r10 ; r8: Filter1 + shadd8 r12 , r12 , r10 ; r12: Filter2 + + ldr r9, [sp] ; load qs0 + ldr r11, [sp, #4] ; load ps0 + + qsub8 r9 , r9, r8 ; qs0 = vp8_signed_char_clamp(qs0 - Filter1) + qadd8 r11, r11, r12 ; ps0 = vp8_signed_char_clamp(ps0 + Filter2) + + ;save bottom 3 bits so that we round one side +4 and the other +3 + ;and r8, r12, r10 ; s = Filter2 & 7 (s: r8) + ;qadd8 r12 , r12 , r9 ; Filter2 = vp8_signed_char_clamp(Filter2+4) + ;mov r10, #0 + ;shadd8 r12 , r12 , r10 ; Filter2 >>= 3 + ;usub8 lr, r8, r9 ; s = (s==4)*-1 + ;sel lr, r11, r10 + ;shadd8 r12 , r12 , r10 + ;usub8 r8, r9, r8 + ;sel r8, r11, r10 + ;ldr r9, [sp] ; load qs0 + ;ldr r11, [sp, #4] ; load ps0 + ;shadd8 r12 , r12 , r10 + ;and r8, r8, lr ; -1 for each element that equals 4 + ;qadd8 r10, r8, r12 ; u = vp8_signed_char_clamp(s + Filter2) + ;qsub8 r9 , r9, r12 ; qs0 = vp8_signed_char_clamp(qs0 - Filter2) + ;qadd8 r11, r11, r10 ; ps0 = vp8_signed_char_clamp(ps0 + u) + + ;end of modification for vp8 + + bic r12, r7, r6 ;vp8_filter &= ~hev ( r6 is free) + ;mov r12, r7 + + ;roughly 3/7th difference across boundary + mov lr, #0x1b ; 27 + mov r7, #0x3f ; 63 + + sxtb16 r6, r12 + sxtb16 r10, r12, ror #8 + smlabb r8, r6, lr, r7 + smlatb r6, r6, lr, r7 + smlabb r7, r10, lr, r7 + smultb r10, r10, lr + ssat r8, #8, r8, asr #7 + ssat r6, #8, r6, asr #7 + add r10, r10, #63 + ssat r7, #8, r7, asr #7 + ssat r10, #8, r10, asr #7 + + ldr lr, c0x80808080 + + pkhbt r6, r8, r6, lsl #16 + pkhbt r10, r7, r10, lsl #16 + uxtb16 r6, r6 + uxtb16 r10, r10 + + sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines + + orr r10, r6, r10, lsl #8 ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7) + + qsub8 r8, r9, r10 ; s = vp8_signed_char_clamp(qs0 - u) + qadd8 r10, r11, r10 ; s = vp8_signed_char_clamp(ps0 + u) + eor r8, r8, lr ; *oq0 = s^0x80 + eor r10, r10, lr ; *op0 = s^0x80 + + strb r10, [src, #-1] ; store op0 result + strb r8, [src], pstep ; store oq0 result + mov r10, r10, lsr #8 + mov r8, r8, lsr #8 + strb r10, [src, #-1] + strb r8, [src], pstep + mov r10, r10, lsr #8 + mov r8, r8, lsr #8 + strb r10, [src, #-1] + strb r8, [src], pstep + mov r10, r10, lsr #8 + mov r8, r8, lsr #8 + strb r10, [src, #-1] + strb r8, [src], pstep + + ;roughly 2/7th difference across boundary + mov lr, #0x12 ; 18 + mov r7, #0x3f ; 63 + + sxtb16 r6, r12 + sxtb16 r10, r12, ror #8 + smlabb r8, r6, lr, r7 + smlatb r6, r6, lr, r7 + smlabb r9, r10, lr, r7 + smlatb r10, r10, lr, r7 + ssat r8, #8, r8, asr #7 + ssat r6, #8, r6, asr #7 + ssat r9, #8, r9, asr #7 + ssat r10, #8, r10, asr #7 + + sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines + + pkhbt r6, r8, r6, lsl #16 + pkhbt r10, r9, r10, lsl #16 + + ldr r9, [sp, #8] ; load qs1 + ldr r11, [sp, #12] ; load ps1 + ldr lr, c0x80808080 + + uxtb16 r6, r6 + uxtb16 r10, r10 + + add src, src, #2 + + orr r10, r6, r10, lsl #8 ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7) + + qsub8 r8, r9, r10 ; s = vp8_signed_char_clamp(qs1 - u) + qadd8 r10, r11, r10 ; s = vp8_signed_char_clamp(ps1 + u) + eor r8, r8, lr ; *oq1 = s^0x80 + eor r10, r10, lr ; *op1 = s^0x80 + + ldrb r11, [src, #-5] ; load p2 for 1/7th difference across boundary + strb r10, [src, #-4] ; store op1 + strb r8, [src, #-1] ; store oq1 + ldrb r9, [src], pstep ; load q2 for 1/7th difference across boundary + + mov r10, r10, lsr #8 + mov r8, r8, lsr #8 + + ldrb r6, [src, #-5] + strb r10, [src, #-4] + strb r8, [src, #-1] + ldrb r7, [src], pstep + + mov r10, r10, lsr #8 + mov r8, r8, lsr #8 + orr r11, r11, r6, lsl #8 + orr r9, r9, r7, lsl #8 + + ldrb r6, [src, #-5] + strb r10, [src, #-4] + strb r8, [src, #-1] + ldrb r7, [src], pstep + + mov r10, r10, lsr #8 + mov r8, r8, lsr #8 + orr r11, r11, r6, lsl #16 + orr r9, r9, r7, lsl #16 + + ldrb r6, [src, #-5] + strb r10, [src, #-4] + strb r8, [src, #-1] + ldrb r7, [src], pstep + orr r11, r11, r6, lsl #24 + orr r9, r9, r7, lsl #24 + + ;roughly 1/7th difference across boundary + eor r9, r9, lr + eor r11, r11, lr + + mov lr, #0x9 ; 9 + mov r7, #0x3f ; 63 + + sxtb16 r6, r12 + sxtb16 r10, r12, ror #8 + smlabb r8, r6, lr, r7 + smlatb r6, r6, lr, r7 + smlabb r12, r10, lr, r7 + smlatb r10, r10, lr, r7 + ssat r8, #8, r8, asr #7 + ssat r6, #8, r6, asr #7 + ssat r12, #8, r12, asr #7 + ssat r10, #8, r10, asr #7 + + sub src, src, pstep, lsl #2 + + pkhbt r6, r8, r6, lsl #16 + pkhbt r10, r12, r10, lsl #16 + + uxtb16 r6, r6 + uxtb16 r10, r10 + + ldr lr, c0x80808080 + + orr r10, r6, r10, lsl #8 ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7) + + qadd8 r8, r11, r10 ; s = vp8_signed_char_clamp(ps2 + u) + qsub8 r10, r9, r10 ; s = vp8_signed_char_clamp(qs2 - u) + eor r8, r8, lr ; *op2 = s^0x80 + eor r10, r10, lr ; *oq2 = s^0x80 + + strb r8, [src, #-5] ; store *op2 + strb r10, [src], pstep ; store *oq2 + mov r8, r8, lsr #8 + mov r10, r10, lsr #8 + strb r8, [src, #-5] + strb r10, [src], pstep + mov r8, r8, lsr #8 + mov r10, r10, lsr #8 + strb r8, [src, #-5] + strb r10, [src], pstep + mov r8, r8, lsr #8 + mov r10, r10, lsr #8 + strb r8, [src, #-5] + strb r10, [src], pstep + + ;adjust src pointer for next loop + sub src, src, #2 + +|mbvskip_filter| + sub src, src, #4 + subs count, count, #1 + + ldrne r6, [src], pstep ; load source data + ldrne r7, [src], pstep + ldrne r8, [src], pstep + ldrne lr, [src], pstep + + bne MBVnext8 + + add sp, sp, #16 + + ldmia sp!, {r4 - r11, pc} + ENDP ; |vp8_mbloop_filter_vertical_edge_armv6| + +; Constant Pool +c0x80808080 DCD 0x80808080 +c0x03030303 DCD 0x03030303 +c0x04040404 DCD 0x04040404 +c0x01010101 DCD 0x01010101 +c0x7F7F7F7F DCD 0x7F7F7F7F + + END diff --git a/vp8/common/arm/armv6/recon_v6.asm b/vp8/common/arm/armv6/recon_v6.asm new file mode 100644 index 000000000..085ff80c9 --- /dev/null +++ b/vp8/common/arm/armv6/recon_v6.asm @@ -0,0 +1,280 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_recon_b_armv6| + EXPORT |vp8_recon2b_armv6| + EXPORT |vp8_recon4b_armv6| + + AREA |.text|, CODE, READONLY ; name this block of code +prd RN r0 +dif RN r1 +dst RN r2 +stride RN r3 + +;void recon_b(unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int stride) +; R0 char* pred_ptr +; R1 short * dif_ptr +; R2 char * dst_ptr +; R3 int stride + +; Description: +; Loop through the block adding the Pred and Diff together. Clamp and then +; store back into the Dst. + +; Restrictions : +; all buffers are expected to be 4 byte aligned coming in and +; going out. +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= +; +; +; +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= +|vp8_recon_b_armv6| PROC + stmdb sp!, {r4 - r9, lr} + + ;0, 1, 2, 3 + ldr r4, [prd], #16 ; 3 | 2 | 1 | 0 + ldr r6, [dif, #0] ; 1 | 0 + ldr r7, [dif, #4] ; 3 | 2 + + pkhbt r8, r6, r7, lsl #16 ; 2 | 0 + pkhtb r9, r7, r6, asr #16 ; 3 | 1 + + uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0 + uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1 + + usat16 r8, #8, r8 + usat16 r9, #8, r9 + add dif, dif, #32 + orr r8, r8, r9, lsl #8 + + str r8, [dst], stride + + ;0, 1, 2, 3 + ldr r4, [prd], #16 ; 3 | 2 | 1 | 0 +;; ldr r6, [dif, #8] ; 1 | 0 +;; ldr r7, [dif, #12] ; 3 | 2 + ldr r6, [dif, #0] ; 1 | 0 + ldr r7, [dif, #4] ; 3 | 2 + + pkhbt r8, r6, r7, lsl #16 ; 2 | 0 + pkhtb r9, r7, r6, asr #16 ; 3 | 1 + + uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0 + uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1 + + usat16 r8, #8, r8 + usat16 r9, #8, r9 + add dif, dif, #32 + orr r8, r8, r9, lsl #8 + + str r8, [dst], stride + + ;0, 1, 2, 3 + ldr r4, [prd], #16 ; 3 | 2 | 1 | 0 +;; ldr r6, [dif, #16] ; 1 | 0 +;; ldr r7, [dif, #20] ; 3 | 2 + ldr r6, [dif, #0] ; 1 | 0 + ldr r7, [dif, #4] ; 3 | 2 + + pkhbt r8, r6, r7, lsl #16 ; 2 | 0 + pkhtb r9, r7, r6, asr #16 ; 3 | 1 + + uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0 + uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1 + + usat16 r8, #8, r8 + usat16 r9, #8, r9 + add dif, dif, #32 + orr r8, r8, r9, lsl #8 + + str r8, [dst], stride + + ;0, 1, 2, 3 + ldr r4, [prd], #16 ; 3 | 2 | 1 | 0 +;; ldr r6, [dif, #24] ; 1 | 0 +;; ldr r7, [dif, #28] ; 3 | 2 + ldr r6, [dif, #0] ; 1 | 0 + ldr r7, [dif, #4] ; 3 | 2 + + pkhbt r8, r6, r7, lsl #16 ; 2 | 0 + pkhtb r9, r7, r6, asr #16 ; 3 | 1 + + uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0 + uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1 + + usat16 r8, #8, r8 + usat16 r9, #8, r9 + orr r8, r8, r9, lsl #8 + + str r8, [dst], stride + + ldmia sp!, {r4 - r9, pc} + + ENDP ; |recon_b| + +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= +; +; +; +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= +; R0 char *pred_ptr +; R1 short *dif_ptr +; R2 char *dst_ptr +; R3 int stride +|vp8_recon4b_armv6| PROC + stmdb sp!, {r4 - r9, lr} + + mov lr, #4 + +recon4b_loop + ;0, 1, 2, 3 + ldr r4, [prd], #4 ; 3 | 2 | 1 | 0 + ldr r6, [dif, #0] ; 1 | 0 + ldr r7, [dif, #4] ; 3 | 2 + + pkhbt r8, r6, r7, lsl #16 ; 2 | 0 + pkhtb r9, r7, r6, asr #16 ; 3 | 1 + + uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0 + uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1 + + usat16 r8, #8, r8 + usat16 r9, #8, r9 + orr r8, r8, r9, lsl #8 + + str r8, [dst] + + ;4, 5, 6, 7 + ldr r4, [prd], #4 +;; ldr r6, [dif, #32] +;; ldr r7, [dif, #36] + ldr r6, [dif, #8] + ldr r7, [dif, #12] + + pkhbt r8, r6, r7, lsl #16 + pkhtb r9, r7, r6, asr #16 + + uxtab16 r8, r8, r4 + uxtab16 r9, r9, r4, ror #8 + usat16 r8, #8, r8 + usat16 r9, #8, r9 + orr r8, r8, r9, lsl #8 + + str r8, [dst, #4] + + ;8, 9, 10, 11 + ldr r4, [prd], #4 +;; ldr r6, [dif, #64] +;; ldr r7, [dif, #68] + ldr r6, [dif, #16] + ldr r7, [dif, #20] + + pkhbt r8, r6, r7, lsl #16 + pkhtb r9, r7, r6, asr #16 + + uxtab16 r8, r8, r4 + uxtab16 r9, r9, r4, ror #8 + usat16 r8, #8, r8 + usat16 r9, #8, r9 + orr r8, r8, r9, lsl #8 + + str r8, [dst, #8] + + ;12, 13, 14, 15 + ldr r4, [prd], #4 +;; ldr r6, [dif, #96] +;; ldr r7, [dif, #100] + ldr r6, [dif, #24] + ldr r7, [dif, #28] + + pkhbt r8, r6, r7, lsl #16 + pkhtb r9, r7, r6, asr #16 + + uxtab16 r8, r8, r4 + uxtab16 r9, r9, r4, ror #8 + usat16 r8, #8, r8 + usat16 r9, #8, r9 + orr r8, r8, r9, lsl #8 + + str r8, [dst, #12] + + add dst, dst, stride +;; add dif, dif, #8 + add dif, dif, #32 + + subs lr, lr, #1 + bne recon4b_loop + + ldmia sp!, {r4 - r9, pc} + + ENDP ; |Recon4B| + +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= +; +; +; +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= +; R0 char *pred_ptr +; R1 short *dif_ptr +; R2 char *dst_ptr +; R3 int stride +|vp8_recon2b_armv6| PROC + stmdb sp!, {r4 - r9, lr} + + mov lr, #4 + +recon2b_loop + ;0, 1, 2, 3 + ldr r4, [prd], #4 + ldr r6, [dif, #0] + ldr r7, [dif, #4] + + pkhbt r8, r6, r7, lsl #16 + pkhtb r9, r7, r6, asr #16 + + uxtab16 r8, r8, r4 + uxtab16 r9, r9, r4, ror #8 + usat16 r8, #8, r8 + usat16 r9, #8, r9 + orr r8, r8, r9, lsl #8 + + str r8, [dst] + + ;4, 5, 6, 7 + ldr r4, [prd], #4 +;; ldr r6, [dif, #32] +;; ldr r7, [dif, #36] + ldr r6, [dif, #8] + ldr r7, [dif, #12] + + pkhbt r8, r6, r7, lsl #16 + pkhtb r9, r7, r6, asr #16 + + uxtab16 r8, r8, r4 + uxtab16 r9, r9, r4, ror #8 + usat16 r8, #8, r8 + usat16 r9, #8, r9 + orr r8, r8, r9, lsl #8 + + str r8, [dst, #4] + + add dst, dst, stride +;; add dif, dif, #8 + add dif, dif, #16 + + subs lr, lr, #1 + bne recon2b_loop + + ldmia sp!, {r4 - r9, pc} + + ENDP ; |Recon2B| + + END diff --git a/vp8/common/arm/armv6/simpleloopfilter_v6.asm b/vp8/common/arm/armv6/simpleloopfilter_v6.asm new file mode 100644 index 000000000..15c6c7d16 --- /dev/null +++ b/vp8/common/arm/armv6/simpleloopfilter_v6.asm @@ -0,0 +1,321 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_loop_filter_simple_horizontal_edge_armv6| + EXPORT |vp8_loop_filter_simple_vertical_edge_armv6| + + AREA |.text|, CODE, READONLY ; name this block of code + + MACRO + TRANSPOSE_MATRIX $a0, $a1, $a2, $a3, $b0, $b1, $b2, $b3 + ; input: $a0, $a1, $a2, $a3; output: $b0, $b1, $b2, $b3 + ; a0: 03 02 01 00 + ; a1: 13 12 11 10 + ; a2: 23 22 21 20 + ; a3: 33 32 31 30 + ; b3 b2 b1 b0 + + uxtb16 $b1, $a1 ; xx 12 xx 10 + uxtb16 $b0, $a0 ; xx 02 xx 00 + uxtb16 $b3, $a3 ; xx 32 xx 30 + uxtb16 $b2, $a2 ; xx 22 xx 20 + orr $b1, $b0, $b1, lsl #8 ; 12 02 10 00 + orr $b3, $b2, $b3, lsl #8 ; 32 22 30 20 + + uxtb16 $a1, $a1, ror #8 ; xx 13 xx 11 + uxtb16 $a3, $a3, ror #8 ; xx 33 xx 31 + uxtb16 $a0, $a0, ror #8 ; xx 03 xx 01 + uxtb16 $a2, $a2, ror #8 ; xx 23 xx 21 + orr $a0, $a0, $a1, lsl #8 ; 13 03 11 01 + orr $a2, $a2, $a3, lsl #8 ; 33 23 31 21 + + pkhtb $b2, $b3, $b1, asr #16 ; 32 22 12 02 -- p1 + pkhbt $b0, $b1, $b3, lsl #16 ; 30 20 10 00 -- p3 + + pkhtb $b3, $a2, $a0, asr #16 ; 33 23 13 03 -- p0 + pkhbt $b1, $a0, $a2, lsl #16 ; 31 21 11 01 -- p2 + MEND + + +src RN r0 +pstep RN r1 + +;r0 unsigned char *src_ptr, +;r1 int src_pixel_step, +;r2 const char *flimit, +;r3 const char *limit, +;stack const char *thresh, +;stack int count + +;Note: All 16 elements in flimit are equal. So, in the code, only one load is needed +;for flimit. Same way applies to limit and thresh. + +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- +|vp8_loop_filter_simple_horizontal_edge_armv6| PROC +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- + stmdb sp!, {r4 - r11, lr} + + sub src, src, pstep, lsl #1 ; move src pointer down by 2 lines + + ldr r12, [r3], #4 ; limit + ldr r3, [src], pstep ; p1 + + ldr r9, [sp, #36] ; count for 8-in-parallel + ldr r4, [src], pstep ; p0 + + ldr r7, [r2], #4 ; flimit + ldr r5, [src], pstep ; q0 + ldr r2, c0x80808080 + + ldr r6, [src] ; q1 + + uadd8 r7, r7, r7 ; flimit * 2 + mov r9, r9, lsl #1 ; 4-in-parallel + uadd8 r12, r7, r12 ; flimit * 2 + limit + +|simple_hnext8| + ; vp8_simple_filter_mask() function + + uqsub8 r7, r3, r6 ; p1 - q1 + uqsub8 r8, r6, r3 ; q1 - p1 + uqsub8 r10, r4, r5 ; p0 - q0 + uqsub8 r11, r5, r4 ; q0 - p0 + orr r8, r8, r7 ; abs(p1 - q1) + ldr lr, c0x7F7F7F7F ; 01111111 mask + orr r10, r10, r11 ; abs(p0 - q0) + and r8, lr, r8, lsr #1 ; abs(p1 - q1) / 2 + uqadd8 r10, r10, r10 ; abs(p0 - q0) * 2 + mvn lr, #0 ; r10 == -1 + uqadd8 r10, r10, r8 ; abs(p0 - q0)*2 + abs(p1 - q1)/2 + ; STALL waiting on r10 :( + uqsub8 r10, r10, r12 ; compare to flimit + mov r8, #0 + + usub8 r10, r8, r10 ; use usub8 instead of ssub8 + ; STALL (maybe?) when are flags set? :/ + sel r10, lr, r8 ; filter mask: lr + + cmp r10, #0 + beq simple_hskip_filter ; skip filtering + + ;vp8_simple_filter() function + + eor r3, r3, r2 ; p1 offset to convert to a signed value + eor r6, r6, r2 ; q1 offset to convert to a signed value + eor r4, r4, r2 ; p0 offset to convert to a signed value + eor r5, r5, r2 ; q0 offset to convert to a signed value + + qsub8 r3, r3, r6 ; vp8_filter (r3) = vp8_signed_char_clamp(p1-q1) + qsub8 r6, r5, r4 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( q0 - p0)) + + qadd8 r3, r3, r6 + ldr r8, c0x03030303 ; r8 = 3 + + qadd8 r3, r3, r6 + ldr r7, c0x04040404 + + qadd8 r3, r3, r6 + and r3, r3, lr ; vp8_filter &= mask; + + ;save bottom 3 bits so that we round one side +4 and the other +3 + qadd8 r8 , r3 , r8 ; Filter2 (r8) = vp8_signed_char_clamp(vp8_filter+3) + qadd8 r3 , r3 , r7 ; Filter1 (r3) = vp8_signed_char_clamp(vp8_filter+4) + + mov r7, #0 + shadd8 r8 , r8 , r7 ; Filter2 >>= 3 + shadd8 r3 , r3 , r7 ; Filter1 >>= 3 + shadd8 r8 , r8 , r7 + shadd8 r3 , r3 , r7 + shadd8 r8 , r8 , r7 ; r8: Filter2 + shadd8 r3 , r3 , r7 ; r7: filter1 + + ;calculate output + sub src, src, pstep, lsl #1 + + qadd8 r4, r4, r8 ; u = vp8_signed_char_clamp(p0 + Filter2) + qsub8 r5 ,r5, r3 ; u = vp8_signed_char_clamp(q0 - Filter1) + eor r4, r4, r2 ; *op0 = u^0x80 + str r4, [src], pstep ; store op0 result + eor r5, r5, r2 ; *oq0 = u^0x80 + str r5, [src], pstep ; store oq0 result + +|simple_hskip_filter| + add src, src, #4 + sub src, src, pstep + sub src, src, pstep, lsl #1 + + subs r9, r9, #1 + + ;pld [src] + ;pld [src, pstep] + ;pld [src, pstep, lsl #1] + + ldrne r3, [src], pstep ; p1 + ldrne r4, [src], pstep ; p0 + ldrne r5, [src], pstep ; q0 + ldrne r6, [src] ; q1 + + bne simple_hnext8 + + ldmia sp!, {r4 - r11, pc} + ENDP ; |vp8_loop_filter_simple_horizontal_edge_armv6| + + +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- +|vp8_loop_filter_simple_vertical_edge_armv6| PROC +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- + stmdb sp!, {r4 - r11, lr} + + ldr r12, [r2], #4 ; r12: flimit + ldr r2, c0x80808080 + ldr r7, [r3], #4 ; limit + + ; load soure data to r7, r8, r9, r10 + ldrh r3, [src, #-2] + ldrh r4, [src], pstep + uadd8 r12, r12, r12 ; flimit * 2 + + ldrh r5, [src, #-2] + ldrh r6, [src], pstep + uadd8 r12, r12, r7 ; flimit * 2 + limit + + pkhbt r7, r3, r4, lsl #16 + + ldrh r3, [src, #-2] + ldrh r4, [src], pstep + ldr r11, [sp, #40] ; count (r11) for 8-in-parallel + + pkhbt r8, r5, r6, lsl #16 + + ldrh r5, [src, #-2] + ldrh r6, [src], pstep + mov r11, r11, lsl #1 ; 4-in-parallel + +|simple_vnext8| + ; vp8_simple_filter_mask() function + pkhbt r9, r3, r4, lsl #16 + pkhbt r10, r5, r6, lsl #16 + + ;transpose r7, r8, r9, r10 to r3, r4, r5, r6 + TRANSPOSE_MATRIX r7, r8, r9, r10, r3, r4, r5, r6 + + uqsub8 r7, r3, r6 ; p1 - q1 + uqsub8 r8, r6, r3 ; q1 - p1 + uqsub8 r9, r4, r5 ; p0 - q0 + uqsub8 r10, r5, r4 ; q0 - p0 + orr r7, r7, r8 ; abs(p1 - q1) + orr r9, r9, r10 ; abs(p0 - q0) + ldr lr, c0x7F7F7F7F ; 0111 1111 mask + uqadd8 r9, r9, r9 ; abs(p0 - q0) * 2 + and r7, lr, r7, lsr #1 ; abs(p1 - q1) / 2 + mov r8, #0 + uqadd8 r7, r7, r9 ; abs(p0 - q0)*2 + abs(p1 - q1)/2 + mvn r10, #0 ; r10 == -1 + uqsub8 r7, r7, r12 ; compare to flimit + + usub8 r7, r8, r7 + sel r7, r10, r8 ; filter mask: lr + + cmp lr, #0 + beq simple_vskip_filter ; skip filtering + + ;vp8_simple_filter() function + eor r3, r3, r2 ; p1 offset to convert to a signed value + eor r6, r6, r2 ; q1 offset to convert to a signed value + eor r4, r4, r2 ; p0 offset to convert to a signed value + eor r5, r5, r2 ; q0 offset to convert to a signed value + + qsub8 r3, r3, r6 ; vp8_filter (r3) = vp8_signed_char_clamp(p1-q1) + qsub8 r6, r5, r4 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( q0 - p0)) + + qadd8 r3, r3, r6 + ldr r8, c0x03030303 ; r8 = 3 + + qadd8 r3, r3, r6 + ldr r7, c0x04040404 + + qadd8 r3, r3, r6 + and r3, r3, lr ; vp8_filter &= mask + + ;save bottom 3 bits so that we round one side +4 and the other +3 + qadd8 r8 , r3 , r8 ; Filter2 (r8) = vp8_signed_char_clamp(vp8_filter+3) + qadd8 r3 , r3 , r7 ; Filter1 (r3) = vp8_signed_char_clamp(vp8_filter+4) + + mov r7, #0 + shadd8 r8 , r8 , r7 ; Filter2 >>= 3 + shadd8 r3 , r3 , r7 ; Filter1 >>= 3 + shadd8 r8 , r8 , r7 + shadd8 r3 , r3 , r7 + shadd8 r8 , r8 , r7 ; r8: filter2 + shadd8 r3 , r3 , r7 ; r7: filter1 + + ;calculate output + sub src, src, pstep, lsl #2 + + qadd8 r4, r4, r8 ; u = vp8_signed_char_clamp(p0 + Filter2) + qsub8 r5, r5, r3 ; u = vp8_signed_char_clamp(q0 - Filter1) + eor r4, r4, r2 ; *op0 = u^0x80 + eor r5, r5, r2 ; *oq0 = u^0x80 + + strb r4, [src, #-1] ; store the result + mov r4, r4, lsr #8 + strb r5, [src], pstep + mov r5, r5, lsr #8 + + strb r4, [src, #-1] + mov r4, r4, lsr #8 + strb r5, [src], pstep + mov r5, r5, lsr #8 + + strb r4, [src, #-1] + mov r4, r4, lsr #8 + strb r5, [src], pstep + mov r5, r5, lsr #8 + + strb r4, [src, #-1] + strb r5, [src], pstep + +|simple_vskip_filter| + subs r11, r11, #1 + + ;pld [src] + ;pld [src, pstep] + ;pld [src, pstep, lsl #1] + + ; load soure data to r7, r8, r9, r10 + ldrneh r3, [src, #-2] + ldrneh r4, [src], pstep + + ldrneh r5, [src, #-2] + ldrneh r6, [src], pstep + + pkhbt r7, r3, r4, lsl #16 + + ldrneh r3, [src, #-2] + ldrneh r4, [src], pstep + + pkhbt r8, r5, r6, lsl #16 + + ldrneh r5, [src, #-2] + ldrneh r6, [src], pstep + + bne simple_vnext8 + + ldmia sp!, {r4 - r12, pc} + ENDP ; |vp8_loop_filter_simple_vertical_edge_armv6| + +; Constant Pool +c0x80808080 DCD 0x80808080 +c0x03030303 DCD 0x03030303 +c0x04040404 DCD 0x04040404 +c0x01010101 DCD 0x01010101 +c0x7F7F7F7F DCD 0x7F7F7F7F + + END diff --git a/vp8/common/arm/armv6/sixtappredict8x4_v6.asm b/vp8/common/arm/armv6/sixtappredict8x4_v6.asm new file mode 100644 index 000000000..551d863e9 --- /dev/null +++ b/vp8/common/arm/armv6/sixtappredict8x4_v6.asm @@ -0,0 +1,277 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_sixtap_predict8x4_armv6| + + AREA |.text|, CODE, READONLY ; name this block of code +;------------------------------------- +; r0 unsigned char *src_ptr, +; r1 int src_pixels_per_line, +; r2 int xoffset, +; r3 int yoffset, +; stack unsigned char *dst_ptr, +; stack int dst_pitch +;------------------------------------- +;note: In first pass, store the result in transpose(8linesx9columns) on stack. Temporary stack size is 184. +;Line width is 20 that is 9 short data plus 2 to make it 4bytes aligned. In second pass, load data from stack, +;and the result is stored in transpose. +|vp8_sixtap_predict8x4_armv6| PROC + stmdb sp!, {r4 - r11, lr} + sub sp, sp, #184 ;reserve space on stack for temporary storage: 20x(8+1) +4 + + cmp r2, #0 ;skip first_pass filter if xoffset=0 + str r3, [sp], #4 ;store yoffset + beq skip_firstpass_filter + +;first-pass filter + ldr r12, _filter8_coeff_ + sub r0, r0, r1, lsl #1 + + add r2, r12, r2, lsl #4 ;calculate filter location + add r0, r0, #3 ;adjust src only for loading convinience + + ldr r3, [r2] ; load up packed filter coefficients + ldr r4, [r2, #4] + ldr r5, [r2, #8] + + mov r2, #0x90000 ; height=9 is top part of counter + + sub r1, r1, #8 + mov lr, #20 + +|first_pass_hloop_v6| + ldrb r6, [r0, #-5] ; load source data + ldrb r7, [r0, #-4] + ldrb r8, [r0, #-3] + ldrb r9, [r0, #-2] + ldrb r10, [r0, #-1] + + orr r2, r2, #0x4 ; construct loop counter. width=8=4x2 + + pkhbt r6, r6, r7, lsl #16 ; r7 | r6 + pkhbt r7, r7, r8, lsl #16 ; r8 | r7 + + pkhbt r8, r8, r9, lsl #16 ; r9 | r8 + pkhbt r9, r9, r10, lsl #16 ; r10 | r9 + +|first_pass_wloop_v6| + smuad r11, r6, r3 ; vp8_filter[0], vp8_filter[1] + smuad r12, r7, r3 + + ldrb r6, [r0], #1 + + smlad r11, r8, r4, r11 ; vp8_filter[2], vp8_filter[3] + ldrb r7, [r0], #1 + smlad r12, r9, r4, r12 + + pkhbt r10, r10, r6, lsl #16 ; r10 | r9 + pkhbt r6, r6, r7, lsl #16 ; r11 | r10 + smlad r11, r10, r5, r11 ; vp8_filter[4], vp8_filter[5] + smlad r12, r6, r5, r12 + + sub r2, r2, #1 + + add r11, r11, #0x40 ; round_shift_and_clamp + tst r2, #0xff ; test loop counter + usat r11, #8, r11, asr #7 + add r12, r12, #0x40 + strh r11, [sp], lr ; result is transposed and stored, which + usat r12, #8, r12, asr #7 + + strh r12, [sp], lr + + movne r11, r6 + movne r12, r7 + + movne r6, r8 + movne r7, r9 + movne r8, r10 + movne r9, r11 + movne r10, r12 + + bne first_pass_wloop_v6 + + ;;add r9, ppl, #30 ; attempt to load 2 adjacent cache lines + ;;IF ARCHITECTURE=6 + ;pld [src, ppl] + ;;pld [src, r9] + ;;ENDIF + + subs r2, r2, #0x10000 + + mov r6, #158 + sub sp, sp, r6 + + add r0, r0, r1 ; move to next input line + + bne first_pass_hloop_v6 + +;second pass filter +secondpass_filter + mov r1, #18 + sub sp, sp, r1 ; 18+4 + + ldr r3, [sp, #-4] ; load back yoffset + ldr r0, [sp, #216] ; load dst address from stack 180+36 + ldr r1, [sp, #220] ; load dst stride from stack 180+40 + + cmp r3, #0 + beq skip_secondpass_filter + + ldr r12, _filter8_coeff_ + add lr, r12, r3, lsl #4 ;calculate filter location + + mov r2, #0x00080000 + + ldr r3, [lr] ; load up packed filter coefficients + ldr r4, [lr, #4] + ldr r5, [lr, #8] + + pkhbt r12, r4, r3 ; pack the filter differently + pkhbt r11, r5, r4 + +second_pass_hloop_v6 + ldr r6, [sp] ; load the data + ldr r7, [sp, #4] + + orr r2, r2, #2 ; loop counter + +second_pass_wloop_v6 + smuad lr, r3, r6 ; apply filter + smulbt r10, r3, r6 + + ldr r8, [sp, #8] + + smlad lr, r4, r7, lr + smladx r10, r12, r7, r10 + + ldrh r9, [sp, #12] + + smlad lr, r5, r8, lr + smladx r10, r11, r8, r10 + + add sp, sp, #4 + smlatb r10, r5, r9, r10 + + sub r2, r2, #1 + + add lr, lr, #0x40 ; round_shift_and_clamp + tst r2, #0xff + usat lr, #8, lr, asr #7 + add r10, r10, #0x40 + strb lr, [r0], r1 ; the result is transposed back and stored + usat r10, #8, r10, asr #7 + + strb r10, [r0],r1 + + movne r6, r7 + movne r7, r8 + + bne second_pass_wloop_v6 + + subs r2, r2, #0x10000 + add sp, sp, #12 ; updata src for next loop (20-8) + sub r0, r0, r1, lsl #2 + add r0, r0, #1 + + bne second_pass_hloop_v6 + + add sp, sp, #20 + ldmia sp!, {r4 - r11, pc} + +;-------------------- +skip_firstpass_filter + sub r0, r0, r1, lsl #1 + sub r1, r1, #8 + mov r2, #9 + mov r3, #20 + +skip_firstpass_hloop + ldrb r4, [r0], #1 ; load data + subs r2, r2, #1 + ldrb r5, [r0], #1 + strh r4, [sp], r3 ; store it to immediate buffer + ldrb r6, [r0], #1 ; load data + strh r5, [sp], r3 + ldrb r7, [r0], #1 + strh r6, [sp], r3 + ldrb r8, [r0], #1 + strh r7, [sp], r3 + ldrb r9, [r0], #1 + strh r8, [sp], r3 + ldrb r10, [r0], #1 + strh r9, [sp], r3 + ldrb r11, [r0], #1 + strh r10, [sp], r3 + add r0, r0, r1 ; move to next input line + strh r11, [sp], r3 + + mov r4, #158 + sub sp, sp, r4 ; move over to next column + bne skip_firstpass_hloop + + b secondpass_filter + +;-------------------- +skip_secondpass_filter + mov r2, #8 + add sp, sp, #4 ;start from src[0] instead of src[-2] + +skip_secondpass_hloop + ldr r6, [sp], #4 + subs r2, r2, #1 + ldr r8, [sp], #4 + + mov r7, r6, lsr #16 ; unpack + strb r6, [r0], r1 + mov r9, r8, lsr #16 + strb r7, [r0], r1 + add sp, sp, #12 ; 20-8 + strb r8, [r0], r1 + strb r9, [r0], r1 + + sub r0, r0, r1, lsl #2 + add r0, r0, #1 + + bne skip_secondpass_hloop + + add sp, sp, #16 ; 180 - (160 +4) + + ldmia sp!, {r4 - r11, pc} + + ENDP + +;----------------- + AREA subpelfilters8_dat, DATA, READWRITE ;read/write by default +;Data section with name data_area is specified. DCD reserves space in memory for 48 data. +;One word each is reserved. Label filter_coeff can be used to access the data. +;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ... +_filter8_coeff_ + DCD filter8_coeff +filter8_coeff + DCD 0x00000000, 0x00000080, 0x00000000, 0x00000000 + DCD 0xfffa0000, 0x000c007b, 0x0000ffff, 0x00000000 + DCD 0xfff50002, 0x0024006c, 0x0001fff8, 0x00000000 + DCD 0xfff70000, 0x0032005d, 0x0000fffa, 0x00000000 + DCD 0xfff00003, 0x004d004d, 0x0003fff0, 0x00000000 + DCD 0xfffa0000, 0x005d0032, 0x0000fff7, 0x00000000 + DCD 0xfff80001, 0x006c0024, 0x0002fff5, 0x00000000 + DCD 0xffff0000, 0x007b000c, 0x0000fffa, 0x00000000 + + ;DCD 0, 0, 128, 0, 0, 0 + ;DCD 0, -6, 123, 12, -1, 0 + ;DCD 2, -11, 108, 36, -8, 1 + ;DCD 0, -9, 93, 50, -6, 0 + ;DCD 3, -16, 77, 77, -16, 3 + ;DCD 0, -6, 50, 93, -9, 0 + ;DCD 1, -8, 36, 108, -11, 2 + ;DCD 0, -1, 12, 123, -6, 0 + + END |