summaryrefslogtreecommitdiff
path: root/vp8/common/arm/armv6
diff options
context:
space:
mode:
authorJohn Koleszar <jkoleszar@google.com>2010-05-18 11:58:33 -0400
committerJohn Koleszar <jkoleszar@google.com>2010-05-18 11:58:33 -0400
commit0ea50ce9cb4b65eee6afa1d041fe8beb5abda667 (patch)
tree1f3b9019f28bc56fd3156f96e5a9653a983ee61b /vp8/common/arm/armv6
downloadlibvpx-0ea50ce9cb4b65eee6afa1d041fe8beb5abda667.tar
libvpx-0ea50ce9cb4b65eee6afa1d041fe8beb5abda667.tar.gz
libvpx-0ea50ce9cb4b65eee6afa1d041fe8beb5abda667.tar.bz2
libvpx-0ea50ce9cb4b65eee6afa1d041fe8beb5abda667.zip
Initial WebM release
Diffstat (limited to 'vp8/common/arm/armv6')
-rw-r--r--vp8/common/arm/armv6/bilinearfilter_v6.asm237
-rw-r--r--vp8/common/arm/armv6/copymem16x16_v6.asm181
-rw-r--r--vp8/common/arm/armv6/copymem8x4_v6.asm127
-rw-r--r--vp8/common/arm/armv6/copymem8x8_v6.asm127
-rw-r--r--vp8/common/arm/armv6/filter_v6.asm383
-rw-r--r--vp8/common/arm/armv6/idct_v6.asm376
-rw-r--r--vp8/common/arm/armv6/iwalsh_v6.asm151
-rw-r--r--vp8/common/arm/armv6/loopfilter_v6.asm1263
-rw-r--r--vp8/common/arm/armv6/recon_v6.asm280
-rw-r--r--vp8/common/arm/armv6/simpleloopfilter_v6.asm321
-rw-r--r--vp8/common/arm/armv6/sixtappredict8x4_v6.asm277
11 files changed, 3723 insertions, 0 deletions
diff --git a/vp8/common/arm/armv6/bilinearfilter_v6.asm b/vp8/common/arm/armv6/bilinearfilter_v6.asm
new file mode 100644
index 000000000..4428cf8ff
--- /dev/null
+++ b/vp8/common/arm/armv6/bilinearfilter_v6.asm
@@ -0,0 +1,237 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_filter_block2d_bil_first_pass_armv6|
+ EXPORT |vp8_filter_block2d_bil_second_pass_armv6|
+
+ AREA |.text|, CODE, READONLY ; name this block of code
+
+;-------------------------------------
+; r0 unsigned char *src_ptr,
+; r1 unsigned short *output_ptr,
+; r2 unsigned int src_pixels_per_line,
+; r3 unsigned int output_height,
+; stack unsigned int output_width,
+; stack const short *vp8_filter
+;-------------------------------------
+; The output is transposed stroed in output array to make it easy for second pass filtering.
+|vp8_filter_block2d_bil_first_pass_armv6| PROC
+ stmdb sp!, {r4 - r11, lr}
+
+ ldr r11, [sp, #40] ; vp8_filter address
+ ldr r4, [sp, #36] ; output width
+
+ mov r12, r3 ; outer-loop counter
+ sub r2, r2, r4 ; src increment for height loop
+
+ ;;IF ARCHITECTURE=6
+ pld [r0]
+ ;;ENDIF
+
+ ldr r5, [r11] ; load up filter coefficients
+
+ mov r3, r3, lsl #1 ; output_height*2
+ add r3, r3, #2 ; plus 2 to make output buffer 4-bit aligned since height is actually (height+1)
+
+ mov r11, r1 ; save output_ptr for each row
+
+ cmp r5, #128 ; if filter coef = 128, then skip the filter
+ beq bil_null_1st_filter
+
+|bil_height_loop_1st_v6|
+ ldrb r6, [r0] ; load source data
+ ldrb r7, [r0, #1]
+ ldrb r8, [r0, #2]
+ mov lr, r4, lsr #2 ; 4-in-parellel loop counter
+
+|bil_width_loop_1st_v6|
+ ldrb r9, [r0, #3]
+ ldrb r10, [r0, #4]
+
+ pkhbt r6, r6, r7, lsl #16 ; src[1] | src[0]
+ pkhbt r7, r7, r8, lsl #16 ; src[2] | src[1]
+
+ smuad r6, r6, r5 ; apply the filter
+ pkhbt r8, r8, r9, lsl #16 ; src[3] | src[2]
+ smuad r7, r7, r5
+ pkhbt r9, r9, r10, lsl #16 ; src[4] | src[3]
+
+ smuad r8, r8, r5
+ smuad r9, r9, r5
+
+ add r0, r0, #4
+ subs lr, lr, #1
+
+ add r6, r6, #0x40 ; round_shift_and_clamp
+ add r7, r7, #0x40
+ usat r6, #16, r6, asr #7
+ usat r7, #16, r7, asr #7
+
+ strh r6, [r1], r3 ; result is transposed and stored
+
+ add r8, r8, #0x40 ; round_shift_and_clamp
+ strh r7, [r1], r3
+ add r9, r9, #0x40
+ usat r8, #16, r8, asr #7
+ usat r9, #16, r9, asr #7
+
+ strh r8, [r1], r3 ; result is transposed and stored
+
+ ldrneb r6, [r0] ; load source data
+ strh r9, [r1], r3
+
+ ldrneb r7, [r0, #1]
+ ldrneb r8, [r0, #2]
+
+ bne bil_width_loop_1st_v6
+
+ add r0, r0, r2 ; move to next input row
+ subs r12, r12, #1
+
+ ;;IF ARCHITECTURE=6
+ pld [r0]
+ ;;ENDIF
+
+ add r11, r11, #2 ; move over to next column
+ mov r1, r11
+
+ bne bil_height_loop_1st_v6
+
+ ldmia sp!, {r4 - r11, pc}
+
+|bil_null_1st_filter|
+|bil_height_loop_null_1st|
+ mov lr, r4, lsr #2 ; loop counter
+
+|bil_width_loop_null_1st|
+ ldrb r6, [r0] ; load data
+ ldrb r7, [r0, #1]
+ ldrb r8, [r0, #2]
+ ldrb r9, [r0, #3]
+
+ strh r6, [r1], r3 ; store it to immediate buffer
+ add r0, r0, #4
+ strh r7, [r1], r3
+ subs lr, lr, #1
+ strh r8, [r1], r3
+ strh r9, [r1], r3
+
+ bne bil_width_loop_null_1st
+
+ subs r12, r12, #1
+ add r0, r0, r2 ; move to next input line
+ add r11, r11, #2 ; move over to next column
+ mov r1, r11
+
+ bne bil_height_loop_null_1st
+
+ ldmia sp!, {r4 - r11, pc}
+
+ ENDP ; |vp8_filter_block2d_bil_first_pass_armv6|
+
+
+;---------------------------------
+; r0 unsigned short *src_ptr,
+; r1 unsigned char *output_ptr,
+; r2 int output_pitch,
+; r3 unsigned int output_height,
+; stack unsigned int output_width,
+; stack const short *vp8_filter
+;---------------------------------
+|vp8_filter_block2d_bil_second_pass_armv6| PROC
+ stmdb sp!, {r4 - r11, lr}
+
+ ldr r11, [sp, #40] ; vp8_filter address
+ ldr r4, [sp, #36] ; output width
+
+ ldr r5, [r11] ; load up filter coefficients
+ mov r12, r4 ; outer-loop counter = width, since we work on transposed data matrix
+ mov r11, r1
+
+ cmp r5, #128 ; if filter coef = 128, then skip the filter
+ beq bil_null_2nd_filter
+
+|bil_height_loop_2nd|
+ ldr r6, [r0] ; load the data
+ ldr r8, [r0, #4]
+ ldrh r10, [r0, #8]
+ mov lr, r3, lsr #2 ; loop counter
+
+|bil_width_loop_2nd|
+ pkhtb r7, r6, r8 ; src[1] | src[2]
+ pkhtb r9, r8, r10 ; src[3] | src[4]
+
+ smuad r6, r6, r5 ; apply filter
+ smuad r8, r8, r5 ; apply filter
+
+ subs lr, lr, #1
+
+ smuadx r7, r7, r5 ; apply filter
+ smuadx r9, r9, r5 ; apply filter
+
+ add r0, r0, #8
+
+ add r6, r6, #0x40 ; round_shift_and_clamp
+ add r7, r7, #0x40
+ usat r6, #8, r6, asr #7
+ usat r7, #8, r7, asr #7
+ strb r6, [r1], r2 ; the result is transposed back and stored
+
+ add r8, r8, #0x40 ; round_shift_and_clamp
+ strb r7, [r1], r2
+ add r9, r9, #0x40
+ usat r8, #8, r8, asr #7
+ usat r9, #8, r9, asr #7
+ strb r8, [r1], r2 ; the result is transposed back and stored
+
+ ldrne r6, [r0] ; load data
+ strb r9, [r1], r2
+ ldrne r8, [r0, #4]
+ ldrneh r10, [r0, #8]
+
+ bne bil_width_loop_2nd
+
+ subs r12, r12, #1
+ add r0, r0, #4 ; update src for next row
+ add r11, r11, #1
+ mov r1, r11
+
+ bne bil_height_loop_2nd
+ ldmia sp!, {r4 - r11, pc}
+
+|bil_null_2nd_filter|
+|bil_height_loop_null_2nd|
+ mov lr, r3, lsr #2
+
+|bil_width_loop_null_2nd|
+ ldr r6, [r0], #4 ; load data
+ subs lr, lr, #1
+ ldr r8, [r0], #4
+
+ strb r6, [r1], r2 ; store data
+ mov r7, r6, lsr #16
+ strb r7, [r1], r2
+ mov r9, r8, lsr #16
+ strb r8, [r1], r2
+ strb r9, [r1], r2
+
+ bne bil_width_loop_null_2nd
+
+ subs r12, r12, #1
+ add r0, r0, #4
+ add r11, r11, #1
+ mov r1, r11
+
+ bne bil_height_loop_null_2nd
+
+ ldmia sp!, {r4 - r11, pc}
+ ENDP ; |vp8_filter_block2d_second_pass_armv6|
+
+ END
diff --git a/vp8/common/arm/armv6/copymem16x16_v6.asm b/vp8/common/arm/armv6/copymem16x16_v6.asm
new file mode 100644
index 000000000..00e97397c
--- /dev/null
+++ b/vp8/common/arm/armv6/copymem16x16_v6.asm
@@ -0,0 +1,181 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_copy_mem16x16_v6|
+ ; ARM
+ ; REQUIRE8
+ ; PRESERVE8
+
+ AREA Block, CODE, READONLY ; name this block of code
+;void copy_mem16x16_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+|vp8_copy_mem16x16_v6| PROC
+ stmdb sp!, {r4 - r7}
+ ;push {r4-r7}
+
+ ;preload
+ pld [r0]
+ pld [r0, r1]
+ pld [r0, r1, lsl #1]
+
+ ands r4, r0, #15
+ beq copy_mem16x16_fast
+
+ ands r4, r0, #7
+ beq copy_mem16x16_8
+
+ ands r4, r0, #3
+ beq copy_mem16x16_4
+
+ ;copy one byte each time
+ ldrb r4, [r0]
+ ldrb r5, [r0, #1]
+ ldrb r6, [r0, #2]
+ ldrb r7, [r0, #3]
+
+ mov r12, #16
+
+copy_mem16x16_1_loop
+ strb r4, [r2]
+ strb r5, [r2, #1]
+ strb r6, [r2, #2]
+ strb r7, [r2, #3]
+
+ ldrb r4, [r0, #4]
+ ldrb r5, [r0, #5]
+ ldrb r6, [r0, #6]
+ ldrb r7, [r0, #7]
+
+ subs r12, r12, #1
+
+ strb r4, [r2, #4]
+ strb r5, [r2, #5]
+ strb r6, [r2, #6]
+ strb r7, [r2, #7]
+
+ ldrb r4, [r0, #8]
+ ldrb r5, [r0, #9]
+ ldrb r6, [r0, #10]
+ ldrb r7, [r0, #11]
+
+ strb r4, [r2, #8]
+ strb r5, [r2, #9]
+ strb r6, [r2, #10]
+ strb r7, [r2, #11]
+
+ ldrb r4, [r0, #12]
+ ldrb r5, [r0, #13]
+ ldrb r6, [r0, #14]
+ ldrb r7, [r0, #15]
+
+ add r0, r0, r1
+
+ strb r4, [r2, #12]
+ strb r5, [r2, #13]
+ strb r6, [r2, #14]
+ strb r7, [r2, #15]
+
+ add r2, r2, r3
+
+ ldrneb r4, [r0]
+ ldrneb r5, [r0, #1]
+ ldrneb r6, [r0, #2]
+ ldrneb r7, [r0, #3]
+
+ bne copy_mem16x16_1_loop
+
+ ldmia sp!, {r4 - r7}
+ ;pop {r4-r7}
+ mov pc, lr
+
+;copy 4 bytes each time
+copy_mem16x16_4
+ ldr r4, [r0]
+ ldr r5, [r0, #4]
+ ldr r6, [r0, #8]
+ ldr r7, [r0, #12]
+
+ mov r12, #16
+
+copy_mem16x16_4_loop
+ subs r12, r12, #1
+ add r0, r0, r1
+
+ str r4, [r2]
+ str r5, [r2, #4]
+ str r6, [r2, #8]
+ str r7, [r2, #12]
+
+ add r2, r2, r3
+
+ ldrne r4, [r0]
+ ldrne r5, [r0, #4]
+ ldrne r6, [r0, #8]
+ ldrne r7, [r0, #12]
+
+ bne copy_mem16x16_4_loop
+
+ ldmia sp!, {r4 - r7}
+ ;pop {r4-r7}
+ mov pc, lr
+
+;copy 8 bytes each time
+copy_mem16x16_8
+ sub r1, r1, #16
+ sub r3, r3, #16
+
+ mov r12, #16
+
+copy_mem16x16_8_loop
+ ldmia r0!, {r4-r5}
+ ;ldm r0, {r4-r5}
+ ldmia r0!, {r6-r7}
+
+ add r0, r0, r1
+
+ stmia r2!, {r4-r5}
+ subs r12, r12, #1
+ ;stm r2, {r4-r5}
+ stmia r2!, {r6-r7}
+
+ add r2, r2, r3
+
+ bne copy_mem16x16_8_loop
+
+ ldmia sp!, {r4 - r7}
+ ;pop {r4-r7}
+ mov pc, lr
+
+;copy 16 bytes each time
+copy_mem16x16_fast
+ ;sub r1, r1, #16
+ ;sub r3, r3, #16
+
+ mov r12, #16
+
+copy_mem16x16_fast_loop
+ ldmia r0, {r4-r7}
+ ;ldm r0, {r4-r7}
+ add r0, r0, r1
+
+ subs r12, r12, #1
+ stmia r2, {r4-r7}
+ ;stm r2, {r4-r7}
+ add r2, r2, r3
+
+ bne copy_mem16x16_fast_loop
+
+ ldmia sp!, {r4 - r7}
+ ;pop {r4-r7}
+ mov pc, lr
+
+ ENDP ; |vp8_copy_mem16x16_v6|
+
+ END
diff --git a/vp8/common/arm/armv6/copymem8x4_v6.asm b/vp8/common/arm/armv6/copymem8x4_v6.asm
new file mode 100644
index 000000000..94473ca65
--- /dev/null
+++ b/vp8/common/arm/armv6/copymem8x4_v6.asm
@@ -0,0 +1,127 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_copy_mem8x4_v6|
+ ; ARM
+ ; REQUIRE8
+ ; PRESERVE8
+
+ AREA Block, CODE, READONLY ; name this block of code
+;void vp8_copy_mem8x4_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+|vp8_copy_mem8x4_v6| PROC
+ ;push {r4-r5}
+ stmdb sp!, {r4-r5}
+
+ ;preload
+ pld [r0]
+ pld [r0, r1]
+ pld [r0, r1, lsl #1]
+
+ ands r4, r0, #7
+ beq copy_mem8x4_fast
+
+ ands r4, r0, #3
+ beq copy_mem8x4_4
+
+ ;copy 1 byte each time
+ ldrb r4, [r0]
+ ldrb r5, [r0, #1]
+
+ mov r12, #4
+
+copy_mem8x4_1_loop
+ strb r4, [r2]
+ strb r5, [r2, #1]
+
+ ldrb r4, [r0, #2]
+ ldrb r5, [r0, #3]
+
+ subs r12, r12, #1
+
+ strb r4, [r2, #2]
+ strb r5, [r2, #3]
+
+ ldrb r4, [r0, #4]
+ ldrb r5, [r0, #5]
+
+ strb r4, [r2, #4]
+ strb r5, [r2, #5]
+
+ ldrb r4, [r0, #6]
+ ldrb r5, [r0, #7]
+
+ add r0, r0, r1
+
+ strb r4, [r2, #6]
+ strb r5, [r2, #7]
+
+ add r2, r2, r3
+
+ ldrneb r4, [r0]
+ ldrneb r5, [r0, #1]
+
+ bne copy_mem8x4_1_loop
+
+ ldmia sp!, {r4 - r5}
+ ;pop {r4-r5}
+ mov pc, lr
+
+;copy 4 bytes each time
+copy_mem8x4_4
+ ldr r4, [r0]
+ ldr r5, [r0, #4]
+
+ mov r12, #4
+
+copy_mem8x4_4_loop
+ subs r12, r12, #1
+ add r0, r0, r1
+
+ str r4, [r2]
+ str r5, [r2, #4]
+
+ add r2, r2, r3
+
+ ldrne r4, [r0]
+ ldrne r5, [r0, #4]
+
+ bne copy_mem8x4_4_loop
+
+ ldmia sp!, {r4-r5}
+ ;pop {r4-r5}
+ mov pc, lr
+
+;copy 8 bytes each time
+copy_mem8x4_fast
+ ;sub r1, r1, #8
+ ;sub r3, r3, #8
+
+ mov r12, #4
+
+copy_mem8x4_fast_loop
+ ldmia r0, {r4-r5}
+ ;ldm r0, {r4-r5}
+ add r0, r0, r1
+
+ subs r12, r12, #1
+ stmia r2, {r4-r5}
+ ;stm r2, {r4-r5}
+ add r2, r2, r3
+
+ bne copy_mem8x4_fast_loop
+
+ ldmia sp!, {r4-r5}
+ ;pop {r4-r5}
+ mov pc, lr
+
+ ENDP ; |vp8_copy_mem8x4_v6|
+
+ END
diff --git a/vp8/common/arm/armv6/copymem8x8_v6.asm b/vp8/common/arm/armv6/copymem8x8_v6.asm
new file mode 100644
index 000000000..7cfa53389
--- /dev/null
+++ b/vp8/common/arm/armv6/copymem8x8_v6.asm
@@ -0,0 +1,127 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_copy_mem8x8_v6|
+ ; ARM
+ ; REQUIRE8
+ ; PRESERVE8
+
+ AREA Block, CODE, READONLY ; name this block of code
+;void copy_mem8x8_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+|vp8_copy_mem8x8_v6| PROC
+ ;push {r4-r5}
+ stmdb sp!, {r4-r5}
+
+ ;preload
+ pld [r0]
+ pld [r0, r1]
+ pld [r0, r1, lsl #1]
+
+ ands r4, r0, #7
+ beq copy_mem8x8_fast
+
+ ands r4, r0, #3
+ beq copy_mem8x8_4
+
+ ;copy 1 byte each time
+ ldrb r4, [r0]
+ ldrb r5, [r0, #1]
+
+ mov r12, #8
+
+copy_mem8x8_1_loop
+ strb r4, [r2]
+ strb r5, [r2, #1]
+
+ ldrb r4, [r0, #2]
+ ldrb r5, [r0, #3]
+
+ subs r12, r12, #1
+
+ strb r4, [r2, #2]
+ strb r5, [r2, #3]
+
+ ldrb r4, [r0, #4]
+ ldrb r5, [r0, #5]
+
+ strb r4, [r2, #4]
+ strb r5, [r2, #5]
+
+ ldrb r4, [r0, #6]
+ ldrb r5, [r0, #7]
+
+ add r0, r0, r1
+
+ strb r4, [r2, #6]
+ strb r5, [r2, #7]
+
+ add r2, r2, r3
+
+ ldrneb r4, [r0]
+ ldrneb r5, [r0, #1]
+
+ bne copy_mem8x8_1_loop
+
+ ldmia sp!, {r4 - r5}
+ ;pop {r4-r5}
+ mov pc, lr
+
+;copy 4 bytes each time
+copy_mem8x8_4
+ ldr r4, [r0]
+ ldr r5, [r0, #4]
+
+ mov r12, #8
+
+copy_mem8x8_4_loop
+ subs r12, r12, #1
+ add r0, r0, r1
+
+ str r4, [r2]
+ str r5, [r2, #4]
+
+ add r2, r2, r3
+
+ ldrne r4, [r0]
+ ldrne r5, [r0, #4]
+
+ bne copy_mem8x8_4_loop
+
+ ldmia sp!, {r4 - r5}
+ ;pop {r4-r5}
+ mov pc, lr
+
+;copy 8 bytes each time
+copy_mem8x8_fast
+ ;sub r1, r1, #8
+ ;sub r3, r3, #8
+
+ mov r12, #8
+
+copy_mem8x8_fast_loop
+ ldmia r0, {r4-r5}
+ ;ldm r0, {r4-r5}
+ add r0, r0, r1
+
+ subs r12, r12, #1
+ stmia r2, {r4-r5}
+ ;stm r2, {r4-r5}
+ add r2, r2, r3
+
+ bne copy_mem8x8_fast_loop
+
+ ldmia sp!, {r4-r5}
+ ;pop {r4-r5}
+ mov pc, lr
+
+ ENDP ; |vp8_copy_mem8x8_v6|
+
+ END
diff --git a/vp8/common/arm/armv6/filter_v6.asm b/vp8/common/arm/armv6/filter_v6.asm
new file mode 100644
index 000000000..a7863fc94
--- /dev/null
+++ b/vp8/common/arm/armv6/filter_v6.asm
@@ -0,0 +1,383 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_filter_block2d_first_pass_armv6|
+ EXPORT |vp8_filter_block2d_second_pass_armv6|
+ EXPORT |vp8_filter_block2d_first_pass_only_armv6|
+ EXPORT |vp8_filter_block2d_second_pass_only_armv6|
+
+ AREA |.text|, CODE, READONLY ; name this block of code
+;-------------------------------------
+; r0 unsigned char *src_ptr
+; r1 short *output_ptr
+; r2 unsigned int src_pixels_per_line
+; r3 unsigned int output_width
+; stack unsigned int output_height
+; stack const short *vp8_filter
+;-------------------------------------
+; vp8_filter the input and put in the output array. Apply the 6 tap FIR filter with
+; the output being a 2 byte value and the intput being a 1 byte value.
+|vp8_filter_block2d_first_pass_armv6| PROC
+ stmdb sp!, {r4 - r11, lr}
+
+ ldr r11, [sp, #40] ; vp8_filter address
+ ldr r7, [sp, #36] ; output height
+
+ sub r2, r2, r3 ; inside loop increments input array,
+ ; so the height loop only needs to add
+ ; r2 - width to the input pointer
+
+ mov r3, r3, lsl #1 ; multiply width by 2 because using shorts
+ add r12, r3, #16 ; square off the output
+ sub sp, sp, #4
+
+ ;;IF ARCHITECTURE=6
+ ;pld [r0, #-2]
+ ;;pld [r0, #30]
+ ;;ENDIF
+
+ ldr r4, [r11] ; load up packed filter coefficients
+ ldr r5, [r11, #4]
+ ldr r6, [r11, #8]
+
+ str r1, [sp] ; push destination to stack
+ mov r7, r7, lsl #16 ; height is top part of counter
+
+; six tap filter
+|height_loop_1st_6|
+ ldrb r8, [r0, #-2] ; load source data
+ ldrb r9, [r0, #-1]
+ ldrb r10, [r0], #2
+ orr r7, r7, r3, lsr #2 ; construct loop counter
+
+|width_loop_1st_6|
+ ldrb r11, [r0, #-1]
+
+ pkhbt lr, r8, r9, lsl #16 ; r9 | r8
+ pkhbt r8, r9, r10, lsl #16 ; r10 | r9
+
+ ldrb r9, [r0]
+
+ smuad lr, lr, r4 ; apply the filter
+ pkhbt r10, r10, r11, lsl #16 ; r11 | r10
+ smuad r8, r8, r4
+ pkhbt r11, r11, r9, lsl #16 ; r9 | r11
+
+ smlad lr, r10, r5, lr
+ ldrb r10, [r0, #1]
+ smlad r8, r11, r5, r8
+ ldrb r11, [r0, #2]
+
+ sub r7, r7, #1
+
+ pkhbt r9, r9, r10, lsl #16 ; r10 | r9
+ pkhbt r10, r10, r11, lsl #16 ; r11 | r10
+
+ smlad lr, r9, r6, lr
+ smlad r11, r10, r6, r8
+
+ ands r10, r7, #0xff ; test loop counter
+
+ add lr, lr, #0x40 ; round_shift_and_clamp
+ ldrneb r8, [r0, #-2] ; load data for next loop
+ usat lr, #8, lr, asr #7
+ add r11, r11, #0x40
+ ldrneb r9, [r0, #-1]
+ usat r11, #8, r11, asr #7
+
+ strh lr, [r1], r12 ; result is transposed and stored, which
+ ; will make second pass filtering easier.
+ ldrneb r10, [r0], #2
+ strh r11, [r1], r12
+
+ bne width_loop_1st_6
+
+ ;;add r9, r2, #30 ; attempt to load 2 adjacent cache lines
+ ;;IF ARCHITECTURE=6
+ ;pld [r0, r2]
+ ;;pld [r0, r9]
+ ;;ENDIF
+
+ ldr r1, [sp] ; load and update dst address
+ subs r7, r7, #0x10000
+ add r0, r0, r2 ; move to next input line
+ add r1, r1, #2 ; move over to next column
+ str r1, [sp]
+
+ bne height_loop_1st_6
+
+ add sp, sp, #4
+ ldmia sp!, {r4 - r11, pc}
+
+ ENDP
+
+;---------------------------------
+; r0 short *src_ptr,
+; r1 unsigned char *output_ptr,
+; r2 unsigned int output_pitch,
+; r3 unsigned int cnt,
+; stack const short *vp8_filter
+;---------------------------------
+|vp8_filter_block2d_second_pass_armv6| PROC
+ stmdb sp!, {r4 - r11, lr}
+
+ ldr r11, [sp, #36] ; vp8_filter address
+ sub sp, sp, #4
+ mov r7, r3, lsl #16 ; height is top part of counter
+ str r1, [sp] ; push destination to stack
+
+ ldr r4, [r11] ; load up packed filter coefficients
+ ldr r5, [r11, #4]
+ ldr r6, [r11, #8]
+
+ pkhbt r12, r5, r4 ; pack the filter differently
+ pkhbt r11, r6, r5
+
+ sub r0, r0, #4 ; offset input buffer
+
+|height_loop_2nd|
+ ldr r8, [r0] ; load the data
+ ldr r9, [r0, #4]
+ orr r7, r7, r3, lsr #1 ; loop counter
+
+|width_loop_2nd|
+ smuad lr, r4, r8 ; apply filter
+ sub r7, r7, #1
+ smulbt r8, r4, r8
+
+ ldr r10, [r0, #8]
+
+ smlad lr, r5, r9, lr
+ smladx r8, r12, r9, r8
+
+ ldrh r9, [r0, #12]
+
+ smlad lr, r6, r10, lr
+ smladx r8, r11, r10, r8
+
+ add r0, r0, #4
+ smlatb r10, r6, r9, r8
+
+ add lr, lr, #0x40 ; round_shift_and_clamp
+ ands r8, r7, #0xff
+ usat lr, #8, lr, asr #7
+ add r10, r10, #0x40
+ strb lr, [r1], r2 ; the result is transposed back and stored
+ usat r10, #8, r10, asr #7
+
+ ldrne r8, [r0] ; load data for next loop
+ ldrne r9, [r0, #4]
+ strb r10, [r1], r2
+
+ bne width_loop_2nd
+
+ ldr r1, [sp] ; update dst for next loop
+ subs r7, r7, #0x10000
+ add r0, r0, #16 ; updata src for next loop
+ add r1, r1, #1
+ str r1, [sp]
+
+ bne height_loop_2nd
+
+ add sp, sp, #4
+ ldmia sp!, {r4 - r11, pc}
+
+ ENDP
+
+;------------------------------------
+; r0 unsigned char *src_ptr
+; r1 unsigned char *output_ptr,
+; r2 unsigned int src_pixels_per_line
+; r3 unsigned int cnt,
+; stack unsigned int output_pitch,
+; stack const short *vp8_filter
+;------------------------------------
+|vp8_filter_block2d_first_pass_only_armv6| PROC
+ stmdb sp!, {r4 - r11, lr}
+
+ ldr r4, [sp, #36] ; output pitch
+ ldr r11, [sp, #40] ; HFilter address
+ sub sp, sp, #8
+
+ mov r7, r3
+ sub r2, r2, r3 ; inside loop increments input array,
+ ; so the height loop only needs to add
+ ; r2 - width to the input pointer
+
+ sub r4, r4, r3
+ str r4, [sp] ; save modified output pitch
+ str r2, [sp, #4]
+
+ mov r2, #0x40
+
+ ldr r4, [r11] ; load up packed filter coefficients
+ ldr r5, [r11, #4]
+ ldr r6, [r11, #8]
+
+; six tap filter
+|height_loop_1st_only_6|
+ ldrb r8, [r0, #-2] ; load data
+ ldrb r9, [r0, #-1]
+ ldrb r10, [r0], #2
+
+ mov r12, r3, lsr #1 ; loop counter
+
+|width_loop_1st_only_6|
+ ldrb r11, [r0, #-1]
+
+ pkhbt lr, r8, r9, lsl #16 ; r9 | r8
+ pkhbt r8, r9, r10, lsl #16 ; r10 | r9
+
+ ldrb r9, [r0]
+
+;; smuad lr, lr, r4
+ smlad lr, lr, r4, r2
+ pkhbt r10, r10, r11, lsl #16 ; r11 | r10
+;; smuad r8, r8, r4
+ smlad r8, r8, r4, r2
+ pkhbt r11, r11, r9, lsl #16 ; r9 | r11
+
+ smlad lr, r10, r5, lr
+ ldrb r10, [r0, #1]
+ smlad r8, r11, r5, r8
+ ldrb r11, [r0, #2]
+
+ subs r12, r12, #1
+
+ pkhbt r9, r9, r10, lsl #16 ; r10 | r9
+ pkhbt r10, r10, r11, lsl #16 ; r11 | r10
+
+ smlad lr, r9, r6, lr
+ smlad r10, r10, r6, r8
+
+;; add lr, lr, #0x40 ; round_shift_and_clamp
+ ldrneb r8, [r0, #-2] ; load data for next loop
+ usat lr, #8, lr, asr #7
+;; add r10, r10, #0x40
+ strb lr, [r1], #1 ; store the result
+ usat r10, #8, r10, asr #7
+
+ ldrneb r9, [r0, #-1]
+ strb r10, [r1], #1
+ ldrneb r10, [r0], #2
+
+ bne width_loop_1st_only_6
+
+ ;;add r9, r2, #30 ; attempt to load 2 adjacent cache lines
+ ;;IF ARCHITECTURE=6
+ ;pld [r0, r2]
+ ;;pld [r0, r9]
+ ;;ENDIF
+
+ ldr lr, [sp] ; load back output pitch
+ ldr r12, [sp, #4] ; load back output pitch
+ subs r7, r7, #1
+ add r0, r0, r12 ; updata src for next loop
+ add r1, r1, lr ; update dst for next loop
+
+ bne height_loop_1st_only_6
+
+ add sp, sp, #8
+ ldmia sp!, {r4 - r11, pc}
+ ENDP ; |vp8_filter_block2d_first_pass_only_armv6|
+
+
+;------------------------------------
+; r0 unsigned char *src_ptr,
+; r1 unsigned char *output_ptr,
+; r2 unsigned int src_pixels_per_line
+; r3 unsigned int cnt,
+; stack unsigned int output_pitch,
+; stack const short *vp8_filter
+;------------------------------------
+|vp8_filter_block2d_second_pass_only_armv6| PROC
+ stmdb sp!, {r4 - r11, lr}
+
+ ldr r11, [sp, #40] ; VFilter address
+ ldr r12, [sp, #36] ; output pitch
+
+ mov r7, r3, lsl #16 ; height is top part of counter
+ sub r0, r0, r2, lsl #1 ; need 6 elements for filtering, 2 before, 3 after
+
+ sub sp, sp, #8
+
+ ldr r4, [r11] ; load up packed filter coefficients
+ ldr r5, [r11, #4]
+ ldr r6, [r11, #8]
+
+ str r0, [sp] ; save r0 to stack
+ str r1, [sp, #4] ; save dst to stack
+
+; six tap filter
+|width_loop_2nd_only_6|
+ ldrb r8, [r0], r2 ; load data
+ orr r7, r7, r3 ; loop counter
+ ldrb r9, [r0], r2
+ ldrb r10, [r0], r2
+
+|height_loop_2nd_only_6|
+ ; filter first column in this inner loop, than, move to next colum.
+ ldrb r11, [r0], r2
+
+ pkhbt lr, r8, r9, lsl #16 ; r9 | r8
+ pkhbt r8, r9, r10, lsl #16 ; r10 | r9
+
+ ldrb r9, [r0], r2
+
+ smuad lr, lr, r4
+ pkhbt r10, r10, r11, lsl #16 ; r11 | r10
+ smuad r8, r8, r4
+ pkhbt r11, r11, r9, lsl #16 ; r9 | r11
+
+ smlad lr, r10, r5, lr
+ ldrb r10, [r0], r2
+ smlad r8, r11, r5, r8
+ ldrb r11, [r0]
+
+ sub r7, r7, #2
+ sub r0, r0, r2, lsl #2
+
+ pkhbt r9, r9, r10, lsl #16 ; r10 | r9
+ pkhbt r10, r10, r11, lsl #16 ; r11 | r10
+
+ smlad lr, r9, r6, lr
+ smlad r10, r10, r6, r8
+
+ ands r9, r7, #0xff
+
+ add lr, lr, #0x40 ; round_shift_and_clamp
+ ldrneb r8, [r0], r2 ; load data for next loop
+ usat lr, #8, lr, asr #7
+ add r10, r10, #0x40
+ strb lr, [r1], r12 ; store the result for the column
+ usat r10, #8, r10, asr #7
+
+ ldrneb r9, [r0], r2
+ strb r10, [r1], r12
+ ldrneb r10, [r0], r2
+
+ bne height_loop_2nd_only_6
+
+ ldr r0, [sp]
+ ldr r1, [sp, #4]
+ subs r7, r7, #0x10000
+ add r0, r0, #1 ; move to filter next column
+ str r0, [sp]
+ add r1, r1, #1
+ str r1, [sp, #4]
+
+ bne width_loop_2nd_only_6
+
+ add sp, sp, #8
+
+ ldmia sp!, {r4 - r11, pc}
+ ENDP ; |vp8_filter_block2d_second_pass_only_armv6|
+
+ END
diff --git a/vp8/common/arm/armv6/idct_v6.asm b/vp8/common/arm/armv6/idct_v6.asm
new file mode 100644
index 000000000..25c5165ec
--- /dev/null
+++ b/vp8/common/arm/armv6/idct_v6.asm
@@ -0,0 +1,376 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+; r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12 r14
+ EXPORT |vp8_short_idct4x4llm_1_v6|
+ EXPORT |vp8_short_idct4x4llm_v6|
+ EXPORT |vp8_short_idct4x4llm_v6_scott|
+ EXPORT |vp8_short_idct4x4llm_v6_dual|
+
+ EXPORT |vp8_dc_only_idct_armv6|
+
+ AREA |.text|, CODE, READONLY
+
+;********************************************************************************
+;* void short_idct4x4llm_1_v6(INT16 * input, INT16 * output, INT32 pitch)
+;* r0 INT16 * input
+;* r1 INT16 * output
+;* r2 INT32 pitch
+;* bench: 3/5
+;********************************************************************************
+
+|vp8_short_idct4x4llm_1_v6| PROC ; cycles in out pit
+ ;
+ ldrsh r0, [r0] ; load input[0] 1, r0 un 2
+ add r0, r0, #4 ; 1 +4
+ stmdb sp!, {r4, r5, lr} ; make room for wide writes 1 backup
+ mov r0, r0, asr #3 ; (input[0] + 4) >> 3 1, r0 req`d ^1 >> 3
+ pkhbt r4, r0, r0, lsl #16 ; pack r0 into r4 1, r0 req`d ^1 pack
+ mov r5, r4 ; expand expand
+
+ strd r4, [r1], r2 ; *output = r0, post inc 1
+ strd r4, [r1], r2 ; 1
+ strd r4, [r1], r2 ; 1
+ strd r4, [r1] ; 1
+ ;
+ ldmia sp!, {r4, r5, pc} ; replace vars, return restore
+ ENDP ; |vp8_short_idct4x4llm_1_v6|
+;********************************************************************************
+;********************************************************************************
+;********************************************************************************
+
+;********************************************************************************
+;* void short_idct4x4llm_v6(INT16 * input, INT16 * output, INT32 pitch)
+;* r0 INT16 * input
+;* r1 INT16 * output
+;* r2 INT32 pitch
+;* bench:
+;********************************************************************************
+
+|vp8_short_idct4x4llm_v6| PROC ; cycles in out pit
+ ;
+ stmdb sp!, {r4-r11, lr} ; backup registers 1 backup
+ ;
+ mov r4, #0x00004E00 ; 1 cst
+ orr r4, r4, #0x0000007B ; cospi8sqrt2minus1
+ mov r5, #0x00008A00 ; 1 cst
+ orr r5, r5, #0x0000008C ; sinpi8sqrt2
+ ;
+ mov r6, #4 ; i=4 1 i
+loop1 ;
+ ldrsh r12, [r0, #8] ; input[4] 1, r12 unavail 2 [4]
+ ldrsh r3, [r0, #24] ; input[12] 1, r3 unavail 2 [12]
+ ldrsh r8, [r0, #16] ; input[8] 1, r8 unavail 2 [8]
+ ldrsh r7, [r0], #0x2 ; input[0] 1, r7 unavail 2 ++ [0]
+ smulwb r10, r5, r12 ; ([4] * sinpi8sqrt2) >> 16 1, r10 un 2, r12/r5 ^1 t1
+ smulwb r11, r4, r3 ; ([12] * cospi8sqrt2minus1) >> 16 1, r11 un 2, r3/r4 ^1 t2
+ add r9, r7, r8 ; a1 = [0] + [8] 1 a1
+ sub r7, r7, r8 ; b1 = [0] - [8] 1 b1
+ add r11, r3, r11 ; temp2 1
+ rsb r11, r11, r10 ; c1 = temp1 - temp2 1 c1
+ smulwb r3, r5, r3 ; ([12] * sinpi8sqrt2) >> 16 1, r3 un 2, r3/r5 ^ 1 t2
+ smulwb r10, r4, r12 ; ([4] * cospi8sqrt2minus1) >> 16 1, r10 un 2, r12/r4 ^1 t1
+ add r8, r7, r11 ; b1 + c1 1 b+c
+ strh r8, [r1, r2] ; out[pitch] = b1+c1 1
+ sub r7, r7, r11 ; b1 - c1 1 b-c
+ add r10, r12, r10 ; temp1 1
+ add r3, r10, r3 ; d1 = temp1 + temp2 1 d1
+ add r10, r9, r3 ; a1 + d1 1 a+d
+ sub r3, r9, r3 ; a1 - d1 1 a-d
+ add r8, r2, r2 ; pitch * 2 1 p*2
+ strh r7, [r1, r8] ; out[pitch*2] = b1-c1 1
+ add r7, r2, r2, lsl #1 ; pitch * 3 1 p*3
+ strh r3, [r1, r7] ; out[pitch*3] = a1-d1 1
+ subs r6, r6, #1 ; i-- 1 --
+ strh r10, [r1], #0x2 ; out[0] = a1+d1 1 ++
+ bne loop1 ; if i>0, continue
+ ;
+ sub r1, r1, #8 ; set up out for next loop 1 -4
+ ; for this iteration, input=prev output
+ mov r6, #4 ; i=4 1 i
+; b returnfull
+loop2 ;
+ ldrsh r11, [r1, #2] ; input[1] 1, r11 un 2 [1]
+ ldrsh r8, [r1, #6] ; input[3] 1, r8 un 2 [3]
+ ldrsh r3, [r1, #4] ; input[2] 1, r3 un 2 [2]
+ ldrsh r0, [r1] ; input[0] 1, r0 un 2 [0]
+ smulwb r9, r5, r11 ; ([1] * sinpi8sqrt2) >> 16 1, r9 un 2, r5/r11 ^1 t1
+ smulwb r10, r4, r8 ; ([3] * cospi8sqrt2minus1) >> 16 1, r10 un 2, r4/r8 ^1 t2
+ add r7, r0, r3 ; a1 = [0] + [2] 1 a1
+ sub r0, r0, r3 ; b1 = [0] - [2] 1 b1
+ add r10, r8, r10 ; temp2 1
+ rsb r9, r10, r9 ; c1 = temp1 - temp2 1 c1
+ smulwb r8, r5, r8 ; ([3] * sinpi8sqrt2) >> 16 1, r8 un 2, r5/r8 ^1 t2
+ smulwb r10, r4, r11 ; ([1] * cospi8sqrt2minus1) >> 16 1, r10 un 2, r4/r11 ^1 t1
+ add r3, r0, r9 ; b1+c1 1 b+c
+ add r3, r3, #4 ; b1+c1+4 1 +4
+ add r10, r11, r10 ; temp1 1
+ mov r3, r3, asr #3 ; b1+c1+4 >> 3 1, r3 ^1 >>3
+ strh r3, [r1, #2] ; out[1] = b1+c1 1
+ add r10, r10, r8 ; d1 = temp1 + temp2 1 d1
+ add r3, r7, r10 ; a1+d1 1 a+d
+ add r3, r3, #4 ; a1+d1+4 1 +4
+ sub r7, r7, r10 ; a1-d1 1 a-d
+ add r7, r7, #4 ; a1-d1+4 1 +4
+ mov r3, r3, asr #3 ; a1+d1+4 >> 3 1, r3 ^1 >>3
+ mov r7, r7, asr #3 ; a1-d1+4 >> 3 1, r7 ^1 >>3
+ strh r7, [r1, #6] ; out[3] = a1-d1 1
+ sub r0, r0, r9 ; b1-c1 1 b-c
+ add r0, r0, #4 ; b1-c1+4 1 +4
+ subs r6, r6, #1 ; i-- 1 --
+ mov r0, r0, asr #3 ; b1-c1+4 >> 3 1, r0 ^1 >>3
+ strh r0, [r1, #4] ; out[2] = b1-c1 1
+ strh r3, [r1], r2 ; out[0] = a1+d1 1
+; add r1, r1, r2 ; out += pitch 1 ++
+ bne loop2 ; if i>0, continue
+returnfull ;
+ ldmia sp!, {r4 - r11, pc} ; replace vars, return restore
+ ENDP
+
+;********************************************************************************
+;********************************************************************************
+;********************************************************************************
+
+;********************************************************************************
+;* void short_idct4x4llm_v6_scott(INT16 * input, INT16 * output, INT32 pitch)
+;* r0 INT16 * input
+;* r1 INT16 * output
+;* r2 INT32 pitch
+;* bench:
+;********************************************************************************
+
+|vp8_short_idct4x4llm_v6_scott| PROC ; cycles in out pit
+; mov r0, #0 ;
+; ldr r0, [r0] ;
+ stmdb sp!, {r4 - r11, lr} ; backup registers 1 backup
+ ;
+ mov r3, #0x00004E00 ; cos
+ orr r3, r3, #0x0000007B ; cospi8sqrt2minus1
+ mov r4, #0x00008A00 ; sin
+ orr r4, r4, #0x0000008C ; sinpi8sqrt2
+ ;
+ mov r5, #0x2 ; i i
+ ;
+short_idct4x4llm_v6_scott_loop1 ;
+ ldr r10, [r0, #(4*2)] ; i5 | i4 5,4
+ ldr r11, [r0, #(12*2)] ; i13 | i12 13,12
+ ;
+ smulwb r6, r4, r10 ; ((ip[4] * sinpi8sqrt2) >> 16) lt1
+ smulwb r7, r3, r11 ; ((ip[12] * cospi8sqrt2minus1) >> 16) lt2
+ ;
+ smulwb r12, r3, r10 ; ((ip[4] * cospi8sqrt2misu1) >> 16) l2t2
+ smulwb r14, r4, r11 ; ((ip[12] * sinpi8sqrt2) >> 16) l2t1
+ ;
+ add r6, r6, r7 ; partial c1 lt1-lt2
+ add r12, r12, r14 ; partial d1 l2t2+l2t1
+ ;
+ smulwt r14, r4, r10 ; ((ip[5] * sinpi8sqrt2) >> 16) ht1
+ smulwt r7, r3, r11 ; ((ip[13] * cospi8sqrt2minus1) >> 16) ht2
+ ;
+ smulwt r8, r3, r10 ; ((ip[5] * cospi8sqrt2minus1) >> 16) h2t1
+ smulwt r9, r4, r11 ; ((ip[13] * sinpi8sqrt2) >> 16) h2t2
+ ;
+ add r7, r14, r7 ; partial c1_2 ht1+ht2
+ sub r8, r8, r9 ; partial d1_2 h2t1-h2t2
+ ;
+ pkhbt r6, r6, r7, lsl #16 ; partial c1_2 | partial c1_1 pack
+ pkhbt r12, r12, r8, lsl #16 ; partial d1_2 | partial d1_1 pack
+ ;
+ usub16 r6, r6, r10 ; c1_2 | c1_1 c
+ uadd16 r12, r12, r11 ; d1_2 | d1_1 d
+ ;
+ ldr r10, [r0, #0] ; i1 | i0 1,0
+ ldr r11, [r0, #(8*2)] ; i9 | i10 9,10
+ ;
+;;;;;; add r0, r0, #0x4 ; +4
+;;;;;; add r1, r1, #0x4 ; +4
+ ;
+ uadd16 r8, r10, r11 ; i1 + i9 | i0 + i8 aka a1 a
+ usub16 r9, r10, r11 ; i1 - i9 | i0 - i8 aka b1 b
+ ;
+ uadd16 r7, r8, r12 ; a1 + d1 pair a+d
+ usub16 r14, r8, r12 ; a1 - d1 pair a-d
+ ;
+ str r7, [r1] ; op[0] = a1 + d1
+ str r14, [r1, r2] ; op[pitch*3] = a1 - d1
+ ;
+ add r0, r0, #0x4 ; op[pitch] = b1 + c1 ++
+ add r1, r1, #0x4 ; op[pitch*2] = b1 - c1 ++
+ ;
+ subs r5, r5, #0x1 ; --
+ bne short_idct4x4llm_v6_scott_loop1 ;
+ ;
+ sub r1, r1, #16 ; reset output ptr
+ mov r5, #0x4 ;
+ mov r0, r1 ; input = output
+ ;
+short_idct4x4llm_v6_scott_loop2 ;
+ ;
+ subs r5, r5, #0x1 ;
+ bne short_idct4x4llm_v6_scott_loop2 ;
+ ;
+ ldmia sp!, {r4 - r11, pc} ;
+ ENDP ;
+ ;
+;********************************************************************************
+;********************************************************************************
+;********************************************************************************
+
+;********************************************************************************
+;* void short_idct4x4llm_v6_dual(INT16 * input, INT16 * output, INT32 pitch)
+;* r0 INT16 * input
+;* r1 INT16 * output
+;* r2 INT32 pitch
+;* bench:
+;********************************************************************************
+
+|vp8_short_idct4x4llm_v6_dual| PROC ; cycles in out pit
+ ;
+ stmdb sp!, {r4-r11, lr} ; backup registers 1 backup
+ mov r3, #0x00004E00 ; cos
+ orr r3, r3, #0x0000007B ; cospi8sqrt2minus1
+ mov r4, #0x00008A00 ; sin
+ orr r4, r4, #0x0000008C ; sinpi8sqrt2
+ mov r5, #0x2 ; i=2 i
+loop1_dual
+ ldr r6, [r0, #(4*2)] ; i5 | i4 5|4
+ ldr r12, [r0, #(12*2)] ; i13 | i12 13|12
+ ldr r14, [r0, #(8*2)] ; i9 | i8 9|8
+
+ smulwt r9, r3, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16 5c
+ smulwb r7, r3, r6 ; (ip[4] * cospi8sqrt2minus1) >> 16 4c
+ smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16 5s
+ smulwb r8, r4, r6 ; (ip[4] * sinpi8sqrt2) >> 16 4s
+ pkhbt r7, r7, r9, lsl #16 ; 5c | 4c
+ smulwt r11, r3, r12 ; (ip[13] * cospi8sqrt2minus1) >> 16 13c
+ pkhbt r8, r8, r10, lsl #16 ; 5s | 4s
+ uadd16 r6, r6, r7 ; 5c+5 | 4c+4
+ smulwt r7, r4, r12 ; (ip[13] * sinpi8sqrt2) >> 16 13s
+ smulwb r9, r3, r12 ; (ip[12] * cospi8sqrt2minus1) >> 16 12c
+ smulwb r10, r4, r12 ; (ip[12] * sinpi8sqrt2) >> 16 12s
+ subs r5, r5, #0x1 ; i-- --
+ pkhbt r9, r9, r11, lsl #16 ; 13c | 12c
+ ldr r11, [r0], #0x4 ; i1 | i0 ++ 1|0
+ pkhbt r10, r10, r7, lsl #16 ; 13s | 12s
+ uadd16 r7, r12, r9 ; 13c+13 | 12c+12
+ usub16 r7, r8, r7 ; c c
+ uadd16 r6, r6, r10 ; d d
+ uadd16 r10, r11, r14 ; a a
+ usub16 r8, r11, r14 ; b b
+ uadd16 r9, r10, r6 ; a+d a+d
+ usub16 r10, r10, r6 ; a-d a-d
+ uadd16 r6, r8, r7 ; b+c b+c
+ usub16 r7, r8, r7 ; b-c b-c
+ str r6, [r1, r2] ; o5 | o4
+ add r6, r2, r2 ; pitch * 2 p2
+ str r7, [r1, r6] ; o9 | o8
+ add r6, r6, r2 ; pitch * 3 p3
+ str r10, [r1, r6] ; o13 | o12
+ str r9, [r1], #0x4 ; o1 | o0 ++
+ bne loop1_dual ;
+ mov r5, #0x2 ; i=2 i
+ sub r0, r1, #8 ; reset input/output i/o
+loop2_dual
+ ldr r6, [r0, r2] ; i5 | i4 5|4
+ ldr r1, [r0] ; i1 | i0 1|0
+ ldr r12, [r0, #0x4] ; i3 | i2 3|2
+ add r14, r2, #0x4 ; pitch + 2 p+2
+ ldr r14, [r0, r14] ; i7 | i6 7|6
+ smulwt r9, r3, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16 5c
+ smulwt r7, r3, r1 ; (ip[1] * cospi8sqrt2minus1) >> 16 1c
+ smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16 5s
+ smulwt r8, r4, r1 ; (ip[1] * sinpi8sqrt2) >> 16 1s
+ pkhbt r11, r6, r1, lsl #16 ; i0 | i4 0|4
+ pkhbt r7, r9, r7, lsl #16 ; 1c | 5c
+ pkhbt r8, r10, r8, lsl #16 ; 1s | 5s = temp1 © tc1
+ pkhtb r1, r1, r6, asr #16 ; i1 | i5 1|5
+ uadd16 r1, r7, r1 ; 1c+1 | 5c+5 = temp2 (d) td2
+ pkhbt r9, r14, r12, lsl #16 ; i2 | i6 2|6
+ uadd16 r10, r11, r9 ; a a
+ usub16 r9, r11, r9 ; b b
+ pkhtb r6, r12, r14, asr #16 ; i3 | i7 3|7
+ subs r5, r5, #0x1 ; i-- --
+ smulwt r7, r3, r6 ; (ip[3] * cospi8sqrt2minus1) >> 16 3c
+ smulwt r11, r4, r6 ; (ip[3] * sinpi8sqrt2) >> 16 3s
+ smulwb r12, r3, r6 ; (ip[7] * cospi8sqrt2minus1) >> 16 7c
+ smulwb r14, r4, r6 ; (ip[7] * sinpi8sqrt2) >> 16 7s
+
+ pkhbt r7, r12, r7, lsl #16 ; 3c | 7c
+ pkhbt r11, r14, r11, lsl #16 ; 3s | 7s = temp1 (d) td1
+ uadd16 r6, r7, r6 ; 3c+3 | 7c+7 = temp2 (c) tc2
+ usub16 r12, r8, r6 ; c (o1 | o5) c
+ uadd16 r6, r11, r1 ; d (o3 | o7) d
+ uadd16 r7, r10, r6 ; a+d a+d
+ mov r8, #0x4 ; set up 4's 4
+ orr r8, r8, #0x40000 ; 4|4
+ usub16 r6, r10, r6 ; a-d a-d
+ uadd16 r6, r6, r8 ; a-d+4 3|7
+ uadd16 r7, r7, r8 ; a+d+4 0|4
+ uadd16 r10, r9, r12 ; b+c b+c
+ usub16 r1, r9, r12 ; b-c b-c
+ uadd16 r10, r10, r8 ; b+c+4 1|5
+ uadd16 r1, r1, r8 ; b-c+4 2|6
+ mov r8, r10, asr #19 ; o1 >> 3
+ strh r8, [r0, #2] ; o1
+ mov r8, r1, asr #19 ; o2 >> 3
+ strh r8, [r0, #4] ; o2
+ mov r8, r6, asr #19 ; o3 >> 3
+ strh r8, [r0, #6] ; o3
+ mov r8, r7, asr #19 ; o0 >> 3
+ strh r8, [r0], r2 ; o0 +p
+ sxth r10, r10 ;
+ mov r8, r10, asr #3 ; o5 >> 3
+ strh r8, [r0, #2] ; o5
+ sxth r1, r1 ;
+ mov r8, r1, asr #3 ; o6 >> 3
+ strh r8, [r0, #4] ; o6
+ sxth r6, r6 ;
+ mov r8, r6, asr #3 ; o7 >> 3
+ strh r8, [r0, #6] ; o7
+ sxth r7, r7 ;
+ mov r8, r7, asr #3 ; o4 >> 3
+ strh r8, [r0], r2 ; o4 +p
+;;;;; subs r5, r5, #0x1 ; i-- --
+ bne loop2_dual ;
+ ;
+ ldmia sp!, {r4 - r11, pc} ; replace vars, return restore
+ ENDP
+
+
+; sjl added 10/17/08
+;void dc_only_idct_armv6(short input_dc, short *output, int pitch)
+|vp8_dc_only_idct_armv6| PROC
+ stmdb sp!, {r4 - r6, lr}
+
+ add r0, r0, #0x4
+ add r4, r1, r2 ; output + shortpitch
+ mov r0, r0, ASR #0x3 ;aka a1
+ add r5, r1, r2, LSL #1 ; output + shortpitch * 2
+ pkhbt r0, r0, r0, lsl #16 ; a1 | a1
+ add r6, r5, r2 ; output + shortpitch * 3
+
+ str r0, [r1, #0]
+ str r0, [r1, #4]
+
+ str r0, [r4, #0]
+ str r0, [r4, #4]
+
+ str r0, [r5, #0]
+ str r0, [r5, #4]
+
+ str r0, [r6, #0]
+ str r0, [r6, #4]
+
+
+ ldmia sp!, {r4 - r6, pc}
+
+ ENDP ; |vp8_dc_only_idct_armv6|
+
+ END
diff --git a/vp8/common/arm/armv6/iwalsh_v6.asm b/vp8/common/arm/armv6/iwalsh_v6.asm
new file mode 100644
index 000000000..87475681f
--- /dev/null
+++ b/vp8/common/arm/armv6/iwalsh_v6.asm
@@ -0,0 +1,151 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+ EXPORT |vp8_short_inv_walsh4x4_armv6|
+ EXPORT |vp8_short_inv_walsh4x4_1_armv6|
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA |.text|, CODE, READONLY ; name this block of code
+
+;short vp8_short_inv_walsh4x4_armv6(short *input, short *output)
+|vp8_short_inv_walsh4x4_armv6| PROC
+
+ stmdb sp!, {r4 - r11, lr}
+
+ ldr r2, [r0], #4 ; [1 | 0]
+ ldr r3, [r0], #4 ; [3 | 2]
+ ldr r4, [r0], #4 ; [5 | 4]
+ ldr r5, [r0], #4 ; [7 | 6]
+ ldr r6, [r0], #4 ; [9 | 8]
+ ldr r7, [r0], #4 ; [11 | 10]
+ ldr r8, [r0], #4 ; [13 | 12]
+ ldr r9, [r0] ; [15 | 14]
+
+ qadd16 r10, r2, r8 ; a1 [1+13 | 0+12]
+ qadd16 r11, r4, r6 ; b1 [5+9 | 4+8]
+ qsub16 r12, r4, r6 ; c1 [5-9 | 4-8]
+ qsub16 lr, r2, r8 ; d1 [1-13 | 0-12]
+
+ qadd16 r2, r10, r11 ; a1 + b1 [1 | 0]
+ qadd16 r4, r12, lr ; c1 + d1 [5 | 4]
+ qsub16 r6, r10, r11 ; a1 - b1 [9 | 8]
+ qsub16 r8, lr, r12 ; d1 - c1 [13 | 12]
+
+ qadd16 r10, r3, r9 ; a1 [3+15 | 2+14]
+ qadd16 r11, r5, r7 ; b1 [7+11 | 6+10]
+ qsub16 r12, r5, r7 ; c1 [7-11 | 6-10]
+ qsub16 lr, r3, r9 ; d1 [3-15 | 2-14]
+
+ qadd16 r3, r10, r11 ; a1 + b1 [3 | 2]
+ qadd16 r5, r12, lr ; c1 + d1 [7 | 6]
+ qsub16 r7, r10, r11 ; a1 - b1 [11 | 10]
+ qsub16 r9, lr, r12 ; d1 - c1 [15 | 14]
+
+ ; first transform complete
+
+ qsubaddx r10, r2, r3 ; [c1|a1] [1-2 | 0+3]
+ qaddsubx r11, r2, r3 ; [b1|d1] [1+2 | 0-3]
+ qsubaddx r12, r4, r5 ; [c1|a1] [5-6 | 4+7]
+ qaddsubx lr, r4, r5 ; [b1|d1] [5+6 | 4-7]
+
+ qaddsubx r2, r10, r11 ; [b2|c2] [c1+d1 | a1-b1]
+ qaddsubx r3, r11, r10 ; [a2|d2] [b1+a1 | d1-c1]
+ ldr r10, c0x00030003
+ qaddsubx r4, r12, lr ; [b2|c2] [c1+d1 | a1-b1]
+ qaddsubx r5, lr, r12 ; [a2|d2] [b1+a1 | d1-c1]
+
+ qadd16 r2, r2, r10 ; [b2+3|c2+3]
+ qadd16 r3, r3, r10 ; [a2+3|d2+3]
+ qadd16 r4, r4, r10 ; [b2+3|c2+3]
+ qadd16 r5, r5, r10 ; [a2+3|d2+3]
+
+ asr r12, r2, #3 ; [1 | x]
+ pkhtb r12, r12, r3, asr #19; [1 | 0]
+ lsl lr, r3, #16 ; [~3 | x]
+ lsl r2, r2, #16 ; [~2 | x]
+ asr lr, lr, #3 ; [3 | x]
+ pkhtb lr, lr, r2, asr #19 ; [3 | 2]
+
+ asr r2, r4, #3 ; [5 | x]
+ pkhtb r2, r2, r5, asr #19 ; [5 | 4]
+ lsl r3, r5, #16 ; [~7 | x]
+ lsl r4, r4, #16 ; [~6 | x]
+ asr r3, r3, #3 ; [7 | x]
+ pkhtb r3, r3, r4, asr #19 ; [7 | 6]
+
+ str r12, [r1], #4
+ str lr, [r1], #4
+ str r2, [r1], #4
+ str r3, [r1], #4
+
+ qsubaddx r2, r6, r7 ; [c1|a1] [9-10 | 8+11]
+ qaddsubx r3, r6, r7 ; [b1|d1] [9+10 | 8-11]
+ qsubaddx r4, r8, r9 ; [c1|a1] [13-14 | 12+15]
+ qaddsubx r5, r8, r9 ; [b1|d1] [13+14 | 12-15]
+
+ qaddsubx r6, r2, r3 ; [b2|c2] [c1+d1 | a1-b1]
+ qaddsubx r7, r3, r2 ; [a2|d2] [b1+a1 | d1-c1]
+ qaddsubx r8, r4, r5 ; [b2|c2] [c1+d1 | a1-b1]
+ qaddsubx r9, r5, r4 ; [a2|d2] [b1+a1 | d1-c1]
+
+ qadd16 r6, r6, r10 ; [b2+3|c2+3]
+ qadd16 r7, r7, r10 ; [a2+3|d2+3]
+ qadd16 r8, r8, r10 ; [b2+3|c2+3]
+ qadd16 r9, r9, r10 ; [a2+3|d2+3]
+
+ asr r2, r6, #3 ; [9 | x]
+ pkhtb r2, r2, r7, asr #19 ; [9 | 8]
+ lsl r3, r7, #16 ; [~11| x]
+ lsl r4, r6, #16 ; [~10| x]
+ asr r3, r3, #3 ; [11 | x]
+ pkhtb r3, r3, r4, asr #19 ; [11 | 10]
+
+ asr r4, r8, #3 ; [13 | x]
+ pkhtb r4, r4, r9, asr #19 ; [13 | 12]
+ lsl r5, r9, #16 ; [~15| x]
+ lsl r6, r8, #16 ; [~14| x]
+ asr r5, r5, #3 ; [15 | x]
+ pkhtb r5, r5, r6, asr #19 ; [15 | 14]
+
+ str r2, [r1], #4
+ str r3, [r1], #4
+ str r4, [r1], #4
+ str r5, [r1]
+
+ ldmia sp!, {r4 - r11, pc}
+ ENDP ; |vp8_short_inv_walsh4x4_armv6|
+
+
+;short vp8_short_inv_walsh4x4_1_armv6(short *input, short *output)
+|vp8_short_inv_walsh4x4_1_armv6| PROC
+
+ ldrsh r2, [r0] ; [0]
+ add r2, r2, #3 ; [0] + 3
+ asr r2, r2, #3 ; a1 ([0]+3) >> 3
+ lsl r2, r2, #16 ; [a1 | x]
+ orr r2, r2, r2, lsr #16 ; [a1 | a1]
+
+ str r2, [r1], #4
+ str r2, [r1], #4
+ str r2, [r1], #4
+ str r2, [r1], #4
+ str r2, [r1], #4
+ str r2, [r1], #4
+ str r2, [r1], #4
+ str r2, [r1]
+
+ bx lr
+ ENDP ; |vp8_short_inv_walsh4x4_1_armv6|
+
+; Constant Pool
+c0x00030003 DCD 0x00030003
+ END
diff --git a/vp8/common/arm/armv6/loopfilter_v6.asm b/vp8/common/arm/armv6/loopfilter_v6.asm
new file mode 100644
index 000000000..c2b02dc0a
--- /dev/null
+++ b/vp8/common/arm/armv6/loopfilter_v6.asm
@@ -0,0 +1,1263 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_loop_filter_horizontal_edge_armv6|
+ EXPORT |vp8_mbloop_filter_horizontal_edge_armv6|
+ EXPORT |vp8_loop_filter_vertical_edge_armv6|
+ EXPORT |vp8_mbloop_filter_vertical_edge_armv6|
+
+ AREA |.text|, CODE, READONLY ; name this block of code
+
+ MACRO
+ TRANSPOSE_MATRIX $a0, $a1, $a2, $a3, $b0, $b1, $b2, $b3
+ ; input: $a0, $a1, $a2, $a3; output: $b0, $b1, $b2, $b3
+ ; a0: 03 02 01 00
+ ; a1: 13 12 11 10
+ ; a2: 23 22 21 20
+ ; a3: 33 32 31 30
+ ; b3 b2 b1 b0
+
+ uxtb16 $b1, $a1 ; xx 12 xx 10
+ uxtb16 $b0, $a0 ; xx 02 xx 00
+ uxtb16 $b3, $a3 ; xx 32 xx 30
+ uxtb16 $b2, $a2 ; xx 22 xx 20
+ orr $b1, $b0, $b1, lsl #8 ; 12 02 10 00
+ orr $b3, $b2, $b3, lsl #8 ; 32 22 30 20
+
+ uxtb16 $a1, $a1, ror #8 ; xx 13 xx 11
+ uxtb16 $a3, $a3, ror #8 ; xx 33 xx 31
+ uxtb16 $a0, $a0, ror #8 ; xx 03 xx 01
+ uxtb16 $a2, $a2, ror #8 ; xx 23 xx 21
+ orr $a0, $a0, $a1, lsl #8 ; 13 03 11 01
+ orr $a2, $a2, $a3, lsl #8 ; 33 23 31 21
+
+ pkhtb $b2, $b3, $b1, asr #16 ; 32 22 12 02 -- p1
+ pkhbt $b0, $b1, $b3, lsl #16 ; 30 20 10 00 -- p3
+
+ pkhtb $b3, $a2, $a0, asr #16 ; 33 23 13 03 -- p0
+ pkhbt $b1, $a0, $a2, lsl #16 ; 31 21 11 01 -- p2
+ MEND
+
+
+src RN r0
+pstep RN r1
+count RN r5
+
+;r0 unsigned char *src_ptr,
+;r1 int src_pixel_step,
+;r2 const char *flimit,
+;r3 const char *limit,
+;stack const char *thresh,
+;stack int count
+
+;Note: All 16 elements in flimit are equal. So, in the code, only one load is needed
+;for flimit. Same way applies to limit and thresh.
+
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+|vp8_loop_filter_horizontal_edge_armv6| PROC
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+ stmdb sp!, {r4 - r11, lr}
+
+ sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines
+ ldr count, [sp, #40] ; count for 8-in-parallel
+ ldr r6, [sp, #36] ; load thresh address
+ sub sp, sp, #16 ; create temp buffer
+
+ ldr r9, [src], pstep ; p3
+ ldr r4, [r2], #4 ; flimit
+ ldr r10, [src], pstep ; p2
+ ldr r2, [r3], #4 ; limit
+ ldr r11, [src], pstep ; p1
+ uadd8 r4, r4, r4 ; flimit * 2
+ ldr r3, [r6], #4 ; thresh
+ mov count, count, lsl #1 ; 4-in-parallel
+ uadd8 r4, r4, r2 ; flimit * 2 + limit
+
+|Hnext8|
+ ; vp8_filter_mask() function
+ ; calculate breakout conditions
+ ldr r12, [src], pstep ; p0
+
+ uqsub8 r6, r9, r10 ; p3 - p2
+ uqsub8 r7, r10, r9 ; p2 - p3
+ uqsub8 r8, r10, r11 ; p2 - p1
+ uqsub8 r10, r11, r10 ; p1 - p2
+
+ orr r6, r6, r7 ; abs (p3-p2)
+ orr r8, r8, r10 ; abs (p2-p1)
+ uqsub8 lr, r6, r2 ; compare to limit. lr: vp8_filter_mask
+ uqsub8 r8, r8, r2 ; compare to limit
+ uqsub8 r6, r11, r12 ; p1 - p0
+ orr lr, lr, r8
+ uqsub8 r7, r12, r11 ; p0 - p1
+ ldr r9, [src], pstep ; q0
+ ldr r10, [src], pstep ; q1
+ orr r6, r6, r7 ; abs (p1-p0)
+ uqsub8 r7, r6, r2 ; compare to limit
+ uqsub8 r8, r6, r3 ; compare to thresh -- save r8 for later
+ orr lr, lr, r7
+
+ uqsub8 r6, r11, r10 ; p1 - q1
+ uqsub8 r7, r10, r11 ; q1 - p1
+ uqsub8 r11, r12, r9 ; p0 - q0
+ uqsub8 r12, r9, r12 ; q0 - p0
+ orr r6, r6, r7 ; abs (p1-q1)
+ ldr r7, c0x7F7F7F7F
+ orr r12, r11, r12 ; abs (p0-q0)
+ ldr r11, [src], pstep ; q2
+ uqadd8 r12, r12, r12 ; abs (p0-q0) * 2
+ and r6, r7, r6, lsr #1 ; abs (p1-q1) / 2
+ uqsub8 r7, r9, r10 ; q0 - q1
+ uqadd8 r12, r12, r6 ; abs (p0-q0)*2 + abs (p1-q1)/2
+ uqsub8 r6, r10, r9 ; q1 - q0
+ uqsub8 r12, r12, r4 ; compare to flimit
+ uqsub8 r9, r11, r10 ; q2 - q1
+
+ orr lr, lr, r12
+
+ ldr r12, [src], pstep ; q3
+ uqsub8 r10, r10, r11 ; q1 - q2
+ orr r6, r7, r6 ; abs (q1-q0)
+ orr r10, r9, r10 ; abs (q2-q1)
+ uqsub8 r7, r6, r2 ; compare to limit
+ uqsub8 r10, r10, r2 ; compare to limit
+ uqsub8 r6, r6, r3 ; compare to thresh -- save r6 for later
+ orr lr, lr, r7
+ orr lr, lr, r10
+
+ uqsub8 r10, r12, r11 ; q3 - q2
+ uqsub8 r9, r11, r12 ; q2 - q3
+
+ mvn r11, #0 ; r11 == -1
+
+ orr r10, r10, r9 ; abs (q3-q2)
+ uqsub8 r10, r10, r2 ; compare to limit
+
+ mov r12, #0
+ orr lr, lr, r10
+ sub src, src, pstep, lsl #2
+
+ usub8 lr, r12, lr ; use usub8 instead of ssub8
+ sel lr, r11, r12 ; filter mask: lr
+
+ cmp lr, #0
+ beq hskip_filter ; skip filtering
+
+ sub src, src, pstep, lsl #1 ; move src pointer down by 6 lines
+
+ ;vp8_hevmask() function
+ ;calculate high edge variance
+ orr r10, r6, r8 ; calculate vp8_hevmask
+
+ ldr r7, [src], pstep ; p1
+
+ usub8 r10, r12, r10 ; use usub8 instead of ssub8
+ sel r6, r12, r11 ; obtain vp8_hevmask: r6
+
+ ;vp8_filter() function
+ ldr r8, [src], pstep ; p0
+ ldr r12, c0x80808080
+ ldr r9, [src], pstep ; q0
+ ldr r10, [src], pstep ; q1
+
+ eor r7, r7, r12 ; p1 offset to convert to a signed value
+ eor r8, r8, r12 ; p0 offset to convert to a signed value
+ eor r9, r9, r12 ; q0 offset to convert to a signed value
+ eor r10, r10, r12 ; q1 offset to convert to a signed value
+
+ str r9, [sp] ; store qs0 temporarily
+ str r8, [sp, #4] ; store ps0 temporarily
+ str r10, [sp, #8] ; store qs1 temporarily
+ str r7, [sp, #12] ; store ps1 temporarily
+
+ qsub8 r7, r7, r10 ; vp8_signed_char_clamp(ps1-qs1)
+ qsub8 r8, r9, r8 ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
+
+ and r7, r7, r6 ; vp8_filter (r7) &= hev
+
+ qadd8 r7, r7, r8
+ ldr r9, c0x03030303 ; r9 = 3 --modified for vp8
+
+ qadd8 r7, r7, r8
+ ldr r10, c0x04040404
+
+ qadd8 r7, r7, r8
+ and r7, r7, lr ; vp8_filter &= mask;
+
+ ;modify code for vp8 -- Filter1 = vp8_filter (r7)
+ qadd8 r8 , r7 , r9 ; Filter2 (r8) = vp8_signed_char_clamp(vp8_filter+3)
+ qadd8 r7 , r7 , r10 ; vp8_filter = vp8_signed_char_clamp(vp8_filter+4)
+
+ mov r9, #0
+ shadd8 r8 , r8 , r9 ; Filter2 >>= 3
+ shadd8 r7 , r7 , r9 ; vp8_filter >>= 3
+ shadd8 r8 , r8 , r9
+ shadd8 r7 , r7 , r9
+ shadd8 lr , r8 , r9 ; lr: Filter2
+ shadd8 r7 , r7 , r9 ; r7: filter
+
+ ;usub8 lr, r8, r10 ; s = (s==4)*-1
+ ;sel lr, r11, r9
+ ;usub8 r8, r10, r8
+ ;sel r8, r11, r9
+ ;and r8, r8, lr ; -1 for each element that equals 4
+
+ ;calculate output
+ ;qadd8 lr, r8, r7 ; u = vp8_signed_char_clamp(s + vp8_filter)
+
+ ldr r8, [sp] ; load qs0
+ ldr r9, [sp, #4] ; load ps0
+
+ ldr r10, c0x01010101
+
+ qsub8 r8 ,r8, r7 ; u = vp8_signed_char_clamp(qs0 - vp8_filter)
+ qadd8 r9, r9, lr ; u = vp8_signed_char_clamp(ps0 + Filter2)
+
+ ;end of modification for vp8
+
+ mov lr, #0
+ sadd8 r7, r7 , r10 ; vp8_filter += 1
+ shadd8 r7, r7, lr ; vp8_filter >>= 1
+
+ ldr r11, [sp, #12] ; load ps1
+ ldr r10, [sp, #8] ; load qs1
+
+ bic r7, r7, r6 ; vp8_filter &= ~hev
+ sub src, src, pstep, lsl #2
+
+ qadd8 r11, r11, r7 ; u = vp8_signed_char_clamp(ps1 + vp8_filter)
+ qsub8 r10, r10,r7 ; u = vp8_signed_char_clamp(qs1 - vp8_filter)
+
+ eor r11, r11, r12 ; *op1 = u^0x80
+ str r11, [src], pstep ; store op1
+ eor r9, r9, r12 ; *op0 = u^0x80
+ str r9, [src], pstep ; store op0 result
+ eor r8, r8, r12 ; *oq0 = u^0x80
+ str r8, [src], pstep ; store oq0 result
+ eor r10, r10, r12 ; *oq1 = u^0x80
+ str r10, [src], pstep ; store oq1
+
+ sub src, src, pstep, lsl #1
+
+|hskip_filter|
+ add src, src, #4
+ sub src, src, pstep, lsl #2
+
+ subs count, count, #1
+
+ ;pld [src]
+ ;pld [src, pstep]
+ ;pld [src, pstep, lsl #1]
+ ;pld [src, pstep, lsl #2]
+ ;pld [src, pstep, lsl #3]
+
+ ldrne r9, [src], pstep ; p3
+ ldrne r10, [src], pstep ; p2
+ ldrne r11, [src], pstep ; p1
+
+ bne Hnext8
+
+ add sp, sp, #16
+ ldmia sp!, {r4 - r11, pc}
+ ENDP ; |vp8_loop_filter_horizontal_edge_armv6|
+
+
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+|vp8_mbloop_filter_horizontal_edge_armv6| PROC
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+ stmdb sp!, {r4 - r11, lr}
+
+ sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines
+ ldr count, [sp, #40] ; count for 8-in-parallel
+ ldr r6, [sp, #36] ; load thresh address
+ sub sp, sp, #16 ; create temp buffer
+
+ ldr r9, [src], pstep ; p3
+ ldr r4, [r2], #4 ; flimit
+ ldr r10, [src], pstep ; p2
+ ldr r2, [r3], #4 ; limit
+ ldr r11, [src], pstep ; p1
+ uadd8 r4, r4, r4 ; flimit * 2
+ ldr r3, [r6], #4 ; thresh
+ mov count, count, lsl #1 ; 4-in-parallel
+ uadd8 r4, r4, r2 ; flimit * 2 + limit
+
+|MBHnext8|
+
+ ; vp8_filter_mask() function
+ ; calculate breakout conditions
+ ldr r12, [src], pstep ; p0
+
+ uqsub8 r6, r9, r10 ; p3 - p2
+ uqsub8 r7, r10, r9 ; p2 - p3
+ uqsub8 r8, r10, r11 ; p2 - p1
+ uqsub8 r10, r11, r10 ; p1 - p2
+
+ orr r6, r6, r7 ; abs (p3-p2)
+ orr r8, r8, r10 ; abs (p2-p1)
+ uqsub8 lr, r6, r2 ; compare to limit. lr: vp8_filter_mask
+ uqsub8 r8, r8, r2 ; compare to limit
+
+ uqsub8 r6, r11, r12 ; p1 - p0
+ orr lr, lr, r8
+ uqsub8 r7, r12, r11 ; p0 - p1
+ ldr r9, [src], pstep ; q0
+ ldr r10, [src], pstep ; q1
+ orr r6, r6, r7 ; abs (p1-p0)
+ uqsub8 r7, r6, r2 ; compare to limit
+ uqsub8 r8, r6, r3 ; compare to thresh -- save r8 for later
+ orr lr, lr, r7
+
+ uqsub8 r6, r11, r10 ; p1 - q1
+ uqsub8 r7, r10, r11 ; q1 - p1
+ uqsub8 r11, r12, r9 ; p0 - q0
+ uqsub8 r12, r9, r12 ; q0 - p0
+ orr r6, r6, r7 ; abs (p1-q1)
+ ldr r7, c0x7F7F7F7F
+ orr r12, r11, r12 ; abs (p0-q0)
+ ldr r11, [src], pstep ; q2
+ uqadd8 r12, r12, r12 ; abs (p0-q0) * 2
+ and r6, r7, r6, lsr #1 ; abs (p1-q1) / 2
+ uqsub8 r7, r9, r10 ; q0 - q1
+ uqadd8 r12, r12, r6 ; abs (p0-q0)*2 + abs (p1-q1)/2
+ uqsub8 r6, r10, r9 ; q1 - q0
+ uqsub8 r12, r12, r4 ; compare to flimit
+ uqsub8 r9, r11, r10 ; q2 - q1
+
+ orr lr, lr, r12
+
+ ldr r12, [src], pstep ; q3
+
+ uqsub8 r10, r10, r11 ; q1 - q2
+ orr r6, r7, r6 ; abs (q1-q0)
+ orr r10, r9, r10 ; abs (q2-q1)
+ uqsub8 r7, r6, r2 ; compare to limit
+ uqsub8 r10, r10, r2 ; compare to limit
+ uqsub8 r6, r6, r3 ; compare to thresh -- save r6 for later
+ orr lr, lr, r7
+ orr lr, lr, r10
+
+ uqsub8 r10, r12, r11 ; q3 - q2
+ uqsub8 r9, r11, r12 ; q2 - q3
+
+ mvn r11, #0 ; r11 == -1
+
+ orr r10, r10, r9 ; abs (q3-q2)
+ uqsub8 r10, r10, r2 ; compare to limit
+
+ mov r12, #0
+
+ orr lr, lr, r10
+
+ usub8 lr, r12, lr ; use usub8 instead of ssub8
+ sel lr, r11, r12 ; filter mask: lr
+
+ cmp lr, #0
+ beq mbhskip_filter ; skip filtering
+
+ ;vp8_hevmask() function
+ ;calculate high edge variance
+ sub src, src, pstep, lsl #2 ; move src pointer down by 6 lines
+ sub src, src, pstep, lsl #1
+
+ orr r10, r6, r8
+ ldr r7, [src], pstep ; p1
+
+ usub8 r10, r12, r10
+ sel r6, r12, r11 ; hev mask: r6
+
+ ;vp8_mbfilter() function
+ ;p2, q2 are only needed at the end. Don't need to load them in now.
+ ldr r8, [src], pstep ; p0
+ ldr r12, c0x80808080
+ ldr r9, [src], pstep ; q0
+ ldr r10, [src] ; q1
+
+ eor r7, r7, r12 ; ps1
+ eor r8, r8, r12 ; ps0
+ eor r9, r9, r12 ; qs0
+ eor r10, r10, r12 ; qs1
+
+ qsub8 r12, r9, r8 ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
+ str r7, [sp, #12] ; store ps1 temporarily
+ qsub8 r7, r7, r10 ; vp8_signed_char_clamp(ps1-qs1)
+ str r10, [sp, #8] ; store qs1 temporarily
+ qadd8 r7, r7, r12
+ str r9, [sp] ; store qs0 temporarily
+ qadd8 r7, r7, r12
+ str r8, [sp, #4] ; store ps0 temporarily
+ qadd8 r7, r7, r12 ; vp8_filter: r7
+
+ ldr r10, c0x03030303 ; r10 = 3 --modified for vp8
+ ldr r9, c0x04040404
+
+ and r7, r7, lr ; vp8_filter &= mask (lr is free)
+
+ mov r12, r7 ; Filter2: r12
+ and r12, r12, r6 ; Filter2 &= hev
+
+ ;modify code for vp8
+ ;save bottom 3 bits so that we round one side +4 and the other +3
+ qadd8 r8 , r12 , r9 ; Filter1 (r8) = vp8_signed_char_clamp(Filter2+4)
+ qadd8 r12 , r12 , r10 ; Filter2 (r12) = vp8_signed_char_clamp(Filter2+3)
+
+ mov r10, #0
+ shadd8 r8 , r8 , r10 ; Filter1 >>= 3
+ shadd8 r12 , r12 , r10 ; Filter2 >>= 3
+ shadd8 r8 , r8 , r10
+ shadd8 r12 , r12 , r10
+ shadd8 r8 , r8 , r10 ; r8: Filter1
+ shadd8 r12 , r12 , r10 ; r12: Filter2
+
+ ldr r9, [sp] ; load qs0
+ ldr r11, [sp, #4] ; load ps0
+
+ qsub8 r9 , r9, r8 ; qs0 = vp8_signed_char_clamp(qs0 - Filter1)
+ qadd8 r11, r11, r12 ; ps0 = vp8_signed_char_clamp(ps0 + Filter2)
+
+ ;save bottom 3 bits so that we round one side +4 and the other +3
+ ;and r8, r12, r10 ; s = Filter2 & 7 (s: r8)
+ ;qadd8 r12 , r12 , r9 ; Filter2 = vp8_signed_char_clamp(Filter2+4)
+ ;mov r10, #0
+ ;shadd8 r12 , r12 , r10 ; Filter2 >>= 3
+ ;usub8 lr, r8, r9 ; s = (s==4)*-1
+ ;sel lr, r11, r10
+ ;shadd8 r12 , r12 , r10
+ ;usub8 r8, r9, r8
+ ;sel r8, r11, r10
+ ;ldr r9, [sp] ; load qs0
+ ;ldr r11, [sp, #4] ; load ps0
+ ;shadd8 r12 , r12 , r10
+ ;and r8, r8, lr ; -1 for each element that equals 4
+ ;qadd8 r10, r8, r12 ; u = vp8_signed_char_clamp(s + Filter2)
+ ;qsub8 r9 , r9, r12 ; qs0 = vp8_signed_char_clamp(qs0 - Filter2)
+ ;qadd8 r11, r11, r10 ; ps0 = vp8_signed_char_clamp(ps0 + u)
+
+ ;end of modification for vp8
+
+ bic r12, r7, r6 ; vp8_filter &= ~hev ( r6 is free)
+ ;mov r12, r7
+
+ ;roughly 3/7th difference across boundary
+ mov lr, #0x1b ; 27
+ mov r7, #0x3f ; 63
+
+ sxtb16 r6, r12
+ sxtb16 r10, r12, ror #8
+ smlabb r8, r6, lr, r7
+ smlatb r6, r6, lr, r7
+ smlabb r7, r10, lr, r7
+ smultb r10, r10, lr
+ ssat r8, #8, r8, asr #7
+ ssat r6, #8, r6, asr #7
+ add r10, r10, #63
+ ssat r7, #8, r7, asr #7
+ ssat r10, #8, r10, asr #7
+
+ ldr lr, c0x80808080
+
+ pkhbt r6, r8, r6, lsl #16
+ pkhbt r10, r7, r10, lsl #16
+ uxtb16 r6, r6
+ uxtb16 r10, r10
+
+ sub src, src, pstep
+
+ orr r10, r6, r10, lsl #8 ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7)
+
+ qsub8 r8, r9, r10 ; s = vp8_signed_char_clamp(qs0 - u)
+ qadd8 r10, r11, r10 ; s = vp8_signed_char_clamp(ps0 + u)
+ eor r8, r8, lr ; *oq0 = s^0x80
+ str r8, [src] ; store *oq0
+ sub src, src, pstep
+ eor r10, r10, lr ; *op0 = s^0x80
+ str r10, [src] ; store *op0
+
+ ;roughly 2/7th difference across boundary
+ mov lr, #0x12 ; 18
+ mov r7, #0x3f ; 63
+
+ sxtb16 r6, r12
+ sxtb16 r10, r12, ror #8
+ smlabb r8, r6, lr, r7
+ smlatb r6, r6, lr, r7
+ smlabb r9, r10, lr, r7
+ smlatb r10, r10, lr, r7
+ ssat r8, #8, r8, asr #7
+ ssat r6, #8, r6, asr #7
+ ssat r9, #8, r9, asr #7
+ ssat r10, #8, r10, asr #7
+
+ ldr lr, c0x80808080
+
+ pkhbt r6, r8, r6, lsl #16
+ pkhbt r10, r9, r10, lsl #16
+
+ ldr r9, [sp, #8] ; load qs1
+ ldr r11, [sp, #12] ; load ps1
+
+ uxtb16 r6, r6
+ uxtb16 r10, r10
+
+ sub src, src, pstep
+
+ orr r10, r6, r10, lsl #8 ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7)
+
+ qadd8 r11, r11, r10 ; s = vp8_signed_char_clamp(ps1 + u)
+ qsub8 r8, r9, r10 ; s = vp8_signed_char_clamp(qs1 - u)
+ eor r11, r11, lr ; *op1 = s^0x80
+ str r11, [src], pstep ; store *op1
+ eor r8, r8, lr ; *oq1 = s^0x80
+ add src, src, pstep, lsl #1
+
+ mov r7, #0x3f ; 63
+
+ str r8, [src], pstep ; store *oq1
+
+ ;roughly 1/7th difference across boundary
+ mov lr, #0x9 ; 9
+ ldr r9, [src] ; load q2
+
+ sxtb16 r6, r12
+ sxtb16 r10, r12, ror #8
+ smlabb r8, r6, lr, r7
+ smlatb r6, r6, lr, r7
+ smlabb r12, r10, lr, r7
+ smlatb r10, r10, lr, r7
+ ssat r8, #8, r8, asr #7
+ ssat r6, #8, r6, asr #7
+ ssat r12, #8, r12, asr #7
+ ssat r10, #8, r10, asr #7
+
+ sub src, src, pstep, lsl #2
+
+ pkhbt r6, r8, r6, lsl #16
+ pkhbt r10, r12, r10, lsl #16
+
+ sub src, src, pstep
+ ldr lr, c0x80808080
+
+ ldr r11, [src] ; load p2
+
+ uxtb16 r6, r6
+ uxtb16 r10, r10
+
+ eor r9, r9, lr
+ eor r11, r11, lr
+
+ orr r10, r6, r10, lsl #8 ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7)
+
+ qadd8 r8, r11, r10 ; s = vp8_signed_char_clamp(ps2 + u)
+ qsub8 r10, r9, r10 ; s = vp8_signed_char_clamp(qs2 - u)
+ eor r8, r8, lr ; *op2 = s^0x80
+ str r8, [src], pstep, lsl #2 ; store *op2
+ add src, src, pstep
+ eor r10, r10, lr ; *oq2 = s^0x80
+ str r10, [src], pstep, lsl #1 ; store *oq2
+
+|mbhskip_filter|
+ add src, src, #4
+ sub src, src, pstep, lsl #3
+ subs count, count, #1
+
+ ldrne r9, [src], pstep ; p3
+ ldrne r10, [src], pstep ; p2
+ ldrne r11, [src], pstep ; p1
+
+ bne MBHnext8
+
+ add sp, sp, #16
+ ldmia sp!, {r4 - r11, pc}
+ ENDP ; |vp8_mbloop_filter_horizontal_edge_armv6|
+
+
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+|vp8_loop_filter_vertical_edge_armv6| PROC
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+ stmdb sp!, {r4 - r11, lr}
+
+ sub src, src, #4 ; move src pointer down by 4
+ ldr count, [sp, #40] ; count for 8-in-parallel
+ ldr r12, [sp, #36] ; load thresh address
+ sub sp, sp, #16 ; create temp buffer
+
+ ldr r6, [src], pstep ; load source data
+ ldr r4, [r2], #4 ; flimit
+ ldr r7, [src], pstep
+ ldr r2, [r3], #4 ; limit
+ ldr r8, [src], pstep
+ uadd8 r4, r4, r4 ; flimit * 2
+ ldr r3, [r12], #4 ; thresh
+ ldr lr, [src], pstep
+ mov count, count, lsl #1 ; 4-in-parallel
+ uadd8 r4, r4, r2 ; flimit * 2 + limit
+
+|Vnext8|
+
+ ; vp8_filter_mask() function
+ ; calculate breakout conditions
+ ; transpose the source data for 4-in-parallel operation
+ TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12
+
+ uqsub8 r7, r9, r10 ; p3 - p2
+ uqsub8 r8, r10, r9 ; p2 - p3
+ uqsub8 r9, r10, r11 ; p2 - p1
+ uqsub8 r10, r11, r10 ; p1 - p2
+ orr r7, r7, r8 ; abs (p3-p2)
+ orr r10, r9, r10 ; abs (p2-p1)
+ uqsub8 lr, r7, r2 ; compare to limit. lr: vp8_filter_mask
+ uqsub8 r10, r10, r2 ; compare to limit
+
+ sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines
+
+ orr lr, lr, r10
+
+ uqsub8 r6, r11, r12 ; p1 - p0
+ uqsub8 r7, r12, r11 ; p0 - p1
+ add src, src, #4 ; move src pointer up by 4
+ orr r6, r6, r7 ; abs (p1-p0)
+ str r11, [sp, #12] ; save p1
+ uqsub8 r10, r6, r2 ; compare to limit
+ uqsub8 r11, r6, r3 ; compare to thresh
+ orr lr, lr, r10
+
+ ; transpose uses 8 regs(r6 - r12 and lr). Need to save reg value now
+ ; transpose the source data for 4-in-parallel operation
+ ldr r6, [src], pstep ; load source data
+ str r11, [sp] ; push r11 to stack
+ ldr r7, [src], pstep
+ str r12, [sp, #4] ; save current reg before load q0 - q3 data
+ ldr r8, [src], pstep
+ str lr, [sp, #8]
+ ldr lr, [src], pstep
+
+ TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12
+
+ ldr lr, [sp, #8] ; load back (f)limit accumulator
+
+ uqsub8 r6, r12, r11 ; q3 - q2
+ uqsub8 r7, r11, r12 ; q2 - q3
+ uqsub8 r12, r11, r10 ; q2 - q1
+ uqsub8 r11, r10, r11 ; q1 - q2
+ orr r6, r6, r7 ; abs (q3-q2)
+ orr r7, r12, r11 ; abs (q2-q1)
+ uqsub8 r6, r6, r2 ; compare to limit
+ uqsub8 r7, r7, r2 ; compare to limit
+ ldr r11, [sp, #4] ; load back p0
+ ldr r12, [sp, #12] ; load back p1
+ orr lr, lr, r6
+ orr lr, lr, r7
+
+ uqsub8 r6, r11, r9 ; p0 - q0
+ uqsub8 r7, r9, r11 ; q0 - p0
+ uqsub8 r8, r12, r10 ; p1 - q1
+ uqsub8 r11, r10, r12 ; q1 - p1
+ orr r6, r6, r7 ; abs (p0-q0)
+ ldr r7, c0x7F7F7F7F
+ orr r8, r8, r11 ; abs (p1-q1)
+ uqadd8 r6, r6, r6 ; abs (p0-q0) * 2
+ and r8, r7, r8, lsr #1 ; abs (p1-q1) / 2
+ uqsub8 r11, r10, r9 ; q1 - q0
+ uqadd8 r6, r8, r6 ; abs (p0-q0)*2 + abs (p1-q1)/2
+ uqsub8 r12, r9, r10 ; q0 - q1
+ uqsub8 r6, r6, r4 ; compare to flimit
+
+ orr r9, r11, r12 ; abs (q1-q0)
+ uqsub8 r8, r9, r2 ; compare to limit
+ uqsub8 r10, r9, r3 ; compare to thresh
+ orr lr, lr, r6
+ orr lr, lr, r8
+
+ mvn r11, #0 ; r11 == -1
+ mov r12, #0
+
+ usub8 lr, r12, lr
+ ldr r9, [sp] ; load the compared result
+ sel lr, r11, r12 ; filter mask: lr
+
+ cmp lr, #0
+ beq vskip_filter ; skip filtering
+
+ ;vp8_hevmask() function
+ ;calculate high edge variance
+
+ sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines
+
+ orr r9, r9, r10
+
+ ldrh r7, [src, #-2]
+ ldrh r8, [src], pstep
+
+ usub8 r9, r12, r9
+ sel r6, r12, r11 ; hev mask: r6
+
+ ;vp8_filter() function
+ ; load soure data to r6, r11, r12, lr
+ ldrh r9, [src, #-2]
+ ldrh r10, [src], pstep
+
+ pkhbt r12, r7, r8, lsl #16
+
+ ldrh r7, [src, #-2]
+ ldrh r8, [src], pstep
+
+ pkhbt r11, r9, r10, lsl #16
+
+ ldrh r9, [src, #-2]
+ ldrh r10, [src], pstep
+
+ ; Transpose needs 8 regs(r6 - r12, and lr). Save r6 and lr first
+ str r6, [sp]
+ str lr, [sp, #4]
+
+ pkhbt r6, r7, r8, lsl #16
+ pkhbt lr, r9, r10, lsl #16
+
+ ;transpose r12, r11, r6, lr to r7, r8, r9, r10
+ TRANSPOSE_MATRIX r12, r11, r6, lr, r7, r8, r9, r10
+
+ ;load back hev_mask r6 and filter_mask lr
+ ldr r12, c0x80808080
+ ldr r6, [sp]
+ ldr lr, [sp, #4]
+
+ eor r7, r7, r12 ; p1 offset to convert to a signed value
+ eor r8, r8, r12 ; p0 offset to convert to a signed value
+ eor r9, r9, r12 ; q0 offset to convert to a signed value
+ eor r10, r10, r12 ; q1 offset to convert to a signed value
+
+ str r9, [sp] ; store qs0 temporarily
+ str r8, [sp, #4] ; store ps0 temporarily
+ str r10, [sp, #8] ; store qs1 temporarily
+ str r7, [sp, #12] ; store ps1 temporarily
+
+ qsub8 r7, r7, r10 ; vp8_signed_char_clamp(ps1-qs1)
+ qsub8 r8, r9, r8 ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
+
+ and r7, r7, r6 ; vp8_filter (r7) &= hev (r7 : filter)
+
+ qadd8 r7, r7, r8
+ ldr r9, c0x03030303 ; r9 = 3 --modified for vp8
+
+ qadd8 r7, r7, r8
+ ldr r10, c0x04040404
+
+ qadd8 r7, r7, r8
+ ;mvn r11, #0 ; r11 == -1
+
+ and r7, r7, lr ; vp8_filter &= mask
+
+ ;modify code for vp8 -- Filter1 = vp8_filter (r7)
+ qadd8 r8 , r7 , r9 ; Filter2 (r8) = vp8_signed_char_clamp(vp8_filter+3)
+ qadd8 r7 , r7 , r10 ; vp8_filter = vp8_signed_char_clamp(vp8_filter+4)
+
+ mov r9, #0
+ shadd8 r8 , r8 , r9 ; Filter2 >>= 3
+ shadd8 r7 , r7 , r9 ; vp8_filter >>= 3
+ shadd8 r8 , r8 , r9
+ shadd8 r7 , r7 , r9
+ shadd8 lr , r8 , r9 ; lr: filter2
+ shadd8 r7 , r7 , r9 ; r7: filter
+
+ ;usub8 lr, r8, r10 ; s = (s==4)*-1
+ ;sel lr, r11, r9
+ ;usub8 r8, r10, r8
+ ;sel r8, r11, r9
+ ;and r8, r8, lr ; -1 for each element that equals 4 -- r8: s
+
+ ;calculate output
+ ;qadd8 lr, r8, r7 ; u = vp8_signed_char_clamp(s + vp8_filter)
+
+ ldr r8, [sp] ; load qs0
+ ldr r9, [sp, #4] ; load ps0
+
+ ldr r10, c0x01010101
+
+ qsub8 r8, r8, r7 ; u = vp8_signed_char_clamp(qs0 - vp8_filter)
+ qadd8 r9, r9, lr ; u = vp8_signed_char_clamp(ps0 + Filter2)
+ ;end of modification for vp8
+
+ eor r8, r8, r12
+ eor r9, r9, r12
+
+ mov lr, #0
+
+ sadd8 r7, r7, r10
+ shadd8 r7, r7, lr
+
+ ldr r10, [sp, #8] ; load qs1
+ ldr r11, [sp, #12] ; load ps1
+
+ bic r7, r7, r6 ; r7: vp8_filter
+
+ qsub8 r10 , r10, r7 ; u = vp8_signed_char_clamp(qs1 - vp8_filter)
+ qadd8 r11, r11, r7 ; u = vp8_signed_char_clamp(ps1 + vp8_filter)
+ eor r10, r10, r12
+ eor r11, r11, r12
+
+ sub src, src, pstep, lsl #2
+
+ ;we can use TRANSPOSE_MATRIX macro to transpose output - input: q1, q0, p0, p1
+ ;output is b0, b1, b2, b3
+ ;b0: 03 02 01 00
+ ;b1: 13 12 11 10
+ ;b2: 23 22 21 20
+ ;b3: 33 32 31 30
+ ; p1 p0 q0 q1
+ ; (a3 a2 a1 a0)
+ TRANSPOSE_MATRIX r11, r9, r8, r10, r6, r7, r12, lr
+
+ strh r6, [src, #-2] ; store the result
+ mov r6, r6, lsr #16
+ strh r6, [src], pstep
+
+ strh r7, [src, #-2]
+ mov r7, r7, lsr #16
+ strh r7, [src], pstep
+
+ strh r12, [src, #-2]
+ mov r12, r12, lsr #16
+ strh r12, [src], pstep
+
+ strh lr, [src, #-2]
+ mov lr, lr, lsr #16
+ strh lr, [src], pstep
+
+|vskip_filter|
+ sub src, src, #4
+ subs count, count, #1
+
+ ldrne r6, [src], pstep ; load source data
+ ldrne r7, [src], pstep
+ ldrne r8, [src], pstep
+ ldrne lr, [src], pstep
+
+ bne Vnext8
+
+ add sp, sp, #16
+
+ ldmia sp!, {r4 - r11, pc}
+ ENDP ; |vp8_loop_filter_vertical_edge_armv6|
+
+
+
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+|vp8_mbloop_filter_vertical_edge_armv6| PROC
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+ stmdb sp!, {r4 - r11, lr}
+
+ sub src, src, #4 ; move src pointer down by 4
+ ldr count, [sp, #40] ; count for 8-in-parallel
+ ldr r12, [sp, #36] ; load thresh address
+ sub sp, sp, #16 ; create temp buffer
+
+ ldr r6, [src], pstep ; load source data
+ ldr r4, [r2], #4 ; flimit
+ ldr r7, [src], pstep
+ ldr r2, [r3], #4 ; limit
+ ldr r8, [src], pstep
+ uadd8 r4, r4, r4 ; flimit * 2
+ ldr r3, [r12], #4 ; thresh
+ ldr lr, [src], pstep
+ mov count, count, lsl #1 ; 4-in-parallel
+ uadd8 r4, r4, r2 ; flimit * 2 + limit
+
+|MBVnext8|
+ ; vp8_filter_mask() function
+ ; calculate breakout conditions
+ ; transpose the source data for 4-in-parallel operation
+ TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12
+
+ uqsub8 r7, r9, r10 ; p3 - p2
+ uqsub8 r8, r10, r9 ; p2 - p3
+ uqsub8 r9, r10, r11 ; p2 - p1
+ uqsub8 r10, r11, r10 ; p1 - p2
+ orr r7, r7, r8 ; abs (p3-p2)
+ orr r10, r9, r10 ; abs (p2-p1)
+ uqsub8 lr, r7, r2 ; compare to limit. lr: vp8_filter_mask
+ uqsub8 r10, r10, r2 ; compare to limit
+
+ sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines
+
+ orr lr, lr, r10
+
+ uqsub8 r6, r11, r12 ; p1 - p0
+ uqsub8 r7, r12, r11 ; p0 - p1
+ add src, src, #4 ; move src pointer up by 4
+ orr r6, r6, r7 ; abs (p1-p0)
+ str r11, [sp, #12] ; save p1
+ uqsub8 r10, r6, r2 ; compare to limit
+ uqsub8 r11, r6, r3 ; compare to thresh
+ orr lr, lr, r10
+
+ ; transpose uses 8 regs(r6 - r12 and lr). Need to save reg value now
+ ; transpose the source data for 4-in-parallel operation
+ ldr r6, [src], pstep ; load source data
+ str r11, [sp] ; push r11 to stack
+ ldr r7, [src], pstep
+ str r12, [sp, #4] ; save current reg before load q0 - q3 data
+ ldr r8, [src], pstep
+ str lr, [sp, #8]
+ ldr lr, [src], pstep
+
+ TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12
+
+ ldr lr, [sp, #8] ; load back (f)limit accumulator
+
+ uqsub8 r6, r12, r11 ; q3 - q2
+ uqsub8 r7, r11, r12 ; q2 - q3
+ uqsub8 r12, r11, r10 ; q2 - q1
+ uqsub8 r11, r10, r11 ; q1 - q2
+ orr r6, r6, r7 ; abs (q3-q2)
+ orr r7, r12, r11 ; abs (q2-q1)
+ uqsub8 r6, r6, r2 ; compare to limit
+ uqsub8 r7, r7, r2 ; compare to limit
+ ldr r11, [sp, #4] ; load back p0
+ ldr r12, [sp, #12] ; load back p1
+ orr lr, lr, r6
+ orr lr, lr, r7
+
+ uqsub8 r6, r11, r9 ; p0 - q0
+ uqsub8 r7, r9, r11 ; q0 - p0
+ uqsub8 r8, r12, r10 ; p1 - q1
+ uqsub8 r11, r10, r12 ; q1 - p1
+ orr r6, r6, r7 ; abs (p0-q0)
+ ldr r7, c0x7F7F7F7F
+ orr r8, r8, r11 ; abs (p1-q1)
+ uqadd8 r6, r6, r6 ; abs (p0-q0) * 2
+ and r8, r7, r8, lsr #1 ; abs (p1-q1) / 2
+ uqsub8 r11, r10, r9 ; q1 - q0
+ uqadd8 r6, r8, r6 ; abs (p0-q0)*2 + abs (p1-q1)/2
+ uqsub8 r12, r9, r10 ; q0 - q1
+ uqsub8 r6, r6, r4 ; compare to flimit
+
+ orr r9, r11, r12 ; abs (q1-q0)
+ uqsub8 r8, r9, r2 ; compare to limit
+ uqsub8 r10, r9, r3 ; compare to thresh
+ orr lr, lr, r6
+ orr lr, lr, r8
+
+ mvn r11, #0 ; r11 == -1
+ mov r12, #0
+
+ usub8 lr, r12, lr
+ ldr r9, [sp] ; load the compared result
+ sel lr, r11, r12 ; filter mask: lr
+
+ cmp lr, #0
+ beq mbvskip_filter ; skip filtering
+
+
+ ;vp8_hevmask() function
+ ;calculate high edge variance
+
+ sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines
+
+ orr r9, r9, r10
+
+ ldrh r7, [src, #-2]
+ ldrh r8, [src], pstep
+
+ usub8 r9, r12, r9
+ sel r6, r12, r11 ; hev mask: r6
+
+
+ ; vp8_mbfilter() function
+ ; p2, q2 are only needed at the end. Don't need to load them in now.
+ ; Transpose needs 8 regs(r6 - r12, and lr). Save r6 and lr first
+ ; load soure data to r6, r11, r12, lr
+ ldrh r9, [src, #-2]
+ ldrh r10, [src], pstep
+
+ pkhbt r12, r7, r8, lsl #16
+
+ ldrh r7, [src, #-2]
+ ldrh r8, [src], pstep
+
+ pkhbt r11, r9, r10, lsl #16
+
+ ldrh r9, [src, #-2]
+ ldrh r10, [src], pstep
+
+ str r6, [sp] ; save r6
+ str lr, [sp, #4] ; save lr
+
+ pkhbt r6, r7, r8, lsl #16
+ pkhbt lr, r9, r10, lsl #16
+
+ ;transpose r12, r11, r6, lr to p1, p0, q0, q1
+ TRANSPOSE_MATRIX r12, r11, r6, lr, r7, r8, r9, r10
+
+ ;load back hev_mask r6 and filter_mask lr
+ ldr r12, c0x80808080
+ ldr r6, [sp]
+ ldr lr, [sp, #4]
+
+ eor r7, r7, r12 ; ps1
+ eor r8, r8, r12 ; ps0
+ eor r9, r9, r12 ; qs0
+ eor r10, r10, r12 ; qs1
+
+ qsub8 r12, r9, r8 ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
+ str r7, [sp, #12] ; store ps1 temporarily
+ qsub8 r7, r7, r10 ; vp8_signed_char_clamp(ps1-qs1)
+ str r10, [sp, #8] ; store qs1 temporarily
+ qadd8 r7, r7, r12
+ str r9, [sp] ; store qs0 temporarily
+ qadd8 r7, r7, r12
+ str r8, [sp, #4] ; store ps0 temporarily
+ qadd8 r7, r7, r12 ; vp8_filter: r7
+
+ ldr r10, c0x03030303 ; r10 = 3 --modified for vp8
+ ldr r9, c0x04040404
+ ;mvn r11, #0 ; r11 == -1
+
+ and r7, r7, lr ; vp8_filter &= mask (lr is free)
+
+ mov r12, r7 ; Filter2: r12
+ and r12, r12, r6 ; Filter2 &= hev
+
+ ;modify code for vp8
+ ;save bottom 3 bits so that we round one side +4 and the other +3
+ qadd8 r8 , r12 , r9 ; Filter1 (r8) = vp8_signed_char_clamp(Filter2+4)
+ qadd8 r12 , r12 , r10 ; Filter2 (r12) = vp8_signed_char_clamp(Filter2+3)
+
+ mov r10, #0
+ shadd8 r8 , r8 , r10 ; Filter1 >>= 3
+ shadd8 r12 , r12 , r10 ; Filter2 >>= 3
+ shadd8 r8 , r8 , r10
+ shadd8 r12 , r12 , r10
+ shadd8 r8 , r8 , r10 ; r8: Filter1
+ shadd8 r12 , r12 , r10 ; r12: Filter2
+
+ ldr r9, [sp] ; load qs0
+ ldr r11, [sp, #4] ; load ps0
+
+ qsub8 r9 , r9, r8 ; qs0 = vp8_signed_char_clamp(qs0 - Filter1)
+ qadd8 r11, r11, r12 ; ps0 = vp8_signed_char_clamp(ps0 + Filter2)
+
+ ;save bottom 3 bits so that we round one side +4 and the other +3
+ ;and r8, r12, r10 ; s = Filter2 & 7 (s: r8)
+ ;qadd8 r12 , r12 , r9 ; Filter2 = vp8_signed_char_clamp(Filter2+4)
+ ;mov r10, #0
+ ;shadd8 r12 , r12 , r10 ; Filter2 >>= 3
+ ;usub8 lr, r8, r9 ; s = (s==4)*-1
+ ;sel lr, r11, r10
+ ;shadd8 r12 , r12 , r10
+ ;usub8 r8, r9, r8
+ ;sel r8, r11, r10
+ ;ldr r9, [sp] ; load qs0
+ ;ldr r11, [sp, #4] ; load ps0
+ ;shadd8 r12 , r12 , r10
+ ;and r8, r8, lr ; -1 for each element that equals 4
+ ;qadd8 r10, r8, r12 ; u = vp8_signed_char_clamp(s + Filter2)
+ ;qsub8 r9 , r9, r12 ; qs0 = vp8_signed_char_clamp(qs0 - Filter2)
+ ;qadd8 r11, r11, r10 ; ps0 = vp8_signed_char_clamp(ps0 + u)
+
+ ;end of modification for vp8
+
+ bic r12, r7, r6 ;vp8_filter &= ~hev ( r6 is free)
+ ;mov r12, r7
+
+ ;roughly 3/7th difference across boundary
+ mov lr, #0x1b ; 27
+ mov r7, #0x3f ; 63
+
+ sxtb16 r6, r12
+ sxtb16 r10, r12, ror #8
+ smlabb r8, r6, lr, r7
+ smlatb r6, r6, lr, r7
+ smlabb r7, r10, lr, r7
+ smultb r10, r10, lr
+ ssat r8, #8, r8, asr #7
+ ssat r6, #8, r6, asr #7
+ add r10, r10, #63
+ ssat r7, #8, r7, asr #7
+ ssat r10, #8, r10, asr #7
+
+ ldr lr, c0x80808080
+
+ pkhbt r6, r8, r6, lsl #16
+ pkhbt r10, r7, r10, lsl #16
+ uxtb16 r6, r6
+ uxtb16 r10, r10
+
+ sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines
+
+ orr r10, r6, r10, lsl #8 ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7)
+
+ qsub8 r8, r9, r10 ; s = vp8_signed_char_clamp(qs0 - u)
+ qadd8 r10, r11, r10 ; s = vp8_signed_char_clamp(ps0 + u)
+ eor r8, r8, lr ; *oq0 = s^0x80
+ eor r10, r10, lr ; *op0 = s^0x80
+
+ strb r10, [src, #-1] ; store op0 result
+ strb r8, [src], pstep ; store oq0 result
+ mov r10, r10, lsr #8
+ mov r8, r8, lsr #8
+ strb r10, [src, #-1]
+ strb r8, [src], pstep
+ mov r10, r10, lsr #8
+ mov r8, r8, lsr #8
+ strb r10, [src, #-1]
+ strb r8, [src], pstep
+ mov r10, r10, lsr #8
+ mov r8, r8, lsr #8
+ strb r10, [src, #-1]
+ strb r8, [src], pstep
+
+ ;roughly 2/7th difference across boundary
+ mov lr, #0x12 ; 18
+ mov r7, #0x3f ; 63
+
+ sxtb16 r6, r12
+ sxtb16 r10, r12, ror #8
+ smlabb r8, r6, lr, r7
+ smlatb r6, r6, lr, r7
+ smlabb r9, r10, lr, r7
+ smlatb r10, r10, lr, r7
+ ssat r8, #8, r8, asr #7
+ ssat r6, #8, r6, asr #7
+ ssat r9, #8, r9, asr #7
+ ssat r10, #8, r10, asr #7
+
+ sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines
+
+ pkhbt r6, r8, r6, lsl #16
+ pkhbt r10, r9, r10, lsl #16
+
+ ldr r9, [sp, #8] ; load qs1
+ ldr r11, [sp, #12] ; load ps1
+ ldr lr, c0x80808080
+
+ uxtb16 r6, r6
+ uxtb16 r10, r10
+
+ add src, src, #2
+
+ orr r10, r6, r10, lsl #8 ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7)
+
+ qsub8 r8, r9, r10 ; s = vp8_signed_char_clamp(qs1 - u)
+ qadd8 r10, r11, r10 ; s = vp8_signed_char_clamp(ps1 + u)
+ eor r8, r8, lr ; *oq1 = s^0x80
+ eor r10, r10, lr ; *op1 = s^0x80
+
+ ldrb r11, [src, #-5] ; load p2 for 1/7th difference across boundary
+ strb r10, [src, #-4] ; store op1
+ strb r8, [src, #-1] ; store oq1
+ ldrb r9, [src], pstep ; load q2 for 1/7th difference across boundary
+
+ mov r10, r10, lsr #8
+ mov r8, r8, lsr #8
+
+ ldrb r6, [src, #-5]
+ strb r10, [src, #-4]
+ strb r8, [src, #-1]
+ ldrb r7, [src], pstep
+
+ mov r10, r10, lsr #8
+ mov r8, r8, lsr #8
+ orr r11, r11, r6, lsl #8
+ orr r9, r9, r7, lsl #8
+
+ ldrb r6, [src, #-5]
+ strb r10, [src, #-4]
+ strb r8, [src, #-1]
+ ldrb r7, [src], pstep
+
+ mov r10, r10, lsr #8
+ mov r8, r8, lsr #8
+ orr r11, r11, r6, lsl #16
+ orr r9, r9, r7, lsl #16
+
+ ldrb r6, [src, #-5]
+ strb r10, [src, #-4]
+ strb r8, [src, #-1]
+ ldrb r7, [src], pstep
+ orr r11, r11, r6, lsl #24
+ orr r9, r9, r7, lsl #24
+
+ ;roughly 1/7th difference across boundary
+ eor r9, r9, lr
+ eor r11, r11, lr
+
+ mov lr, #0x9 ; 9
+ mov r7, #0x3f ; 63
+
+ sxtb16 r6, r12
+ sxtb16 r10, r12, ror #8
+ smlabb r8, r6, lr, r7
+ smlatb r6, r6, lr, r7
+ smlabb r12, r10, lr, r7
+ smlatb r10, r10, lr, r7
+ ssat r8, #8, r8, asr #7
+ ssat r6, #8, r6, asr #7
+ ssat r12, #8, r12, asr #7
+ ssat r10, #8, r10, asr #7
+
+ sub src, src, pstep, lsl #2
+
+ pkhbt r6, r8, r6, lsl #16
+ pkhbt r10, r12, r10, lsl #16
+
+ uxtb16 r6, r6
+ uxtb16 r10, r10
+
+ ldr lr, c0x80808080
+
+ orr r10, r6, r10, lsl #8 ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7)
+
+ qadd8 r8, r11, r10 ; s = vp8_signed_char_clamp(ps2 + u)
+ qsub8 r10, r9, r10 ; s = vp8_signed_char_clamp(qs2 - u)
+ eor r8, r8, lr ; *op2 = s^0x80
+ eor r10, r10, lr ; *oq2 = s^0x80
+
+ strb r8, [src, #-5] ; store *op2
+ strb r10, [src], pstep ; store *oq2
+ mov r8, r8, lsr #8
+ mov r10, r10, lsr #8
+ strb r8, [src, #-5]
+ strb r10, [src], pstep
+ mov r8, r8, lsr #8
+ mov r10, r10, lsr #8
+ strb r8, [src, #-5]
+ strb r10, [src], pstep
+ mov r8, r8, lsr #8
+ mov r10, r10, lsr #8
+ strb r8, [src, #-5]
+ strb r10, [src], pstep
+
+ ;adjust src pointer for next loop
+ sub src, src, #2
+
+|mbvskip_filter|
+ sub src, src, #4
+ subs count, count, #1
+
+ ldrne r6, [src], pstep ; load source data
+ ldrne r7, [src], pstep
+ ldrne r8, [src], pstep
+ ldrne lr, [src], pstep
+
+ bne MBVnext8
+
+ add sp, sp, #16
+
+ ldmia sp!, {r4 - r11, pc}
+ ENDP ; |vp8_mbloop_filter_vertical_edge_armv6|
+
+; Constant Pool
+c0x80808080 DCD 0x80808080
+c0x03030303 DCD 0x03030303
+c0x04040404 DCD 0x04040404
+c0x01010101 DCD 0x01010101
+c0x7F7F7F7F DCD 0x7F7F7F7F
+
+ END
diff --git a/vp8/common/arm/armv6/recon_v6.asm b/vp8/common/arm/armv6/recon_v6.asm
new file mode 100644
index 000000000..085ff80c9
--- /dev/null
+++ b/vp8/common/arm/armv6/recon_v6.asm
@@ -0,0 +1,280 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_recon_b_armv6|
+ EXPORT |vp8_recon2b_armv6|
+ EXPORT |vp8_recon4b_armv6|
+
+ AREA |.text|, CODE, READONLY ; name this block of code
+prd RN r0
+dif RN r1
+dst RN r2
+stride RN r3
+
+;void recon_b(unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int stride)
+; R0 char* pred_ptr
+; R1 short * dif_ptr
+; R2 char * dst_ptr
+; R3 int stride
+
+; Description:
+; Loop through the block adding the Pred and Diff together. Clamp and then
+; store back into the Dst.
+
+; Restrictions :
+; all buffers are expected to be 4 byte aligned coming in and
+; going out.
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+;
+;
+;
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+|vp8_recon_b_armv6| PROC
+ stmdb sp!, {r4 - r9, lr}
+
+ ;0, 1, 2, 3
+ ldr r4, [prd], #16 ; 3 | 2 | 1 | 0
+ ldr r6, [dif, #0] ; 1 | 0
+ ldr r7, [dif, #4] ; 3 | 2
+
+ pkhbt r8, r6, r7, lsl #16 ; 2 | 0
+ pkhtb r9, r7, r6, asr #16 ; 3 | 1
+
+ uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0
+ uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1
+
+ usat16 r8, #8, r8
+ usat16 r9, #8, r9
+ add dif, dif, #32
+ orr r8, r8, r9, lsl #8
+
+ str r8, [dst], stride
+
+ ;0, 1, 2, 3
+ ldr r4, [prd], #16 ; 3 | 2 | 1 | 0
+;; ldr r6, [dif, #8] ; 1 | 0
+;; ldr r7, [dif, #12] ; 3 | 2
+ ldr r6, [dif, #0] ; 1 | 0
+ ldr r7, [dif, #4] ; 3 | 2
+
+ pkhbt r8, r6, r7, lsl #16 ; 2 | 0
+ pkhtb r9, r7, r6, asr #16 ; 3 | 1
+
+ uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0
+ uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1
+
+ usat16 r8, #8, r8
+ usat16 r9, #8, r9
+ add dif, dif, #32
+ orr r8, r8, r9, lsl #8
+
+ str r8, [dst], stride
+
+ ;0, 1, 2, 3
+ ldr r4, [prd], #16 ; 3 | 2 | 1 | 0
+;; ldr r6, [dif, #16] ; 1 | 0
+;; ldr r7, [dif, #20] ; 3 | 2
+ ldr r6, [dif, #0] ; 1 | 0
+ ldr r7, [dif, #4] ; 3 | 2
+
+ pkhbt r8, r6, r7, lsl #16 ; 2 | 0
+ pkhtb r9, r7, r6, asr #16 ; 3 | 1
+
+ uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0
+ uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1
+
+ usat16 r8, #8, r8
+ usat16 r9, #8, r9
+ add dif, dif, #32
+ orr r8, r8, r9, lsl #8
+
+ str r8, [dst], stride
+
+ ;0, 1, 2, 3
+ ldr r4, [prd], #16 ; 3 | 2 | 1 | 0
+;; ldr r6, [dif, #24] ; 1 | 0
+;; ldr r7, [dif, #28] ; 3 | 2
+ ldr r6, [dif, #0] ; 1 | 0
+ ldr r7, [dif, #4] ; 3 | 2
+
+ pkhbt r8, r6, r7, lsl #16 ; 2 | 0
+ pkhtb r9, r7, r6, asr #16 ; 3 | 1
+
+ uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0
+ uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1
+
+ usat16 r8, #8, r8
+ usat16 r9, #8, r9
+ orr r8, r8, r9, lsl #8
+
+ str r8, [dst], stride
+
+ ldmia sp!, {r4 - r9, pc}
+
+ ENDP ; |recon_b|
+
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+;
+;
+;
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+; R0 char *pred_ptr
+; R1 short *dif_ptr
+; R2 char *dst_ptr
+; R3 int stride
+|vp8_recon4b_armv6| PROC
+ stmdb sp!, {r4 - r9, lr}
+
+ mov lr, #4
+
+recon4b_loop
+ ;0, 1, 2, 3
+ ldr r4, [prd], #4 ; 3 | 2 | 1 | 0
+ ldr r6, [dif, #0] ; 1 | 0
+ ldr r7, [dif, #4] ; 3 | 2
+
+ pkhbt r8, r6, r7, lsl #16 ; 2 | 0
+ pkhtb r9, r7, r6, asr #16 ; 3 | 1
+
+ uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0
+ uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1
+
+ usat16 r8, #8, r8
+ usat16 r9, #8, r9
+ orr r8, r8, r9, lsl #8
+
+ str r8, [dst]
+
+ ;4, 5, 6, 7
+ ldr r4, [prd], #4
+;; ldr r6, [dif, #32]
+;; ldr r7, [dif, #36]
+ ldr r6, [dif, #8]
+ ldr r7, [dif, #12]
+
+ pkhbt r8, r6, r7, lsl #16
+ pkhtb r9, r7, r6, asr #16
+
+ uxtab16 r8, r8, r4
+ uxtab16 r9, r9, r4, ror #8
+ usat16 r8, #8, r8
+ usat16 r9, #8, r9
+ orr r8, r8, r9, lsl #8
+
+ str r8, [dst, #4]
+
+ ;8, 9, 10, 11
+ ldr r4, [prd], #4
+;; ldr r6, [dif, #64]
+;; ldr r7, [dif, #68]
+ ldr r6, [dif, #16]
+ ldr r7, [dif, #20]
+
+ pkhbt r8, r6, r7, lsl #16
+ pkhtb r9, r7, r6, asr #16
+
+ uxtab16 r8, r8, r4
+ uxtab16 r9, r9, r4, ror #8
+ usat16 r8, #8, r8
+ usat16 r9, #8, r9
+ orr r8, r8, r9, lsl #8
+
+ str r8, [dst, #8]
+
+ ;12, 13, 14, 15
+ ldr r4, [prd], #4
+;; ldr r6, [dif, #96]
+;; ldr r7, [dif, #100]
+ ldr r6, [dif, #24]
+ ldr r7, [dif, #28]
+
+ pkhbt r8, r6, r7, lsl #16
+ pkhtb r9, r7, r6, asr #16
+
+ uxtab16 r8, r8, r4
+ uxtab16 r9, r9, r4, ror #8
+ usat16 r8, #8, r8
+ usat16 r9, #8, r9
+ orr r8, r8, r9, lsl #8
+
+ str r8, [dst, #12]
+
+ add dst, dst, stride
+;; add dif, dif, #8
+ add dif, dif, #32
+
+ subs lr, lr, #1
+ bne recon4b_loop
+
+ ldmia sp!, {r4 - r9, pc}
+
+ ENDP ; |Recon4B|
+
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+;
+;
+;
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+; R0 char *pred_ptr
+; R1 short *dif_ptr
+; R2 char *dst_ptr
+; R3 int stride
+|vp8_recon2b_armv6| PROC
+ stmdb sp!, {r4 - r9, lr}
+
+ mov lr, #4
+
+recon2b_loop
+ ;0, 1, 2, 3
+ ldr r4, [prd], #4
+ ldr r6, [dif, #0]
+ ldr r7, [dif, #4]
+
+ pkhbt r8, r6, r7, lsl #16
+ pkhtb r9, r7, r6, asr #16
+
+ uxtab16 r8, r8, r4
+ uxtab16 r9, r9, r4, ror #8
+ usat16 r8, #8, r8
+ usat16 r9, #8, r9
+ orr r8, r8, r9, lsl #8
+
+ str r8, [dst]
+
+ ;4, 5, 6, 7
+ ldr r4, [prd], #4
+;; ldr r6, [dif, #32]
+;; ldr r7, [dif, #36]
+ ldr r6, [dif, #8]
+ ldr r7, [dif, #12]
+
+ pkhbt r8, r6, r7, lsl #16
+ pkhtb r9, r7, r6, asr #16
+
+ uxtab16 r8, r8, r4
+ uxtab16 r9, r9, r4, ror #8
+ usat16 r8, #8, r8
+ usat16 r9, #8, r9
+ orr r8, r8, r9, lsl #8
+
+ str r8, [dst, #4]
+
+ add dst, dst, stride
+;; add dif, dif, #8
+ add dif, dif, #16
+
+ subs lr, lr, #1
+ bne recon2b_loop
+
+ ldmia sp!, {r4 - r9, pc}
+
+ ENDP ; |Recon2B|
+
+ END
diff --git a/vp8/common/arm/armv6/simpleloopfilter_v6.asm b/vp8/common/arm/armv6/simpleloopfilter_v6.asm
new file mode 100644
index 000000000..15c6c7d16
--- /dev/null
+++ b/vp8/common/arm/armv6/simpleloopfilter_v6.asm
@@ -0,0 +1,321 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_loop_filter_simple_horizontal_edge_armv6|
+ EXPORT |vp8_loop_filter_simple_vertical_edge_armv6|
+
+ AREA |.text|, CODE, READONLY ; name this block of code
+
+ MACRO
+ TRANSPOSE_MATRIX $a0, $a1, $a2, $a3, $b0, $b1, $b2, $b3
+ ; input: $a0, $a1, $a2, $a3; output: $b0, $b1, $b2, $b3
+ ; a0: 03 02 01 00
+ ; a1: 13 12 11 10
+ ; a2: 23 22 21 20
+ ; a3: 33 32 31 30
+ ; b3 b2 b1 b0
+
+ uxtb16 $b1, $a1 ; xx 12 xx 10
+ uxtb16 $b0, $a0 ; xx 02 xx 00
+ uxtb16 $b3, $a3 ; xx 32 xx 30
+ uxtb16 $b2, $a2 ; xx 22 xx 20
+ orr $b1, $b0, $b1, lsl #8 ; 12 02 10 00
+ orr $b3, $b2, $b3, lsl #8 ; 32 22 30 20
+
+ uxtb16 $a1, $a1, ror #8 ; xx 13 xx 11
+ uxtb16 $a3, $a3, ror #8 ; xx 33 xx 31
+ uxtb16 $a0, $a0, ror #8 ; xx 03 xx 01
+ uxtb16 $a2, $a2, ror #8 ; xx 23 xx 21
+ orr $a0, $a0, $a1, lsl #8 ; 13 03 11 01
+ orr $a2, $a2, $a3, lsl #8 ; 33 23 31 21
+
+ pkhtb $b2, $b3, $b1, asr #16 ; 32 22 12 02 -- p1
+ pkhbt $b0, $b1, $b3, lsl #16 ; 30 20 10 00 -- p3
+
+ pkhtb $b3, $a2, $a0, asr #16 ; 33 23 13 03 -- p0
+ pkhbt $b1, $a0, $a2, lsl #16 ; 31 21 11 01 -- p2
+ MEND
+
+
+src RN r0
+pstep RN r1
+
+;r0 unsigned char *src_ptr,
+;r1 int src_pixel_step,
+;r2 const char *flimit,
+;r3 const char *limit,
+;stack const char *thresh,
+;stack int count
+
+;Note: All 16 elements in flimit are equal. So, in the code, only one load is needed
+;for flimit. Same way applies to limit and thresh.
+
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+|vp8_loop_filter_simple_horizontal_edge_armv6| PROC
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+ stmdb sp!, {r4 - r11, lr}
+
+ sub src, src, pstep, lsl #1 ; move src pointer down by 2 lines
+
+ ldr r12, [r3], #4 ; limit
+ ldr r3, [src], pstep ; p1
+
+ ldr r9, [sp, #36] ; count for 8-in-parallel
+ ldr r4, [src], pstep ; p0
+
+ ldr r7, [r2], #4 ; flimit
+ ldr r5, [src], pstep ; q0
+ ldr r2, c0x80808080
+
+ ldr r6, [src] ; q1
+
+ uadd8 r7, r7, r7 ; flimit * 2
+ mov r9, r9, lsl #1 ; 4-in-parallel
+ uadd8 r12, r7, r12 ; flimit * 2 + limit
+
+|simple_hnext8|
+ ; vp8_simple_filter_mask() function
+
+ uqsub8 r7, r3, r6 ; p1 - q1
+ uqsub8 r8, r6, r3 ; q1 - p1
+ uqsub8 r10, r4, r5 ; p0 - q0
+ uqsub8 r11, r5, r4 ; q0 - p0
+ orr r8, r8, r7 ; abs(p1 - q1)
+ ldr lr, c0x7F7F7F7F ; 01111111 mask
+ orr r10, r10, r11 ; abs(p0 - q0)
+ and r8, lr, r8, lsr #1 ; abs(p1 - q1) / 2
+ uqadd8 r10, r10, r10 ; abs(p0 - q0) * 2
+ mvn lr, #0 ; r10 == -1
+ uqadd8 r10, r10, r8 ; abs(p0 - q0)*2 + abs(p1 - q1)/2
+ ; STALL waiting on r10 :(
+ uqsub8 r10, r10, r12 ; compare to flimit
+ mov r8, #0
+
+ usub8 r10, r8, r10 ; use usub8 instead of ssub8
+ ; STALL (maybe?) when are flags set? :/
+ sel r10, lr, r8 ; filter mask: lr
+
+ cmp r10, #0
+ beq simple_hskip_filter ; skip filtering
+
+ ;vp8_simple_filter() function
+
+ eor r3, r3, r2 ; p1 offset to convert to a signed value
+ eor r6, r6, r2 ; q1 offset to convert to a signed value
+ eor r4, r4, r2 ; p0 offset to convert to a signed value
+ eor r5, r5, r2 ; q0 offset to convert to a signed value
+
+ qsub8 r3, r3, r6 ; vp8_filter (r3) = vp8_signed_char_clamp(p1-q1)
+ qsub8 r6, r5, r4 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( q0 - p0))
+
+ qadd8 r3, r3, r6
+ ldr r8, c0x03030303 ; r8 = 3
+
+ qadd8 r3, r3, r6
+ ldr r7, c0x04040404
+
+ qadd8 r3, r3, r6
+ and r3, r3, lr ; vp8_filter &= mask;
+
+ ;save bottom 3 bits so that we round one side +4 and the other +3
+ qadd8 r8 , r3 , r8 ; Filter2 (r8) = vp8_signed_char_clamp(vp8_filter+3)
+ qadd8 r3 , r3 , r7 ; Filter1 (r3) = vp8_signed_char_clamp(vp8_filter+4)
+
+ mov r7, #0
+ shadd8 r8 , r8 , r7 ; Filter2 >>= 3
+ shadd8 r3 , r3 , r7 ; Filter1 >>= 3
+ shadd8 r8 , r8 , r7
+ shadd8 r3 , r3 , r7
+ shadd8 r8 , r8 , r7 ; r8: Filter2
+ shadd8 r3 , r3 , r7 ; r7: filter1
+
+ ;calculate output
+ sub src, src, pstep, lsl #1
+
+ qadd8 r4, r4, r8 ; u = vp8_signed_char_clamp(p0 + Filter2)
+ qsub8 r5 ,r5, r3 ; u = vp8_signed_char_clamp(q0 - Filter1)
+ eor r4, r4, r2 ; *op0 = u^0x80
+ str r4, [src], pstep ; store op0 result
+ eor r5, r5, r2 ; *oq0 = u^0x80
+ str r5, [src], pstep ; store oq0 result
+
+|simple_hskip_filter|
+ add src, src, #4
+ sub src, src, pstep
+ sub src, src, pstep, lsl #1
+
+ subs r9, r9, #1
+
+ ;pld [src]
+ ;pld [src, pstep]
+ ;pld [src, pstep, lsl #1]
+
+ ldrne r3, [src], pstep ; p1
+ ldrne r4, [src], pstep ; p0
+ ldrne r5, [src], pstep ; q0
+ ldrne r6, [src] ; q1
+
+ bne simple_hnext8
+
+ ldmia sp!, {r4 - r11, pc}
+ ENDP ; |vp8_loop_filter_simple_horizontal_edge_armv6|
+
+
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+|vp8_loop_filter_simple_vertical_edge_armv6| PROC
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+ stmdb sp!, {r4 - r11, lr}
+
+ ldr r12, [r2], #4 ; r12: flimit
+ ldr r2, c0x80808080
+ ldr r7, [r3], #4 ; limit
+
+ ; load soure data to r7, r8, r9, r10
+ ldrh r3, [src, #-2]
+ ldrh r4, [src], pstep
+ uadd8 r12, r12, r12 ; flimit * 2
+
+ ldrh r5, [src, #-2]
+ ldrh r6, [src], pstep
+ uadd8 r12, r12, r7 ; flimit * 2 + limit
+
+ pkhbt r7, r3, r4, lsl #16
+
+ ldrh r3, [src, #-2]
+ ldrh r4, [src], pstep
+ ldr r11, [sp, #40] ; count (r11) for 8-in-parallel
+
+ pkhbt r8, r5, r6, lsl #16
+
+ ldrh r5, [src, #-2]
+ ldrh r6, [src], pstep
+ mov r11, r11, lsl #1 ; 4-in-parallel
+
+|simple_vnext8|
+ ; vp8_simple_filter_mask() function
+ pkhbt r9, r3, r4, lsl #16
+ pkhbt r10, r5, r6, lsl #16
+
+ ;transpose r7, r8, r9, r10 to r3, r4, r5, r6
+ TRANSPOSE_MATRIX r7, r8, r9, r10, r3, r4, r5, r6
+
+ uqsub8 r7, r3, r6 ; p1 - q1
+ uqsub8 r8, r6, r3 ; q1 - p1
+ uqsub8 r9, r4, r5 ; p0 - q0
+ uqsub8 r10, r5, r4 ; q0 - p0
+ orr r7, r7, r8 ; abs(p1 - q1)
+ orr r9, r9, r10 ; abs(p0 - q0)
+ ldr lr, c0x7F7F7F7F ; 0111 1111 mask
+ uqadd8 r9, r9, r9 ; abs(p0 - q0) * 2
+ and r7, lr, r7, lsr #1 ; abs(p1 - q1) / 2
+ mov r8, #0
+ uqadd8 r7, r7, r9 ; abs(p0 - q0)*2 + abs(p1 - q1)/2
+ mvn r10, #0 ; r10 == -1
+ uqsub8 r7, r7, r12 ; compare to flimit
+
+ usub8 r7, r8, r7
+ sel r7, r10, r8 ; filter mask: lr
+
+ cmp lr, #0
+ beq simple_vskip_filter ; skip filtering
+
+ ;vp8_simple_filter() function
+ eor r3, r3, r2 ; p1 offset to convert to a signed value
+ eor r6, r6, r2 ; q1 offset to convert to a signed value
+ eor r4, r4, r2 ; p0 offset to convert to a signed value
+ eor r5, r5, r2 ; q0 offset to convert to a signed value
+
+ qsub8 r3, r3, r6 ; vp8_filter (r3) = vp8_signed_char_clamp(p1-q1)
+ qsub8 r6, r5, r4 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( q0 - p0))
+
+ qadd8 r3, r3, r6
+ ldr r8, c0x03030303 ; r8 = 3
+
+ qadd8 r3, r3, r6
+ ldr r7, c0x04040404
+
+ qadd8 r3, r3, r6
+ and r3, r3, lr ; vp8_filter &= mask
+
+ ;save bottom 3 bits so that we round one side +4 and the other +3
+ qadd8 r8 , r3 , r8 ; Filter2 (r8) = vp8_signed_char_clamp(vp8_filter+3)
+ qadd8 r3 , r3 , r7 ; Filter1 (r3) = vp8_signed_char_clamp(vp8_filter+4)
+
+ mov r7, #0
+ shadd8 r8 , r8 , r7 ; Filter2 >>= 3
+ shadd8 r3 , r3 , r7 ; Filter1 >>= 3
+ shadd8 r8 , r8 , r7
+ shadd8 r3 , r3 , r7
+ shadd8 r8 , r8 , r7 ; r8: filter2
+ shadd8 r3 , r3 , r7 ; r7: filter1
+
+ ;calculate output
+ sub src, src, pstep, lsl #2
+
+ qadd8 r4, r4, r8 ; u = vp8_signed_char_clamp(p0 + Filter2)
+ qsub8 r5, r5, r3 ; u = vp8_signed_char_clamp(q0 - Filter1)
+ eor r4, r4, r2 ; *op0 = u^0x80
+ eor r5, r5, r2 ; *oq0 = u^0x80
+
+ strb r4, [src, #-1] ; store the result
+ mov r4, r4, lsr #8
+ strb r5, [src], pstep
+ mov r5, r5, lsr #8
+
+ strb r4, [src, #-1]
+ mov r4, r4, lsr #8
+ strb r5, [src], pstep
+ mov r5, r5, lsr #8
+
+ strb r4, [src, #-1]
+ mov r4, r4, lsr #8
+ strb r5, [src], pstep
+ mov r5, r5, lsr #8
+
+ strb r4, [src, #-1]
+ strb r5, [src], pstep
+
+|simple_vskip_filter|
+ subs r11, r11, #1
+
+ ;pld [src]
+ ;pld [src, pstep]
+ ;pld [src, pstep, lsl #1]
+
+ ; load soure data to r7, r8, r9, r10
+ ldrneh r3, [src, #-2]
+ ldrneh r4, [src], pstep
+
+ ldrneh r5, [src, #-2]
+ ldrneh r6, [src], pstep
+
+ pkhbt r7, r3, r4, lsl #16
+
+ ldrneh r3, [src, #-2]
+ ldrneh r4, [src], pstep
+
+ pkhbt r8, r5, r6, lsl #16
+
+ ldrneh r5, [src, #-2]
+ ldrneh r6, [src], pstep
+
+ bne simple_vnext8
+
+ ldmia sp!, {r4 - r12, pc}
+ ENDP ; |vp8_loop_filter_simple_vertical_edge_armv6|
+
+; Constant Pool
+c0x80808080 DCD 0x80808080
+c0x03030303 DCD 0x03030303
+c0x04040404 DCD 0x04040404
+c0x01010101 DCD 0x01010101
+c0x7F7F7F7F DCD 0x7F7F7F7F
+
+ END
diff --git a/vp8/common/arm/armv6/sixtappredict8x4_v6.asm b/vp8/common/arm/armv6/sixtappredict8x4_v6.asm
new file mode 100644
index 000000000..551d863e9
--- /dev/null
+++ b/vp8/common/arm/armv6/sixtappredict8x4_v6.asm
@@ -0,0 +1,277 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_sixtap_predict8x4_armv6|
+
+ AREA |.text|, CODE, READONLY ; name this block of code
+;-------------------------------------
+; r0 unsigned char *src_ptr,
+; r1 int src_pixels_per_line,
+; r2 int xoffset,
+; r3 int yoffset,
+; stack unsigned char *dst_ptr,
+; stack int dst_pitch
+;-------------------------------------
+;note: In first pass, store the result in transpose(8linesx9columns) on stack. Temporary stack size is 184.
+;Line width is 20 that is 9 short data plus 2 to make it 4bytes aligned. In second pass, load data from stack,
+;and the result is stored in transpose.
+|vp8_sixtap_predict8x4_armv6| PROC
+ stmdb sp!, {r4 - r11, lr}
+ sub sp, sp, #184 ;reserve space on stack for temporary storage: 20x(8+1) +4
+
+ cmp r2, #0 ;skip first_pass filter if xoffset=0
+ str r3, [sp], #4 ;store yoffset
+ beq skip_firstpass_filter
+
+;first-pass filter
+ ldr r12, _filter8_coeff_
+ sub r0, r0, r1, lsl #1
+
+ add r2, r12, r2, lsl #4 ;calculate filter location
+ add r0, r0, #3 ;adjust src only for loading convinience
+
+ ldr r3, [r2] ; load up packed filter coefficients
+ ldr r4, [r2, #4]
+ ldr r5, [r2, #8]
+
+ mov r2, #0x90000 ; height=9 is top part of counter
+
+ sub r1, r1, #8
+ mov lr, #20
+
+|first_pass_hloop_v6|
+ ldrb r6, [r0, #-5] ; load source data
+ ldrb r7, [r0, #-4]
+ ldrb r8, [r0, #-3]
+ ldrb r9, [r0, #-2]
+ ldrb r10, [r0, #-1]
+
+ orr r2, r2, #0x4 ; construct loop counter. width=8=4x2
+
+ pkhbt r6, r6, r7, lsl #16 ; r7 | r6
+ pkhbt r7, r7, r8, lsl #16 ; r8 | r7
+
+ pkhbt r8, r8, r9, lsl #16 ; r9 | r8
+ pkhbt r9, r9, r10, lsl #16 ; r10 | r9
+
+|first_pass_wloop_v6|
+ smuad r11, r6, r3 ; vp8_filter[0], vp8_filter[1]
+ smuad r12, r7, r3
+
+ ldrb r6, [r0], #1
+
+ smlad r11, r8, r4, r11 ; vp8_filter[2], vp8_filter[3]
+ ldrb r7, [r0], #1
+ smlad r12, r9, r4, r12
+
+ pkhbt r10, r10, r6, lsl #16 ; r10 | r9
+ pkhbt r6, r6, r7, lsl #16 ; r11 | r10
+ smlad r11, r10, r5, r11 ; vp8_filter[4], vp8_filter[5]
+ smlad r12, r6, r5, r12
+
+ sub r2, r2, #1
+
+ add r11, r11, #0x40 ; round_shift_and_clamp
+ tst r2, #0xff ; test loop counter
+ usat r11, #8, r11, asr #7
+ add r12, r12, #0x40
+ strh r11, [sp], lr ; result is transposed and stored, which
+ usat r12, #8, r12, asr #7
+
+ strh r12, [sp], lr
+
+ movne r11, r6
+ movne r12, r7
+
+ movne r6, r8
+ movne r7, r9
+ movne r8, r10
+ movne r9, r11
+ movne r10, r12
+
+ bne first_pass_wloop_v6
+
+ ;;add r9, ppl, #30 ; attempt to load 2 adjacent cache lines
+ ;;IF ARCHITECTURE=6
+ ;pld [src, ppl]
+ ;;pld [src, r9]
+ ;;ENDIF
+
+ subs r2, r2, #0x10000
+
+ mov r6, #158
+ sub sp, sp, r6
+
+ add r0, r0, r1 ; move to next input line
+
+ bne first_pass_hloop_v6
+
+;second pass filter
+secondpass_filter
+ mov r1, #18
+ sub sp, sp, r1 ; 18+4
+
+ ldr r3, [sp, #-4] ; load back yoffset
+ ldr r0, [sp, #216] ; load dst address from stack 180+36
+ ldr r1, [sp, #220] ; load dst stride from stack 180+40
+
+ cmp r3, #0
+ beq skip_secondpass_filter
+
+ ldr r12, _filter8_coeff_
+ add lr, r12, r3, lsl #4 ;calculate filter location
+
+ mov r2, #0x00080000
+
+ ldr r3, [lr] ; load up packed filter coefficients
+ ldr r4, [lr, #4]
+ ldr r5, [lr, #8]
+
+ pkhbt r12, r4, r3 ; pack the filter differently
+ pkhbt r11, r5, r4
+
+second_pass_hloop_v6
+ ldr r6, [sp] ; load the data
+ ldr r7, [sp, #4]
+
+ orr r2, r2, #2 ; loop counter
+
+second_pass_wloop_v6
+ smuad lr, r3, r6 ; apply filter
+ smulbt r10, r3, r6
+
+ ldr r8, [sp, #8]
+
+ smlad lr, r4, r7, lr
+ smladx r10, r12, r7, r10
+
+ ldrh r9, [sp, #12]
+
+ smlad lr, r5, r8, lr
+ smladx r10, r11, r8, r10
+
+ add sp, sp, #4
+ smlatb r10, r5, r9, r10
+
+ sub r2, r2, #1
+
+ add lr, lr, #0x40 ; round_shift_and_clamp
+ tst r2, #0xff
+ usat lr, #8, lr, asr #7
+ add r10, r10, #0x40
+ strb lr, [r0], r1 ; the result is transposed back and stored
+ usat r10, #8, r10, asr #7
+
+ strb r10, [r0],r1
+
+ movne r6, r7
+ movne r7, r8
+
+ bne second_pass_wloop_v6
+
+ subs r2, r2, #0x10000
+ add sp, sp, #12 ; updata src for next loop (20-8)
+ sub r0, r0, r1, lsl #2
+ add r0, r0, #1
+
+ bne second_pass_hloop_v6
+
+ add sp, sp, #20
+ ldmia sp!, {r4 - r11, pc}
+
+;--------------------
+skip_firstpass_filter
+ sub r0, r0, r1, lsl #1
+ sub r1, r1, #8
+ mov r2, #9
+ mov r3, #20
+
+skip_firstpass_hloop
+ ldrb r4, [r0], #1 ; load data
+ subs r2, r2, #1
+ ldrb r5, [r0], #1
+ strh r4, [sp], r3 ; store it to immediate buffer
+ ldrb r6, [r0], #1 ; load data
+ strh r5, [sp], r3
+ ldrb r7, [r0], #1
+ strh r6, [sp], r3
+ ldrb r8, [r0], #1
+ strh r7, [sp], r3
+ ldrb r9, [r0], #1
+ strh r8, [sp], r3
+ ldrb r10, [r0], #1
+ strh r9, [sp], r3
+ ldrb r11, [r0], #1
+ strh r10, [sp], r3
+ add r0, r0, r1 ; move to next input line
+ strh r11, [sp], r3
+
+ mov r4, #158
+ sub sp, sp, r4 ; move over to next column
+ bne skip_firstpass_hloop
+
+ b secondpass_filter
+
+;--------------------
+skip_secondpass_filter
+ mov r2, #8
+ add sp, sp, #4 ;start from src[0] instead of src[-2]
+
+skip_secondpass_hloop
+ ldr r6, [sp], #4
+ subs r2, r2, #1
+ ldr r8, [sp], #4
+
+ mov r7, r6, lsr #16 ; unpack
+ strb r6, [r0], r1
+ mov r9, r8, lsr #16
+ strb r7, [r0], r1
+ add sp, sp, #12 ; 20-8
+ strb r8, [r0], r1
+ strb r9, [r0], r1
+
+ sub r0, r0, r1, lsl #2
+ add r0, r0, #1
+
+ bne skip_secondpass_hloop
+
+ add sp, sp, #16 ; 180 - (160 +4)
+
+ ldmia sp!, {r4 - r11, pc}
+
+ ENDP
+
+;-----------------
+ AREA subpelfilters8_dat, DATA, READWRITE ;read/write by default
+;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
+;One word each is reserved. Label filter_coeff can be used to access the data.
+;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
+_filter8_coeff_
+ DCD filter8_coeff
+filter8_coeff
+ DCD 0x00000000, 0x00000080, 0x00000000, 0x00000000
+ DCD 0xfffa0000, 0x000c007b, 0x0000ffff, 0x00000000
+ DCD 0xfff50002, 0x0024006c, 0x0001fff8, 0x00000000
+ DCD 0xfff70000, 0x0032005d, 0x0000fffa, 0x00000000
+ DCD 0xfff00003, 0x004d004d, 0x0003fff0, 0x00000000
+ DCD 0xfffa0000, 0x005d0032, 0x0000fff7, 0x00000000
+ DCD 0xfff80001, 0x006c0024, 0x0002fff5, 0x00000000
+ DCD 0xffff0000, 0x007b000c, 0x0000fffa, 0x00000000
+
+ ;DCD 0, 0, 128, 0, 0, 0
+ ;DCD 0, -6, 123, 12, -1, 0
+ ;DCD 2, -11, 108, 36, -8, 1
+ ;DCD 0, -9, 93, 50, -6, 0
+ ;DCD 3, -16, 77, 77, -16, 3
+ ;DCD 0, -6, 50, 93, -9, 0
+ ;DCD 1, -8, 36, 108, -11, 2
+ ;DCD 0, -1, 12, 123, -6, 0
+
+ END