summaryrefslogtreecommitdiff
path: root/vp8/common
diff options
context:
space:
mode:
authorJohann <johannkoenig@google.com>2016-07-29 12:31:40 -0700
committerJohann <johannkoenig@google.com>2016-08-04 12:55:06 -0700
commitd55724fae9cb27e070add7952394fc0427ef2061 (patch)
tree9adfc4144df76b6192e7b2ad9c59d58d58dd0804 /vp8/common
parent476e8fc8558592f5535ec2bcdfc6798d35f65f12 (diff)
downloadlibvpx-d55724fae9cb27e070add7952394fc0427ef2061.tar
libvpx-d55724fae9cb27e070add7952394fc0427ef2061.tar.gz
libvpx-d55724fae9cb27e070add7952394fc0427ef2061.tar.bz2
libvpx-d55724fae9cb27e070add7952394fc0427ef2061.zip
Remove armv6 target
Change-Id: I1fa81cc9cabf362a185fc3a53f1e58de533a41e5
Diffstat (limited to 'vp8/common')
-rw-r--r--vp8/common/arm/armv6/bilinearfilter_v6.asm237
-rw-r--r--vp8/common/arm/armv6/copymem16x16_v6.asm186
-rw-r--r--vp8/common/arm/armv6/copymem8x4_v6.asm128
-rw-r--r--vp8/common/arm/armv6/copymem8x8_v6.asm128
-rw-r--r--vp8/common/arm/armv6/dc_only_idct_add_v6.asm70
-rw-r--r--vp8/common/arm/armv6/dequant_idct_v6.asm190
-rw-r--r--vp8/common/arm/armv6/dequantize_v6.asm69
-rw-r--r--vp8/common/arm/armv6/filter_v6.asm624
-rw-r--r--vp8/common/arm/armv6/idct_blk_v6.c100
-rw-r--r--vp8/common/arm/armv6/idct_v6.asm202
-rw-r--r--vp8/common/arm/armv6/iwalsh_v6.asm136
-rw-r--r--vp8/common/arm/armv6/loopfilter_v6.asm1282
-rw-r--r--vp8/common/arm/armv6/simpleloopfilter_v6.asm286
-rw-r--r--vp8/common/arm/armv6/sixtappredict8x4_v6.asm273
-rw-r--r--vp8/common/arm/bilinearfilter_arm.c87
-rw-r--r--vp8/common/arm/bilinearfilter_arm.h31
-rw-r--r--vp8/common/arm/dequantize_arm.c23
-rw-r--r--vp8/common/arm/filter_arm.c176
-rw-r--r--vp8/common/arm/loopfilter_arm.c107
-rw-r--r--vp8/common/rtcd_defs.pl91
20 files changed, 33 insertions, 4393 deletions
diff --git a/vp8/common/arm/armv6/bilinearfilter_v6.asm b/vp8/common/arm/armv6/bilinearfilter_v6.asm
deleted file mode 100644
index 9704b4210..000000000
--- a/vp8/common/arm/armv6/bilinearfilter_v6.asm
+++ /dev/null
@@ -1,237 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_filter_block2d_bil_first_pass_armv6|
- EXPORT |vp8_filter_block2d_bil_second_pass_armv6|
-
- AREA |.text|, CODE, READONLY ; name this block of code
-
-;-------------------------------------
-; r0 unsigned char *src_ptr,
-; r1 unsigned short *dst_ptr,
-; r2 unsigned int src_pitch,
-; r3 unsigned int height,
-; stack unsigned int width,
-; stack const short *vp8_filter
-;-------------------------------------
-; The output is transposed stroed in output array to make it easy for second pass filtering.
-|vp8_filter_block2d_bil_first_pass_armv6| PROC
- stmdb sp!, {r4 - r11, lr}
-
- ldr r11, [sp, #40] ; vp8_filter address
- ldr r4, [sp, #36] ; width
-
- mov r12, r3 ; outer-loop counter
-
- add r7, r2, r4 ; preload next row
- pld [r0, r7]
-
- sub r2, r2, r4 ; src increment for height loop
-
- ldr r5, [r11] ; load up filter coefficients
-
- mov r3, r3, lsl #1 ; height*2
- add r3, r3, #2 ; plus 2 to make output buffer 4-bit aligned since height is actually (height+1)
-
- mov r11, r1 ; save dst_ptr for each row
-
- cmp r5, #128 ; if filter coef = 128, then skip the filter
- beq bil_null_1st_filter
-
-|bil_height_loop_1st_v6|
- ldrb r6, [r0] ; load source data
- ldrb r7, [r0, #1]
- ldrb r8, [r0, #2]
- mov lr, r4, lsr #2 ; 4-in-parellel loop counter
-
-|bil_width_loop_1st_v6|
- ldrb r9, [r0, #3]
- ldrb r10, [r0, #4]
-
- pkhbt r6, r6, r7, lsl #16 ; src[1] | src[0]
- pkhbt r7, r7, r8, lsl #16 ; src[2] | src[1]
-
- smuad r6, r6, r5 ; apply the filter
- pkhbt r8, r8, r9, lsl #16 ; src[3] | src[2]
- smuad r7, r7, r5
- pkhbt r9, r9, r10, lsl #16 ; src[4] | src[3]
-
- smuad r8, r8, r5
- smuad r9, r9, r5
-
- add r0, r0, #4
- subs lr, lr, #1
-
- add r6, r6, #0x40 ; round_shift_and_clamp
- add r7, r7, #0x40
- usat r6, #16, r6, asr #7
- usat r7, #16, r7, asr #7
-
- strh r6, [r1], r3 ; result is transposed and stored
-
- add r8, r8, #0x40 ; round_shift_and_clamp
- strh r7, [r1], r3
- add r9, r9, #0x40
- usat r8, #16, r8, asr #7
- usat r9, #16, r9, asr #7
-
- strh r8, [r1], r3 ; result is transposed and stored
-
- ldrneb r6, [r0] ; load source data
- strh r9, [r1], r3
-
- ldrneb r7, [r0, #1]
- ldrneb r8, [r0, #2]
-
- bne bil_width_loop_1st_v6
-
- add r0, r0, r2 ; move to next input row
- subs r12, r12, #1
-
- add r9, r2, r4, lsl #1 ; adding back block width
- pld [r0, r9] ; preload next row
-
- add r11, r11, #2 ; move over to next column
- mov r1, r11
-
- bne bil_height_loop_1st_v6
-
- ldmia sp!, {r4 - r11, pc}
-
-|bil_null_1st_filter|
-|bil_height_loop_null_1st|
- mov lr, r4, lsr #2 ; loop counter
-
-|bil_width_loop_null_1st|
- ldrb r6, [r0] ; load data
- ldrb r7, [r0, #1]
- ldrb r8, [r0, #2]
- ldrb r9, [r0, #3]
-
- strh r6, [r1], r3 ; store it to immediate buffer
- add r0, r0, #4
- strh r7, [r1], r3
- subs lr, lr, #1
- strh r8, [r1], r3
- strh r9, [r1], r3
-
- bne bil_width_loop_null_1st
-
- subs r12, r12, #1
- add r0, r0, r2 ; move to next input line
- add r11, r11, #2 ; move over to next column
- mov r1, r11
-
- bne bil_height_loop_null_1st
-
- ldmia sp!, {r4 - r11, pc}
-
- ENDP ; |vp8_filter_block2d_bil_first_pass_armv6|
-
-
-;---------------------------------
-; r0 unsigned short *src_ptr,
-; r1 unsigned char *dst_ptr,
-; r2 int dst_pitch,
-; r3 unsigned int height,
-; stack unsigned int width,
-; stack const short *vp8_filter
-;---------------------------------
-|vp8_filter_block2d_bil_second_pass_armv6| PROC
- stmdb sp!, {r4 - r11, lr}
-
- ldr r11, [sp, #40] ; vp8_filter address
- ldr r4, [sp, #36] ; width
-
- ldr r5, [r11] ; load up filter coefficients
- mov r12, r4 ; outer-loop counter = width, since we work on transposed data matrix
- mov r11, r1
-
- cmp r5, #128 ; if filter coef = 128, then skip the filter
- beq bil_null_2nd_filter
-
-|bil_height_loop_2nd|
- ldr r6, [r0] ; load the data
- ldr r8, [r0, #4]
- ldrh r10, [r0, #8]
- mov lr, r3, lsr #2 ; loop counter
-
-|bil_width_loop_2nd|
- pkhtb r7, r6, r8 ; src[1] | src[2]
- pkhtb r9, r8, r10 ; src[3] | src[4]
-
- smuad r6, r6, r5 ; apply filter
- smuad r8, r8, r5 ; apply filter
-
- subs lr, lr, #1
-
- smuadx r7, r7, r5 ; apply filter
- smuadx r9, r9, r5 ; apply filter
-
- add r0, r0, #8
-
- add r6, r6, #0x40 ; round_shift_and_clamp
- add r7, r7, #0x40
- usat r6, #8, r6, asr #7
- usat r7, #8, r7, asr #7
- strb r6, [r1], r2 ; the result is transposed back and stored
-
- add r8, r8, #0x40 ; round_shift_and_clamp
- strb r7, [r1], r2
- add r9, r9, #0x40
- usat r8, #8, r8, asr #7
- usat r9, #8, r9, asr #7
- strb r8, [r1], r2 ; the result is transposed back and stored
-
- ldrne r6, [r0] ; load data
- strb r9, [r1], r2
- ldrne r8, [r0, #4]
- ldrneh r10, [r0, #8]
-
- bne bil_width_loop_2nd
-
- subs r12, r12, #1
- add r0, r0, #4 ; update src for next row
- add r11, r11, #1
- mov r1, r11
-
- bne bil_height_loop_2nd
- ldmia sp!, {r4 - r11, pc}
-
-|bil_null_2nd_filter|
-|bil_height_loop_null_2nd|
- mov lr, r3, lsr #2
-
-|bil_width_loop_null_2nd|
- ldr r6, [r0], #4 ; load data
- subs lr, lr, #1
- ldr r8, [r0], #4
-
- strb r6, [r1], r2 ; store data
- mov r7, r6, lsr #16
- strb r7, [r1], r2
- mov r9, r8, lsr #16
- strb r8, [r1], r2
- strb r9, [r1], r2
-
- bne bil_width_loop_null_2nd
-
- subs r12, r12, #1
- add r0, r0, #4
- add r11, r11, #1
- mov r1, r11
-
- bne bil_height_loop_null_2nd
-
- ldmia sp!, {r4 - r11, pc}
- ENDP ; |vp8_filter_block2d_second_pass_armv6|
-
- END
diff --git a/vp8/common/arm/armv6/copymem16x16_v6.asm b/vp8/common/arm/armv6/copymem16x16_v6.asm
deleted file mode 100644
index abf048c2f..000000000
--- a/vp8/common/arm/armv6/copymem16x16_v6.asm
+++ /dev/null
@@ -1,186 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_copy_mem16x16_v6|
- ; ARM
- ; REQUIRE8
- ; PRESERVE8
-
- AREA Block, CODE, READONLY ; name this block of code
-;void copy_mem16x16_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
-|vp8_copy_mem16x16_v6| PROC
- stmdb sp!, {r4 - r7}
- ;push {r4-r7}
-
- ;preload
- pld [r0, #31] ; preload for next 16x16 block
-
- ands r4, r0, #15
- beq copy_mem16x16_fast
-
- ands r4, r0, #7
- beq copy_mem16x16_8
-
- ands r4, r0, #3
- beq copy_mem16x16_4
-
- ;copy one byte each time
- ldrb r4, [r0]
- ldrb r5, [r0, #1]
- ldrb r6, [r0, #2]
- ldrb r7, [r0, #3]
-
- mov r12, #16
-
-copy_mem16x16_1_loop
- strb r4, [r2]
- strb r5, [r2, #1]
- strb r6, [r2, #2]
- strb r7, [r2, #3]
-
- ldrb r4, [r0, #4]
- ldrb r5, [r0, #5]
- ldrb r6, [r0, #6]
- ldrb r7, [r0, #7]
-
- subs r12, r12, #1
-
- strb r4, [r2, #4]
- strb r5, [r2, #5]
- strb r6, [r2, #6]
- strb r7, [r2, #7]
-
- ldrb r4, [r0, #8]
- ldrb r5, [r0, #9]
- ldrb r6, [r0, #10]
- ldrb r7, [r0, #11]
-
- strb r4, [r2, #8]
- strb r5, [r2, #9]
- strb r6, [r2, #10]
- strb r7, [r2, #11]
-
- ldrb r4, [r0, #12]
- ldrb r5, [r0, #13]
- ldrb r6, [r0, #14]
- ldrb r7, [r0, #15]
-
- add r0, r0, r1
-
- strb r4, [r2, #12]
- strb r5, [r2, #13]
- strb r6, [r2, #14]
- strb r7, [r2, #15]
-
- add r2, r2, r3
-
- ldrneb r4, [r0]
- ldrneb r5, [r0, #1]
- ldrneb r6, [r0, #2]
- ldrneb r7, [r0, #3]
-
- pld [r0, #31] ; preload for next 16x16 block
-
- bne copy_mem16x16_1_loop
-
- ldmia sp!, {r4 - r7}
- ;pop {r4-r7}
- mov pc, lr
-
-;copy 4 bytes each time
-copy_mem16x16_4
- ldr r4, [r0]
- ldr r5, [r0, #4]
- ldr r6, [r0, #8]
- ldr r7, [r0, #12]
-
- mov r12, #16
-
-copy_mem16x16_4_loop
- subs r12, r12, #1
- add r0, r0, r1
-
- str r4, [r2]
- str r5, [r2, #4]
- str r6, [r2, #8]
- str r7, [r2, #12]
-
- add r2, r2, r3
-
- ldrne r4, [r0]
- ldrne r5, [r0, #4]
- ldrne r6, [r0, #8]
- ldrne r7, [r0, #12]
-
- pld [r0, #31] ; preload for next 16x16 block
-
- bne copy_mem16x16_4_loop
-
- ldmia sp!, {r4 - r7}
- ;pop {r4-r7}
- mov pc, lr
-
-;copy 8 bytes each time
-copy_mem16x16_8
- sub r1, r1, #16
- sub r3, r3, #16
-
- mov r12, #16
-
-copy_mem16x16_8_loop
- ldmia r0!, {r4-r5}
- ;ldm r0, {r4-r5}
- ldmia r0!, {r6-r7}
-
- add r0, r0, r1
-
- stmia r2!, {r4-r5}
- subs r12, r12, #1
- ;stm r2, {r4-r5}
- stmia r2!, {r6-r7}
-
- add r2, r2, r3
-
- pld [r0, #31] ; preload for next 16x16 block
- bne copy_mem16x16_8_loop
-
- ldmia sp!, {r4 - r7}
- ;pop {r4-r7}
- mov pc, lr
-
-;copy 16 bytes each time
-copy_mem16x16_fast
- ;sub r1, r1, #16
- ;sub r3, r3, #16
-
- mov r12, #16
-
-copy_mem16x16_fast_loop
- ldmia r0, {r4-r7}
- ;ldm r0, {r4-r7}
- add r0, r0, r1
-
- subs r12, r12, #1
- stmia r2, {r4-r7}
- ;stm r2, {r4-r7}
- add r2, r2, r3
-
- pld [r0, #31] ; preload for next 16x16 block
- bne copy_mem16x16_fast_loop
-
- ldmia sp!, {r4 - r7}
- ;pop {r4-r7}
- mov pc, lr
-
- ENDP ; |vp8_copy_mem16x16_v6|
-
- END
diff --git a/vp8/common/arm/armv6/copymem8x4_v6.asm b/vp8/common/arm/armv6/copymem8x4_v6.asm
deleted file mode 100644
index d8362ef05..000000000
--- a/vp8/common/arm/armv6/copymem8x4_v6.asm
+++ /dev/null
@@ -1,128 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_copy_mem8x4_v6|
- ; ARM
- ; REQUIRE8
- ; PRESERVE8
-
- AREA Block, CODE, READONLY ; name this block of code
-;void vp8_copy_mem8x4_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
-|vp8_copy_mem8x4_v6| PROC
- ;push {r4-r5}
- stmdb sp!, {r4-r5}
-
- ;preload
- pld [r0]
- pld [r0, r1]
- pld [r0, r1, lsl #1]
-
- ands r4, r0, #7
- beq copy_mem8x4_fast
-
- ands r4, r0, #3
- beq copy_mem8x4_4
-
- ;copy 1 byte each time
- ldrb r4, [r0]
- ldrb r5, [r0, #1]
-
- mov r12, #4
-
-copy_mem8x4_1_loop
- strb r4, [r2]
- strb r5, [r2, #1]
-
- ldrb r4, [r0, #2]
- ldrb r5, [r0, #3]
-
- subs r12, r12, #1
-
- strb r4, [r2, #2]
- strb r5, [r2, #3]
-
- ldrb r4, [r0, #4]
- ldrb r5, [r0, #5]
-
- strb r4, [r2, #4]
- strb r5, [r2, #5]
-
- ldrb r4, [r0, #6]
- ldrb r5, [r0, #7]
-
- add r0, r0, r1
-
- strb r4, [r2, #6]
- strb r5, [r2, #7]
-
- add r2, r2, r3
-
- ldrneb r4, [r0]
- ldrneb r5, [r0, #1]
-
- bne copy_mem8x4_1_loop
-
- ldmia sp!, {r4 - r5}
- ;pop {r4-r5}
- mov pc, lr
-
-;copy 4 bytes each time
-copy_mem8x4_4
- ldr r4, [r0]
- ldr r5, [r0, #4]
-
- mov r12, #4
-
-copy_mem8x4_4_loop
- subs r12, r12, #1
- add r0, r0, r1
-
- str r4, [r2]
- str r5, [r2, #4]
-
- add r2, r2, r3
-
- ldrne r4, [r0]
- ldrne r5, [r0, #4]
-
- bne copy_mem8x4_4_loop
-
- ldmia sp!, {r4-r5}
- ;pop {r4-r5}
- mov pc, lr
-
-;copy 8 bytes each time
-copy_mem8x4_fast
- ;sub r1, r1, #8
- ;sub r3, r3, #8
-
- mov r12, #4
-
-copy_mem8x4_fast_loop
- ldmia r0, {r4-r5}
- ;ldm r0, {r4-r5}
- add r0, r0, r1
-
- subs r12, r12, #1
- stmia r2, {r4-r5}
- ;stm r2, {r4-r5}
- add r2, r2, r3
-
- bne copy_mem8x4_fast_loop
-
- ldmia sp!, {r4-r5}
- ;pop {r4-r5}
- mov pc, lr
-
- ENDP ; |vp8_copy_mem8x4_v6|
-
- END
diff --git a/vp8/common/arm/armv6/copymem8x8_v6.asm b/vp8/common/arm/armv6/copymem8x8_v6.asm
deleted file mode 100644
index c6a60c610..000000000
--- a/vp8/common/arm/armv6/copymem8x8_v6.asm
+++ /dev/null
@@ -1,128 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_copy_mem8x8_v6|
- ; ARM
- ; REQUIRE8
- ; PRESERVE8
-
- AREA Block, CODE, READONLY ; name this block of code
-;void copy_mem8x8_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
-|vp8_copy_mem8x8_v6| PROC
- ;push {r4-r5}
- stmdb sp!, {r4-r5}
-
- ;preload
- pld [r0]
- pld [r0, r1]
- pld [r0, r1, lsl #1]
-
- ands r4, r0, #7
- beq copy_mem8x8_fast
-
- ands r4, r0, #3
- beq copy_mem8x8_4
-
- ;copy 1 byte each time
- ldrb r4, [r0]
- ldrb r5, [r0, #1]
-
- mov r12, #8
-
-copy_mem8x8_1_loop
- strb r4, [r2]
- strb r5, [r2, #1]
-
- ldrb r4, [r0, #2]
- ldrb r5, [r0, #3]
-
- subs r12, r12, #1
-
- strb r4, [r2, #2]
- strb r5, [r2, #3]
-
- ldrb r4, [r0, #4]
- ldrb r5, [r0, #5]
-
- strb r4, [r2, #4]
- strb r5, [r2, #5]
-
- ldrb r4, [r0, #6]
- ldrb r5, [r0, #7]
-
- add r0, r0, r1
-
- strb r4, [r2, #6]
- strb r5, [r2, #7]
-
- add r2, r2, r3
-
- ldrneb r4, [r0]
- ldrneb r5, [r0, #1]
-
- bne copy_mem8x8_1_loop
-
- ldmia sp!, {r4 - r5}
- ;pop {r4-r5}
- mov pc, lr
-
-;copy 4 bytes each time
-copy_mem8x8_4
- ldr r4, [r0]
- ldr r5, [r0, #4]
-
- mov r12, #8
-
-copy_mem8x8_4_loop
- subs r12, r12, #1
- add r0, r0, r1
-
- str r4, [r2]
- str r5, [r2, #4]
-
- add r2, r2, r3
-
- ldrne r4, [r0]
- ldrne r5, [r0, #4]
-
- bne copy_mem8x8_4_loop
-
- ldmia sp!, {r4 - r5}
- ;pop {r4-r5}
- mov pc, lr
-
-;copy 8 bytes each time
-copy_mem8x8_fast
- ;sub r1, r1, #8
- ;sub r3, r3, #8
-
- mov r12, #8
-
-copy_mem8x8_fast_loop
- ldmia r0, {r4-r5}
- ;ldm r0, {r4-r5}
- add r0, r0, r1
-
- subs r12, r12, #1
- stmia r2, {r4-r5}
- ;stm r2, {r4-r5}
- add r2, r2, r3
-
- bne copy_mem8x8_fast_loop
-
- ldmia sp!, {r4-r5}
- ;pop {r4-r5}
- mov pc, lr
-
- ENDP ; |vp8_copy_mem8x8_v6|
-
- END
diff --git a/vp8/common/arm/armv6/dc_only_idct_add_v6.asm b/vp8/common/arm/armv6/dc_only_idct_add_v6.asm
deleted file mode 100644
index 9aa659fa7..000000000
--- a/vp8/common/arm/armv6/dc_only_idct_add_v6.asm
+++ /dev/null
@@ -1,70 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license and patent
-; grant that can be found in the LICENSE file in the root of the source
-; tree. All contributing project authors may be found in the AUTHORS
-; file in the root of the source tree.
-;
-
- EXPORT |vp8_dc_only_idct_add_v6|
-
- AREA |.text|, CODE, READONLY
-
-;void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr,
-; int pred_stride, unsigned char *dst_ptr,
-; int dst_stride)
-; r0 input_dc
-; r1 pred_ptr
-; r2 pred_stride
-; r3 dst_ptr
-; sp dst_stride
-
-|vp8_dc_only_idct_add_v6| PROC
- stmdb sp!, {r4 - r7}
-
- add r0, r0, #4 ; input_dc += 4
- ldr r12, c0x0000FFFF
- ldr r4, [r1], r2
- and r0, r12, r0, asr #3 ; input_dc >> 3 + mask
- ldr r6, [r1], r2
- orr r0, r0, r0, lsl #16 ; a1 | a1
-
- ldr r12, [sp, #16] ; dst stride
-
- uxtab16 r5, r0, r4 ; a1+2 | a1+0
- uxtab16 r4, r0, r4, ror #8 ; a1+3 | a1+1
- uxtab16 r7, r0, r6
- uxtab16 r6, r0, r6, ror #8
- usat16 r5, #8, r5
- usat16 r4, #8, r4
- usat16 r7, #8, r7
- usat16 r6, #8, r6
- orr r5, r5, r4, lsl #8
- orr r7, r7, r6, lsl #8
- ldr r4, [r1], r2
- str r5, [r3], r12
- ldr r6, [r1]
- str r7, [r3], r12
-
- uxtab16 r5, r0, r4
- uxtab16 r4, r0, r4, ror #8
- uxtab16 r7, r0, r6
- uxtab16 r6, r0, r6, ror #8
- usat16 r5, #8, r5
- usat16 r4, #8, r4
- usat16 r7, #8, r7
- usat16 r6, #8, r6
- orr r5, r5, r4, lsl #8
- orr r7, r7, r6, lsl #8
- str r5, [r3], r12
- str r7, [r3]
-
- ldmia sp!, {r4 - r7}
- bx lr
-
- ENDP ; |vp8_dc_only_idct_add_v6|
-
-; Constant Pool
-c0x0000FFFF DCD 0x0000FFFF
- END
diff --git a/vp8/common/arm/armv6/dequant_idct_v6.asm b/vp8/common/arm/armv6/dequant_idct_v6.asm
deleted file mode 100644
index db48ded58..000000000
--- a/vp8/common/arm/armv6/dequant_idct_v6.asm
+++ /dev/null
@@ -1,190 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license and patent
-; grant that can be found in the LICENSE file in the root of the source
-; tree. All contributing project authors may be found in the AUTHORS
-; file in the root of the source tree.
-;
-
- EXPORT |vp8_dequant_idct_add_v6|
-
- AREA |.text|, CODE, READONLY
-;void vp8_dequant_idct_v6(short *input, short *dq,
-; unsigned char *dest, int stride)
-; r0 = q
-; r1 = dq
-; r2 = dst
-; r3 = stride
-
-|vp8_dequant_idct_add_v6| PROC
- stmdb sp!, {r4-r11, lr}
-
- ldr r4, [r0] ;input
- ldr r5, [r1], #4 ;dq
-
- sub sp, sp, #4
- str r3, [sp]
-
- mov r12, #4
-
-vp8_dequant_add_loop
- smulbb r6, r4, r5
- smultt r7, r4, r5
-
- ldr r4, [r0, #4] ;input
- ldr r5, [r1], #4 ;dq
-
- strh r6, [r0], #2
- strh r7, [r0], #2
-
- smulbb r6, r4, r5
- smultt r7, r4, r5
-
- subs r12, r12, #1
-
- ldrne r4, [r0, #4]
- ldrne r5, [r1], #4
-
- strh r6, [r0], #2
- strh r7, [r0], #2
-
- bne vp8_dequant_add_loop
-
- sub r0, r0, #32
- mov r1, r0
-
-; short_idct4x4llm_v6_dual
- ldr r3, cospi8sqrt2minus1
- ldr r4, sinpi8sqrt2
- ldr r6, [r0, #8]
- mov r5, #2
-vp8_dequant_idct_loop1_v6
- ldr r12, [r0, #24]
- ldr r14, [r0, #16]
- smulwt r9, r3, r6
- smulwb r7, r3, r6
- smulwt r10, r4, r6
- smulwb r8, r4, r6
- pkhbt r7, r7, r9, lsl #16
- smulwt r11, r3, r12
- pkhbt r8, r8, r10, lsl #16
- uadd16 r6, r6, r7
- smulwt r7, r4, r12
- smulwb r9, r3, r12
- smulwb r10, r4, r12
- subs r5, r5, #1
- pkhbt r9, r9, r11, lsl #16
- ldr r11, [r0], #4
- pkhbt r10, r10, r7, lsl #16
- uadd16 r7, r12, r9
- usub16 r7, r8, r7
- uadd16 r6, r6, r10
- uadd16 r10, r11, r14
- usub16 r8, r11, r14
- uadd16 r9, r10, r6
- usub16 r10, r10, r6
- uadd16 r6, r8, r7
- usub16 r7, r8, r7
- str r6, [r1, #8]
- ldrne r6, [r0, #8]
- str r7, [r1, #16]
- str r10, [r1, #24]
- str r9, [r1], #4
- bne vp8_dequant_idct_loop1_v6
-
- mov r5, #2
- sub r0, r1, #8
-vp8_dequant_idct_loop2_v6
- ldr r6, [r0], #4
- ldr r7, [r0], #4
- ldr r8, [r0], #4
- ldr r9, [r0], #4
- smulwt r1, r3, r6
- smulwt r12, r4, r6
- smulwt lr, r3, r8
- smulwt r10, r4, r8
- pkhbt r11, r8, r6, lsl #16
- pkhbt r1, lr, r1, lsl #16
- pkhbt r12, r10, r12, lsl #16
- pkhtb r6, r6, r8, asr #16
- uadd16 r6, r1, r6
- pkhbt lr, r9, r7, lsl #16
- uadd16 r10, r11, lr
- usub16 lr, r11, lr
- pkhtb r8, r7, r9, asr #16
- subs r5, r5, #1
- smulwt r1, r3, r8
- smulwb r7, r3, r8
- smulwt r11, r4, r8
- smulwb r9, r4, r8
- pkhbt r1, r7, r1, lsl #16
- uadd16 r8, r1, r8
- pkhbt r11, r9, r11, lsl #16
- usub16 r1, r12, r8
- uadd16 r8, r11, r6
- ldr r9, c0x00040004
- ldr r12, [sp] ; get stride from stack
- uadd16 r6, r10, r8
- usub16 r7, r10, r8
- uadd16 r7, r7, r9
- uadd16 r6, r6, r9
- uadd16 r10, r14, r1
- usub16 r1, r14, r1
- uadd16 r10, r10, r9
- uadd16 r1, r1, r9
- ldr r11, [r2] ; load input from dst
- mov r8, r7, asr #3
- pkhtb r9, r8, r10, asr #19
- mov r8, r1, asr #3
- pkhtb r8, r8, r6, asr #19
- uxtb16 lr, r11, ror #8
- qadd16 r9, r9, lr
- uxtb16 lr, r11
- qadd16 r8, r8, lr
- usat16 r9, #8, r9
- usat16 r8, #8, r8
- orr r9, r8, r9, lsl #8
- ldr r11, [r2, r12] ; load input from dst
- mov r7, r7, lsl #16
- mov r1, r1, lsl #16
- mov r10, r10, lsl #16
- mov r6, r6, lsl #16
- mov r7, r7, asr #3
- pkhtb r7, r7, r10, asr #19
- mov r1, r1, asr #3
- pkhtb r1, r1, r6, asr #19
- uxtb16 r8, r11, ror #8
- qadd16 r7, r7, r8
- uxtb16 r8, r11
- qadd16 r1, r1, r8
- usat16 r7, #8, r7
- usat16 r1, #8, r1
- orr r1, r1, r7, lsl #8
- str r9, [r2], r12 ; store output to dst
- str r1, [r2], r12 ; store output to dst
- bne vp8_dequant_idct_loop2_v6
-
-; memset
- sub r0, r0, #32
- add sp, sp, #4
-
- mov r12, #0
- str r12, [r0]
- str r12, [r0, #4]
- str r12, [r0, #8]
- str r12, [r0, #12]
- str r12, [r0, #16]
- str r12, [r0, #20]
- str r12, [r0, #24]
- str r12, [r0, #28]
-
- ldmia sp!, {r4 - r11, pc}
- ENDP ; |vp8_dequant_idct_add_v6|
-
-; Constant Pool
-cospi8sqrt2minus1 DCD 0x00004E7B
-sinpi8sqrt2 DCD 0x00008A8C
-c0x00040004 DCD 0x00040004
-
- END
diff --git a/vp8/common/arm/armv6/dequantize_v6.asm b/vp8/common/arm/armv6/dequantize_v6.asm
deleted file mode 100644
index 72f7e0ee5..000000000
--- a/vp8/common/arm/armv6/dequantize_v6.asm
+++ /dev/null
@@ -1,69 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_dequantize_b_loop_v6|
-
- AREA |.text|, CODE, READONLY ; name this block of code
-;-------------------------------
-;void vp8_dequantize_b_loop_v6(short *Q, short *DQC, short *DQ);
-; r0 short *Q,
-; r1 short *DQC
-; r2 short *DQ
-|vp8_dequantize_b_loop_v6| PROC
- stmdb sp!, {r4-r9, lr}
-
- ldr r3, [r0] ;load Q
- ldr r4, [r1] ;load DQC
- ldr r5, [r0, #4]
- ldr r6, [r1, #4]
-
- mov r12, #2 ;loop counter
-
-dequant_loop
- smulbb r7, r3, r4 ;multiply
- smultt r8, r3, r4
- smulbb r9, r5, r6
- smultt lr, r5, r6
-
- ldr r3, [r0, #8]
- ldr r4, [r1, #8]
- ldr r5, [r0, #12]
- ldr r6, [r1, #12]
-
- strh r7, [r2], #2 ;store result
- smulbb r7, r3, r4 ;multiply
- strh r8, [r2], #2
- smultt r8, r3, r4
- strh r9, [r2], #2
- smulbb r9, r5, r6
- strh lr, [r2], #2
- smultt lr, r5, r6
-
- subs r12, r12, #1
-
- add r0, r0, #16
- add r1, r1, #16
-
- ldrne r3, [r0]
- strh r7, [r2], #2 ;store result
- ldrne r4, [r1]
- strh r8, [r2], #2
- ldrne r5, [r0, #4]
- strh r9, [r2], #2
- ldrne r6, [r1, #4]
- strh lr, [r2], #2
-
- bne dequant_loop
-
- ldmia sp!, {r4-r9, pc}
- ENDP ;|vp8_dequantize_b_loop_v6|
-
- END
diff --git a/vp8/common/arm/armv6/filter_v6.asm b/vp8/common/arm/armv6/filter_v6.asm
deleted file mode 100644
index eb4b75bd8..000000000
--- a/vp8/common/arm/armv6/filter_v6.asm
+++ /dev/null
@@ -1,624 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_filter_block2d_first_pass_armv6|
- EXPORT |vp8_filter_block2d_first_pass_16x16_armv6|
- EXPORT |vp8_filter_block2d_first_pass_8x8_armv6|
- EXPORT |vp8_filter_block2d_second_pass_armv6|
- EXPORT |vp8_filter4_block2d_second_pass_armv6|
- EXPORT |vp8_filter_block2d_first_pass_only_armv6|
- EXPORT |vp8_filter_block2d_second_pass_only_armv6|
-
- AREA |.text|, CODE, READONLY ; name this block of code
-;-------------------------------------
-; r0 unsigned char *src_ptr
-; r1 short *output_ptr
-; r2 unsigned int src_pixels_per_line
-; r3 unsigned int output_width
-; stack unsigned int output_height
-; stack const short *vp8_filter
-;-------------------------------------
-; vp8_filter the input and put in the output array. Apply the 6 tap FIR filter with
-; the output being a 2 byte value and the intput being a 1 byte value.
-|vp8_filter_block2d_first_pass_armv6| PROC
- stmdb sp!, {r4 - r11, lr}
-
- ldr r11, [sp, #40] ; vp8_filter address
- ldr r7, [sp, #36] ; output height
-
- sub r2, r2, r3 ; inside loop increments input array,
- ; so the height loop only needs to add
- ; r2 - width to the input pointer
-
- mov r3, r3, lsl #1 ; multiply width by 2 because using shorts
- add r12, r3, #16 ; square off the output
- sub sp, sp, #4
-
- ldr r4, [r11] ; load up packed filter coefficients
- ldr r5, [r11, #4]
- ldr r6, [r11, #8]
-
- str r1, [sp] ; push destination to stack
- mov r7, r7, lsl #16 ; height is top part of counter
-
-; six tap filter
-|height_loop_1st_6|
- ldrb r8, [r0, #-2] ; load source data
- ldrb r9, [r0, #-1]
- ldrb r10, [r0], #2
- orr r7, r7, r3, lsr #2 ; construct loop counter
-
-|width_loop_1st_6|
- ldrb r11, [r0, #-1]
-
- pkhbt lr, r8, r9, lsl #16 ; r9 | r8
- pkhbt r8, r9, r10, lsl #16 ; r10 | r9
-
- ldrb r9, [r0]
-
- smuad lr, lr, r4 ; apply the filter
- pkhbt r10, r10, r11, lsl #16 ; r11 | r10
- smuad r8, r8, r4
- pkhbt r11, r11, r9, lsl #16 ; r9 | r11
-
- smlad lr, r10, r5, lr
- ldrb r10, [r0, #1]
- smlad r8, r11, r5, r8
- ldrb r11, [r0, #2]
-
- sub r7, r7, #1
-
- pkhbt r9, r9, r10, lsl #16 ; r10 | r9
- pkhbt r10, r10, r11, lsl #16 ; r11 | r10
-
- smlad lr, r9, r6, lr
- smlad r11, r10, r6, r8
-
- ands r10, r7, #0xff ; test loop counter
-
- add lr, lr, #0x40 ; round_shift_and_clamp
- ldrneb r8, [r0, #-2] ; load data for next loop
- usat lr, #8, lr, asr #7
- add r11, r11, #0x40
- ldrneb r9, [r0, #-1]
- usat r11, #8, r11, asr #7
-
- strh lr, [r1], r12 ; result is transposed and stored, which
- ; will make second pass filtering easier.
- ldrneb r10, [r0], #2
- strh r11, [r1], r12
-
- bne width_loop_1st_6
-
- ldr r1, [sp] ; load and update dst address
- subs r7, r7, #0x10000
- add r0, r0, r2 ; move to next input line
-
- add r1, r1, #2 ; move over to next column
- str r1, [sp]
-
- bne height_loop_1st_6
-
- add sp, sp, #4
- ldmia sp!, {r4 - r11, pc}
-
- ENDP
-
-; --------------------------
-; 16x16 version
-; -----------------------------
-|vp8_filter_block2d_first_pass_16x16_armv6| PROC
- stmdb sp!, {r4 - r11, lr}
-
- ldr r11, [sp, #40] ; vp8_filter address
- ldr r7, [sp, #36] ; output height
-
- add r4, r2, #18 ; preload next low
- pld [r0, r4]
-
- sub r2, r2, r3 ; inside loop increments input array,
- ; so the height loop only needs to add
- ; r2 - width to the input pointer
-
- mov r3, r3, lsl #1 ; multiply width by 2 because using shorts
- add r12, r3, #16 ; square off the output
- sub sp, sp, #4
-
- ldr r4, [r11] ; load up packed filter coefficients
- ldr r5, [r11, #4]
- ldr r6, [r11, #8]
-
- str r1, [sp] ; push destination to stack
- mov r7, r7, lsl #16 ; height is top part of counter
-
-; six tap filter
-|height_loop_1st_16_6|
- ldrb r8, [r0, #-2] ; load source data
- ldrb r9, [r0, #-1]
- ldrb r10, [r0], #2
- orr r7, r7, r3, lsr #2 ; construct loop counter
-
-|width_loop_1st_16_6|
- ldrb r11, [r0, #-1]
-
- pkhbt lr, r8, r9, lsl #16 ; r9 | r8
- pkhbt r8, r9, r10, lsl #16 ; r10 | r9
-
- ldrb r9, [r0]
-
- smuad lr, lr, r4 ; apply the filter
- pkhbt r10, r10, r11, lsl #16 ; r11 | r10
- smuad r8, r8, r4
- pkhbt r11, r11, r9, lsl #16 ; r9 | r11
-
- smlad lr, r10, r5, lr
- ldrb r10, [r0, #1]
- smlad r8, r11, r5, r8
- ldrb r11, [r0, #2]
-
- sub r7, r7, #1
-
- pkhbt r9, r9, r10, lsl #16 ; r10 | r9
- pkhbt r10, r10, r11, lsl #16 ; r11 | r10
-
- smlad lr, r9, r6, lr
- smlad r11, r10, r6, r8
-
- ands r10, r7, #0xff ; test loop counter
-
- add lr, lr, #0x40 ; round_shift_and_clamp
- ldrneb r8, [r0, #-2] ; load data for next loop
- usat lr, #8, lr, asr #7
- add r11, r11, #0x40
- ldrneb r9, [r0, #-1]
- usat r11, #8, r11, asr #7
-
- strh lr, [r1], r12 ; result is transposed and stored, which
- ; will make second pass filtering easier.
- ldrneb r10, [r0], #2
- strh r11, [r1], r12
-
- bne width_loop_1st_16_6
-
- ldr r1, [sp] ; load and update dst address
- subs r7, r7, #0x10000
- add r0, r0, r2 ; move to next input line
-
- add r11, r2, #34 ; adding back block width(=16)
- pld [r0, r11] ; preload next low
-
- add r1, r1, #2 ; move over to next column
- str r1, [sp]
-
- bne height_loop_1st_16_6
-
- add sp, sp, #4
- ldmia sp!, {r4 - r11, pc}
-
- ENDP
-
-; --------------------------
-; 8x8 version
-; -----------------------------
-|vp8_filter_block2d_first_pass_8x8_armv6| PROC
- stmdb sp!, {r4 - r11, lr}
-
- ldr r11, [sp, #40] ; vp8_filter address
- ldr r7, [sp, #36] ; output height
-
- add r4, r2, #10 ; preload next low
- pld [r0, r4]
-
- sub r2, r2, r3 ; inside loop increments input array,
- ; so the height loop only needs to add
- ; r2 - width to the input pointer
-
- mov r3, r3, lsl #1 ; multiply width by 2 because using shorts
- add r12, r3, #16 ; square off the output
- sub sp, sp, #4
-
- ldr r4, [r11] ; load up packed filter coefficients
- ldr r5, [r11, #4]
- ldr r6, [r11, #8]
-
- str r1, [sp] ; push destination to stack
- mov r7, r7, lsl #16 ; height is top part of counter
-
-; six tap filter
-|height_loop_1st_8_6|
- ldrb r8, [r0, #-2] ; load source data
- ldrb r9, [r0, #-1]
- ldrb r10, [r0], #2
- orr r7, r7, r3, lsr #2 ; construct loop counter
-
-|width_loop_1st_8_6|
- ldrb r11, [r0, #-1]
-
- pkhbt lr, r8, r9, lsl #16 ; r9 | r8
- pkhbt r8, r9, r10, lsl #16 ; r10 | r9
-
- ldrb r9, [r0]
-
- smuad lr, lr, r4 ; apply the filter
- pkhbt r10, r10, r11, lsl #16 ; r11 | r10
- smuad r8, r8, r4
- pkhbt r11, r11, r9, lsl #16 ; r9 | r11
-
- smlad lr, r10, r5, lr
- ldrb r10, [r0, #1]
- smlad r8, r11, r5, r8
- ldrb r11, [r0, #2]
-
- sub r7, r7, #1
-
- pkhbt r9, r9, r10, lsl #16 ; r10 | r9
- pkhbt r10, r10, r11, lsl #16 ; r11 | r10
-
- smlad lr, r9, r6, lr
- smlad r11, r10, r6, r8
-
- ands r10, r7, #0xff ; test loop counter
-
- add lr, lr, #0x40 ; round_shift_and_clamp
- ldrneb r8, [r0, #-2] ; load data for next loop
- usat lr, #8, lr, asr #7
- add r11, r11, #0x40
- ldrneb r9, [r0, #-1]
- usat r11, #8, r11, asr #7
-
- strh lr, [r1], r12 ; result is transposed and stored, which
- ; will make second pass filtering easier.
- ldrneb r10, [r0], #2
- strh r11, [r1], r12
-
- bne width_loop_1st_8_6
-
- ldr r1, [sp] ; load and update dst address
- subs r7, r7, #0x10000
- add r0, r0, r2 ; move to next input line
-
- add r11, r2, #18 ; adding back block width(=8)
- pld [r0, r11] ; preload next low
-
- add r1, r1, #2 ; move over to next column
- str r1, [sp]
-
- bne height_loop_1st_8_6
-
- add sp, sp, #4
- ldmia sp!, {r4 - r11, pc}
-
- ENDP
-
-;---------------------------------
-; r0 short *src_ptr,
-; r1 unsigned char *output_ptr,
-; r2 unsigned int output_pitch,
-; r3 unsigned int cnt,
-; stack const short *vp8_filter
-;---------------------------------
-|vp8_filter_block2d_second_pass_armv6| PROC
- stmdb sp!, {r4 - r11, lr}
-
- ldr r11, [sp, #36] ; vp8_filter address
- sub sp, sp, #4
- mov r7, r3, lsl #16 ; height is top part of counter
- str r1, [sp] ; push destination to stack
-
- ldr r4, [r11] ; load up packed filter coefficients
- ldr r5, [r11, #4]
- ldr r6, [r11, #8]
-
- pkhbt r12, r5, r4 ; pack the filter differently
- pkhbt r11, r6, r5
-
- sub r0, r0, #4 ; offset input buffer
-
-|height_loop_2nd|
- ldr r8, [r0] ; load the data
- ldr r9, [r0, #4]
- orr r7, r7, r3, lsr #1 ; loop counter
-
-|width_loop_2nd|
- smuad lr, r4, r8 ; apply filter
- sub r7, r7, #1
- smulbt r8, r4, r8
-
- ldr r10, [r0, #8]
-
- smlad lr, r5, r9, lr
- smladx r8, r12, r9, r8
-
- ldrh r9, [r0, #12]
-
- smlad lr, r6, r10, lr
- smladx r8, r11, r10, r8
-
- add r0, r0, #4
- smlatb r10, r6, r9, r8
-
- add lr, lr, #0x40 ; round_shift_and_clamp
- ands r8, r7, #0xff
- usat lr, #8, lr, asr #7
- add r10, r10, #0x40
- strb lr, [r1], r2 ; the result is transposed back and stored
- usat r10, #8, r10, asr #7
-
- ldrne r8, [r0] ; load data for next loop
- ldrne r9, [r0, #4]
- strb r10, [r1], r2
-
- bne width_loop_2nd
-
- ldr r1, [sp] ; update dst for next loop
- subs r7, r7, #0x10000
- add r0, r0, #16 ; updata src for next loop
- add r1, r1, #1
- str r1, [sp]
-
- bne height_loop_2nd
-
- add sp, sp, #4
- ldmia sp!, {r4 - r11, pc}
-
- ENDP
-
-;---------------------------------
-; r0 short *src_ptr,
-; r1 unsigned char *output_ptr,
-; r2 unsigned int output_pitch,
-; r3 unsigned int cnt,
-; stack const short *vp8_filter
-;---------------------------------
-|vp8_filter4_block2d_second_pass_armv6| PROC
- stmdb sp!, {r4 - r11, lr}
-
- ldr r11, [sp, #36] ; vp8_filter address
- mov r7, r3, lsl #16 ; height is top part of counter
-
- ldr r4, [r11] ; load up packed filter coefficients
- add lr, r1, r3 ; save final destination pointer
- ldr r5, [r11, #4]
- ldr r6, [r11, #8]
-
- pkhbt r12, r5, r4 ; pack the filter differently
- pkhbt r11, r6, r5
- mov r4, #0x40 ; rounding factor (for smlad{x})
-
-|height_loop_2nd_4|
- ldrd r8, r9, [r0, #-4] ; load the data
- orr r7, r7, r3, lsr #1 ; loop counter
-
-|width_loop_2nd_4|
- ldr r10, [r0, #4]!
- smladx r6, r9, r12, r4 ; apply filter
- pkhbt r8, r9, r8
- smlad r5, r8, r12, r4
- pkhbt r8, r10, r9
- smladx r6, r10, r11, r6
- sub r7, r7, #1
- smlad r5, r8, r11, r5
-
- mov r8, r9 ; shift the data for the next loop
- mov r9, r10
-
- usat r6, #8, r6, asr #7 ; shift and clamp
- usat r5, #8, r5, asr #7
-
- strb r5, [r1], r2 ; the result is transposed back and stored
- tst r7, #0xff
- strb r6, [r1], r2
-
- bne width_loop_2nd_4
-
- subs r7, r7, #0x10000
- add r0, r0, #16 ; update src for next loop
- sub r1, lr, r7, lsr #16 ; update dst for next loop
-
- bne height_loop_2nd_4
-
- ldmia sp!, {r4 - r11, pc}
-
- ENDP
-
-;------------------------------------
-; r0 unsigned char *src_ptr
-; r1 unsigned char *output_ptr,
-; r2 unsigned int src_pixels_per_line
-; r3 unsigned int cnt,
-; stack unsigned int output_pitch,
-; stack const short *vp8_filter
-;------------------------------------
-|vp8_filter_block2d_first_pass_only_armv6| PROC
- stmdb sp!, {r4 - r11, lr}
-
- add r7, r2, r3 ; preload next low
- add r7, r7, #2
- pld [r0, r7]
-
- ldr r4, [sp, #36] ; output pitch
- ldr r11, [sp, #40] ; HFilter address
- sub sp, sp, #8
-
- mov r7, r3
- sub r2, r2, r3 ; inside loop increments input array,
- ; so the height loop only needs to add
- ; r2 - width to the input pointer
-
- sub r4, r4, r3
- str r4, [sp] ; save modified output pitch
- str r2, [sp, #4]
-
- mov r2, #0x40
-
- ldr r4, [r11] ; load up packed filter coefficients
- ldr r5, [r11, #4]
- ldr r6, [r11, #8]
-
-; six tap filter
-|height_loop_1st_only_6|
- ldrb r8, [r0, #-2] ; load data
- ldrb r9, [r0, #-1]
- ldrb r10, [r0], #2
-
- mov r12, r3, lsr #1 ; loop counter
-
-|width_loop_1st_only_6|
- ldrb r11, [r0, #-1]
-
- pkhbt lr, r8, r9, lsl #16 ; r9 | r8
- pkhbt r8, r9, r10, lsl #16 ; r10 | r9
-
- ldrb r9, [r0]
-
-;; smuad lr, lr, r4
- smlad lr, lr, r4, r2
- pkhbt r10, r10, r11, lsl #16 ; r11 | r10
-;; smuad r8, r8, r4
- smlad r8, r8, r4, r2
- pkhbt r11, r11, r9, lsl #16 ; r9 | r11
-
- smlad lr, r10, r5, lr
- ldrb r10, [r0, #1]
- smlad r8, r11, r5, r8
- ldrb r11, [r0, #2]
-
- subs r12, r12, #1
-
- pkhbt r9, r9, r10, lsl #16 ; r10 | r9
- pkhbt r10, r10, r11, lsl #16 ; r11 | r10
-
- smlad lr, r9, r6, lr
- smlad r10, r10, r6, r8
-
-;; add lr, lr, #0x40 ; round_shift_and_clamp
- ldrneb r8, [r0, #-2] ; load data for next loop
- usat lr, #8, lr, asr #7
-;; add r10, r10, #0x40
- strb lr, [r1], #1 ; store the result
- usat r10, #8, r10, asr #7
-
- ldrneb r9, [r0, #-1]
- strb r10, [r1], #1
- ldrneb r10, [r0], #2
-
- bne width_loop_1st_only_6
-
- ldr lr, [sp] ; load back output pitch
- ldr r12, [sp, #4] ; load back output pitch
- subs r7, r7, #1
- add r0, r0, r12 ; updata src for next loop
-
- add r11, r12, r3 ; preload next low
- add r11, r11, #2
- pld [r0, r11]
-
- add r1, r1, lr ; update dst for next loop
-
- bne height_loop_1st_only_6
-
- add sp, sp, #8
- ldmia sp!, {r4 - r11, pc}
- ENDP ; |vp8_filter_block2d_first_pass_only_armv6|
-
-
-;------------------------------------
-; r0 unsigned char *src_ptr,
-; r1 unsigned char *output_ptr,
-; r2 unsigned int src_pixels_per_line
-; r3 unsigned int cnt,
-; stack unsigned int output_pitch,
-; stack const short *vp8_filter
-;------------------------------------
-|vp8_filter_block2d_second_pass_only_armv6| PROC
- stmdb sp!, {r4 - r11, lr}
-
- ldr r11, [sp, #40] ; VFilter address
- ldr r12, [sp, #36] ; output pitch
-
- mov r7, r3, lsl #16 ; height is top part of counter
- sub r0, r0, r2, lsl #1 ; need 6 elements for filtering, 2 before, 3 after
-
- sub sp, sp, #8
-
- ldr r4, [r11] ; load up packed filter coefficients
- ldr r5, [r11, #4]
- ldr r6, [r11, #8]
-
- str r0, [sp] ; save r0 to stack
- str r1, [sp, #4] ; save dst to stack
-
-; six tap filter
-|width_loop_2nd_only_6|
- ldrb r8, [r0], r2 ; load data
- orr r7, r7, r3 ; loop counter
- ldrb r9, [r0], r2
- ldrb r10, [r0], r2
-
-|height_loop_2nd_only_6|
- ; filter first column in this inner loop, than, move to next colum.
- ldrb r11, [r0], r2
-
- pkhbt lr, r8, r9, lsl #16 ; r9 | r8
- pkhbt r8, r9, r10, lsl #16 ; r10 | r9
-
- ldrb r9, [r0], r2
-
- smuad lr, lr, r4
- pkhbt r10, r10, r11, lsl #16 ; r11 | r10
- smuad r8, r8, r4
- pkhbt r11, r11, r9, lsl #16 ; r9 | r11
-
- smlad lr, r10, r5, lr
- ldrb r10, [r0], r2
- smlad r8, r11, r5, r8
- ldrb r11, [r0]
-
- sub r7, r7, #2
- sub r0, r0, r2, lsl #2
-
- pkhbt r9, r9, r10, lsl #16 ; r10 | r9
- pkhbt r10, r10, r11, lsl #16 ; r11 | r10
-
- smlad lr, r9, r6, lr
- smlad r10, r10, r6, r8
-
- ands r9, r7, #0xff
-
- add lr, lr, #0x40 ; round_shift_and_clamp
- ldrneb r8, [r0], r2 ; load data for next loop
- usat lr, #8, lr, asr #7
- add r10, r10, #0x40
- strb lr, [r1], r12 ; store the result for the column
- usat r10, #8, r10, asr #7
-
- ldrneb r9, [r0], r2
- strb r10, [r1], r12
- ldrneb r10, [r0], r2
-
- bne height_loop_2nd_only_6
-
- ldr r0, [sp]
- ldr r1, [sp, #4]
- subs r7, r7, #0x10000
- add r0, r0, #1 ; move to filter next column
- str r0, [sp]
- add r1, r1, #1
- str r1, [sp, #4]
-
- bne width_loop_2nd_only_6
-
- add sp, sp, #8
-
- ldmia sp!, {r4 - r11, pc}
- ENDP ; |vp8_filter_block2d_second_pass_only_armv6|
-
- END
diff --git a/vp8/common/arm/armv6/idct_blk_v6.c b/vp8/common/arm/armv6/idct_blk_v6.c
deleted file mode 100644
index 14a1273e2..000000000
--- a/vp8/common/arm/armv6/idct_blk_v6.c
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vpx_config.h"
-#include "vp8_rtcd.h"
-
-void vp8_dequant_idct_add_y_block_v6(short *q, short *dq, unsigned char *dst,
- int stride, char *eobs) {
- int i;
-
- for (i = 0; i < 4; ++i) {
- if (eobs[0] > 1)
- vp8_dequant_idct_add_v6(q, dq, dst, stride);
- else if (eobs[0] == 1) {
- vp8_dc_only_idct_add_v6(q[0] * dq[0], dst, stride, dst, stride);
- ((int *)q)[0] = 0;
- }
-
- if (eobs[1] > 1)
- vp8_dequant_idct_add_v6(q + 16, dq, dst + 4, stride);
- else if (eobs[1] == 1) {
- vp8_dc_only_idct_add_v6(q[16] * dq[0], dst + 4, stride, dst + 4, stride);
- ((int *)(q + 16))[0] = 0;
- }
-
- if (eobs[2] > 1)
- vp8_dequant_idct_add_v6(q + 32, dq, dst + 8, stride);
- else if (eobs[2] == 1) {
- vp8_dc_only_idct_add_v6(q[32] * dq[0], dst + 8, stride, dst + 8, stride);
- ((int *)(q + 32))[0] = 0;
- }
-
- if (eobs[3] > 1)
- vp8_dequant_idct_add_v6(q + 48, dq, dst + 12, stride);
- else if (eobs[3] == 1) {
- vp8_dc_only_idct_add_v6(q[48] * dq[0], dst + 12, stride, dst + 12,
- stride);
- ((int *)(q + 48))[0] = 0;
- }
-
- q += 64;
- dst += 4 * stride;
- eobs += 4;
- }
-}
-
-void vp8_dequant_idct_add_uv_block_v6(short *q, short *dq, unsigned char *dstu,
- unsigned char *dstv, int stride,
- char *eobs) {
- int i;
-
- for (i = 0; i < 2; ++i) {
- if (eobs[0] > 1)
- vp8_dequant_idct_add_v6(q, dq, dstu, stride);
- else if (eobs[0] == 1) {
- vp8_dc_only_idct_add_v6(q[0] * dq[0], dstu, stride, dstu, stride);
- ((int *)q)[0] = 0;
- }
-
- if (eobs[1] > 1)
- vp8_dequant_idct_add_v6(q + 16, dq, dstu + 4, stride);
- else if (eobs[1] == 1) {
- vp8_dc_only_idct_add_v6(q[16] * dq[0], dstu + 4, stride, dstu + 4,
- stride);
- ((int *)(q + 16))[0] = 0;
- }
-
- q += 32;
- dstu += 4 * stride;
- eobs += 2;
- }
-
- for (i = 0; i < 2; ++i) {
- if (eobs[0] > 1)
- vp8_dequant_idct_add_v6(q, dq, dstv, stride);
- else if (eobs[0] == 1) {
- vp8_dc_only_idct_add_v6(q[0] * dq[0], dstv, stride, dstv, stride);
- ((int *)q)[0] = 0;
- }
-
- if (eobs[1] > 1)
- vp8_dequant_idct_add_v6(q + 16, dq, dstv + 4, stride);
- else if (eobs[1] == 1) {
- vp8_dc_only_idct_add_v6(q[16] * dq[0], dstv + 4, stride, dstv + 4,
- stride);
- ((int *)(q + 16))[0] = 0;
- }
-
- q += 32;
- dstv += 4 * stride;
- eobs += 2;
- }
-}
diff --git a/vp8/common/arm/armv6/idct_v6.asm b/vp8/common/arm/armv6/idct_v6.asm
deleted file mode 100644
index b4d44cbeb..000000000
--- a/vp8/common/arm/armv6/idct_v6.asm
+++ /dev/null
@@ -1,202 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_short_idct4x4llm_v6_dual|
-
- AREA |.text|, CODE, READONLY
-
-
-; void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch,
-; unsigned char *dst, int stride)
-; r0 short* input
-; r1 unsigned char* pred
-; r2 int pitch
-; r3 unsigned char* dst
-; sp int stride
-
-|vp8_short_idct4x4llm_v6_dual| PROC
- stmdb sp!, {r4-r11, lr}
-
- sub sp, sp, #4
-
- mov r4, #0x00008A00 ; sin
- orr r4, r4, #0x0000008C ; sinpi8sqrt2
-
- mov r5, #0x00004E00 ; cos
- orr r5, r5, #0x0000007B ; cospi8sqrt2minus1
- orr r5, r5, #1<<31 ; loop counter on top bit
-
-loop1_dual
- ldr r6, [r0, #(4*2)] ; i5 | i4
- ldr r12, [r0, #(12*2)] ; i13|i12
- ldr r14, [r0, #(8*2)] ; i9 | i8
-
- smulbt r9, r5, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16
- smulbb r7, r5, r6 ; (ip[4] * cospi8sqrt2minus1) >> 16
- smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16
- smulwb r8, r4, r6 ; (ip[4] * sinpi8sqrt2) >> 16
-
- smulbt r11, r5, r12 ; (ip[13] * cospi8sqrt2minus1) >> 16
- pkhtb r7, r9, r7, asr #16 ; 5c | 4c
- pkhbt r8, r8, r10, lsl #16 ; 5s | 4s
- uadd16 r6, r6, r7 ; 5c+5 | 4c+4
-
- smulwt r7, r4, r12 ; (ip[13] * sinpi8sqrt2) >> 16
- smulbb r9, r5, r12 ; (ip[12] * cospi8sqrt2minus1) >> 16
- smulwb r10, r4, r12 ; (ip[12] * sinpi8sqrt2) >> 16
-
- subs r5, r5, #1<<31 ; i--
-
- pkhtb r9, r11, r9, asr #16 ; 13c | 12c
- ldr r11, [r0] ; i1 | i0
- pkhbt r10, r10, r7, lsl #16 ; 13s | 12s
- uadd16 r7, r12, r9 ; 13c+13 | 12c+12
-
- usub16 r7, r8, r7 ; c
- uadd16 r6, r6, r10 ; d
- uadd16 r10, r11, r14 ; a
- usub16 r8, r11, r14 ; b
-
- uadd16 r9, r10, r6 ; a+d
- usub16 r10, r10, r6 ; a-d
- uadd16 r6, r8, r7 ; b+c
- usub16 r7, r8, r7 ; b-c
-
- ; use input buffer to store intermediate results
- str r6, [r0, #(4*2)] ; o5 | o4
- str r7, [r0, #(8*2)] ; o9 | o8
- str r10,[r0, #(12*2)] ; o13|o12
- str r9, [r0], #4 ; o1 | o0
-
- bcs loop1_dual
-
- sub r0, r0, #8 ; reset input/output
- str r0, [sp]
-
-loop2_dual
-
- ldr r6, [r0, #(4*2)] ; i5 | i4
- ldr r12,[r0, #(2*2)] ; i3 | i2
- ldr r14,[r0, #(6*2)] ; i7 | i6
- ldr r0, [r0, #(0*2)] ; i1 | i0
-
- smulbt r9, r5, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16
- smulbt r7, r5, r0 ; (ip[1] * cospi8sqrt2minus1) >> 16
- smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16
- smulwt r8, r4, r0 ; (ip[1] * sinpi8sqrt2) >> 16
-
- pkhbt r11, r6, r0, lsl #16 ; i0 | i4
- pkhtb r7, r7, r9, asr #16 ; 1c | 5c
- pkhtb r0, r0, r6, asr #16 ; i1 | i5
- pkhbt r8, r10, r8, lsl #16 ; 1s | 5s = temp1
-
- uadd16 r0, r7, r0 ; 1c+1 | 5c+5 = temp2
- pkhbt r9, r14, r12, lsl #16 ; i2 | i6
- uadd16 r10, r11, r9 ; a
- usub16 r9, r11, r9 ; b
- pkhtb r6, r12, r14, asr #16 ; i3 | i7
-
- subs r5, r5, #1<<31 ; i--
-
- smulbt r7, r5, r6 ; (ip[3] * cospi8sqrt2minus1) >> 16
- smulwt r11, r4, r6 ; (ip[3] * sinpi8sqrt2) >> 16
- smulbb r12, r5, r6 ; (ip[7] * cospi8sqrt2minus1) >> 16
- smulwb r14, r4, r6 ; (ip[7] * sinpi8sqrt2) >> 16
-
- pkhtb r7, r7, r12, asr #16 ; 3c | 7c
- pkhbt r11, r14, r11, lsl #16 ; 3s | 7s = temp1
-
- uadd16 r6, r7, r6 ; 3c+3 | 7c+7 = temp2
- usub16 r12, r8, r6 ; c (o1 | o5)
- uadd16 r6, r11, r0 ; d (o3 | o7)
- uadd16 r7, r10, r6 ; a+d
-
- mov r8, #4 ; set up 4's
- orr r8, r8, #0x40000 ; 4|4
-
- usub16 r6, r10, r6 ; a-d
- uadd16 r6, r6, r8 ; a-d+4, 3|7
- uadd16 r7, r7, r8 ; a+d+4, 0|4
- uadd16 r10, r9, r12 ; b+c
- usub16 r0, r9, r12 ; b-c
- uadd16 r10, r10, r8 ; b+c+4, 1|5
- uadd16 r8, r0, r8 ; b-c+4, 2|6
-
- ldr lr, [sp, #40] ; dst stride
-
- ldrb r0, [r1] ; pred p0
- ldrb r11, [r1, #1] ; pred p1
- ldrb r12, [r1, #2] ; pred p2
-
- add r0, r0, r7, asr #19 ; p0 + o0
- add r11, r11, r10, asr #19 ; p1 + o1
- add r12, r12, r8, asr #19 ; p2 + o2
-
- usat r0, #8, r0 ; d0 = clip8(p0 + o0)
- usat r11, #8, r11 ; d1 = clip8(p1 + o1)
- usat r12, #8, r12 ; d2 = clip8(p2 + o2)
-
- add r0, r0, r11, lsl #8 ; |--|--|d1|d0|
-
- ldrb r11, [r1, #3] ; pred p3
-
- add r0, r0, r12, lsl #16 ; |--|d2|d1|d0|
-
- add r11, r11, r6, asr #19 ; p3 + o3
-
- sxth r7, r7 ;
- sxth r10, r10 ;
-
- usat r11, #8, r11 ; d3 = clip8(p3 + o3)
-
- sxth r8, r8 ;
- sxth r6, r6 ;
-
- add r0, r0, r11, lsl #24 ; |d3|d2|d1|d0|
-
- ldrb r12, [r1, r2]! ; pred p4
- str r0, [r3], lr
- ldrb r11, [r1, #1] ; pred p5
-
- add r12, r12, r7, asr #3 ; p4 + o4
- add r11, r11, r10, asr #3 ; p5 + o5
-
- usat r12, #8, r12 ; d4 = clip8(p4 + o4)
- usat r11, #8, r11 ; d5 = clip8(p5 + o5)
-
- ldrb r7, [r1, #2] ; pred p6
- ldrb r10, [r1, #3] ; pred p6
-
- add r12, r12, r11, lsl #8 ; |--|--|d5|d4|
-
- add r7, r7, r8, asr #3 ; p6 + o6
- add r10, r10, r6, asr #3 ; p7 + o7
-
- ldr r0, [sp] ; load input pointer
-
- usat r7, #8, r7 ; d6 = clip8(p6 + o6)
- usat r10, #8, r10 ; d7 = clip8(p7 + o7)
-
- add r12, r12, r7, lsl #16 ; |--|d6|d5|d4|
- add r12, r12, r10, lsl #24 ; |d7|d6|d5|d4|
-
- str r12, [r3], lr
- add r0, r0, #16
- add r1, r1, r2 ; pred + pitch
-
- bcs loop2_dual
-
- add sp, sp, #4 ; idct_output buffer
- ldmia sp!, {r4 - r11, pc}
-
- ENDP
-
- END
diff --git a/vp8/common/arm/armv6/iwalsh_v6.asm b/vp8/common/arm/armv6/iwalsh_v6.asm
deleted file mode 100644
index 31ef09cad..000000000
--- a/vp8/common/arm/armv6/iwalsh_v6.asm
+++ /dev/null
@@ -1,136 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
- EXPORT |vp8_short_inv_walsh4x4_v6|
-
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA |.text|, CODE, READONLY ; name this block of code
-
-;short vp8_short_inv_walsh4x4_v6(short *input, short *mb_dqcoeff)
-|vp8_short_inv_walsh4x4_v6| PROC
-
- stmdb sp!, {r4 - r12, lr}
-
- ldr r2, [r0, #0] ; [1 | 0]
- ldr r3, [r0, #4] ; [3 | 2]
- ldr r4, [r0, #8] ; [5 | 4]
- ldr r5, [r0, #12] ; [7 | 6]
- ldr r6, [r0, #16] ; [9 | 8]
- ldr r7, [r0, #20] ; [11 | 10]
- ldr r8, [r0, #24] ; [13 | 12]
- ldr r9, [r0, #28] ; [15 | 14]
-
- qadd16 r10, r2, r8 ; a1 [1+13 | 0+12]
- qadd16 r11, r4, r6 ; b1 [5+9 | 4+8]
- qsub16 r12, r4, r6 ; c1 [5-9 | 4-8]
- qsub16 lr, r2, r8 ; d1 [1-13 | 0-12]
-
- qadd16 r2, r10, r11 ; a1 + b1 [1 | 0]
- qadd16 r4, r12, lr ; c1 + d1 [5 | 4]
- qsub16 r6, r10, r11 ; a1 - b1 [9 | 8]
- qsub16 r8, lr, r12 ; d1 - c1 [13 | 12]
-
- qadd16 r10, r3, r9 ; a1 [3+15 | 2+14]
- qadd16 r11, r5, r7 ; b1 [7+11 | 6+10]
- qsub16 r12, r5, r7 ; c1 [7-11 | 6-10]
- qsub16 lr, r3, r9 ; d1 [3-15 | 2-14]
-
- qadd16 r3, r10, r11 ; a1 + b1 [3 | 2]
- qadd16 r5, r12, lr ; c1 + d1 [7 | 6]
- qsub16 r7, r10, r11 ; a1 - b1 [11 | 10]
- qsub16 r9, lr, r12 ; d1 - c1 [15 | 14]
-
- ; first transform complete
-
- qsubaddx r10, r2, r3 ; [c1|a1] [1-2 | 0+3]
- qaddsubx r11, r2, r3 ; [b1|d1] [1+2 | 0-3]
- qsubaddx r12, r4, r5 ; [c1|a1] [5-6 | 4+7]
- qaddsubx lr, r4, r5 ; [b1|d1] [5+6 | 4-7]
-
- qaddsubx r2, r10, r11 ; [b2|c2] [c1+d1 | a1-b1]
- qaddsubx r3, r11, r10 ; [a2|d2] [b1+a1 | d1-c1]
- ldr r10, c0x00030003
- qaddsubx r4, r12, lr ; [b2|c2] [c1+d1 | a1-b1]
- qaddsubx r5, lr, r12 ; [a2|d2] [b1+a1 | d1-c1]
-
- qadd16 r2, r2, r10 ; [b2+3|c2+3]
- qadd16 r3, r3, r10 ; [a2+3|d2+3]
- qadd16 r4, r4, r10 ; [b2+3|c2+3]
- qadd16 r5, r5, r10 ; [a2+3|d2+3]
-
- asr r12, r3, #19 ; [0]
- strh r12, [r1], #32
- asr lr, r2, #19 ; [1]
- strh lr, [r1], #32
- sxth r2, r2
- sxth r3, r3
- asr r2, r2, #3 ; [2]
- strh r2, [r1], #32
- asr r3, r3, #3 ; [3]
- strh r3, [r1], #32
-
- asr r12, r5, #19 ; [4]
- strh r12, [r1], #32
- asr lr, r4, #19 ; [5]
- strh lr, [r1], #32
- sxth r4, r4
- sxth r5, r5
- asr r4, r4, #3 ; [6]
- strh r4, [r1], #32
- asr r5, r5, #3 ; [7]
- strh r5, [r1], #32
-
- qsubaddx r2, r6, r7 ; [c1|a1] [9-10 | 8+11]
- qaddsubx r3, r6, r7 ; [b1|d1] [9+10 | 8-11]
- qsubaddx r4, r8, r9 ; [c1|a1] [13-14 | 12+15]
- qaddsubx r5, r8, r9 ; [b1|d1] [13+14 | 12-15]
-
- qaddsubx r6, r2, r3 ; [b2|c2] [c1+d1 | a1-b1]
- qaddsubx r7, r3, r2 ; [a2|d2] [b1+a1 | d1-c1]
- qaddsubx r8, r4, r5 ; [b2|c2] [c1+d1 | a1-b1]
- qaddsubx r9, r5, r4 ; [a2|d2] [b1+a1 | d1-c1]
-
- qadd16 r6, r6, r10 ; [b2+3|c2+3]
- qadd16 r7, r7, r10 ; [a2+3|d2+3]
- qadd16 r8, r8, r10 ; [b2+3|c2+3]
- qadd16 r9, r9, r10 ; [a2+3|d2+3]
-
- asr r12, r7, #19 ; [8]
- strh r12, [r1], #32
- asr lr, r6, #19 ; [9]
- strh lr, [r1], #32
- sxth r6, r6
- sxth r7, r7
- asr r6, r6, #3 ; [10]
- strh r6, [r1], #32
- asr r7, r7, #3 ; [11]
- strh r7, [r1], #32
-
- asr r12, r9, #19 ; [12]
- strh r12, [r1], #32
- asr lr, r8, #19 ; [13]
- strh lr, [r1], #32
- sxth r8, r8
- sxth r9, r9
- asr r8, r8, #3 ; [14]
- strh r8, [r1], #32
- asr r9, r9, #3 ; [15]
- strh r9, [r1], #32
-
- ldmia sp!, {r4 - r12, pc}
- ENDP ; |vp8_short_inv_walsh4x4_v6|
-
-
-; Constant Pool
-c0x00030003 DCD 0x00030003
- END
diff --git a/vp8/common/arm/armv6/loopfilter_v6.asm b/vp8/common/arm/armv6/loopfilter_v6.asm
deleted file mode 100644
index 1cbbbcdef..000000000
--- a/vp8/common/arm/armv6/loopfilter_v6.asm
+++ /dev/null
@@ -1,1282 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_loop_filter_horizontal_edge_armv6|
- EXPORT |vp8_mbloop_filter_horizontal_edge_armv6|
- EXPORT |vp8_loop_filter_vertical_edge_armv6|
- EXPORT |vp8_mbloop_filter_vertical_edge_armv6|
-
- AREA |.text|, CODE, READONLY ; name this block of code
-
- MACRO
- TRANSPOSE_MATRIX $a0, $a1, $a2, $a3, $b0, $b1, $b2, $b3
- ; input: $a0, $a1, $a2, $a3; output: $b0, $b1, $b2, $b3
- ; a0: 03 02 01 00
- ; a1: 13 12 11 10
- ; a2: 23 22 21 20
- ; a3: 33 32 31 30
- ; b3 b2 b1 b0
-
- uxtb16 $b1, $a1 ; xx 12 xx 10
- uxtb16 $b0, $a0 ; xx 02 xx 00
- uxtb16 $b3, $a3 ; xx 32 xx 30
- uxtb16 $b2, $a2 ; xx 22 xx 20
- orr $b1, $b0, $b1, lsl #8 ; 12 02 10 00
- orr $b3, $b2, $b3, lsl #8 ; 32 22 30 20
-
- uxtb16 $a1, $a1, ror #8 ; xx 13 xx 11
- uxtb16 $a3, $a3, ror #8 ; xx 33 xx 31
- uxtb16 $a0, $a0, ror #8 ; xx 03 xx 01
- uxtb16 $a2, $a2, ror #8 ; xx 23 xx 21
- orr $a0, $a0, $a1, lsl #8 ; 13 03 11 01
- orr $a2, $a2, $a3, lsl #8 ; 33 23 31 21
-
- pkhtb $b2, $b3, $b1, asr #16 ; 32 22 12 02 -- p1
- pkhbt $b0, $b1, $b3, lsl #16 ; 30 20 10 00 -- p3
-
- pkhtb $b3, $a2, $a0, asr #16 ; 33 23 13 03 -- p0
- pkhbt $b1, $a0, $a2, lsl #16 ; 31 21 11 01 -- p2
- MEND
-
-
-src RN r0
-pstep RN r1
-count RN r5
-
-;r0 unsigned char *src_ptr,
-;r1 int src_pixel_step,
-;r2 const char *blimit,
-;r3 const char *limit,
-;stack const char *thresh,
-;stack int count
-
-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
-|vp8_loop_filter_horizontal_edge_armv6| PROC
-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
- stmdb sp!, {r4 - r11, lr}
-
- sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines
- ldr count, [sp, #40] ; count for 8-in-parallel
- ldr r6, [sp, #36] ; load thresh address
- sub sp, sp, #16 ; create temp buffer
-
- ldr r9, [src], pstep ; p3
- ldrb r4, [r2] ; blimit
- ldr r10, [src], pstep ; p2
- ldrb r2, [r3] ; limit
- ldr r11, [src], pstep ; p1
- orr r4, r4, r4, lsl #8
- ldrb r3, [r6] ; thresh
- orr r2, r2, r2, lsl #8
- mov count, count, lsl #1 ; 4-in-parallel
- orr r4, r4, r4, lsl #16
- orr r3, r3, r3, lsl #8
- orr r2, r2, r2, lsl #16
- orr r3, r3, r3, lsl #16
-
-|Hnext8|
- ; vp8_filter_mask() function
- ; calculate breakout conditions
- ldr r12, [src], pstep ; p0
-
- uqsub8 r6, r9, r10 ; p3 - p2
- uqsub8 r7, r10, r9 ; p2 - p3
- uqsub8 r8, r10, r11 ; p2 - p1
- uqsub8 r10, r11, r10 ; p1 - p2
-
- orr r6, r6, r7 ; abs (p3-p2)
- orr r8, r8, r10 ; abs (p2-p1)
- uqsub8 lr, r6, r2 ; compare to limit. lr: vp8_filter_mask
- uqsub8 r8, r8, r2 ; compare to limit
- uqsub8 r6, r11, r12 ; p1 - p0
- orr lr, lr, r8
- uqsub8 r7, r12, r11 ; p0 - p1
- ldr r9, [src], pstep ; q0
- ldr r10, [src], pstep ; q1
- orr r6, r6, r7 ; abs (p1-p0)
- uqsub8 r7, r6, r2 ; compare to limit
- uqsub8 r8, r6, r3 ; compare to thresh -- save r8 for later
- orr lr, lr, r7
-
- uqsub8 r6, r11, r10 ; p1 - q1
- uqsub8 r7, r10, r11 ; q1 - p1
- uqsub8 r11, r12, r9 ; p0 - q0
- uqsub8 r12, r9, r12 ; q0 - p0
- orr r6, r6, r7 ; abs (p1-q1)
- ldr r7, c0x7F7F7F7F
- orr r12, r11, r12 ; abs (p0-q0)
- ldr r11, [src], pstep ; q2
- uqadd8 r12, r12, r12 ; abs (p0-q0) * 2
- and r6, r7, r6, lsr #1 ; abs (p1-q1) / 2
- uqsub8 r7, r9, r10 ; q0 - q1
- uqadd8 r12, r12, r6 ; abs (p0-q0)*2 + abs (p1-q1)/2
- uqsub8 r6, r10, r9 ; q1 - q0
- uqsub8 r12, r12, r4 ; compare to flimit
- uqsub8 r9, r11, r10 ; q2 - q1
-
- orr lr, lr, r12
-
- ldr r12, [src], pstep ; q3
- uqsub8 r10, r10, r11 ; q1 - q2
- orr r6, r7, r6 ; abs (q1-q0)
- orr r10, r9, r10 ; abs (q2-q1)
- uqsub8 r7, r6, r2 ; compare to limit
- uqsub8 r10, r10, r2 ; compare to limit
- uqsub8 r6, r6, r3 ; compare to thresh -- save r6 for later
- orr lr, lr, r7
- orr lr, lr, r10
-
- uqsub8 r10, r12, r11 ; q3 - q2
- uqsub8 r9, r11, r12 ; q2 - q3
-
- mvn r11, #0 ; r11 == -1
-
- orr r10, r10, r9 ; abs (q3-q2)
- uqsub8 r10, r10, r2 ; compare to limit
-
- mov r12, #0
- orr lr, lr, r10
- sub src, src, pstep, lsl #2
-
- usub8 lr, r12, lr ; use usub8 instead of ssub8
- sel lr, r11, r12 ; filter mask: lr
-
- cmp lr, #0
- beq hskip_filter ; skip filtering
-
- sub src, src, pstep, lsl #1 ; move src pointer down by 6 lines
-
- ;vp8_hevmask() function
- ;calculate high edge variance
- orr r10, r6, r8 ; calculate vp8_hevmask
-
- ldr r7, [src], pstep ; p1
-
- usub8 r10, r12, r10 ; use usub8 instead of ssub8
- sel r6, r12, r11 ; obtain vp8_hevmask: r6
-
- ;vp8_filter() function
- ldr r8, [src], pstep ; p0
- ldr r12, c0x80808080
- ldr r9, [src], pstep ; q0
- ldr r10, [src], pstep ; q1
-
- eor r7, r7, r12 ; p1 offset to convert to a signed value
- eor r8, r8, r12 ; p0 offset to convert to a signed value
- eor r9, r9, r12 ; q0 offset to convert to a signed value
- eor r10, r10, r12 ; q1 offset to convert to a signed value
-
- str r9, [sp] ; store qs0 temporarily
- str r8, [sp, #4] ; store ps0 temporarily
- str r10, [sp, #8] ; store qs1 temporarily
- str r7, [sp, #12] ; store ps1 temporarily
-
- qsub8 r7, r7, r10 ; vp8_signed_char_clamp(ps1-qs1)
- qsub8 r8, r9, r8 ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
-
- and r7, r7, r6 ; vp8_filter (r7) &= hev
-
- qadd8 r7, r7, r8
- ldr r9, c0x03030303 ; r9 = 3 --modified for vp8
-
- qadd8 r7, r7, r8
- ldr r10, c0x04040404
-
- qadd8 r7, r7, r8
- and r7, r7, lr ; vp8_filter &= mask;
-
- ;modify code for vp8 -- Filter1 = vp8_filter (r7)
- qadd8 r8 , r7 , r9 ; Filter2 (r8) = vp8_signed_char_clamp(vp8_filter+3)
- qadd8 r7 , r7 , r10 ; vp8_filter = vp8_signed_char_clamp(vp8_filter+4)
-
- mov r9, #0
- shadd8 r8 , r8 , r9 ; Filter2 >>= 3
- shadd8 r7 , r7 , r9 ; vp8_filter >>= 3
- shadd8 r8 , r8 , r9
- shadd8 r7 , r7 , r9
- shadd8 lr , r8 , r9 ; lr: Filter2
- shadd8 r7 , r7 , r9 ; r7: filter
-
- ;usub8 lr, r8, r10 ; s = (s==4)*-1
- ;sel lr, r11, r9
- ;usub8 r8, r10, r8
- ;sel r8, r11, r9
- ;and r8, r8, lr ; -1 for each element that equals 4
-
- ;calculate output
- ;qadd8 lr, r8, r7 ; u = vp8_signed_char_clamp(s + vp8_filter)
-
- ldr r8, [sp] ; load qs0
- ldr r9, [sp, #4] ; load ps0
-
- ldr r10, c0x01010101
-
- qsub8 r8 ,r8, r7 ; u = vp8_signed_char_clamp(qs0 - vp8_filter)
- qadd8 r9, r9, lr ; u = vp8_signed_char_clamp(ps0 + Filter2)
-
- ;end of modification for vp8
-
- mov lr, #0
- sadd8 r7, r7 , r10 ; vp8_filter += 1
- shadd8 r7, r7, lr ; vp8_filter >>= 1
-
- ldr r11, [sp, #12] ; load ps1
- ldr r10, [sp, #8] ; load qs1
-
- bic r7, r7, r6 ; vp8_filter &= ~hev
- sub src, src, pstep, lsl #2
-
- qadd8 r11, r11, r7 ; u = vp8_signed_char_clamp(ps1 + vp8_filter)
- qsub8 r10, r10,r7 ; u = vp8_signed_char_clamp(qs1 - vp8_filter)
-
- eor r11, r11, r12 ; *op1 = u^0x80
- str r11, [src], pstep ; store op1
- eor r9, r9, r12 ; *op0 = u^0x80
- str r9, [src], pstep ; store op0 result
- eor r8, r8, r12 ; *oq0 = u^0x80
- str r8, [src], pstep ; store oq0 result
- eor r10, r10, r12 ; *oq1 = u^0x80
- str r10, [src], pstep ; store oq1
-
- sub src, src, pstep, lsl #1
-
-|hskip_filter|
- add src, src, #4
- sub src, src, pstep, lsl #2
-
- subs count, count, #1
-
- ldrne r9, [src], pstep ; p3
- ldrne r10, [src], pstep ; p2
- ldrne r11, [src], pstep ; p1
-
- bne Hnext8
-
- add sp, sp, #16
- ldmia sp!, {r4 - r11, pc}
- ENDP ; |vp8_loop_filter_horizontal_edge_armv6|
-
-
-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
-|vp8_mbloop_filter_horizontal_edge_armv6| PROC
-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
- stmdb sp!, {r4 - r11, lr}
-
- sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines
- ldr count, [sp, #40] ; count for 8-in-parallel
- ldr r6, [sp, #36] ; load thresh address
- sub sp, sp, #16 ; create temp buffer
-
- ldr r9, [src], pstep ; p3
- ldrb r4, [r2] ; blimit
- ldr r10, [src], pstep ; p2
- ldrb r2, [r3] ; limit
- ldr r11, [src], pstep ; p1
- orr r4, r4, r4, lsl #8
- ldrb r3, [r6] ; thresh
- orr r2, r2, r2, lsl #8
- mov count, count, lsl #1 ; 4-in-parallel
- orr r4, r4, r4, lsl #16
- orr r3, r3, r3, lsl #8
- orr r2, r2, r2, lsl #16
- orr r3, r3, r3, lsl #16
-
-|MBHnext8|
-
- ; vp8_filter_mask() function
- ; calculate breakout conditions
- ldr r12, [src], pstep ; p0
-
- uqsub8 r6, r9, r10 ; p3 - p2
- uqsub8 r7, r10, r9 ; p2 - p3
- uqsub8 r8, r10, r11 ; p2 - p1
- uqsub8 r10, r11, r10 ; p1 - p2
-
- orr r6, r6, r7 ; abs (p3-p2)
- orr r8, r8, r10 ; abs (p2-p1)
- uqsub8 lr, r6, r2 ; compare to limit. lr: vp8_filter_mask
- uqsub8 r8, r8, r2 ; compare to limit
-
- uqsub8 r6, r11, r12 ; p1 - p0
- orr lr, lr, r8
- uqsub8 r7, r12, r11 ; p0 - p1
- ldr r9, [src], pstep ; q0
- ldr r10, [src], pstep ; q1
- orr r6, r6, r7 ; abs (p1-p0)
- uqsub8 r7, r6, r2 ; compare to limit
- uqsub8 r8, r6, r3 ; compare to thresh -- save r8 for later
- orr lr, lr, r7
-
- uqsub8 r6, r11, r10 ; p1 - q1
- uqsub8 r7, r10, r11 ; q1 - p1
- uqsub8 r11, r12, r9 ; p0 - q0
- uqsub8 r12, r9, r12 ; q0 - p0
- orr r6, r6, r7 ; abs (p1-q1)
- ldr r7, c0x7F7F7F7F
- orr r12, r11, r12 ; abs (p0-q0)
- ldr r11, [src], pstep ; q2
- uqadd8 r12, r12, r12 ; abs (p0-q0) * 2
- and r6, r7, r6, lsr #1 ; abs (p1-q1) / 2
- uqsub8 r7, r9, r10 ; q0 - q1
- uqadd8 r12, r12, r6 ; abs (p0-q0)*2 + abs (p1-q1)/2
- uqsub8 r6, r10, r9 ; q1 - q0
- uqsub8 r12, r12, r4 ; compare to flimit
- uqsub8 r9, r11, r10 ; q2 - q1
-
- orr lr, lr, r12
-
- ldr r12, [src], pstep ; q3
-
- uqsub8 r10, r10, r11 ; q1 - q2
- orr r6, r7, r6 ; abs (q1-q0)
- orr r10, r9, r10 ; abs (q2-q1)
- uqsub8 r7, r6, r2 ; compare to limit
- uqsub8 r10, r10, r2 ; compare to limit
- uqsub8 r6, r6, r3 ; compare to thresh -- save r6 for later
- orr lr, lr, r7
- orr lr, lr, r10
-
- uqsub8 r10, r12, r11 ; q3 - q2
- uqsub8 r9, r11, r12 ; q2 - q3
-
- mvn r11, #0 ; r11 == -1
-
- orr r10, r10, r9 ; abs (q3-q2)
- uqsub8 r10, r10, r2 ; compare to limit
-
- mov r12, #0
-
- orr lr, lr, r10
-
- usub8 lr, r12, lr ; use usub8 instead of ssub8
- sel lr, r11, r12 ; filter mask: lr
-
- cmp lr, #0
- beq mbhskip_filter ; skip filtering
-
- ;vp8_hevmask() function
- ;calculate high edge variance
- sub src, src, pstep, lsl #2 ; move src pointer down by 6 lines
- sub src, src, pstep, lsl #1
-
- orr r10, r6, r8
- ldr r7, [src], pstep ; p1
-
- usub8 r10, r12, r10
- sel r6, r12, r11 ; hev mask: r6
-
- ;vp8_mbfilter() function
- ;p2, q2 are only needed at the end. Don't need to load them in now.
- ldr r8, [src], pstep ; p0
- ldr r12, c0x80808080
- ldr r9, [src], pstep ; q0
- ldr r10, [src] ; q1
-
- eor r7, r7, r12 ; ps1
- eor r8, r8, r12 ; ps0
- eor r9, r9, r12 ; qs0
- eor r10, r10, r12 ; qs1
-
- qsub8 r12, r9, r8 ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
- str r7, [sp, #12] ; store ps1 temporarily
- qsub8 r7, r7, r10 ; vp8_signed_char_clamp(ps1-qs1)
- str r10, [sp, #8] ; store qs1 temporarily
- qadd8 r7, r7, r12
- str r9, [sp] ; store qs0 temporarily
- qadd8 r7, r7, r12
- str r8, [sp, #4] ; store ps0 temporarily
- qadd8 r7, r7, r12 ; vp8_filter: r7
-
- ldr r10, c0x03030303 ; r10 = 3 --modified for vp8
- ldr r9, c0x04040404
-
- and r7, r7, lr ; vp8_filter &= mask (lr is free)
-
- mov r12, r7 ; Filter2: r12
- and r12, r12, r6 ; Filter2 &= hev
-
- ;modify code for vp8
- ;save bottom 3 bits so that we round one side +4 and the other +3
- qadd8 r8 , r12 , r9 ; Filter1 (r8) = vp8_signed_char_clamp(Filter2+4)
- qadd8 r12 , r12 , r10 ; Filter2 (r12) = vp8_signed_char_clamp(Filter2+3)
-
- mov r10, #0
- shadd8 r8 , r8 , r10 ; Filter1 >>= 3
- shadd8 r12 , r12 , r10 ; Filter2 >>= 3
- shadd8 r8 , r8 , r10
- shadd8 r12 , r12 , r10
- shadd8 r8 , r8 , r10 ; r8: Filter1
- shadd8 r12 , r12 , r10 ; r12: Filter2
-
- ldr r9, [sp] ; load qs0
- ldr r11, [sp, #4] ; load ps0
-
- qsub8 r9 , r9, r8 ; qs0 = vp8_signed_char_clamp(qs0 - Filter1)
- qadd8 r11, r11, r12 ; ps0 = vp8_signed_char_clamp(ps0 + Filter2)
-
- ;save bottom 3 bits so that we round one side +4 and the other +3
- ;and r8, r12, r10 ; s = Filter2 & 7 (s: r8)
- ;qadd8 r12 , r12 , r9 ; Filter2 = vp8_signed_char_clamp(Filter2+4)
- ;mov r10, #0
- ;shadd8 r12 , r12 , r10 ; Filter2 >>= 3
- ;usub8 lr, r8, r9 ; s = (s==4)*-1
- ;sel lr, r11, r10
- ;shadd8 r12 , r12 , r10
- ;usub8 r8, r9, r8
- ;sel r8, r11, r10
- ;ldr r9, [sp] ; load qs0
- ;ldr r11, [sp, #4] ; load ps0
- ;shadd8 r12 , r12 , r10
- ;and r8, r8, lr ; -1 for each element that equals 4
- ;qadd8 r10, r8, r12 ; u = vp8_signed_char_clamp(s + Filter2)
- ;qsub8 r9 , r9, r12 ; qs0 = vp8_signed_char_clamp(qs0 - Filter2)
- ;qadd8 r11, r11, r10 ; ps0 = vp8_signed_char_clamp(ps0 + u)
-
- ;end of modification for vp8
-
- bic r12, r7, r6 ; vp8_filter &= ~hev ( r6 is free)
- ;mov r12, r7
-
- ;roughly 3/7th difference across boundary
- mov lr, #0x1b ; 27
- mov r7, #0x3f ; 63
-
- sxtb16 r6, r12
- sxtb16 r10, r12, ror #8
- smlabb r8, r6, lr, r7
- smlatb r6, r6, lr, r7
- smlabb r7, r10, lr, r7
- smultb r10, r10, lr
- ssat r8, #8, r8, asr #7
- ssat r6, #8, r6, asr #7
- add r10, r10, #63
- ssat r7, #8, r7, asr #7
- ssat r10, #8, r10, asr #7
-
- ldr lr, c0x80808080
-
- pkhbt r6, r8, r6, lsl #16
- pkhbt r10, r7, r10, lsl #16
- uxtb16 r6, r6
- uxtb16 r10, r10
-
- sub src, src, pstep
-
- orr r10, r6, r10, lsl #8 ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7)
-
- qsub8 r8, r9, r10 ; s = vp8_signed_char_clamp(qs0 - u)
- qadd8 r10, r11, r10 ; s = vp8_signed_char_clamp(ps0 + u)
- eor r8, r8, lr ; *oq0 = s^0x80
- str r8, [src] ; store *oq0
- sub src, src, pstep
- eor r10, r10, lr ; *op0 = s^0x80
- str r10, [src] ; store *op0
-
- ;roughly 2/7th difference across boundary
- mov lr, #0x12 ; 18
- mov r7, #0x3f ; 63
-
- sxtb16 r6, r12
- sxtb16 r10, r12, ror #8
- smlabb r8, r6, lr, r7
- smlatb r6, r6, lr, r7
- smlabb r9, r10, lr, r7
- smlatb r10, r10, lr, r7
- ssat r8, #8, r8, asr #7
- ssat r6, #8, r6, asr #7
- ssat r9, #8, r9, asr #7
- ssat r10, #8, r10, asr #7
-
- ldr lr, c0x80808080
-
- pkhbt r6, r8, r6, lsl #16
- pkhbt r10, r9, r10, lsl #16
-
- ldr r9, [sp, #8] ; load qs1
- ldr r11, [sp, #12] ; load ps1
-
- uxtb16 r6, r6
- uxtb16 r10, r10
-
- sub src, src, pstep
-
- orr r10, r6, r10, lsl #8 ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7)
-
- qadd8 r11, r11, r10 ; s = vp8_signed_char_clamp(ps1 + u)
- qsub8 r8, r9, r10 ; s = vp8_signed_char_clamp(qs1 - u)
- eor r11, r11, lr ; *op1 = s^0x80
- str r11, [src], pstep ; store *op1
- eor r8, r8, lr ; *oq1 = s^0x80
- add src, src, pstep, lsl #1
-
- mov r7, #0x3f ; 63
-
- str r8, [src], pstep ; store *oq1
-
- ;roughly 1/7th difference across boundary
- mov lr, #0x9 ; 9
- ldr r9, [src] ; load q2
-
- sxtb16 r6, r12
- sxtb16 r10, r12, ror #8
- smlabb r8, r6, lr, r7
- smlatb r6, r6, lr, r7
- smlabb r12, r10, lr, r7
- smlatb r10, r10, lr, r7
- ssat r8, #8, r8, asr #7
- ssat r6, #8, r6, asr #7
- ssat r12, #8, r12, asr #7
- ssat r10, #8, r10, asr #7
-
- sub src, src, pstep, lsl #2
-
- pkhbt r6, r8, r6, lsl #16
- pkhbt r10, r12, r10, lsl #16
-
- sub src, src, pstep
- ldr lr, c0x80808080
-
- ldr r11, [src] ; load p2
-
- uxtb16 r6, r6
- uxtb16 r10, r10
-
- eor r9, r9, lr
- eor r11, r11, lr
-
- orr r10, r6, r10, lsl #8 ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7)
-
- qadd8 r8, r11, r10 ; s = vp8_signed_char_clamp(ps2 + u)
- qsub8 r10, r9, r10 ; s = vp8_signed_char_clamp(qs2 - u)
- eor r8, r8, lr ; *op2 = s^0x80
- str r8, [src], pstep, lsl #2 ; store *op2
- add src, src, pstep
- eor r10, r10, lr ; *oq2 = s^0x80
- str r10, [src], pstep, lsl #1 ; store *oq2
-
-|mbhskip_filter|
- add src, src, #4
- sub src, src, pstep, lsl #3
- subs count, count, #1
-
- ldrne r9, [src], pstep ; p3
- ldrne r10, [src], pstep ; p2
- ldrne r11, [src], pstep ; p1
-
- bne MBHnext8
-
- add sp, sp, #16
- ldmia sp!, {r4 - r11, pc}
- ENDP ; |vp8_mbloop_filter_horizontal_edge_armv6|
-
-
-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
-|vp8_loop_filter_vertical_edge_armv6| PROC
-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
- stmdb sp!, {r4 - r11, lr}
-
- sub src, src, #4 ; move src pointer down by 4
- ldr count, [sp, #40] ; count for 8-in-parallel
- ldr r12, [sp, #36] ; load thresh address
- sub sp, sp, #16 ; create temp buffer
-
- ldr r6, [src], pstep ; load source data
- ldrb r4, [r2] ; blimit
- ldr r7, [src], pstep
- ldrb r2, [r3] ; limit
- ldr r8, [src], pstep
- orr r4, r4, r4, lsl #8
- ldrb r3, [r12] ; thresh
- orr r2, r2, r2, lsl #8
- ldr lr, [src], pstep
- mov count, count, lsl #1 ; 4-in-parallel
- orr r4, r4, r4, lsl #16
- orr r3, r3, r3, lsl #8
- orr r2, r2, r2, lsl #16
- orr r3, r3, r3, lsl #16
-
-|Vnext8|
-
- ; vp8_filter_mask() function
- ; calculate breakout conditions
- ; transpose the source data for 4-in-parallel operation
- TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12
-
- uqsub8 r7, r9, r10 ; p3 - p2
- uqsub8 r8, r10, r9 ; p2 - p3
- uqsub8 r9, r10, r11 ; p2 - p1
- uqsub8 r10, r11, r10 ; p1 - p2
- orr r7, r7, r8 ; abs (p3-p2)
- orr r10, r9, r10 ; abs (p2-p1)
- uqsub8 lr, r7, r2 ; compare to limit. lr: vp8_filter_mask
- uqsub8 r10, r10, r2 ; compare to limit
-
- sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines
-
- orr lr, lr, r10
-
- uqsub8 r6, r11, r12 ; p1 - p0
- uqsub8 r7, r12, r11 ; p0 - p1
- add src, src, #4 ; move src pointer up by 4
- orr r6, r6, r7 ; abs (p1-p0)
- str r11, [sp, #12] ; save p1
- uqsub8 r10, r6, r2 ; compare to limit
- uqsub8 r11, r6, r3 ; compare to thresh
- orr lr, lr, r10
-
- ; transpose uses 8 regs(r6 - r12 and lr). Need to save reg value now
- ; transpose the source data for 4-in-parallel operation
- ldr r6, [src], pstep ; load source data
- str r11, [sp] ; push r11 to stack
- ldr r7, [src], pstep
- str r12, [sp, #4] ; save current reg before load q0 - q3 data
- ldr r8, [src], pstep
- str lr, [sp, #8]
- ldr lr, [src], pstep
-
- TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12
-
- ldr lr, [sp, #8] ; load back (f)limit accumulator
-
- uqsub8 r6, r12, r11 ; q3 - q2
- uqsub8 r7, r11, r12 ; q2 - q3
- uqsub8 r12, r11, r10 ; q2 - q1
- uqsub8 r11, r10, r11 ; q1 - q2
- orr r6, r6, r7 ; abs (q3-q2)
- orr r7, r12, r11 ; abs (q2-q1)
- uqsub8 r6, r6, r2 ; compare to limit
- uqsub8 r7, r7, r2 ; compare to limit
- ldr r11, [sp, #4] ; load back p0
- ldr r12, [sp, #12] ; load back p1
- orr lr, lr, r6
- orr lr, lr, r7
-
- uqsub8 r6, r11, r9 ; p0 - q0
- uqsub8 r7, r9, r11 ; q0 - p0
- uqsub8 r8, r12, r10 ; p1 - q1
- uqsub8 r11, r10, r12 ; q1 - p1
- orr r6, r6, r7 ; abs (p0-q0)
- ldr r7, c0x7F7F7F7F
- orr r8, r8, r11 ; abs (p1-q1)
- uqadd8 r6, r6, r6 ; abs (p0-q0) * 2
- and r8, r7, r8, lsr #1 ; abs (p1-q1) / 2
- uqsub8 r11, r10, r9 ; q1 - q0
- uqadd8 r6, r8, r6 ; abs (p0-q0)*2 + abs (p1-q1)/2
- uqsub8 r12, r9, r10 ; q0 - q1
- uqsub8 r6, r6, r4 ; compare to flimit
-
- orr r9, r11, r12 ; abs (q1-q0)
- uqsub8 r8, r9, r2 ; compare to limit
- uqsub8 r10, r9, r3 ; compare to thresh
- orr lr, lr, r6
- orr lr, lr, r8
-
- mvn r11, #0 ; r11 == -1
- mov r12, #0
-
- usub8 lr, r12, lr
- ldr r9, [sp] ; load the compared result
- sel lr, r11, r12 ; filter mask: lr
-
- cmp lr, #0
- beq vskip_filter ; skip filtering
-
- ;vp8_hevmask() function
- ;calculate high edge variance
-
- sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines
-
- orr r9, r9, r10
-
- ldrh r7, [src, #-2]
- ldrh r8, [src], pstep
-
- usub8 r9, r12, r9
- sel r6, r12, r11 ; hev mask: r6
-
- ;vp8_filter() function
- ; load soure data to r6, r11, r12, lr
- ldrh r9, [src, #-2]
- ldrh r10, [src], pstep
-
- pkhbt r12, r7, r8, lsl #16
-
- ldrh r7, [src, #-2]
- ldrh r8, [src], pstep
-
- pkhbt r11, r9, r10, lsl #16
-
- ldrh r9, [src, #-2]
- ldrh r10, [src], pstep
-
- ; Transpose needs 8 regs(r6 - r12, and lr). Save r6 and lr first
- str r6, [sp]
- str lr, [sp, #4]
-
- pkhbt r6, r7, r8, lsl #16
- pkhbt lr, r9, r10, lsl #16
-
- ;transpose r12, r11, r6, lr to r7, r8, r9, r10
- TRANSPOSE_MATRIX r12, r11, r6, lr, r7, r8, r9, r10
-
- ;load back hev_mask r6 and filter_mask lr
- ldr r12, c0x80808080
- ldr r6, [sp]
- ldr lr, [sp, #4]
-
- eor r7, r7, r12 ; p1 offset to convert to a signed value
- eor r8, r8, r12 ; p0 offset to convert to a signed value
- eor r9, r9, r12 ; q0 offset to convert to a signed value
- eor r10, r10, r12 ; q1 offset to convert to a signed value
-
- str r9, [sp] ; store qs0 temporarily
- str r8, [sp, #4] ; store ps0 temporarily
- str r10, [sp, #8] ; store qs1 temporarily
- str r7, [sp, #12] ; store ps1 temporarily
-
- qsub8 r7, r7, r10 ; vp8_signed_char_clamp(ps1-qs1)
- qsub8 r8, r9, r8 ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
-
- and r7, r7, r6 ; vp8_filter (r7) &= hev (r7 : filter)
-
- qadd8 r7, r7, r8
- ldr r9, c0x03030303 ; r9 = 3 --modified for vp8
-
- qadd8 r7, r7, r8
- ldr r10, c0x04040404
-
- qadd8 r7, r7, r8
- ;mvn r11, #0 ; r11 == -1
-
- and r7, r7, lr ; vp8_filter &= mask
-
- ;modify code for vp8 -- Filter1 = vp8_filter (r7)
- qadd8 r8 , r7 , r9 ; Filter2 (r8) = vp8_signed_char_clamp(vp8_filter+3)
- qadd8 r7 , r7 , r10 ; vp8_filter = vp8_signed_char_clamp(vp8_filter+4)
-
- mov r9, #0
- shadd8 r8 , r8 , r9 ; Filter2 >>= 3
- shadd8 r7 , r7 , r9 ; vp8_filter >>= 3
- shadd8 r8 , r8 , r9
- shadd8 r7 , r7 , r9
- shadd8 lr , r8 , r9 ; lr: filter2
- shadd8 r7 , r7 , r9 ; r7: filter
-
- ;usub8 lr, r8, r10 ; s = (s==4)*-1
- ;sel lr, r11, r9
- ;usub8 r8, r10, r8
- ;sel r8, r11, r9
- ;and r8, r8, lr ; -1 for each element that equals 4 -- r8: s
-
- ;calculate output
- ;qadd8 lr, r8, r7 ; u = vp8_signed_char_clamp(s + vp8_filter)
-
- ldr r8, [sp] ; load qs0
- ldr r9, [sp, #4] ; load ps0
-
- ldr r10, c0x01010101
-
- qsub8 r8, r8, r7 ; u = vp8_signed_char_clamp(qs0 - vp8_filter)
- qadd8 r9, r9, lr ; u = vp8_signed_char_clamp(ps0 + Filter2)
- ;end of modification for vp8
-
- eor r8, r8, r12
- eor r9, r9, r12
-
- mov lr, #0
-
- sadd8 r7, r7, r10
- shadd8 r7, r7, lr
-
- ldr r10, [sp, #8] ; load qs1
- ldr r11, [sp, #12] ; load ps1
-
- bic r7, r7, r6 ; r7: vp8_filter
-
- qsub8 r10 , r10, r7 ; u = vp8_signed_char_clamp(qs1 - vp8_filter)
- qadd8 r11, r11, r7 ; u = vp8_signed_char_clamp(ps1 + vp8_filter)
- eor r10, r10, r12
- eor r11, r11, r12
-
- sub src, src, pstep, lsl #2
-
- ;we can use TRANSPOSE_MATRIX macro to transpose output - input: q1, q0, p0, p1
- ;output is b0, b1, b2, b3
- ;b0: 03 02 01 00
- ;b1: 13 12 11 10
- ;b2: 23 22 21 20
- ;b3: 33 32 31 30
- ; p1 p0 q0 q1
- ; (a3 a2 a1 a0)
- TRANSPOSE_MATRIX r11, r9, r8, r10, r6, r7, r12, lr
-
- strh r6, [src, #-2] ; store the result
- mov r6, r6, lsr #16
- strh r6, [src], pstep
-
- strh r7, [src, #-2]
- mov r7, r7, lsr #16
- strh r7, [src], pstep
-
- strh r12, [src, #-2]
- mov r12, r12, lsr #16
- strh r12, [src], pstep
-
- strh lr, [src, #-2]
- mov lr, lr, lsr #16
- strh lr, [src], pstep
-
-|vskip_filter|
- sub src, src, #4
- subs count, count, #1
-
- ldrne r6, [src], pstep ; load source data
- ldrne r7, [src], pstep
- ldrne r8, [src], pstep
- ldrne lr, [src], pstep
-
- bne Vnext8
-
- add sp, sp, #16
-
- ldmia sp!, {r4 - r11, pc}
- ENDP ; |vp8_loop_filter_vertical_edge_armv6|
-
-
-
-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
-|vp8_mbloop_filter_vertical_edge_armv6| PROC
-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
- stmdb sp!, {r4 - r11, lr}
-
- sub src, src, #4 ; move src pointer down by 4
- ldr count, [sp, #40] ; count for 8-in-parallel
- ldr r12, [sp, #36] ; load thresh address
- pld [src, #23] ; preload for next block
- sub sp, sp, #16 ; create temp buffer
-
- ldr r6, [src], pstep ; load source data
- ldrb r4, [r2] ; blimit
- pld [src, #23]
- ldr r7, [src], pstep
- ldrb r2, [r3] ; limit
- pld [src, #23]
- ldr r8, [src], pstep
- orr r4, r4, r4, lsl #8
- ldrb r3, [r12] ; thresh
- orr r2, r2, r2, lsl #8
- pld [src, #23]
- ldr lr, [src], pstep
- mov count, count, lsl #1 ; 4-in-parallel
- orr r4, r4, r4, lsl #16
- orr r3, r3, r3, lsl #8
- orr r2, r2, r2, lsl #16
- orr r3, r3, r3, lsl #16
-
-|MBVnext8|
- ; vp8_filter_mask() function
- ; calculate breakout conditions
- ; transpose the source data for 4-in-parallel operation
- TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12
-
- uqsub8 r7, r9, r10 ; p3 - p2
- uqsub8 r8, r10, r9 ; p2 - p3
- uqsub8 r9, r10, r11 ; p2 - p1
- uqsub8 r10, r11, r10 ; p1 - p2
- orr r7, r7, r8 ; abs (p3-p2)
- orr r10, r9, r10 ; abs (p2-p1)
- uqsub8 lr, r7, r2 ; compare to limit. lr: vp8_filter_mask
- uqsub8 r10, r10, r2 ; compare to limit
-
- sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines
-
- orr lr, lr, r10
-
- uqsub8 r6, r11, r12 ; p1 - p0
- uqsub8 r7, r12, r11 ; p0 - p1
- add src, src, #4 ; move src pointer up by 4
- orr r6, r6, r7 ; abs (p1-p0)
- str r11, [sp, #12] ; save p1
- uqsub8 r10, r6, r2 ; compare to limit
- uqsub8 r11, r6, r3 ; compare to thresh
- orr lr, lr, r10
-
- ; transpose uses 8 regs(r6 - r12 and lr). Need to save reg value now
- ; transpose the source data for 4-in-parallel operation
- ldr r6, [src], pstep ; load source data
- str r11, [sp] ; push r11 to stack
- ldr r7, [src], pstep
- str r12, [sp, #4] ; save current reg before load q0 - q3 data
- ldr r8, [src], pstep
- str lr, [sp, #8]
- ldr lr, [src], pstep
-
-
- TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12
-
- ldr lr, [sp, #8] ; load back (f)limit accumulator
-
- uqsub8 r6, r12, r11 ; q3 - q2
- uqsub8 r7, r11, r12 ; q2 - q3
- uqsub8 r12, r11, r10 ; q2 - q1
- uqsub8 r11, r10, r11 ; q1 - q2
- orr r6, r6, r7 ; abs (q3-q2)
- orr r7, r12, r11 ; abs (q2-q1)
- uqsub8 r6, r6, r2 ; compare to limit
- uqsub8 r7, r7, r2 ; compare to limit
- ldr r11, [sp, #4] ; load back p0
- ldr r12, [sp, #12] ; load back p1
- orr lr, lr, r6
- orr lr, lr, r7
-
- uqsub8 r6, r11, r9 ; p0 - q0
- uqsub8 r7, r9, r11 ; q0 - p0
- uqsub8 r8, r12, r10 ; p1 - q1
- uqsub8 r11, r10, r12 ; q1 - p1
- orr r6, r6, r7 ; abs (p0-q0)
- ldr r7, c0x7F7F7F7F
- orr r8, r8, r11 ; abs (p1-q1)
- uqadd8 r6, r6, r6 ; abs (p0-q0) * 2
- and r8, r7, r8, lsr #1 ; abs (p1-q1) / 2
- uqsub8 r11, r10, r9 ; q1 - q0
- uqadd8 r6, r8, r6 ; abs (p0-q0)*2 + abs (p1-q1)/2
- uqsub8 r12, r9, r10 ; q0 - q1
- uqsub8 r6, r6, r4 ; compare to flimit
-
- orr r9, r11, r12 ; abs (q1-q0)
- uqsub8 r8, r9, r2 ; compare to limit
- uqsub8 r10, r9, r3 ; compare to thresh
- orr lr, lr, r6
- orr lr, lr, r8
-
- mvn r11, #0 ; r11 == -1
- mov r12, #0
-
- usub8 lr, r12, lr
- ldr r9, [sp] ; load the compared result
- sel lr, r11, r12 ; filter mask: lr
-
- cmp lr, #0
- beq mbvskip_filter ; skip filtering
-
-
-
- ;vp8_hevmask() function
- ;calculate high edge variance
-
- sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines
-
- orr r9, r9, r10
-
- ldrh r7, [src, #-2]
- ldrh r8, [src], pstep
-
- usub8 r9, r12, r9
- sel r6, r12, r11 ; hev mask: r6
-
-
- ; vp8_mbfilter() function
- ; p2, q2 are only needed at the end. Don't need to load them in now.
- ; Transpose needs 8 regs(r6 - r12, and lr). Save r6 and lr first
- ; load soure data to r6, r11, r12, lr
- ldrh r9, [src, #-2]
- ldrh r10, [src], pstep
-
- pkhbt r12, r7, r8, lsl #16
-
- ldrh r7, [src, #-2]
- ldrh r8, [src], pstep
-
- pkhbt r11, r9, r10, lsl #16
-
- ldrh r9, [src, #-2]
- ldrh r10, [src], pstep
-
- str r6, [sp] ; save r6
- str lr, [sp, #4] ; save lr
-
- pkhbt r6, r7, r8, lsl #16
- pkhbt lr, r9, r10, lsl #16
-
- ;transpose r12, r11, r6, lr to p1, p0, q0, q1
- TRANSPOSE_MATRIX r12, r11, r6, lr, r7, r8, r9, r10
-
- ;load back hev_mask r6 and filter_mask lr
- ldr r12, c0x80808080
- ldr r6, [sp]
- ldr lr, [sp, #4]
-
- eor r7, r7, r12 ; ps1
- eor r8, r8, r12 ; ps0
- eor r9, r9, r12 ; qs0
- eor r10, r10, r12 ; qs1
-
- qsub8 r12, r9, r8 ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
- str r7, [sp, #12] ; store ps1 temporarily
- qsub8 r7, r7, r10 ; vp8_signed_char_clamp(ps1-qs1)
- str r10, [sp, #8] ; store qs1 temporarily
- qadd8 r7, r7, r12
- str r9, [sp] ; store qs0 temporarily
- qadd8 r7, r7, r12
- str r8, [sp, #4] ; store ps0 temporarily
- qadd8 r7, r7, r12 ; vp8_filter: r7
-
- ldr r10, c0x03030303 ; r10 = 3 --modified for vp8
- ldr r9, c0x04040404
- ;mvn r11, #0 ; r11 == -1
-
- and r7, r7, lr ; vp8_filter &= mask (lr is free)
-
- mov r12, r7 ; Filter2: r12
- and r12, r12, r6 ; Filter2 &= hev
-
- ;modify code for vp8
- ;save bottom 3 bits so that we round one side +4 and the other +3
- qadd8 r8 , r12 , r9 ; Filter1 (r8) = vp8_signed_char_clamp(Filter2+4)
- qadd8 r12 , r12 , r10 ; Filter2 (r12) = vp8_signed_char_clamp(Filter2+3)
-
- mov r10, #0
- shadd8 r8 , r8 , r10 ; Filter1 >>= 3
- shadd8 r12 , r12 , r10 ; Filter2 >>= 3
- shadd8 r8 , r8 , r10
- shadd8 r12 , r12 , r10
- shadd8 r8 , r8 , r10 ; r8: Filter1
- shadd8 r12 , r12 , r10 ; r12: Filter2
-
- ldr r9, [sp] ; load qs0
- ldr r11, [sp, #4] ; load ps0
-
- qsub8 r9 , r9, r8 ; qs0 = vp8_signed_char_clamp(qs0 - Filter1)
- qadd8 r11, r11, r12 ; ps0 = vp8_signed_char_clamp(ps0 + Filter2)
-
- ;save bottom 3 bits so that we round one side +4 and the other +3
- ;and r8, r12, r10 ; s = Filter2 & 7 (s: r8)
- ;qadd8 r12 , r12 , r9 ; Filter2 = vp8_signed_char_clamp(Filter2+4)
- ;mov r10, #0
- ;shadd8 r12 , r12 , r10 ; Filter2 >>= 3
- ;usub8 lr, r8, r9 ; s = (s==4)*-1
- ;sel lr, r11, r10
- ;shadd8 r12 , r12 , r10
- ;usub8 r8, r9, r8
- ;sel r8, r11, r10
- ;ldr r9, [sp] ; load qs0
- ;ldr r11, [sp, #4] ; load ps0
- ;shadd8 r12 , r12 , r10
- ;and r8, r8, lr ; -1 for each element that equals 4
- ;qadd8 r10, r8, r12 ; u = vp8_signed_char_clamp(s + Filter2)
- ;qsub8 r9 , r9, r12 ; qs0 = vp8_signed_char_clamp(qs0 - Filter2)
- ;qadd8 r11, r11, r10 ; ps0 = vp8_signed_char_clamp(ps0 + u)
-
- ;end of modification for vp8
-
- bic r12, r7, r6 ;vp8_filter &= ~hev ( r6 is free)
- ;mov r12, r7
-
- ;roughly 3/7th difference across boundary
- mov lr, #0x1b ; 27
- mov r7, #0x3f ; 63
-
- sxtb16 r6, r12
- sxtb16 r10, r12, ror #8
- smlabb r8, r6, lr, r7
- smlatb r6, r6, lr, r7
- smlabb r7, r10, lr, r7
- smultb r10, r10, lr
- ssat r8, #8, r8, asr #7
- ssat r6, #8, r6, asr #7
- add r10, r10, #63
- ssat r7, #8, r7, asr #7
- ssat r10, #8, r10, asr #7
-
- ldr lr, c0x80808080
-
- pkhbt r6, r8, r6, lsl #16
- pkhbt r10, r7, r10, lsl #16
- uxtb16 r6, r6
- uxtb16 r10, r10
-
- sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines
-
- orr r10, r6, r10, lsl #8 ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7)
-
- qsub8 r8, r9, r10 ; s = vp8_signed_char_clamp(qs0 - u)
- qadd8 r10, r11, r10 ; s = vp8_signed_char_clamp(ps0 + u)
- eor r8, r8, lr ; *oq0 = s^0x80
- eor r10, r10, lr ; *op0 = s^0x80
-
- strb r10, [src, #-1] ; store op0 result
- strb r8, [src], pstep ; store oq0 result
- mov r10, r10, lsr #8
- mov r8, r8, lsr #8
- strb r10, [src, #-1]
- strb r8, [src], pstep
- mov r10, r10, lsr #8
- mov r8, r8, lsr #8
- strb r10, [src, #-1]
- strb r8, [src], pstep
- mov r10, r10, lsr #8
- mov r8, r8, lsr #8
- strb r10, [src, #-1]
- strb r8, [src], pstep
-
- ;roughly 2/7th difference across boundary
- mov lr, #0x12 ; 18
- mov r7, #0x3f ; 63
-
- sxtb16 r6, r12
- sxtb16 r10, r12, ror #8
- smlabb r8, r6, lr, r7
- smlatb r6, r6, lr, r7
- smlabb r9, r10, lr, r7
-
- smlatb r10, r10, lr, r7
- ssat r8, #8, r8, asr #7
- ssat r6, #8, r6, asr #7
- ssat r9, #8, r9, asr #7
- ssat r10, #8, r10, asr #7
-
- sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines
-
- pkhbt r6, r8, r6, lsl #16
- pkhbt r10, r9, r10, lsl #16
-
- ldr r9, [sp, #8] ; load qs1
- ldr r11, [sp, #12] ; load ps1
- ldr lr, c0x80808080
-
- uxtb16 r6, r6
- uxtb16 r10, r10
-
- add src, src, #2
-
- orr r10, r6, r10, lsl #8 ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7)
-
- qsub8 r8, r9, r10 ; s = vp8_signed_char_clamp(qs1 - u)
- qadd8 r10, r11, r10 ; s = vp8_signed_char_clamp(ps1 + u)
- eor r8, r8, lr ; *oq1 = s^0x80
- eor r10, r10, lr ; *op1 = s^0x80
-
- ldrb r11, [src, #-5] ; load p2 for 1/7th difference across boundary
- strb r10, [src, #-4] ; store op1
- strb r8, [src, #-1] ; store oq1
- ldrb r9, [src], pstep ; load q2 for 1/7th difference across boundary
-
- mov r10, r10, lsr #8
- mov r8, r8, lsr #8
-
- ldrb r6, [src, #-5]
- strb r10, [src, #-4]
- strb r8, [src, #-1]
- ldrb r7, [src], pstep
-
- mov r10, r10, lsr #8
- mov r8, r8, lsr #8
- orr r11, r11, r6, lsl #8
- orr r9, r9, r7, lsl #8
-
- ldrb r6, [src, #-5]
- strb r10, [src, #-4]
- strb r8, [src, #-1]
- ldrb r7, [src], pstep
-
- mov r10, r10, lsr #8
- mov r8, r8, lsr #8
- orr r11, r11, r6, lsl #16
- orr r9, r9, r7, lsl #16
-
- ldrb r6, [src, #-5]
- strb r10, [src, #-4]
- strb r8, [src, #-1]
- ldrb r7, [src], pstep
- orr r11, r11, r6, lsl #24
- orr r9, r9, r7, lsl #24
-
- ;roughly 1/7th difference across boundary
- eor r9, r9, lr
- eor r11, r11, lr
-
- mov lr, #0x9 ; 9
- mov r7, #0x3f ; 63
-
- sxtb16 r6, r12
- sxtb16 r10, r12, ror #8
- smlabb r8, r6, lr, r7
- smlatb r6, r6, lr, r7
- smlabb r12, r10, lr, r7
- smlatb r10, r10, lr, r7
- ssat r8, #8, r8, asr #7
- ssat r6, #8, r6, asr #7
- ssat r12, #8, r12, asr #7
- ssat r10, #8, r10, asr #7
-
- sub src, src, pstep, lsl #2
-
- pkhbt r6, r8, r6, lsl #16
- pkhbt r10, r12, r10, lsl #16
-
- uxtb16 r6, r6
- uxtb16 r10, r10
-
- ldr lr, c0x80808080
-
- orr r10, r6, r10, lsl #8 ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7)
-
- qadd8 r8, r11, r10 ; s = vp8_signed_char_clamp(ps2 + u)
- qsub8 r10, r9, r10 ; s = vp8_signed_char_clamp(qs2 - u)
- eor r8, r8, lr ; *op2 = s^0x80
- eor r10, r10, lr ; *oq2 = s^0x80
-
- strb r8, [src, #-5] ; store *op2
- strb r10, [src], pstep ; store *oq2
- mov r8, r8, lsr #8
- mov r10, r10, lsr #8
- strb r8, [src, #-5]
- strb r10, [src], pstep
- mov r8, r8, lsr #8
- mov r10, r10, lsr #8
- strb r8, [src, #-5]
- strb r10, [src], pstep
- mov r8, r8, lsr #8
- mov r10, r10, lsr #8
- strb r8, [src, #-5]
- strb r10, [src], pstep
-
- ;adjust src pointer for next loop
- sub src, src, #2
-
-|mbvskip_filter|
- sub src, src, #4
- subs count, count, #1
-
- pld [src, #23] ; preload for next block
- ldrne r6, [src], pstep ; load source data
- pld [src, #23]
- ldrne r7, [src], pstep
- pld [src, #23]
- ldrne r8, [src], pstep
- pld [src, #23]
- ldrne lr, [src], pstep
-
- bne MBVnext8
-
- add sp, sp, #16
-
- ldmia sp!, {r4 - r11, pc}
- ENDP ; |vp8_mbloop_filter_vertical_edge_armv6|
-
-; Constant Pool
-c0x80808080 DCD 0x80808080
-c0x03030303 DCD 0x03030303
-c0x04040404 DCD 0x04040404
-c0x01010101 DCD 0x01010101
-c0x7F7F7F7F DCD 0x7F7F7F7F
-
- END
diff --git a/vp8/common/arm/armv6/simpleloopfilter_v6.asm b/vp8/common/arm/armv6/simpleloopfilter_v6.asm
deleted file mode 100644
index 5e00cf01b..000000000
--- a/vp8/common/arm/armv6/simpleloopfilter_v6.asm
+++ /dev/null
@@ -1,286 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_loop_filter_simple_horizontal_edge_armv6|
- EXPORT |vp8_loop_filter_simple_vertical_edge_armv6|
-
- AREA |.text|, CODE, READONLY ; name this block of code
-
- MACRO
- TRANSPOSE_MATRIX $a0, $a1, $a2, $a3, $b0, $b1, $b2, $b3
- ; input: $a0, $a1, $a2, $a3; output: $b0, $b1, $b2, $b3
- ; a0: 03 02 01 00
- ; a1: 13 12 11 10
- ; a2: 23 22 21 20
- ; a3: 33 32 31 30
- ; b3 b2 b1 b0
-
- uxtb16 $b1, $a1 ; xx 12 xx 10
- uxtb16 $b0, $a0 ; xx 02 xx 00
- uxtb16 $b3, $a3 ; xx 32 xx 30
- uxtb16 $b2, $a2 ; xx 22 xx 20
- orr $b1, $b0, $b1, lsl #8 ; 12 02 10 00
- orr $b3, $b2, $b3, lsl #8 ; 32 22 30 20
-
- uxtb16 $a1, $a1, ror #8 ; xx 13 xx 11
- uxtb16 $a3, $a3, ror #8 ; xx 33 xx 31
- uxtb16 $a0, $a0, ror #8 ; xx 03 xx 01
- uxtb16 $a2, $a2, ror #8 ; xx 23 xx 21
- orr $a0, $a0, $a1, lsl #8 ; 13 03 11 01
- orr $a2, $a2, $a3, lsl #8 ; 33 23 31 21
-
- pkhtb $b2, $b3, $b1, asr #16 ; 32 22 12 02 -- p1
- pkhbt $b0, $b1, $b3, lsl #16 ; 30 20 10 00 -- p3
-
- pkhtb $b3, $a2, $a0, asr #16 ; 33 23 13 03 -- p0
- pkhbt $b1, $a0, $a2, lsl #16 ; 31 21 11 01 -- p2
- MEND
-
-
-
-src RN r0
-pstep RN r1
-
-;r0 unsigned char *src_ptr,
-;r1 int src_pixel_step,
-;r2 const char *blimit
-
-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
-|vp8_loop_filter_simple_horizontal_edge_armv6| PROC
-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
- stmdb sp!, {r4 - r11, lr}
-
- ldrb r12, [r2] ; blimit
- ldr r3, [src, -pstep, lsl #1] ; p1
- ldr r4, [src, -pstep] ; p0
- ldr r5, [src] ; q0
- ldr r6, [src, pstep] ; q1
- orr r12, r12, r12, lsl #8 ; blimit
- ldr r2, c0x80808080
- orr r12, r12, r12, lsl #16 ; blimit
- mov r9, #4 ; double the count. we're doing 4 at a time
- mov lr, #0 ; need 0 in a couple places
-
-|simple_hnext8|
- ; vp8_simple_filter_mask()
-
- uqsub8 r7, r3, r6 ; p1 - q1
- uqsub8 r8, r6, r3 ; q1 - p1
- uqsub8 r10, r4, r5 ; p0 - q0
- uqsub8 r11, r5, r4 ; q0 - p0
- orr r8, r8, r7 ; abs(p1 - q1)
- orr r10, r10, r11 ; abs(p0 - q0)
- uqadd8 r10, r10, r10 ; abs(p0 - q0) * 2
- uhadd8 r8, r8, lr ; abs(p1 - q2) >> 1
- uqadd8 r10, r10, r8 ; abs(p0 - q0)*2 + abs(p1 - q1)/2
- mvn r8, #0
- usub8 r10, r12, r10 ; compare to flimit. usub8 sets GE flags
- sel r10, r8, lr ; filter mask: F or 0
- cmp r10, #0
- beq simple_hskip_filter ; skip filtering if all masks are 0x00
-
- ;vp8_simple_filter()
-
- eor r3, r3, r2 ; p1 offset to convert to a signed value
- eor r6, r6, r2 ; q1 offset to convert to a signed value
- eor r4, r4, r2 ; p0 offset to convert to a signed value
- eor r5, r5, r2 ; q0 offset to convert to a signed value
-
- qsub8 r3, r3, r6 ; vp8_filter = p1 - q1
- qsub8 r6, r5, r4 ; q0 - p0
- qadd8 r3, r3, r6 ; += q0 - p0
- ldr r7, c0x04040404
- qadd8 r3, r3, r6 ; += q0 - p0
- ldr r8, c0x03030303
- qadd8 r3, r3, r6 ; vp8_filter = p1-q1 + 3*(q0-p0))
- ;STALL
- and r3, r3, r10 ; vp8_filter &= mask
-
- qadd8 r7 , r3 , r7 ; Filter1 = vp8_filter + 4
- qadd8 r8 , r3 , r8 ; Filter2 = vp8_filter + 3
-
- shadd8 r7 , r7 , lr
- shadd8 r8 , r8 , lr
- shadd8 r7 , r7 , lr
- shadd8 r8 , r8 , lr
- shadd8 r7 , r7 , lr ; Filter1 >>= 3
- shadd8 r8 , r8 , lr ; Filter2 >>= 3
-
- qsub8 r5 ,r5, r7 ; u = q0 - Filter1
- qadd8 r4, r4, r8 ; u = p0 + Filter2
- eor r5, r5, r2 ; *oq0 = u^0x80
- str r5, [src] ; store oq0 result
- eor r4, r4, r2 ; *op0 = u^0x80
- str r4, [src, -pstep] ; store op0 result
-
-|simple_hskip_filter|
- subs r9, r9, #1
- addne src, src, #4 ; next row
-
- ldrne r3, [src, -pstep, lsl #1] ; p1
- ldrne r4, [src, -pstep] ; p0
- ldrne r5, [src] ; q0
- ldrne r6, [src, pstep] ; q1
-
- bne simple_hnext8
-
- ldmia sp!, {r4 - r11, pc}
- ENDP ; |vp8_loop_filter_simple_horizontal_edge_armv6|
-
-
-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
-|vp8_loop_filter_simple_vertical_edge_armv6| PROC
-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
- stmdb sp!, {r4 - r11, lr}
-
- ldrb r12, [r2] ; r12: blimit
- ldr r2, c0x80808080
- orr r12, r12, r12, lsl #8
-
- ; load soure data to r7, r8, r9, r10
- ldrh r3, [src, #-2]
- pld [src, #23] ; preload for next block
- ldrh r4, [src], pstep
- orr r12, r12, r12, lsl #16
-
- ldrh r5, [src, #-2]
- pld [src, #23]
- ldrh r6, [src], pstep
-
- pkhbt r7, r3, r4, lsl #16
-
- ldrh r3, [src, #-2]
- pld [src, #23]
- ldrh r4, [src], pstep
-
- pkhbt r8, r5, r6, lsl #16
-
- ldrh r5, [src, #-2]
- pld [src, #23]
- ldrh r6, [src], pstep
- mov r11, #4 ; double the count. we're doing 4 at a time
-
-|simple_vnext8|
- ; vp8_simple_filter_mask() function
- pkhbt r9, r3, r4, lsl #16
- pkhbt r10, r5, r6, lsl #16
-
- ;transpose r7, r8, r9, r10 to r3, r4, r5, r6
- TRANSPOSE_MATRIX r7, r8, r9, r10, r3, r4, r5, r6
-
- uqsub8 r7, r3, r6 ; p1 - q1
- uqsub8 r8, r6, r3 ; q1 - p1
- uqsub8 r9, r4, r5 ; p0 - q0
- uqsub8 r10, r5, r4 ; q0 - p0
- orr r7, r7, r8 ; abs(p1 - q1)
- orr r9, r9, r10 ; abs(p0 - q0)
- mov r8, #0
- uqadd8 r9, r9, r9 ; abs(p0 - q0) * 2
- uhadd8 r7, r7, r8 ; abs(p1 - q1) / 2
- uqadd8 r7, r7, r9 ; abs(p0 - q0)*2 + abs(p1 - q1)/2
- mvn r10, #0 ; r10 == -1
-
- usub8 r7, r12, r7 ; compare to flimit
- sel lr, r10, r8 ; filter mask
-
- cmp lr, #0
- beq simple_vskip_filter ; skip filtering
-
- ;vp8_simple_filter() function
- eor r3, r3, r2 ; p1 offset to convert to a signed value
- eor r6, r6, r2 ; q1 offset to convert to a signed value
- eor r4, r4, r2 ; p0 offset to convert to a signed value
- eor r5, r5, r2 ; q0 offset to convert to a signed value
-
- qsub8 r3, r3, r6 ; vp8_filter = p1 - q1
- qsub8 r6, r5, r4 ; q0 - p0
-
- qadd8 r3, r3, r6 ; vp8_filter += q0 - p0
- ldr r9, c0x03030303 ; r9 = 3
-
- qadd8 r3, r3, r6 ; vp8_filter += q0 - p0
- ldr r7, c0x04040404
-
- qadd8 r3, r3, r6 ; vp8_filter = p1-q1 + 3*(q0-p0))
- ;STALL
- and r3, r3, lr ; vp8_filter &= mask
-
- qadd8 r9 , r3 , r9 ; Filter2 = vp8_filter + 3
- qadd8 r3 , r3 , r7 ; Filter1 = vp8_filter + 4
-
- shadd8 r9 , r9 , r8
- shadd8 r3 , r3 , r8
- shadd8 r9 , r9 , r8
- shadd8 r3 , r3 , r8
- shadd8 r9 , r9 , r8 ; Filter2 >>= 3
- shadd8 r3 , r3 , r8 ; Filter1 >>= 3
-
- ;calculate output
- sub src, src, pstep, lsl #2
-
- qadd8 r4, r4, r9 ; u = p0 + Filter2
- qsub8 r5, r5, r3 ; u = q0 - Filter1
- eor r4, r4, r2 ; *op0 = u^0x80
- eor r5, r5, r2 ; *oq0 = u^0x80
-
- strb r4, [src, #-1] ; store the result
- mov r4, r4, lsr #8
- strb r5, [src], pstep
- mov r5, r5, lsr #8
-
- strb r4, [src, #-1]
- mov r4, r4, lsr #8
- strb r5, [src], pstep
- mov r5, r5, lsr #8
-
- strb r4, [src, #-1]
- mov r4, r4, lsr #8
- strb r5, [src], pstep
- mov r5, r5, lsr #8
-
- strb r4, [src, #-1]
- strb r5, [src], pstep
-
-|simple_vskip_filter|
- subs r11, r11, #1
-
- ; load soure data to r7, r8, r9, r10
- ldrneh r3, [src, #-2]
- pld [src, #23] ; preload for next block
- ldrneh r4, [src], pstep
-
- ldrneh r5, [src, #-2]
- pld [src, #23]
- ldrneh r6, [src], pstep
-
- pkhbt r7, r3, r4, lsl #16
-
- ldrneh r3, [src, #-2]
- pld [src, #23]
- ldrneh r4, [src], pstep
-
- pkhbt r8, r5, r6, lsl #16
-
- ldrneh r5, [src, #-2]
- pld [src, #23]
- ldrneh r6, [src], pstep
-
- bne simple_vnext8
-
- ldmia sp!, {r4 - r11, pc}
- ENDP ; |vp8_loop_filter_simple_vertical_edge_armv6|
-
-; Constant Pool
-c0x80808080 DCD 0x80808080
-c0x03030303 DCD 0x03030303
-c0x04040404 DCD 0x04040404
-
- END
diff --git a/vp8/common/arm/armv6/sixtappredict8x4_v6.asm b/vp8/common/arm/armv6/sixtappredict8x4_v6.asm
deleted file mode 100644
index e81aef53d..000000000
--- a/vp8/common/arm/armv6/sixtappredict8x4_v6.asm
+++ /dev/null
@@ -1,273 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_sixtap_predict8x4_armv6|
-
- AREA |.text|, CODE, READONLY ; name this block of code
-;-------------------------------------
-; r0 unsigned char *src_ptr,
-; r1 int src_pixels_per_line,
-; r2 int xoffset,
-; r3 int yoffset,
-; stack unsigned char *dst_ptr,
-; stack int dst_pitch
-;-------------------------------------
-;note: In first pass, store the result in transpose(8linesx9columns) on stack. Temporary stack size is 184.
-;Line width is 20 that is 9 short data plus 2 to make it 4bytes aligned. In second pass, load data from stack,
-;and the result is stored in transpose.
-|vp8_sixtap_predict8x4_armv6| PROC
- stmdb sp!, {r4 - r11, lr}
- str r3, [sp, #-184]! ;reserve space on stack for temporary storage, store yoffset
-
- cmp r2, #0 ;skip first_pass filter if xoffset=0
- add lr, sp, #4 ;point to temporary buffer
- beq skip_firstpass_filter
-
-;first-pass filter
- adr r12, filter8_coeff
- sub r0, r0, r1, lsl #1
-
- add r3, r1, #10 ; preload next low
- pld [r0, r3]
-
- add r2, r12, r2, lsl #4 ;calculate filter location
- add r0, r0, #3 ;adjust src only for loading convinience
-
- ldr r3, [r2] ; load up packed filter coefficients
- ldr r4, [r2, #4]
- ldr r5, [r2, #8]
-
- mov r2, #0x90000 ; height=9 is top part of counter
-
- sub r1, r1, #8
-
-|first_pass_hloop_v6|
- ldrb r6, [r0, #-5] ; load source data
- ldrb r7, [r0, #-4]
- ldrb r8, [r0, #-3]
- ldrb r9, [r0, #-2]
- ldrb r10, [r0, #-1]
-
- orr r2, r2, #0x4 ; construct loop counter. width=8=4x2
-
- pkhbt r6, r6, r7, lsl #16 ; r7 | r6
- pkhbt r7, r7, r8, lsl #16 ; r8 | r7
-
- pkhbt r8, r8, r9, lsl #16 ; r9 | r8
- pkhbt r9, r9, r10, lsl #16 ; r10 | r9
-
-|first_pass_wloop_v6|
- smuad r11, r6, r3 ; vp8_filter[0], vp8_filter[1]
- smuad r12, r7, r3
-
- ldrb r6, [r0], #1
-
- smlad r11, r8, r4, r11 ; vp8_filter[2], vp8_filter[3]
- ldrb r7, [r0], #1
- smlad r12, r9, r4, r12
-
- pkhbt r10, r10, r6, lsl #16 ; r10 | r9
- pkhbt r6, r6, r7, lsl #16 ; r11 | r10
- smlad r11, r10, r5, r11 ; vp8_filter[4], vp8_filter[5]
- smlad r12, r6, r5, r12
-
- sub r2, r2, #1
-
- add r11, r11, #0x40 ; round_shift_and_clamp
- tst r2, #0xff ; test loop counter
- usat r11, #8, r11, asr #7
- add r12, r12, #0x40
- strh r11, [lr], #20 ; result is transposed and stored, which
- usat r12, #8, r12, asr #7
-
- strh r12, [lr], #20
-
- movne r11, r6
- movne r12, r7
-
- movne r6, r8
- movne r7, r9
- movne r8, r10
- movne r9, r11
- movne r10, r12
-
- bne first_pass_wloop_v6
-
- ;;add r9, ppl, #30 ; attempt to load 2 adjacent cache lines
- ;;IF ARCHITECTURE=6
- ;pld [src, ppl]
- ;;pld [src, r9]
- ;;ENDIF
-
- subs r2, r2, #0x10000
-
- sub lr, lr, #158
-
- add r0, r0, r1 ; move to next input line
-
- add r11, r1, #18 ; preload next low. adding back block width(=8), which is subtracted earlier
- pld [r0, r11]
-
- bne first_pass_hloop_v6
-
-;second pass filter
-secondpass_filter
- ldr r3, [sp], #4 ; load back yoffset
- ldr r0, [sp, #216] ; load dst address from stack 180+36
- ldr r1, [sp, #220] ; load dst stride from stack 180+40
-
- cmp r3, #0
- beq skip_secondpass_filter
-
- adr r12, filter8_coeff
- add lr, r12, r3, lsl #4 ;calculate filter location
-
- mov r2, #0x00080000
-
- ldr r3, [lr] ; load up packed filter coefficients
- ldr r4, [lr, #4]
- ldr r5, [lr, #8]
-
- pkhbt r12, r4, r3 ; pack the filter differently
- pkhbt r11, r5, r4
-
-second_pass_hloop_v6
- ldr r6, [sp] ; load the data
- ldr r7, [sp, #4]
-
- orr r2, r2, #2 ; loop counter
-
-second_pass_wloop_v6
- smuad lr, r3, r6 ; apply filter
- smulbt r10, r3, r6
-
- ldr r8, [sp, #8]
-
- smlad lr, r4, r7, lr
- smladx r10, r12, r7, r10
-
- ldrh r9, [sp, #12]
-
- smlad lr, r5, r8, lr
- smladx r10, r11, r8, r10
-
- add sp, sp, #4
- smlatb r10, r5, r9, r10
-
- sub r2, r2, #1
-
- add lr, lr, #0x40 ; round_shift_and_clamp
- tst r2, #0xff
- usat lr, #8, lr, asr #7
- add r10, r10, #0x40
- strb lr, [r0], r1 ; the result is transposed back and stored
- usat r10, #8, r10, asr #7
-
- strb r10, [r0],r1
-
- movne r6, r7
- movne r7, r8
-
- bne second_pass_wloop_v6
-
- subs r2, r2, #0x10000
- add sp, sp, #12 ; updata src for next loop (20-8)
- sub r0, r0, r1, lsl #2
- add r0, r0, #1
-
- bne second_pass_hloop_v6
-
- add sp, sp, #20
- ldmia sp!, {r4 - r11, pc}
-
-;--------------------
-skip_firstpass_filter
- sub r0, r0, r1, lsl #1
- sub r1, r1, #8
- mov r2, #9
-
-skip_firstpass_hloop
- ldrb r4, [r0], #1 ; load data
- subs r2, r2, #1
- ldrb r5, [r0], #1
- strh r4, [lr], #20 ; store it to immediate buffer
- ldrb r6, [r0], #1 ; load data
- strh r5, [lr], #20
- ldrb r7, [r0], #1
- strh r6, [lr], #20
- ldrb r8, [r0], #1
- strh r7, [lr], #20
- ldrb r9, [r0], #1
- strh r8, [lr], #20
- ldrb r10, [r0], #1
- strh r9, [lr], #20
- ldrb r11, [r0], #1
- strh r10, [lr], #20
- add r0, r0, r1 ; move to next input line
- strh r11, [lr], #20
-
- sub lr, lr, #158 ; move over to next column
- bne skip_firstpass_hloop
-
- b secondpass_filter
-
-;--------------------
-skip_secondpass_filter
- mov r2, #8
- add sp, sp, #4 ;start from src[0] instead of src[-2]
-
-skip_secondpass_hloop
- ldr r6, [sp], #4
- subs r2, r2, #1
- ldr r8, [sp], #4
-
- mov r7, r6, lsr #16 ; unpack
- strb r6, [r0], r1
- mov r9, r8, lsr #16
- strb r7, [r0], r1
- add sp, sp, #12 ; 20-8
- strb r8, [r0], r1
- strb r9, [r0], r1
-
- sub r0, r0, r1, lsl #2
- add r0, r0, #1
-
- bne skip_secondpass_hloop
-
- add sp, sp, #16 ; 180 - (160 +4)
-
- ldmia sp!, {r4 - r11, pc}
-
- ENDP
-
-;-----------------
-;One word each is reserved. Label filter_coeff can be used to access the data.
-;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
-filter8_coeff
- DCD 0x00000000, 0x00000080, 0x00000000, 0x00000000
- DCD 0xfffa0000, 0x000c007b, 0x0000ffff, 0x00000000
- DCD 0xfff50002, 0x0024006c, 0x0001fff8, 0x00000000
- DCD 0xfff70000, 0x0032005d, 0x0000fffa, 0x00000000
- DCD 0xfff00003, 0x004d004d, 0x0003fff0, 0x00000000
- DCD 0xfffa0000, 0x005d0032, 0x0000fff7, 0x00000000
- DCD 0xfff80001, 0x006c0024, 0x0002fff5, 0x00000000
- DCD 0xffff0000, 0x007b000c, 0x0000fffa, 0x00000000
-
- ;DCD 0, 0, 128, 0, 0, 0
- ;DCD 0, -6, 123, 12, -1, 0
- ;DCD 2, -11, 108, 36, -8, 1
- ;DCD 0, -9, 93, 50, -6, 0
- ;DCD 3, -16, 77, 77, -16, 3
- ;DCD 0, -6, 50, 93, -9, 0
- ;DCD 1, -8, 36, 108, -11, 2
- ;DCD 0, -1, 12, 123, -6, 0
-
- END
diff --git a/vp8/common/arm/bilinearfilter_arm.c b/vp8/common/arm/bilinearfilter_arm.c
deleted file mode 100644
index d02a8749b..000000000
--- a/vp8/common/arm/bilinearfilter_arm.c
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vpx_config.h"
-#include "vp8_rtcd.h"
-#include <math.h>
-#include "vp8/common/filter.h"
-#include "bilinearfilter_arm.h"
-
-void vp8_filter_block2d_bil_armv6(unsigned char *src_ptr,
- unsigned char *dst_ptr,
- unsigned int src_pitch,
- unsigned int dst_pitch, const short *HFilter,
- const short *VFilter, int Width, int Height) {
- unsigned short FData[36 * 16]; /* Temp data buffer used in filtering */
-
- /* First filter 1-D horizontally... */
- vp8_filter_block2d_bil_first_pass_armv6(src_ptr, FData, src_pitch, Height + 1,
- Width, HFilter);
-
- /* then 1-D vertically... */
- vp8_filter_block2d_bil_second_pass_armv6(FData, dst_ptr, dst_pitch, Height,
- Width, VFilter);
-}
-
-void vp8_bilinear_predict4x4_armv6(unsigned char *src_ptr,
- int src_pixels_per_line, int xoffset,
- int yoffset, unsigned char *dst_ptr,
- int dst_pitch) {
- const short *HFilter;
- const short *VFilter;
-
- HFilter = vp8_bilinear_filters[xoffset];
- VFilter = vp8_bilinear_filters[yoffset];
-
- vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch,
- HFilter, VFilter, 4, 4);
-}
-
-void vp8_bilinear_predict8x8_armv6(unsigned char *src_ptr,
- int src_pixels_per_line, int xoffset,
- int yoffset, unsigned char *dst_ptr,
- int dst_pitch) {
- const short *HFilter;
- const short *VFilter;
-
- HFilter = vp8_bilinear_filters[xoffset];
- VFilter = vp8_bilinear_filters[yoffset];
-
- vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch,
- HFilter, VFilter, 8, 8);
-}
-
-void vp8_bilinear_predict8x4_armv6(unsigned char *src_ptr,
- int src_pixels_per_line, int xoffset,
- int yoffset, unsigned char *dst_ptr,
- int dst_pitch) {
- const short *HFilter;
- const short *VFilter;
-
- HFilter = vp8_bilinear_filters[xoffset];
- VFilter = vp8_bilinear_filters[yoffset];
-
- vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch,
- HFilter, VFilter, 8, 4);
-}
-
-void vp8_bilinear_predict16x16_armv6(unsigned char *src_ptr,
- int src_pixels_per_line, int xoffset,
- int yoffset, unsigned char *dst_ptr,
- int dst_pitch) {
- const short *HFilter;
- const short *VFilter;
-
- HFilter = vp8_bilinear_filters[xoffset];
- VFilter = vp8_bilinear_filters[yoffset];
-
- vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch,
- HFilter, VFilter, 16, 16);
-}
diff --git a/vp8/common/arm/bilinearfilter_arm.h b/vp8/common/arm/bilinearfilter_arm.h
deleted file mode 100644
index c1c70a362..000000000
--- a/vp8/common/arm/bilinearfilter_arm.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * Copyright (c) 2011 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VP8_COMMON_ARM_BILINEARFILTER_ARM_H_
-#define VP8_COMMON_ARM_BILINEARFILTER_ARM_H_
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-extern void vp8_filter_block2d_bil_first_pass_armv6(
- const unsigned char *src_ptr, unsigned short *dst_ptr,
- unsigned int src_pitch, unsigned int height, unsigned int width,
- const short *vp8_filter);
-
-extern void vp8_filter_block2d_bil_second_pass_armv6(
- const unsigned short *src_ptr, unsigned char *dst_ptr, int dst_pitch,
- unsigned int height, unsigned int width, const short *vp8_filter);
-
-#ifdef __cplusplus
-} // extern "C"
-#endif
-
-#endif // VP8_COMMON_ARM_BILINEARFILTER_ARM_H_
diff --git a/vp8/common/arm/dequantize_arm.c b/vp8/common/arm/dequantize_arm.c
deleted file mode 100644
index 3b6b1820e..000000000
--- a/vp8/common/arm/dequantize_arm.c
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vpx_config.h"
-#include "vp8/common/blockd.h"
-
-#if HAVE_MEDIA
-extern void vp8_dequantize_b_loop_v6(short *Q, short *DQC, short *DQ);
-
-void vp8_dequantize_b_v6(BLOCKD *d, short *DQC) {
- short *DQ = d->dqcoeff;
- short *Q = d->qcoeff;
-
- vp8_dequantize_b_loop_v6(Q, DQC, DQ);
-}
-#endif
diff --git a/vp8/common/arm/filter_arm.c b/vp8/common/arm/filter_arm.c
deleted file mode 100644
index 6d547d686..000000000
--- a/vp8/common/arm/filter_arm.c
+++ /dev/null
@@ -1,176 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vpx_config.h"
-#include "vp8_rtcd.h"
-#include <math.h>
-#include "vp8/common/filter.h"
-#include "vpx_ports/mem.h"
-
-extern void vp8_filter_block2d_first_pass_armv6(
- unsigned char *src_ptr, short *output_ptr, unsigned int src_pixels_per_line,
- unsigned int output_width, unsigned int output_height,
- const short *vp8_filter);
-
-// 8x8
-extern void vp8_filter_block2d_first_pass_8x8_armv6(
- unsigned char *src_ptr, short *output_ptr, unsigned int src_pixels_per_line,
- unsigned int output_width, unsigned int output_height,
- const short *vp8_filter);
-
-// 16x16
-extern void vp8_filter_block2d_first_pass_16x16_armv6(
- unsigned char *src_ptr, short *output_ptr, unsigned int src_pixels_per_line,
- unsigned int output_width, unsigned int output_height,
- const short *vp8_filter);
-
-extern void vp8_filter_block2d_second_pass_armv6(short *src_ptr,
- unsigned char *output_ptr,
- unsigned int output_pitch,
- unsigned int cnt,
- const short *vp8_filter);
-
-extern void vp8_filter4_block2d_second_pass_armv6(short *src_ptr,
- unsigned char *output_ptr,
- unsigned int output_pitch,
- unsigned int cnt,
- const short *vp8_filter);
-
-extern void vp8_filter_block2d_first_pass_only_armv6(
- unsigned char *src_ptr, unsigned char *output_ptr,
- unsigned int src_pixels_per_line, unsigned int cnt,
- unsigned int output_pitch, const short *vp8_filter);
-
-extern void vp8_filter_block2d_second_pass_only_armv6(
- unsigned char *src_ptr, unsigned char *output_ptr,
- unsigned int src_pixels_per_line, unsigned int cnt,
- unsigned int output_pitch, const short *vp8_filter);
-
-#if HAVE_MEDIA
-void vp8_sixtap_predict4x4_armv6(unsigned char *src_ptr,
- int src_pixels_per_line, int xoffset,
- int yoffset, unsigned char *dst_ptr,
- int dst_pitch) {
- const short *HFilter;
- const short *VFilter;
- DECLARE_ALIGNED(4, short,
- FData[12 * 4]); /* Temp data buffer used in filtering */
-
- HFilter = vp8_sub_pel_filters[xoffset]; /* 6 tap */
- VFilter = vp8_sub_pel_filters[yoffset]; /* 6 tap */
-
- /* Vfilter is null. First pass only */
- if (xoffset && !yoffset) {
- /*vp8_filter_block2d_first_pass_armv6 ( src_ptr, FData+2,
- src_pixels_per_line, 4, 4, HFilter );
- vp8_filter_block2d_second_pass_armv6 ( FData+2, dst_ptr, dst_pitch, 4,
- VFilter );*/
-
- vp8_filter_block2d_first_pass_only_armv6(
- src_ptr, dst_ptr, src_pixels_per_line, 4, dst_pitch, HFilter);
- }
- /* Hfilter is null. Second pass only */
- else if (!xoffset && yoffset) {
- vp8_filter_block2d_second_pass_only_armv6(
- src_ptr, dst_ptr, src_pixels_per_line, 4, dst_pitch, VFilter);
- } else {
- /* Vfilter is a 4 tap filter */
- if (yoffset & 0x1) {
- vp8_filter_block2d_first_pass_armv6(src_ptr - src_pixels_per_line,
- FData + 1, src_pixels_per_line, 4, 7,
- HFilter);
- vp8_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 4,
- VFilter);
- }
- /* Vfilter is 6 tap filter */
- else {
- vp8_filter_block2d_first_pass_armv6(src_ptr - (2 * src_pixels_per_line),
- FData, src_pixels_per_line, 4, 9,
- HFilter);
- vp8_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 4,
- VFilter);
- }
- }
-}
-
-void vp8_sixtap_predict8x8_armv6(unsigned char *src_ptr,
- int src_pixels_per_line, int xoffset,
- int yoffset, unsigned char *dst_ptr,
- int dst_pitch) {
- const short *HFilter;
- const short *VFilter;
- DECLARE_ALIGNED(4, short,
- FData[16 * 8]); /* Temp data buffer used in filtering */
-
- HFilter = vp8_sub_pel_filters[xoffset]; /* 6 tap */
- VFilter = vp8_sub_pel_filters[yoffset]; /* 6 tap */
-
- if (xoffset && !yoffset) {
- vp8_filter_block2d_first_pass_only_armv6(
- src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, HFilter);
- }
- /* Hfilter is null. Second pass only */
- else if (!xoffset && yoffset) {
- vp8_filter_block2d_second_pass_only_armv6(
- src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, VFilter);
- } else {
- if (yoffset & 0x1) {
- vp8_filter_block2d_first_pass_8x8_armv6(src_ptr - src_pixels_per_line,
- FData + 1, src_pixels_per_line, 8,
- 11, HFilter);
- vp8_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 8,
- VFilter);
- } else {
- vp8_filter_block2d_first_pass_8x8_armv6(
- src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 8,
- 13, HFilter);
- vp8_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 8,
- VFilter);
- }
- }
-}
-
-void vp8_sixtap_predict16x16_armv6(unsigned char *src_ptr,
- int src_pixels_per_line, int xoffset,
- int yoffset, unsigned char *dst_ptr,
- int dst_pitch) {
- const short *HFilter;
- const short *VFilter;
- DECLARE_ALIGNED(4, short,
- FData[24 * 16]); /* Temp data buffer used in filtering */
-
- HFilter = vp8_sub_pel_filters[xoffset]; /* 6 tap */
- VFilter = vp8_sub_pel_filters[yoffset]; /* 6 tap */
-
- if (xoffset && !yoffset) {
- vp8_filter_block2d_first_pass_only_armv6(
- src_ptr, dst_ptr, src_pixels_per_line, 16, dst_pitch, HFilter);
- }
- /* Hfilter is null. Second pass only */
- else if (!xoffset && yoffset) {
- vp8_filter_block2d_second_pass_only_armv6(
- src_ptr, dst_ptr, src_pixels_per_line, 16, dst_pitch, VFilter);
- } else {
- if (yoffset & 0x1) {
- vp8_filter_block2d_first_pass_16x16_armv6(src_ptr - src_pixels_per_line,
- FData + 1, src_pixels_per_line,
- 16, 19, HFilter);
- vp8_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 16,
- VFilter);
- } else {
- vp8_filter_block2d_first_pass_16x16_armv6(
- src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 16,
- 21, HFilter);
- vp8_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 16,
- VFilter);
- }
- }
-}
-#endif
diff --git a/vp8/common/arm/loopfilter_arm.c b/vp8/common/arm/loopfilter_arm.c
index 36fdc8a14..e12f65a04 100644
--- a/vp8/common/arm/loopfilter_arm.c
+++ b/vp8/common/arm/loopfilter_arm.c
@@ -13,18 +13,6 @@
#include "vp8/common/loopfilter.h"
#include "vp8/common/onyxc_int.h"
-#define prototype_loopfilter(sym) \
- void sym(unsigned char *src, int pitch, const unsigned char *blimit, \
- const unsigned char *limit, const unsigned char *thresh, int count)
-
-#if HAVE_MEDIA
-extern prototype_loopfilter(vp8_loop_filter_horizontal_edge_armv6);
-extern prototype_loopfilter(vp8_loop_filter_vertical_edge_armv6);
-extern prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_armv6);
-extern prototype_loopfilter(vp8_mbloop_filter_vertical_edge_armv6);
-#endif
-
-#if HAVE_NEON
typedef void loopfilter_y_neon(unsigned char *src, int pitch,
unsigned char blimit, unsigned char limit,
unsigned char thresh);
@@ -41,101 +29,7 @@ extern loopfilter_y_neon vp8_mbloop_filter_horizontal_edge_y_neon;
extern loopfilter_y_neon vp8_mbloop_filter_vertical_edge_y_neon;
extern loopfilter_uv_neon vp8_mbloop_filter_horizontal_edge_uv_neon;
extern loopfilter_uv_neon vp8_mbloop_filter_vertical_edge_uv_neon;
-#endif
-
-#if HAVE_MEDIA
-/* ARMV6/MEDIA loopfilter functions*/
-/* Horizontal MB filtering */
-void vp8_loop_filter_mbh_armv6(unsigned char *y_ptr, unsigned char *u_ptr,
- unsigned char *v_ptr, int y_stride,
- int uv_stride, loop_filter_info *lfi) {
- vp8_mbloop_filter_horizontal_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim,
- lfi->hev_thr, 2);
-
- if (u_ptr)
- vp8_mbloop_filter_horizontal_edge_armv6(u_ptr, uv_stride, lfi->mblim,
- lfi->lim, lfi->hev_thr, 1);
-
- if (v_ptr)
- vp8_mbloop_filter_horizontal_edge_armv6(v_ptr, uv_stride, lfi->mblim,
- lfi->lim, lfi->hev_thr, 1);
-}
-
-/* Vertical MB Filtering */
-void vp8_loop_filter_mbv_armv6(unsigned char *y_ptr, unsigned char *u_ptr,
- unsigned char *v_ptr, int y_stride,
- int uv_stride, loop_filter_info *lfi) {
- vp8_mbloop_filter_vertical_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim,
- lfi->hev_thr, 2);
-
- if (u_ptr)
- vp8_mbloop_filter_vertical_edge_armv6(u_ptr, uv_stride, lfi->mblim,
- lfi->lim, lfi->hev_thr, 1);
-
- if (v_ptr)
- vp8_mbloop_filter_vertical_edge_armv6(v_ptr, uv_stride, lfi->mblim,
- lfi->lim, lfi->hev_thr, 1);
-}
-
-/* Horizontal B Filtering */
-void vp8_loop_filter_bh_armv6(unsigned char *y_ptr, unsigned char *u_ptr,
- unsigned char *v_ptr, int y_stride, int uv_stride,
- loop_filter_info *lfi) {
- vp8_loop_filter_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride,
- lfi->blim, lfi->lim, lfi->hev_thr, 2);
- vp8_loop_filter_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride,
- lfi->blim, lfi->lim, lfi->hev_thr, 2);
- vp8_loop_filter_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride,
- lfi->blim, lfi->lim, lfi->hev_thr, 2);
-
- if (u_ptr)
- vp8_loop_filter_horizontal_edge_armv6(u_ptr + 4 * uv_stride, uv_stride,
- lfi->blim, lfi->lim, lfi->hev_thr, 1);
-
- if (v_ptr)
- vp8_loop_filter_horizontal_edge_armv6(v_ptr + 4 * uv_stride, uv_stride,
- lfi->blim, lfi->lim, lfi->hev_thr, 1);
-}
-
-void vp8_loop_filter_bhs_armv6(unsigned char *y_ptr, int y_stride,
- const unsigned char *blimit) {
- vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride,
- blimit);
- vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride,
- blimit);
- vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride,
- blimit);
-}
-
-/* Vertical B Filtering */
-void vp8_loop_filter_bv_armv6(unsigned char *y_ptr, unsigned char *u_ptr,
- unsigned char *v_ptr, int y_stride, int uv_stride,
- loop_filter_info *lfi) {
- vp8_loop_filter_vertical_edge_armv6(y_ptr + 4, y_stride, lfi->blim, lfi->lim,
- lfi->hev_thr, 2);
- vp8_loop_filter_vertical_edge_armv6(y_ptr + 8, y_stride, lfi->blim, lfi->lim,
- lfi->hev_thr, 2);
- vp8_loop_filter_vertical_edge_armv6(y_ptr + 12, y_stride, lfi->blim, lfi->lim,
- lfi->hev_thr, 2);
-
- if (u_ptr)
- vp8_loop_filter_vertical_edge_armv6(u_ptr + 4, uv_stride, lfi->blim,
- lfi->lim, lfi->hev_thr, 1);
-
- if (v_ptr)
- vp8_loop_filter_vertical_edge_armv6(v_ptr + 4, uv_stride, lfi->blim,
- lfi->lim, lfi->hev_thr, 1);
-}
-
-void vp8_loop_filter_bvs_armv6(unsigned char *y_ptr, int y_stride,
- const unsigned char *blimit) {
- vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 4, y_stride, blimit);
- vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 8, y_stride, blimit);
- vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 12, y_stride, blimit);
-}
-#endif
-#if HAVE_NEON
/* NEON loopfilter functions */
/* Horizontal MB filtering */
void vp8_loop_filter_mbh_neon(unsigned char *y_ptr, unsigned char *u_ptr,
@@ -205,4 +99,3 @@ void vp8_loop_filter_bv_neon(unsigned char *y_ptr, unsigned char *u_ptr,
vp8_loop_filter_vertical_edge_uv_neon(u_ptr + 4, uv_stride, blim, lim,
hev_thr, v_ptr + 4);
}
-#endif
diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl
index a440352f4..b58f8e7af 100644
--- a/vp8/common/rtcd_defs.pl
+++ b/vp8/common/rtcd_defs.pl
@@ -29,81 +29,69 @@ $vp8_clear_system_state_mmx=vpx_reset_mmx_state;
# Dequant
#
add_proto qw/void vp8_dequantize_b/, "struct blockd*, short *dqc";
-specialize qw/vp8_dequantize_b mmx media neon msa/;
-$vp8_dequantize_b_media=vp8_dequantize_b_v6;
+specialize qw/vp8_dequantize_b mmx neon msa/;
add_proto qw/void vp8_dequant_idct_add/, "short *input, short *dq, unsigned char *output, int stride";
-specialize qw/vp8_dequant_idct_add mmx media neon dspr2 msa/;
-$vp8_dequant_idct_add_media=vp8_dequant_idct_add_v6;
+specialize qw/vp8_dequant_idct_add mmx neon dspr2 msa/;
$vp8_dequant_idct_add_dspr2=vp8_dequant_idct_add_dspr2;
add_proto qw/void vp8_dequant_idct_add_y_block/, "short *q, short *dq, unsigned char *dst, int stride, char *eobs";
-specialize qw/vp8_dequant_idct_add_y_block mmx sse2 media neon dspr2 msa/;
-$vp8_dequant_idct_add_y_block_media=vp8_dequant_idct_add_y_block_v6;
+specialize qw/vp8_dequant_idct_add_y_block mmx sse2 neon dspr2 msa/;
$vp8_dequant_idct_add_y_block_dspr2=vp8_dequant_idct_add_y_block_dspr2;
add_proto qw/void vp8_dequant_idct_add_uv_block/, "short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs";
-specialize qw/vp8_dequant_idct_add_uv_block mmx sse2 media neon dspr2 msa/;
-$vp8_dequant_idct_add_uv_block_media=vp8_dequant_idct_add_uv_block_v6;
+specialize qw/vp8_dequant_idct_add_uv_block mmx sse2 neon dspr2 msa/;
$vp8_dequant_idct_add_y_block_dspr2=vp8_dequant_idct_add_y_block_dspr2;
#
# Loopfilter
#
add_proto qw/void vp8_loop_filter_mbv/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
-specialize qw/vp8_loop_filter_mbv mmx sse2 media neon dspr2 msa/;
-$vp8_loop_filter_mbv_media=vp8_loop_filter_mbv_armv6;
+specialize qw/vp8_loop_filter_mbv mmx sse2 neon dspr2 msa/;
$vp8_loop_filter_mbv_dspr2=vp8_loop_filter_mbv_dspr2;
add_proto qw/void vp8_loop_filter_bv/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
-specialize qw/vp8_loop_filter_bv mmx sse2 media neon dspr2 msa/;
-$vp8_loop_filter_bv_media=vp8_loop_filter_bv_armv6;
+specialize qw/vp8_loop_filter_bv mmx sse2 neon dspr2 msa/;
$vp8_loop_filter_bv_dspr2=vp8_loop_filter_bv_dspr2;
add_proto qw/void vp8_loop_filter_mbh/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
-specialize qw/vp8_loop_filter_mbh mmx sse2 media neon dspr2 msa/;
-$vp8_loop_filter_mbh_media=vp8_loop_filter_mbh_armv6;
+specialize qw/vp8_loop_filter_mbh mmx sse2 neon dspr2 msa/;
$vp8_loop_filter_mbh_dspr2=vp8_loop_filter_mbh_dspr2;
add_proto qw/void vp8_loop_filter_bh/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
-specialize qw/vp8_loop_filter_bh mmx sse2 media neon dspr2 msa/;
-$vp8_loop_filter_bh_media=vp8_loop_filter_bh_armv6;
+specialize qw/vp8_loop_filter_bh mmx sse2 neon dspr2 msa/;
$vp8_loop_filter_bh_dspr2=vp8_loop_filter_bh_dspr2;
add_proto qw/void vp8_loop_filter_simple_mbv/, "unsigned char *y, int ystride, const unsigned char *blimit";
-specialize qw/vp8_loop_filter_simple_mbv mmx sse2 media neon msa/;
+specialize qw/vp8_loop_filter_simple_mbv mmx sse2 neon msa/;
$vp8_loop_filter_simple_mbv_c=vp8_loop_filter_simple_vertical_edge_c;
$vp8_loop_filter_simple_mbv_mmx=vp8_loop_filter_simple_vertical_edge_mmx;
$vp8_loop_filter_simple_mbv_sse2=vp8_loop_filter_simple_vertical_edge_sse2;
-$vp8_loop_filter_simple_mbv_media=vp8_loop_filter_simple_vertical_edge_armv6;
$vp8_loop_filter_simple_mbv_neon=vp8_loop_filter_mbvs_neon;
$vp8_loop_filter_simple_mbv_msa=vp8_loop_filter_simple_vertical_edge_msa;
add_proto qw/void vp8_loop_filter_simple_mbh/, "unsigned char *y, int ystride, const unsigned char *blimit";
-specialize qw/vp8_loop_filter_simple_mbh mmx sse2 media neon msa/;
+specialize qw/vp8_loop_filter_simple_mbh mmx sse2 neon msa/;
$vp8_loop_filter_simple_mbh_c=vp8_loop_filter_simple_horizontal_edge_c;
$vp8_loop_filter_simple_mbh_mmx=vp8_loop_filter_simple_horizontal_edge_mmx;
$vp8_loop_filter_simple_mbh_sse2=vp8_loop_filter_simple_horizontal_edge_sse2;
-$vp8_loop_filter_simple_mbh_media=vp8_loop_filter_simple_horizontal_edge_armv6;
$vp8_loop_filter_simple_mbh_neon=vp8_loop_filter_mbhs_neon;
$vp8_loop_filter_simple_mbh_msa=vp8_loop_filter_simple_horizontal_edge_msa;
add_proto qw/void vp8_loop_filter_simple_bv/, "unsigned char *y, int ystride, const unsigned char *blimit";
-specialize qw/vp8_loop_filter_simple_bv mmx sse2 media neon msa/;
+specialize qw/vp8_loop_filter_simple_bv mmx sse2 neon msa/;
$vp8_loop_filter_simple_bv_c=vp8_loop_filter_bvs_c;
$vp8_loop_filter_simple_bv_mmx=vp8_loop_filter_bvs_mmx;
$vp8_loop_filter_simple_bv_sse2=vp8_loop_filter_bvs_sse2;
-$vp8_loop_filter_simple_bv_media=vp8_loop_filter_bvs_armv6;
$vp8_loop_filter_simple_bv_neon=vp8_loop_filter_bvs_neon;
$vp8_loop_filter_simple_bv_msa=vp8_loop_filter_bvs_msa;
add_proto qw/void vp8_loop_filter_simple_bh/, "unsigned char *y, int ystride, const unsigned char *blimit";
-specialize qw/vp8_loop_filter_simple_bh mmx sse2 media neon msa/;
+specialize qw/vp8_loop_filter_simple_bh mmx sse2 neon msa/;
$vp8_loop_filter_simple_bh_c=vp8_loop_filter_bhs_c;
$vp8_loop_filter_simple_bh_mmx=vp8_loop_filter_bhs_mmx;
$vp8_loop_filter_simple_bh_sse2=vp8_loop_filter_bhs_sse2;
-$vp8_loop_filter_simple_bh_media=vp8_loop_filter_bhs_armv6;
$vp8_loop_filter_simple_bh_neon=vp8_loop_filter_bhs_neon;
$vp8_loop_filter_simple_bh_msa=vp8_loop_filter_bhs_msa;
@@ -112,8 +100,7 @@ $vp8_loop_filter_simple_bh_msa=vp8_loop_filter_bhs_msa;
#
#idct16
add_proto qw/void vp8_short_idct4x4llm/, "short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride";
-specialize qw/vp8_short_idct4x4llm mmx media neon dspr2 msa/;
-$vp8_short_idct4x4llm_media=vp8_short_idct4x4llm_v6_dual;
+specialize qw/vp8_short_idct4x4llm mmx neon dspr2 msa/;
$vp8_short_idct4x4llm_dspr2=vp8_short_idct4x4llm_dspr2;
#iwalsh1
@@ -124,32 +111,27 @@ $vp8_short_inv_walsh4x4_1_dspr2=vp8_short_inv_walsh4x4_1_dspr2;
#iwalsh16
add_proto qw/void vp8_short_inv_walsh4x4/, "short *input, short *output";
-specialize qw/vp8_short_inv_walsh4x4 mmx sse2 media neon dspr2 msa/;
-$vp8_short_inv_walsh4x4_media=vp8_short_inv_walsh4x4_v6;
+specialize qw/vp8_short_inv_walsh4x4 mmx sse2 neon dspr2 msa/;
$vp8_short_inv_walsh4x4_dspr2=vp8_short_inv_walsh4x4_dspr2;
#idct1_scalar_add
add_proto qw/void vp8_dc_only_idct_add/, "short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride";
-specialize qw/vp8_dc_only_idct_add mmx media neon dspr2 msa/;
-$vp8_dc_only_idct_add_media=vp8_dc_only_idct_add_v6;
+specialize qw/vp8_dc_only_idct_add mmx neon dspr2 msa/;
$vp8_dc_only_idct_add_dspr2=vp8_dc_only_idct_add_dspr2;
#
# RECON
#
add_proto qw/void vp8_copy_mem16x16/, "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch";
-specialize qw/vp8_copy_mem16x16 mmx sse2 media neon dspr2 msa/;
-$vp8_copy_mem16x16_media=vp8_copy_mem16x16_v6;
+specialize qw/vp8_copy_mem16x16 mmx sse2 neon dspr2 msa/;
$vp8_copy_mem16x16_dspr2=vp8_copy_mem16x16_dspr2;
add_proto qw/void vp8_copy_mem8x8/, "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch";
-specialize qw/vp8_copy_mem8x8 mmx media neon dspr2 msa/;
-$vp8_copy_mem8x8_media=vp8_copy_mem8x8_v6;
+specialize qw/vp8_copy_mem8x8 mmx neon dspr2 msa/;
$vp8_copy_mem8x8_dspr2=vp8_copy_mem8x8_dspr2;
add_proto qw/void vp8_copy_mem8x4/, "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch";
-specialize qw/vp8_copy_mem8x4 mmx media neon dspr2 msa/;
-$vp8_copy_mem8x4_media=vp8_copy_mem8x4_v6;
+specialize qw/vp8_copy_mem8x4 mmx neon dspr2 msa/;
$vp8_copy_mem8x4_dspr2=vp8_copy_mem8x4_dspr2;
#
@@ -180,40 +162,36 @@ if (vpx_config("CONFIG_POSTPROC") eq "yes") {
# Subpixel
#
add_proto qw/void vp8_sixtap_predict16x16/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
-specialize qw/vp8_sixtap_predict16x16 mmx sse2 ssse3 media neon dspr2 msa/;
-$vp8_sixtap_predict16x16_media=vp8_sixtap_predict16x16_armv6;
+specialize qw/vp8_sixtap_predict16x16 mmx sse2 ssse3 neon dspr2 msa/;
$vp8_sixtap_predict16x16_dspr2=vp8_sixtap_predict16x16_dspr2;
add_proto qw/void vp8_sixtap_predict8x8/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
-specialize qw/vp8_sixtap_predict8x8 mmx sse2 ssse3 media neon dspr2 msa/;
-$vp8_sixtap_predict8x8_media=vp8_sixtap_predict8x8_armv6;
+specialize qw/vp8_sixtap_predict8x8 mmx sse2 ssse3 neon dspr2 msa/;
$vp8_sixtap_predict8x8_dspr2=vp8_sixtap_predict8x8_dspr2;
add_proto qw/void vp8_sixtap_predict8x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
-specialize qw/vp8_sixtap_predict8x4 mmx sse2 ssse3 media neon dspr2 msa/;
-$vp8_sixtap_predict8x4_media=vp8_sixtap_predict8x4_armv6;
+specialize qw/vp8_sixtap_predict8x4 mmx sse2 ssse3 neon dspr2 msa/;
$vp8_sixtap_predict8x4_dspr2=vp8_sixtap_predict8x4_dspr2;
+# TODO(johannkoenig): Add neon implementation
+# https://bugs.chromium.org/p/webm/issues/detail?id=1273
add_proto qw/void vp8_sixtap_predict4x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
-specialize qw/vp8_sixtap_predict4x4 mmx ssse3 media dspr2 msa/;
-$vp8_sixtap_predict4x4_media=vp8_sixtap_predict4x4_armv6;
+specialize qw/vp8_sixtap_predict4x4 mmx ssse3 dspr2 msa/;
$vp8_sixtap_predict4x4_dspr2=vp8_sixtap_predict4x4_dspr2;
add_proto qw/void vp8_bilinear_predict16x16/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
-specialize qw/vp8_bilinear_predict16x16 mmx sse2 ssse3 media neon msa/;
-$vp8_bilinear_predict16x16_media=vp8_bilinear_predict16x16_armv6;
+specialize qw/vp8_bilinear_predict16x16 mmx sse2 ssse3 neon msa/;
add_proto qw/void vp8_bilinear_predict8x8/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
-specialize qw/vp8_bilinear_predict8x8 mmx sse2 ssse3 media neon msa/;
-$vp8_bilinear_predict8x8_media=vp8_bilinear_predict8x8_armv6;
+specialize qw/vp8_bilinear_predict8x8 mmx sse2 ssse3 neon msa/;
add_proto qw/void vp8_bilinear_predict8x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
-specialize qw/vp8_bilinear_predict8x4 mmx media neon msa/;
-$vp8_bilinear_predict8x4_media=vp8_bilinear_predict8x4_armv6;
+specialize qw/vp8_bilinear_predict8x4 mmx neon msa/;
+# TODO(johannkoenig): Add neon implementation
+# https://bugs.chromium.org/p/webm/issues/detail?id=1273
add_proto qw/void vp8_bilinear_predict4x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
-specialize qw/vp8_bilinear_predict4x4 mmx media msa/;
-$vp8_bilinear_predict4x4_media=vp8_bilinear_predict4x4_armv6;
+specialize qw/vp8_bilinear_predict4x4 mmx msa/;
#
# Encoder functions below this point.
@@ -232,16 +210,13 @@ if ($opts{arch} =~ /x86/) {
# Forward DCT
#
add_proto qw/void vp8_short_fdct4x4/, "short *input, short *output, int pitch";
-specialize qw/vp8_short_fdct4x4 mmx sse2 media neon msa/;
-$vp8_short_fdct4x4_media=vp8_short_fdct4x4_armv6;
+specialize qw/vp8_short_fdct4x4 mmx sse2 neon msa/;
add_proto qw/void vp8_short_fdct8x4/, "short *input, short *output, int pitch";
-specialize qw/vp8_short_fdct8x4 mmx sse2 media neon msa/;
-$vp8_short_fdct8x4_media=vp8_short_fdct8x4_armv6;
+specialize qw/vp8_short_fdct8x4 mmx sse2 neon msa/;
add_proto qw/void vp8_short_walsh4x4/, "short *input, short *output, int pitch";
-specialize qw/vp8_short_walsh4x4 sse2 media neon msa/;
-$vp8_short_walsh4x4_media=vp8_short_walsh4x4_armv6;
+specialize qw/vp8_short_walsh4x4 sse2 neon msa/;
#
# Quantizer