summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--libs.mk6
-rw-r--r--vp8/encoder/arm/arm_csystemdependent.c12
-rw-r--r--vp8/encoder/arm/armv5te/boolhuff_armv5te.asm4
-rw-r--r--vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm8
-rw-r--r--vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm8
-rw-r--r--vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm8
-rw-r--r--vp8/encoder/arm/armv6/vp8_short_fdct4x4_armv6.asm (renamed from vp8/encoder/arm/armv6/vp8_fast_fdct4x4_armv6.asm)4
-rw-r--r--vp8/encoder/arm/armv6/walsh_v6.asm301
-rw-r--r--vp8/encoder/arm/dct_arm.c8
-rw-r--r--vp8/encoder/arm/dct_arm.h18
-rw-r--r--vp8/encoder/arm/neon/fastfdct4x4_neon.asm124
-rw-r--r--vp8/encoder/arm/neon/fastfdct8x4_neon.asm177
-rw-r--r--vp8/encoder/arm/neon/picklpf_arm.c (renamed from vp8/encoder/arm/picklpf_arm.c)0
-rw-r--r--vp8/encoder/arm/neon/shortfdct_neon.asm293
-rw-r--r--vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.asm111
-rw-r--r--vp8/vp8cx_arm.mk6
-rw-r--r--vpx_scale/arm/neon/yv12extend_arm.c (renamed from vpx_scale/arm/yv12extend_arm.c)0
-rw-r--r--vpx_scale/vpx_scale.mk2
18 files changed, 481 insertions, 609 deletions
diff --git a/libs.mk b/libs.mk
index fecc0dab1..560dec26b 100644
--- a/libs.mk
+++ b/libs.mk
@@ -281,17 +281,17 @@ $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)vpx_config.asm
ifeq ($(filter icc gcc,$(TGT_CC)), $(TGT_CC))
$(BUILD_PFX)asm_com_offsets.asm: $(BUILD_PFX)$(VP8_PREFIX)common/asm_com_offsets.c.S
- grep EQU $< | tr -d '$$\#' $(ADS2GAS) > $@
+ grep -w EQU $< | tr -d '$$\#' $(ADS2GAS) > $@
$(BUILD_PFX)$(VP8_PREFIX)common/asm_com_offsets.c.S: $(VP8_PREFIX)common/asm_com_offsets.c
CLEAN-OBJS += $(BUILD_PFX)asm_com_offsets.asm $(BUILD_PFX)$(VP8_PREFIX)common/asm_com_offsets.c.S
$(BUILD_PFX)asm_enc_offsets.asm: $(BUILD_PFX)$(VP8_PREFIX)encoder/asm_enc_offsets.c.S
- grep EQU $< | tr -d '$$\#' $(ADS2GAS) > $@
+ grep -w EQU $< | tr -d '$$\#' $(ADS2GAS) > $@
$(BUILD_PFX)$(VP8_PREFIX)encoder/asm_enc_offsets.c.S: $(VP8_PREFIX)encoder/asm_enc_offsets.c
CLEAN-OBJS += $(BUILD_PFX)asm_enc_offsets.asm $(BUILD_PFX)$(VP8_PREFIX)encoder/asm_enc_offsets.c.S
$(BUILD_PFX)asm_dec_offsets.asm: $(BUILD_PFX)$(VP8_PREFIX)decoder/asm_dec_offsets.c.S
- grep EQU $< | tr -d '$$\#' $(ADS2GAS) > $@
+ grep -w EQU $< | tr -d '$$\#' $(ADS2GAS) > $@
$(BUILD_PFX)$(VP8_PREFIX)decoder/asm_dec_offsets.c.S: $(VP8_PREFIX)decoder/asm_dec_offsets.c
CLEAN-OBJS += $(BUILD_PFX)asm_dec_offsets.asm $(BUILD_PFX)$(VP8_PREFIX)decoder/asm_dec_offsets.c.S
else
diff --git a/vp8/encoder/arm/arm_csystemdependent.c b/vp8/encoder/arm/arm_csystemdependent.c
index 89f8136fe..081775bfd 100644
--- a/vp8/encoder/arm/arm_csystemdependent.c
+++ b/vp8/encoder/arm/arm_csystemdependent.c
@@ -58,10 +58,10 @@ void vp8_arch_arm_encoder_init(VP8_COMP *cpi)
/*cpi->rtcd.variance.get4x4sse_cs = vp8_get4x4sse_cs_c;*/
- /*cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_c;
- cpi->rtcd.fdct.short8x4 = vp8_short_fdct8x4_c;*/
- cpi->rtcd.fdct.fast4x4 = vp8_fast_fdct4x4_armv6;
- cpi->rtcd.fdct.fast8x4 = vp8_fast_fdct8x4_armv6;
+ cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_armv6;
+ cpi->rtcd.fdct.short8x4 = vp8_short_fdct8x4_armv6;
+ cpi->rtcd.fdct.fast4x4 = vp8_short_fdct4x4_armv6;
+ cpi->rtcd.fdct.fast8x4 = vp8_short_fdct8x4_armv6;
cpi->rtcd.fdct.walsh_short4x4 = vp8_short_walsh4x4_armv6;
/*cpi->rtcd.encodemb.berr = vp8_block_error_c;
@@ -107,8 +107,8 @@ void vp8_arch_arm_encoder_init(VP8_COMP *cpi)
cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_neon;
cpi->rtcd.fdct.short8x4 = vp8_short_fdct8x4_neon;
- cpi->rtcd.fdct.fast4x4 = vp8_fast_fdct4x4_neon;
- cpi->rtcd.fdct.fast8x4 = vp8_fast_fdct8x4_neon;
+ cpi->rtcd.fdct.fast4x4 = vp8_short_fdct4x4_neon;
+ cpi->rtcd.fdct.fast8x4 = vp8_short_fdct8x4_neon;
cpi->rtcd.fdct.walsh_short4x4 = vp8_short_walsh4x4_neon;
/*cpi->rtcd.encodemb.berr = vp8_block_error_c;
diff --git a/vp8/encoder/arm/armv5te/boolhuff_armv5te.asm b/vp8/encoder/arm/armv5te/boolhuff_armv5te.asm
index 3c05f5705..138ed46de 100644
--- a/vp8/encoder/arm/armv5te/boolhuff_armv5te.asm
+++ b/vp8/encoder/arm/armv5te/boolhuff_armv5te.asm
@@ -53,10 +53,10 @@
sub r7, r5, #1 ; range-1
cmp r1, #0
- mul r4, r4, r7 ; ((range-1) * probability)
+ mul r6, r4, r7 ; ((range-1) * probability)
mov r7, #1
- add r4, r7, r4, lsr #8 ; 1 + (((range-1) * probability) >> 8)
+ add r4, r7, r6, lsr #8 ; 1 + (((range-1) * probability) >> 8)
addne r2, r2, r4 ; if (bit) lowvalue += split
subne r4, r5, r4 ; if (bit) range = range-split
diff --git a/vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm b/vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm
index d939287ff..933717c63 100644
--- a/vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm
+++ b/vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm
@@ -71,7 +71,7 @@ token_loop
; off of v, so set a flag here based on this.
; This value is refered to as "bb"
lsls r12, r12, #1 ; bb = v >> n
- mul r4, r4, r7 ; ((range-1) * pp[i>>1]))
+ mul r6, r4, r7 ; ((range-1) * pp[i>>1]))
; bb can only be 0 or 1. So only execute this statement
; if bb == 1, otherwise it will act like i + 0
@@ -79,7 +79,7 @@ token_loop
mov r7, #1
ldrsb lr, [r10, lr] ; i = vp8_coef_tree[i+bb]
- add r4, r7, r4, lsr #8 ; 1 + (((range-1) * pp[i>>1]) >> 8)
+ add r4, r7, r6, lsr #8 ; 1 + (((range-1) * pp[i>>1]) >> 8)
addcs r2, r2, r4 ; if (bb) lowvalue += split
subcs r4, r5, r4 ; if (bb) range = range-split
@@ -172,12 +172,12 @@ extra_bits_loop
ldrb r4, [r9, lr, asr #1] ; pp[i>>1]
sub r7, r5, #1 ; range-1
lsls r12, r12, #1 ; v >> n
- mul r4, r4, r7 ; (range-1) * pp[i>>1]
+ mul r6, r4, r7 ; (range-1) * pp[i>>1]
addcs lr, lr, #1 ; i + bb
mov r7, #1
ldrsb lr, [r10, lr] ; i = b->tree[i+bb]
- add r4, r7, r4, lsr #8 ; split = 1 + (((range-1) * pp[i>>1]) >> 8)
+ add r4, r7, r6, lsr #8 ; split = 1 + (((range-1) * pp[i>>1]) >> 8)
addcs r2, r2, r4 ; if (bb) lowvalue += split
subcs r4, r5, r4 ; if (bb) range = range-split
diff --git a/vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm b/vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm
index ac2bba681..82bf71f35 100644
--- a/vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm
+++ b/vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm
@@ -93,7 +93,7 @@ token_loop
; off of v, so set a flag here based on this.
; This value is refered to as "bb"
lsls r12, r12, #1 ; bb = v >> n
- mul r4, r4, r7 ; ((range-1) * pp[i>>1]))
+ mul r6, r4, r7 ; ((range-1) * pp[i>>1]))
; bb can only be 0 or 1. So only execute this statement
; if bb == 1, otherwise it will act like i + 0
@@ -101,7 +101,7 @@ token_loop
mov r7, #1
ldrsb lr, [r10, lr] ; i = vp8_coef_tree[i+bb]
- add r4, r7, r4, lsr #8 ; 1 + (((range-1) * pp[i>>1]) >> 8)
+ add r4, r7, r6, lsr #8 ; 1 + (((range-1) * pp[i>>1]) >> 8)
addcs r2, r2, r4 ; if (bb) lowvalue += split
subcs r4, r5, r4 ; if (bb) range = range-split
@@ -194,12 +194,12 @@ extra_bits_loop
ldrb r4, [r9, lr, asr #1] ; pp[i>>1]
sub r7, r5, #1 ; range-1
lsls r12, r12, #1 ; v >> n
- mul r4, r4, r7 ; (range-1) * pp[i>>1]
+ mul r6, r4, r7 ; (range-1) * pp[i>>1]
addcs lr, lr, #1 ; i + bb
mov r7, #1
ldrsb lr, [r10, lr] ; i = b->tree[i+bb]
- add r4, r7, r4, lsr #8 ; split = 1 + (((range-1) * pp[i>>1]) >> 8)
+ add r4, r7, r6, lsr #8 ; split = 1 + (((range-1) * pp[i>>1]) >> 8)
addcs r2, r2, r4 ; if (bb) lowvalue += split
subcs r4, r5, r4 ; if (bb) range = range-split
diff --git a/vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm b/vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm
index c2eccdb53..c00375e88 100644
--- a/vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm
+++ b/vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm
@@ -123,7 +123,7 @@ token_loop
; off of v, so set a flag here based on this.
; This value is refered to as "bb"
lsls r12, r12, #1 ; bb = v >> n
- mul r4, r4, r7 ; ((range-1) * pp[i>>1]))
+ mul r6, r4, r7 ; ((range-1) * pp[i>>1]))
; bb can only be 0 or 1. So only execute this statement
; if bb == 1, otherwise it will act like i + 0
@@ -131,7 +131,7 @@ token_loop
mov r7, #1
ldrsb lr, [r10, lr] ; i = vp8_coef_tree[i+bb]
- add r4, r7, r4, lsr #8 ; 1 + (((range-1) * pp[i>>1]) >> 8)
+ add r4, r7, r6, lsr #8 ; 1 + (((range-1) * pp[i>>1]) >> 8)
addcs r2, r2, r4 ; if (bb) lowvalue += split
subcs r4, r5, r4 ; if (bb) range = range-split
@@ -224,12 +224,12 @@ extra_bits_loop
ldrb r4, [r9, lr, asr #1] ; pp[i>>1]
sub r7, r5, #1 ; range-1
lsls r12, r12, #1 ; v >> n
- mul r4, r4, r7 ; (range-1) * pp[i>>1]
+ mul r6, r4, r7 ; (range-1) * pp[i>>1]
addcs lr, lr, #1 ; i + bb
mov r7, #1
ldrsb lr, [r10, lr] ; i = b->tree[i+bb]
- add r4, r7, r4, lsr #8 ; split = 1 + (((range-1) * pp[i>>1]) >> 8)
+ add r4, r7, r6, lsr #8 ; split = 1 + (((range-1) * pp[i>>1]) >> 8)
addcs r2, r2, r4 ; if (bb) lowvalue += split
subcs r4, r5, r4 ; if (bb) range = range-split
diff --git a/vp8/encoder/arm/armv6/vp8_fast_fdct4x4_armv6.asm b/vp8/encoder/arm/armv6/vp8_short_fdct4x4_armv6.asm
index 65bd2b449..8034c1db9 100644
--- a/vp8/encoder/arm/armv6/vp8_fast_fdct4x4_armv6.asm
+++ b/vp8/encoder/arm/armv6/vp8_short_fdct4x4_armv6.asm
@@ -8,7 +8,7 @@
; be found in the AUTHORS file in the root of the source tree.
;
- EXPORT |vp8_fast_fdct4x4_armv6|
+ EXPORT |vp8_short_fdct4x4_armv6|
ARM
REQUIRE8
@@ -16,7 +16,7 @@
AREA |.text|, CODE, READONLY
; void vp8_short_fdct4x4_c(short *input, short *output, int pitch)
-|vp8_fast_fdct4x4_armv6| PROC
+|vp8_short_fdct4x4_armv6| PROC
stmfd sp!, {r4 - r12, lr}
diff --git a/vp8/encoder/arm/armv6/walsh_v6.asm b/vp8/encoder/arm/armv6/walsh_v6.asm
index 61ffdb315..5eaf3f25a 100644
--- a/vp8/encoder/arm/armv6/walsh_v6.asm
+++ b/vp8/encoder/arm/armv6/walsh_v6.asm
@@ -17,129 +17,196 @@
AREA |.text|, CODE, READONLY ; name this block of code
;short vp8_short_walsh4x4_armv6(short *input, short *output, int pitch)
+; r0 short *input,
+; r1 short *output,
+; r2 int pitch
|vp8_short_walsh4x4_armv6| PROC
stmdb sp!, {r4 - r11, lr}
- mov r12, r2 ; ugh. not clean
- ldr r2, [r0] ; [1 | 0]
- ldr r3, [r0, #4] ; [3 | 2]
- ldr r4, [r0, r12]! ; [5 | 4]
- ldr r5, [r0, #4] ; [7 | 6]
- ldr r6, [r0, r12]! ; [9 | 8]
- ldr r7, [r0, #4] ; [11 | 10]
- ldr r8, [r0, r12]! ; [13 | 12]
- ldr r9, [r0, #4] ; [15 | 14]
-
- qsubaddx r10, r2, r3 ; [c1|a1] [1-2 | 0+3]
- qaddsubx r11, r2, r3 ; [b1|d1] [1+2 | 0-3]
- qsubaddx r12, r4, r5 ; [c1|a1] [5-6 | 4+7]
- qaddsubx lr, r4, r5 ; [b1|d1] [5+6 | 4-7]
-
- qaddsubx r2, r10, r11 ; [1 | 2] [c1+d1 | a1-b1]
- qaddsubx r3, r11, r10 ; [0 | 3] [b1+a1 | d1-c1]
- qaddsubx r4, r12, lr ; [5 | 6] [c1+d1 | a1-b1]
- qaddsubx r5, lr, r12 ; [4 | 7] [b1+a1 | d1-c1]
-
- qsubaddx r10, r6, r7 ; [c1|a1] [9-10 | 8+11]
- qaddsubx r11, r6, r7 ; [b1|d1] [9+10 | 8-11]
- qsubaddx r12, r8, r9 ; [c1|a1] [13-14 | 12+15]
- qaddsubx lr, r8, r9 ; [b1|d1] [13+14 | 12-15]
-
- qaddsubx r6, r10, r11 ; [9 |10] [c1+d1 | a1-b1]
- qaddsubx r7, r11, r10 ; [8 |11] [b1+a1 | d1-c1]
- qaddsubx r8, r12, lr ; [13|14] [c1+d1 | a1-b1]
- qaddsubx r9, lr, r12 ; [12|15] [b1+a1 | d1-c1]
-
- ; first transform complete
-
- qadd16 r10, r3, r9 ; a1 [0+12 | 3+15]
- qadd16 r11, r5, r7 ; b1 [4+8 | 7+11]
- qsub16 r12, r5, r7 ; c1 [4-8 | 7-11]
- qsub16 lr, r3, r9 ; d1 [0-12 | 3-15]
-
- qadd16 r3, r10, r11 ; a2 [a1+b1] [0 | 3]
- qadd16 r5, r12, lr ; b2 [c1+d1] [4 | 7]
- qsub16 r7, r10, r11 ; c2 [a1-b1] [8 |11]
- qsub16 r9, lr, r12 ; d2 [d1-c1] [12|15]
-
- qadd16 r10, r2, r8 ; a1 [1+13 | 2+14]
- qadd16 r11, r4, r6 ; b1 [5+9 | 6+10]
- qsub16 r12, r4, r6 ; c1 [5-9 | 6-10]
- qsub16 lr, r2, r8 ; d1 [1-13 | 2-14]
-
- qadd16 r2, r10, r11 ; a2 [a1+b1] [1 | 2]
- qadd16 r4, r12, lr ; b2 [c1+d1] [5 | 6]
- qsub16 r6, r10, r11 ; c2 [a1-b1] [9 |10]
- qsub16 r8, lr, r12 ; d2 [d1-c1] [13|14]
-
- ; [a-d]2 += ([a-d]2 > 0)
-
- asrs r10, r3, #16
- addpl r10, r10, #1 ; [~0]
- asrs r11, r2, #16
- addpl r11, r11, #1 ; [~1]
- lsl r11, r11, #15 ; [1 | x]
- pkhtb r10, r11, r10, asr #1; [1 | 0]
- str r10, [r1], #4
-
- lsls r11, r2, #16
- addpl r11, r11, #0x10000 ; [~2]
- lsls r12, r3, #16
- addpl r12, r12, #0x10000 ; [~3]
- asr r12, r12, #1 ; [3 | x]
- pkhtb r11, r12, r11, asr #17; [3 | 2]
- str r11, [r1], #4
-
- asrs r2, r5, #16
- addpl r2, r2, #1 ; [~4]
- asrs r3, r4, #16
- addpl r3, r3, #1 ; [~5]
- lsl r3, r3, #15 ; [5 | x]
- pkhtb r2, r3, r2, asr #1 ; [5 | 4]
- str r2, [r1], #4
-
- lsls r2, r4, #16
- addpl r2, r2, #0x10000 ; [~6]
- lsls r3, r5, #16
- addpl r3, r3, #0x10000 ; [~7]
- asr r3, r3, #1 ; [7 | x]
- pkhtb r2, r3, r2, asr #17 ; [7 | 6]
- str r2, [r1], #4
-
- asrs r2, r7, #16
- addpl r2, r2, #1 ; [~8]
- asrs r3, r6, #16
- addpl r3, r3, #1 ; [~9]
- lsl r3, r3, #15 ; [9 | x]
- pkhtb r2, r3, r2, asr #1 ; [9 | 8]
- str r2, [r1], #4
-
- lsls r2, r6, #16
- addpl r2, r2, #0x10000 ; [~10]
- lsls r3, r7, #16
- addpl r3, r3, #0x10000 ; [~11]
- asr r3, r3, #1 ; [11 | x]
- pkhtb r2, r3, r2, asr #17 ; [11 | 10]
- str r2, [r1], #4
-
- asrs r2, r9, #16
- addpl r2, r2, #1 ; [~12]
- asrs r3, r8, #16
- addpl r3, r3, #1 ; [~13]
- lsl r3, r3, #15 ; [13 | x]
- pkhtb r2, r3, r2, asr #1 ; [13 | 12]
- str r2, [r1], #4
-
- lsls r2, r8, #16
- addpl r2, r2, #0x10000 ; [~14]
- lsls r3, r9, #16
- addpl r3, r3, #0x10000 ; [~15]
- asr r3, r3, #1 ; [15 | x]
- pkhtb r2, r3, r2, asr #17 ; [15 | 14]
- str r2, [r1]
+ ldrd r4, r5, [r0], r2
+ ldr lr, c00040004
+ ldrd r6, r7, [r0], r2
+
+ ; 0-3
+ qadd16 r3, r4, r5 ; [d1|a1] [1+3 | 0+2]
+ qsub16 r4, r4, r5 ; [c1|b1] [1-3 | 0-2]
+
+ ldrd r8, r9, [r0], r2
+ ; 4-7
+ qadd16 r5, r6, r7 ; [d1|a1] [5+7 | 4+6]
+ qsub16 r6, r6, r7 ; [c1|b1] [5-7 | 4-6]
+
+ ldrd r10, r11, [r0]
+ ; 8-11
+ qadd16 r7, r8, r9 ; [d1|a1] [9+11 | 8+10]
+ qsub16 r8, r8, r9 ; [c1|b1] [9-11 | 8-10]
+
+ ; 12-15
+ qadd16 r9, r10, r11 ; [d1|a1] [13+15 | 12+14]
+ qsub16 r10, r10, r11 ; [c1|b1] [13-15 | 12-14]
+
+
+ lsls r2, r3, #16
+ smuad r11, r3, lr ; A0 = a1<<2 + d1<<2
+ addne r11, r11, #1 ; A0 += (a1!=0)
+
+ lsls r2, r7, #16
+ smuad r12, r7, lr ; C0 = a1<<2 + d1<<2
+ addne r12, r12, #1 ; C0 += (a1!=0)
+
+ add r0, r11, r12 ; a1_0 = A0 + C0
+ sub r11, r11, r12 ; b1_0 = A0 - C0
+
+ lsls r2, r5, #16
+ smuad r12, r5, lr ; B0 = a1<<2 + d1<<2
+ addne r12, r12, #1 ; B0 += (a1!=0)
+
+ lsls r2, r9, #16
+ smuad r2, r9, lr ; D0 = a1<<2 + d1<<2
+ addne r2, r2, #1 ; D0 += (a1!=0)
+
+ add lr, r12, r2 ; d1_0 = B0 + D0
+ sub r12, r12, r2 ; c1_0 = B0 - D0
+
+ ; op[0,4,8,12]
+ adds r2, r0, lr ; a2 = a1_0 + d1_0
+ addmi r2, r2, #1 ; += a2 < 0
+ add r2, r2, #3 ; += 3
+ subs r0, r0, lr ; d2 = a1_0 - d1_0
+ mov r2, r2, asr #3 ; >> 3
+ strh r2, [r1] ; op[0]
+
+ addmi r0, r0, #1 ; += a2 < 0
+ add r0, r0, #3 ; += 3
+ ldr lr, c00040004
+ mov r0, r0, asr #3 ; >> 3
+ strh r0, [r1, #24] ; op[12]
+
+ adds r2, r11, r12 ; b2 = b1_0 + c1_0
+ addmi r2, r2, #1 ; += a2 < 0
+ add r2, r2, #3 ; += 3
+ subs r0, r11, r12 ; c2 = b1_0 - c1_0
+ mov r2, r2, asr #3 ; >> 3
+ strh r2, [r1, #8] ; op[4]
+
+ addmi r0, r0, #1 ; += a2 < 0
+ add r0, r0, #3 ; += 3
+ smusd r3, r3, lr ; A3 = a1<<2 - d1<<2
+ smusd r7, r7, lr ; C3 = a1<<2 - d1<<2
+ mov r0, r0, asr #3 ; >> 3
+ strh r0, [r1, #16] ; op[8]
+
+
+ ; op[3,7,11,15]
+ add r0, r3, r7 ; a1_3 = A3 + C3
+ sub r3, r3, r7 ; b1_3 = A3 - C3
+
+ smusd r5, r5, lr ; B3 = a1<<2 - d1<<2
+ smusd r9, r9, lr ; D3 = a1<<2 - d1<<2
+ add r7, r5, r9 ; d1_3 = B3 + D3
+ sub r5, r5, r9 ; c1_3 = B3 - D3
+
+ adds r2, r0, r7 ; a2 = a1_3 + d1_3
+ addmi r2, r2, #1 ; += a2 < 0
+ add r2, r2, #3 ; += 3
+ adds r9, r3, r5 ; b2 = b1_3 + c1_3
+ mov r2, r2, asr #3 ; >> 3
+ strh r2, [r1, #6] ; op[3]
+
+ addmi r9, r9, #1 ; += a2 < 0
+ add r9, r9, #3 ; += 3
+ subs r2, r3, r5 ; c2 = b1_3 - c1_3
+ mov r9, r9, asr #3 ; >> 3
+ strh r9, [r1, #14] ; op[7]
+
+ addmi r2, r2, #1 ; += a2 < 0
+ add r2, r2, #3 ; += 3
+ subs r9, r0, r7 ; d2 = a1_3 - d1_3
+ mov r2, r2, asr #3 ; >> 3
+ strh r2, [r1, #22] ; op[11]
+
+ addmi r9, r9, #1 ; += a2 < 0
+ add r9, r9, #3 ; += 3
+ smuad r3, r4, lr ; A1 = b1<<2 + c1<<2
+ smuad r5, r8, lr ; C1 = b1<<2 + c1<<2
+ mov r9, r9, asr #3 ; >> 3
+ strh r9, [r1, #30] ; op[15]
+
+ ; op[1,5,9,13]
+ add r0, r3, r5 ; a1_1 = A1 + C1
+ sub r3, r3, r5 ; b1_1 = A1 - C1
+
+ smuad r7, r6, lr ; B1 = b1<<2 + c1<<2
+ smuad r9, r10, lr ; D1 = b1<<2 + c1<<2
+ add r5, r7, r9 ; d1_1 = B1 + D1
+ sub r7, r7, r9 ; c1_1 = B1 - D1
+
+ adds r2, r0, r5 ; a2 = a1_1 + d1_1
+ addmi r2, r2, #1 ; += a2 < 0
+ add r2, r2, #3 ; += 3
+ adds r9, r3, r7 ; b2 = b1_1 + c1_1
+ mov r2, r2, asr #3 ; >> 3
+ strh r2, [r1, #2] ; op[1]
+
+ addmi r9, r9, #1 ; += a2 < 0
+ add r9, r9, #3 ; += 3
+ subs r2, r3, r7 ; c2 = b1_1 - c1_1
+ mov r9, r9, asr #3 ; >> 3
+ strh r9, [r1, #10] ; op[5]
+
+ addmi r2, r2, #1 ; += a2 < 0
+ add r2, r2, #3 ; += 3
+ subs r9, r0, r5 ; d2 = a1_1 - d1_1
+ mov r2, r2, asr #3 ; >> 3
+ strh r2, [r1, #18] ; op[9]
+
+ addmi r9, r9, #1 ; += a2 < 0
+ add r9, r9, #3 ; += 3
+ smusd r4, r4, lr ; A2 = b1<<2 - c1<<2
+ smusd r8, r8, lr ; C2 = b1<<2 - c1<<2
+ mov r9, r9, asr #3 ; >> 3
+ strh r9, [r1, #26] ; op[13]
+
+
+ ; op[2,6,10,14]
+ add r11, r4, r8 ; a1_2 = A2 + C2
+ sub r12, r4, r8 ; b1_2 = A2 - C2
+
+ smusd r6, r6, lr ; B2 = b1<<2 - c1<<2
+ smusd r10, r10, lr ; D2 = b1<<2 - c1<<2
+ add r4, r6, r10 ; d1_2 = B2 + D2
+ sub r8, r6, r10 ; c1_2 = B2 - D2
+
+ adds r2, r11, r4 ; a2 = a1_2 + d1_2
+ addmi r2, r2, #1 ; += a2 < 0
+ add r2, r2, #3 ; += 3
+ adds r9, r12, r8 ; b2 = b1_2 + c1_2
+ mov r2, r2, asr #3 ; >> 3
+ strh r2, [r1, #4] ; op[2]
+
+ addmi r9, r9, #1 ; += a2 < 0
+ add r9, r9, #3 ; += 3
+ subs r2, r12, r8 ; c2 = b1_2 - c1_2
+ mov r9, r9, asr #3 ; >> 3
+ strh r9, [r1, #12] ; op[6]
+
+ addmi r2, r2, #1 ; += a2 < 0
+ add r2, r2, #3 ; += 3
+ subs r9, r11, r4 ; d2 = a1_2 - d1_2
+ mov r2, r2, asr #3 ; >> 3
+ strh r2, [r1, #20] ; op[10]
+
+ addmi r9, r9, #1 ; += a2 < 0
+ add r9, r9, #3 ; += 3
+ mov r9, r9, asr #3 ; >> 3
+ strh r9, [r1, #28] ; op[14]
+
ldmia sp!, {r4 - r11, pc}
ENDP ; |vp8_short_walsh4x4_armv6|
+c00040004
+ DCD 0x00040004
+
END
diff --git a/vp8/encoder/arm/dct_arm.c b/vp8/encoder/arm/dct_arm.c
index 60d649d50..2692acb49 100644
--- a/vp8/encoder/arm/dct_arm.c
+++ b/vp8/encoder/arm/dct_arm.c
@@ -13,12 +13,10 @@
#if HAVE_ARMV6
-void vp8_fast_fdct8x4_armv6(short *input, short *output, int pitch)
+void vp8_short_fdct8x4_armv6(short *input, short *output, int pitch)
{
- vp8_fast_fdct4x4_armv6(input, output, pitch);
- vp8_fast_fdct4x4_armv6(input + 4, output + 16, pitch);
+ vp8_short_fdct4x4_armv6(input, output, pitch);
+ vp8_short_fdct4x4_armv6(input + 4, output + 16, pitch);
}
#endif /* HAVE_ARMV6 */
-
-
diff --git a/vp8/encoder/arm/dct_arm.h b/vp8/encoder/arm/dct_arm.h
index 769d5f483..db553c4e0 100644
--- a/vp8/encoder/arm/dct_arm.h
+++ b/vp8/encoder/arm/dct_arm.h
@@ -14,18 +14,24 @@
#if HAVE_ARMV6
extern prototype_fdct(vp8_short_walsh4x4_armv6);
-extern prototype_fdct(vp8_fast_fdct4x4_armv6);
-extern prototype_fdct(vp8_fast_fdct8x4_armv6);
+extern prototype_fdct(vp8_short_fdct4x4_armv6);
+extern prototype_fdct(vp8_short_fdct8x4_armv6);
#if !CONFIG_RUNTIME_CPU_DETECT
#undef vp8_fdct_walsh_short4x4
#define vp8_fdct_walsh_short4x4 vp8_short_walsh4x4_armv6
+#undef vp8_fdct_short4x4
+#define vp8_fdct_short4x4 vp8_short_fdct4x4_armv6
+
+#undef vp8_fdct_short8x4
+#define vp8_fdct_short8x4 vp8_short_fdct8x4_armv6
+
#undef vp8_fdct_fast4x4
-#define vp8_fdct_fast4x4 vp8_fast_fdct4x4_armv6
+#define vp8_fdct_fast4x4 vp8_short_fdct4x4_armv6
#undef vp8_fdct_fast8x4
-#define vp8_fdct_fast8x4 vp8_fast_fdct8x4_armv6
+#define vp8_fdct_fast8x4 vp8_short_fdct8x4_armv6
#endif
#endif /* HAVE_ARMV6 */
@@ -45,10 +51,10 @@ extern prototype_fdct(vp8_short_walsh4x4_neon);
#define vp8_fdct_short8x4 vp8_short_fdct8x4_neon
#undef vp8_fdct_fast4x4
-#define vp8_fdct_fast4x4 vp8_fast_fdct4x4_neon
+#define vp8_fdct_fast4x4 vp8_short_fdct4x4_neon
#undef vp8_fdct_fast8x4
-#define vp8_fdct_fast8x4 vp8_fast_fdct8x4_neon
+#define vp8_fdct_fast8x4 vp8_short_fdct8x4_neon
#undef vp8_fdct_walsh_short4x4
#define vp8_fdct_walsh_short4x4 vp8_short_walsh4x4_neon
diff --git a/vp8/encoder/arm/neon/fastfdct4x4_neon.asm b/vp8/encoder/arm/neon/fastfdct4x4_neon.asm
deleted file mode 100644
index 1cc0bd781..000000000
--- a/vp8/encoder/arm/neon/fastfdct4x4_neon.asm
+++ /dev/null
@@ -1,124 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_fast_fdct4x4_neon|
-
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-;void vp8_fast_fdct4x4_c(short *input, short *output, int pitch);
-;NOTE:
-;The input *src_diff. src_diff is calculated as:
-;diff_ptr[c] = src_ptr[c] - pred_ptr[c]; (in Subtract* function)
-;In which *src_ptr and *pred_ptr both are unsigned char.
-;Therefore, *src_diff should be in the range of [-255, 255].
-;CAUTION:
-;The input values of 25th block are set in vp8_build_dcblock function, which are out of [-255, 255].
-;But, VP8 encoder only uses vp8_short_fdct4x4_c for 25th block, not vp8_fast_fdct4x4_c. That makes
-;it ok for assuming *input in [-255, 255] in vp8_fast_fdct4x4_c, but not ok in vp8_short_fdct4x4_c.
-
-|vp8_fast_fdct4x4_neon| PROC
- vld1.16 {d2}, [r0], r2 ;load input
- ldr r12, _ffdct_coeff_
- vld1.16 {d3}, [r0], r2
- vld1.16 {d4}, [r0], r2
- vld1.16 {d0}, [r12]
- vld1.16 {d5}, [r0], r2
-
- ;First for-loop
- ;transpose d2, d3, d4, d5. Then, d2=ip[0], d3=ip[1], d4=ip[2], d5=ip[3]
- vtrn.32 d2, d4
- vtrn.32 d3, d5
- vtrn.16 d2, d3
- vtrn.16 d4, d5
-
- vadd.s16 d6, d2, d5 ;ip[0]+ip[3]
- vadd.s16 d7, d3, d4 ;ip[1]+ip[2]
- vsub.s16 d8, d3, d4 ;ip[1]-ip[2]
- vsub.s16 d9, d2, d5 ;ip[0]-ip[3]
- vshl.i16 q3, q3, #1 ; a1, b1
- vshl.i16 q4, q4, #1 ; c1, d1
-
- vadd.s16 d10, d6, d7 ;temp1 = a1 + b1
- vsub.s16 d11, d6, d7 ;temp2 = a1 - b1
-
- vqdmulh.s16 q6, q5, d0[1]
- vqdmulh.s16 q8, q4, d0[0]
- vqdmulh.s16 q7, q4, d0[2]
-
- vshr.s16 q6, q6, #1
- vshr.s16 q8, q8, #1
- vshr.s16 q7, q7, #1 ;d14:temp1 = ( c1 * x_c3)>>16; d15:temp1 = (d1 * x_c3)>>16
- vadd.s16 q8, q4, q8 ;d16:temp2 = ((c1 * x_c1)>>16) + c1; d17:temp2 = ((d1 * x_c1)>>16) + d1
-
- vadd.s16 d2, d10, d12 ;op[0] = ((temp1 * x_c2 )>>16) + temp1
- vadd.s16 d4, d11, d13 ;op[2] = ((temp2 * x_c2 )>>16) + temp2
- vadd.s16 d3, d14, d17 ;op[1] = temp1 + temp2 -- q is not necessary, just for protection
- vsub.s16 d5, d15, d16 ;op[3] = temp1 - temp2
-
- ;Second for-loop
- ;transpose d2, d3, d4, d5. Then, d2=ip[0], d3=ip[4], d4=ip[8], d5=ip[12]
- vtrn.32 d2, d4
- vtrn.32 d3, d5
- vtrn.16 d2, d3
- vtrn.16 d4, d5
-
- vadd.s16 d6, d2, d5 ;a1 = ip[0]+ip[12]
- vadd.s16 d7, d3, d4 ;b1 = ip[4]+ip[8]
- vsub.s16 d8, d3, d4 ;c1 = ip[4]-ip[8]
- vsub.s16 d9, d2, d5 ;d1 = ip[0]-ip[12]
-
- vadd.s16 d10, d6, d7 ;temp1 = a1 + b1
- vsub.s16 d11, d6, d7 ;temp2 = a1 - b1
-
-
- vqdmulh.s16 q6, q5, d0[1]
- vqdmulh.s16 q8, q4, d0[0]
- vqdmulh.s16 q7, q4, d0[2]
-
- vshr.s16 q6, q6, #1
- vshr.s16 q8, q8, #1
- vshr.s16 q7, q7, #1 ;d14:temp1 = ( c1 * x_c3)>>16; d15:temp1 = (d1 * x_c3)>>16
- vadd.s16 q8, q4, q8 ;d16:temp2 = ((c1 * x_c1)>>16) + c1; d17:temp2 = ((d1 * x_c1)>>16) + d1
-
- vadd.s16 d2, d10, d12 ;a2 = ((temp1 * x_c2 )>>16) + temp1
- vadd.s16 d4, d11, d13 ;c2 = ((temp2 * x_c2 )>>16) + temp2
- vadd.s16 d3, d14, d17 ;b2 = temp1 + temp2 -- q is not necessary, just for protection
- vsub.s16 d5, d15, d16 ;d2 = temp1 - temp2
-
- vclt.s16 q3, q1, #0
- vclt.s16 q4, q2, #0
-
- vsub.s16 q1, q1, q3
- vsub.s16 q2, q2, q4
-
- vshr.s16 q1, q1, #1
- vshr.s16 q2, q2, #1
-
- vst1.16 {q1, q2}, [r1]
-
- bx lr
-
- ENDP
-
-;-----------------
-
-_ffdct_coeff_
- DCD ffdct_coeff
-ffdct_coeff
-; 60547 = 0xEC83
-; 46341 = 0xB505
-; 25080 = 0x61F8
- DCD 0xB505EC83, 0x000061F8
-
- END
diff --git a/vp8/encoder/arm/neon/fastfdct8x4_neon.asm b/vp8/encoder/arm/neon/fastfdct8x4_neon.asm
deleted file mode 100644
index f6e8bbb83..000000000
--- a/vp8/encoder/arm/neon/fastfdct8x4_neon.asm
+++ /dev/null
@@ -1,177 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_fast_fdct8x4_neon|
-
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-;void vp8_fast_fdct4x4_c(short *input, short *output, int pitch);
-;NOTE:
-;The input *src_diff. src_diff is calculated as:
-;diff_ptr[c] = src_ptr[c] - pred_ptr[c]; (in Subtract* function)
-;In which *src_ptr and *pred_ptr both are unsigned char.
-;Therefore, *src_diff should be in the range of [-255, 255].
-;CAUTION:
-;The input values of 25th block are set in vp8_build_dcblock function, which are out of [-255, 255].
-;But, VP8 encoder only uses vp8_short_fdct4x4_c for 25th block, not vp8_fast_fdct4x4_c. That makes
-;it ok for assuming *input in [-255, 255] in vp8_fast_fdct4x4_c, but not ok in vp8_short_fdct4x4_c.
-
-|vp8_fast_fdct8x4_neon| PROC
- vld1.16 {q1}, [r0], r2 ;load input
- ldr r12, _ffdct8_coeff_
- vld1.16 {q2}, [r0], r2
- vld1.16 {q3}, [r0], r2
- vld1.16 {d0}, [r12]
- vld1.16 {q4}, [r0], r2
-
- ;First for-loop
- ;transpose d2, d4, d6, d8. Then, d2=ip[0], d4=ip[1], d6=ip[2], d8=ip[3]
- ;transpose d3, d5, d7, d9. Then, d3=ip[0], d5=ip[1], d7=ip[2], d9=ip[3]
- vtrn.32 d2, d6
- vtrn.32 d3, d7
- vtrn.32 d4, d8
- vtrn.32 d5, d9
- vtrn.16 d2, d4
- vtrn.16 d3, d5
- vtrn.16 d6, d8
- vtrn.16 d7, d9
-
- vadd.s16 d10, d2, d8 ;ip[0]+ip[3]
- vadd.s16 d11, d4, d6 ;ip[1]+ip[2]
- vsub.s16 d12, d4, d6 ;ip[1]-ip[2]
- vsub.s16 d13, d2, d8 ;ip[0]-ip[3]
- vadd.s16 d22, d3, d9
- vadd.s16 d23, d5, d7
- vsub.s16 d24, d5, d7
- vsub.s16 d25, d3, d9
-
- vshl.i16 q5, q5, #1 ; a1, b1
- vshl.i16 q6, q6, #1 ; c1, d1
- vshl.i16 q1, q11, #1
- vshl.i16 q2, q12, #1
-
- vadd.s16 d14, d10, d11 ;temp1 = a1 + b1
- vsub.s16 d15, d10, d11 ;temp2 = a1 - b1
- vadd.s16 d24, d2, d3
- vsub.s16 d25, d2, d3
-
- vqdmulh.s16 q8, q7, d0[1]
- vqdmulh.s16 q13, q12, d0[1]
- vqdmulh.s16 q10, q6, d0[0]
- vqdmulh.s16 q15, q2, d0[0]
- vqdmulh.s16 q9, q6, d0[2]
- vqdmulh.s16 q14, q2, d0[2]
-
- vshr.s16 q8, q8, #1
- vshr.s16 q13, q13, #1
- vshr.s16 q10, q10, #1
- vshr.s16 q15, q15, #1
- vshr.s16 q9, q9, #1 ;d18:temp1 = ( c1 * x_c3)>>16; d19:temp1 = (d1 * x_c3)>>16
- vshr.s16 q14, q14, #1 ;d28:temp1 = ( c1 * x_c3)>>16; d29:temp1 = (d1 * x_c3)>>16
- vadd.s16 q10, q6, q10 ;d20:temp2 = ((c1 * x_c1)>>16) + c1; d21:temp2 = ((d1 * x_c1)>>16) + d1
- vadd.s16 q15, q2, q15 ;d30:temp2 = ((c1 * x_c1)>>16) + c1; d31:temp2 = ((d1 * x_c1)>>16) + d1
-
- vadd.s16 d2, d14, d16 ;op[0] = ((temp1 * x_c2 )>>16) + temp1
- vadd.s16 d3, d24, d26 ;op[0] = ((temp1 * x_c2 )>>16) + temp1
- vadd.s16 d6, d15, d17 ;op[2] = ((temp2 * x_c2 )>>16) + temp2
- vadd.s16 d7, d25, d27 ;op[2] = ((temp2 * x_c2 )>>16) + temp2
- vadd.s16 d4, d18, d21 ;op[1] = temp1 + temp2 -- q is not necessary, just for protection
- vadd.s16 d5, d28, d31 ;op[1] = temp1 + temp2 -- q is not necessary, just for protection
- vsub.s16 d8, d19, d20 ;op[3] = temp1 - temp2
- vsub.s16 d9, d29, d30 ;op[3] = temp1 - temp2
-
- ;Second for-loop
- ;transpose d2, d4, d6, d8. Then, d2=ip[0], d4=ip[4], d6=ip[8], d8=ip[12]
- ;transpose d3, d5, d7, d9. Then, d3=ip[0], d5=ip[4], d7=ip[8], d9=ip[12]
- vtrn.32 d2, d6
- vtrn.32 d3, d7
- vtrn.32 d4, d8
- vtrn.32 d5, d9
- vtrn.16 d2, d4
- vtrn.16 d3, d5
- vtrn.16 d6, d8
- vtrn.16 d7, d9
-
- vadd.s16 d10, d2, d8 ;a1 = ip[0]+ip[12]
- vadd.s16 d11, d4, d6 ;b1 = ip[4]+ip[8]
- vsub.s16 d12, d4, d6 ;c1 = ip[4]-ip[8]
- vsub.s16 d13, d2, d8 ;d1 = ip[0]-ip[12]
- vadd.s16 d2, d3, d9
- vadd.s16 d4, d5, d7
- vsub.s16 d24, d5, d7
- vsub.s16 d25, d3, d9
-
- vadd.s16 d14, d10, d11 ;temp1 = a1 + b1
- vsub.s16 d15, d10, d11 ;temp2 = a1 - b1
- vadd.s16 d22, d2, d4
- vsub.s16 d23, d2, d4
-
- vqdmulh.s16 q8, q7, d0[1]
- vqdmulh.s16 q13, q11, d0[1]
- vqdmulh.s16 q10, q6, d0[0]
- vqdmulh.s16 q15, q12, d0[0]
- vqdmulh.s16 q9, q6, d0[2]
- vqdmulh.s16 q14, q12, d0[2]
-
- vshr.s16 q8, q8, #1
- vshr.s16 q13, q13, #1
- vshr.s16 q10, q10, #1
- vshr.s16 q15, q15, #1
- vshr.s16 q9, q9, #1 ;d18:temp1 = ( c1 * x_c3)>>16; d19:temp1 = (d1 * x_c3)>>16
- vshr.s16 q14, q14, #1 ;d28:temp1 = ( c1 * x_c3)>>16; d29:temp1 = (d1 * x_c3)>>16
- vadd.s16 q10, q6, q10 ;d20:temp2 = ((c1 * x_c1)>>16) + c1; d21:temp2 = ((d1 * x_c1)>>16) + d1
- vadd.s16 q15, q12, q15 ;d30:temp2 = ((c1 * x_c1)>>16) + c1; d31:temp2 = ((d1 * x_c1)>>16) + d1
-
- vadd.s16 d2, d14, d16 ;a2 = ((temp1 * x_c2 )>>16) + temp1
- vadd.s16 d6, d22, d26 ;a2 = ((temp1 * x_c2 )>>16) + temp1
- vadd.s16 d4, d15, d17 ;c2 = ((temp2 * x_c2 )>>16) + temp2
- vadd.s16 d8, d23, d27 ;c2 = ((temp2 * x_c2 )>>16) + temp2
- vadd.s16 d3, d18, d21 ;b2 = temp1 + temp2 -- q is not necessary, just for protection
- vadd.s16 d7, d28, d31 ;b2 = temp1 + temp2 -- q is not necessary, just for protection
- vsub.s16 d5, d19, d20 ;d2 = temp1 - temp2
- vsub.s16 d9, d29, d30 ;d2 = temp1 - temp2
-
- vclt.s16 q5, q1, #0
- vclt.s16 q6, q2, #0
- vclt.s16 q7, q3, #0
- vclt.s16 q8, q4, #0
-
- vsub.s16 q1, q1, q5
- vsub.s16 q2, q2, q6
- vsub.s16 q3, q3, q7
- vsub.s16 q4, q4, q8
-
- vshr.s16 q1, q1, #1
- vshr.s16 q2, q2, #1
- vshr.s16 q3, q3, #1
- vshr.s16 q4, q4, #1
-
- vst1.16 {q1, q2}, [r1]!
- vst1.16 {q3, q4}, [r1]
-
- bx lr
-
- ENDP
-
-;-----------------
-
-_ffdct8_coeff_
- DCD ffdct8_coeff
-ffdct8_coeff
-; 60547 = 0xEC83
-; 46341 = 0xB505
-; 25080 = 0x61F8
- DCD 0xB505EC83, 0x000061F8
-
- END
diff --git a/vp8/encoder/arm/picklpf_arm.c b/vp8/encoder/arm/neon/picklpf_arm.c
index 3fb370c3d..3fb370c3d 100644
--- a/vp8/encoder/arm/picklpf_arm.c
+++ b/vp8/encoder/arm/neon/picklpf_arm.c
diff --git a/vp8/encoder/arm/neon/shortfdct_neon.asm b/vp8/encoder/arm/neon/shortfdct_neon.asm
index 1b7f36277..09dd011ec 100644
--- a/vp8/encoder/arm/neon/shortfdct_neon.asm
+++ b/vp8/encoder/arm/neon/shortfdct_neon.asm
@@ -11,134 +11,211 @@
EXPORT |vp8_short_fdct4x4_neon|
EXPORT |vp8_short_fdct8x4_neon|
+
ARM
REQUIRE8
PRESERVE8
+ AREA ||.text||, CODE, READONLY, ALIGN=4
+
- AREA ||.text||, CODE, READONLY, ALIGN=2
+ ALIGN 16 ; enable use of @128 bit aligned loads
+coeff
+ DCW 5352, 5352, 5352, 5352
+ DCW 2217, 2217, 2217, 2217
+ DCD 14500, 14500, 14500, 14500
+ DCD 7500, 7500, 7500, 7500
+ DCD 12000, 12000, 12000, 12000
+ DCD 51000, 51000, 51000, 51000
-; r0 short *input
-; r1 short *output
-; r2 int pitch
-; Input has a pitch, output is contiguous
+;void vp8_short_fdct4x4_c(short *input, short *output, int pitch)
|vp8_short_fdct4x4_neon| PROC
- ldr r12, _dct_matrix_
- vld1.16 d0, [r0], r2
- vld1.16 d1, [r0], r2
- vld1.16 d2, [r0], r2
- vld1.16 d3, [r0]
- vld1.16 {q2, q3}, [r12]
-
-;first stage
- vmull.s16 q11, d4, d0[0] ;i=0
- vmull.s16 q12, d4, d1[0] ;i=1
- vmull.s16 q13, d4, d2[0] ;i=2
- vmull.s16 q14, d4, d3[0] ;i=3
-
- vmlal.s16 q11, d5, d0[1]
- vmlal.s16 q12, d5, d1[1]
- vmlal.s16 q13, d5, d2[1]
- vmlal.s16 q14, d5, d3[1]
-
- vmlal.s16 q11, d6, d0[2]
- vmlal.s16 q12, d6, d1[2]
- vmlal.s16 q13, d6, d2[2]
- vmlal.s16 q14, d6, d3[2]
-
- vmlal.s16 q11, d7, d0[3] ;sumtemp for i=0
- vmlal.s16 q12, d7, d1[3] ;sumtemp for i=1
- vmlal.s16 q13, d7, d2[3] ;sumtemp for i=2
- vmlal.s16 q14, d7, d3[3] ;sumtemp for i=3
-
- ; rounding
- vrshrn.i32 d22, q11, #14
- vrshrn.i32 d24, q12, #14
- vrshrn.i32 d26, q13, #14
- vrshrn.i32 d28, q14, #14
-
-;second stage
- vmull.s16 q4, d22, d4[0] ;i=0
- vmull.s16 q5, d22, d4[1] ;i=1
- vmull.s16 q6, d22, d4[2] ;i=2
- vmull.s16 q7, d22, d4[3] ;i=3
-
- vmlal.s16 q4, d24, d5[0]
- vmlal.s16 q5, d24, d5[1]
- vmlal.s16 q6, d24, d5[2]
- vmlal.s16 q7, d24, d5[3]
-
- vmlal.s16 q4, d26, d6[0]
- vmlal.s16 q5, d26, d6[1]
- vmlal.s16 q6, d26, d6[2]
- vmlal.s16 q7, d26, d6[3]
-
- vmlal.s16 q4, d28, d7[0] ;sumtemp for i=0
- vmlal.s16 q5, d28, d7[1] ;sumtemp for i=1
- vmlal.s16 q6, d28, d7[2] ;sumtemp for i=2
- vmlal.s16 q7, d28, d7[3] ;sumtemp for i=3
-
- vrshr.s32 q0, q4, #16
- vrshr.s32 q1, q5, #16
- vrshr.s32 q2, q6, #16
- vrshr.s32 q3, q7, #16
-
- vmovn.i32 d0, q0
- vmovn.i32 d1, q1
- vmovn.i32 d2, q2
- vmovn.i32 d3, q3
-
- vst1.16 {q0, q1}, [r1]
+
+ ; Part one
+ vld1.16 {d0}, [r0@64], r2
+ adr r12, coeff
+ vld1.16 {d1}, [r0@64], r2
+ vld1.16 {q8}, [r12@128]! ; d16=5352, d17=2217
+ vld1.16 {d2}, [r0@64], r2
+ vld1.32 {q9, q10}, [r12@128]! ; q9=14500, q10=7500
+ vld1.16 {d3}, [r0@64], r2
+
+ ; transpose d0=ip[0], d1=ip[1], d2=ip[2], d3=ip[3]
+ vtrn.32 d0, d2
+ vtrn.32 d1, d3
+ vld1.32 {q11,q12}, [r12@128] ; q11=12000, q12=51000
+ vtrn.16 d0, d1
+ vtrn.16 d2, d3
+
+ vadd.s16 d4, d0, d3 ; a1 = ip[0] + ip[3]
+ vadd.s16 d5, d1, d2 ; b1 = ip[1] + ip[2]
+ vsub.s16 d6, d1, d2 ; c1 = ip[1] - ip[2]
+ vsub.s16 d7, d0, d3 ; d1 = ip[0] - ip[3]
+
+ vshl.s16 q2, q2, #3 ; (a1, b1) << 3
+ vshl.s16 q3, q3, #3 ; (c1, d1) << 3
+
+ vadd.s16 d0, d4, d5 ; op[0] = a1 + b1
+ vsub.s16 d2, d4, d5 ; op[2] = a1 - b1
+
+ vmlal.s16 q9, d7, d16 ; d1*5352 + 14500
+ vmlal.s16 q10, d7, d17 ; d1*2217 + 7500
+ vmlal.s16 q9, d6, d17 ; c1*2217 + d1*5352 + 14500
+ vmlsl.s16 q10, d6, d16 ; d1*2217 - c1*5352 + 7500
+
+ vshrn.s32 d1, q9, #12 ; op[1] = (c1*2217 + d1*5352 + 14500)>>12
+ vshrn.s32 d3, q10, #12 ; op[3] = (d1*2217 - c1*5352 + 7500)>>12
+
+
+ ; Part two
+
+ ; transpose d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12]
+ vtrn.32 d0, d2
+ vtrn.32 d1, d3
+ vtrn.16 d0, d1
+ vtrn.16 d2, d3
+
+ vmov.s16 d26, #7
+
+ vadd.s16 d4, d0, d3 ; a1 = ip[0] + ip[12]
+ vadd.s16 d5, d1, d2 ; b1 = ip[4] + ip[8]
+ vsub.s16 d6, d1, d2 ; c1 = ip[4] - ip[8]
+ vadd.s16 d4, d4, d26 ; a1 + 7
+ vsub.s16 d7, d0, d3 ; d1 = ip[0] - ip[12]
+
+ vadd.s16 d0, d4, d5 ; op[0] = a1 + b1 + 7
+ vsub.s16 d2, d4, d5 ; op[8] = a1 - b1 + 7
+
+ vmlal.s16 q11, d7, d16 ; d1*5352 + 12000
+ vmlal.s16 q12, d7, d17 ; d1*2217 + 51000
+
+ vceq.s16 d4, d7, #0
+
+ vshr.s16 d0, d0, #4
+ vshr.s16 d2, d2, #4
+
+ vmlal.s16 q11, d6, d17 ; c1*2217 + d1*5352 + 12000
+ vmlsl.s16 q12, d6, d16 ; d1*2217 - c1*5352 + 51000
+
+ vmvn.s16 d4, d4
+ vshrn.s32 d1, q11, #16 ; op[4] = (c1*2217 + d1*5352 + 12000)>>16
+ vsub.s16 d1, d1, d4 ; op[4] += (d1!=0)
+ vshrn.s32 d3, q12, #16 ; op[12]= (d1*2217 - c1*5352 + 51000)>>16
+
+ vst1.16 {q0, q1}, [r1@128]
bx lr
ENDP
-; r0 short *input
-; r1 short *output
-; r2 int pitch
+;void vp8_short_fdct8x4_c(short *input, short *output, int pitch)
|vp8_short_fdct8x4_neon| PROC
- ; Store link register and input before calling
- ; first 4x4 fdct. Do not need to worry about
- ; output or pitch because those pointers are not
- ; touched in the 4x4 fdct function
- stmdb sp!, {r0, lr}
- bl vp8_short_fdct4x4_neon
+ ; Part one
+
+ vld1.16 {q0}, [r0@128], r2
+ adr r12, coeff
+ vld1.16 {q1}, [r0@128], r2
+ vld1.16 {q8}, [r12@128]! ; d16=5352, d17=2217
+ vld1.16 {q2}, [r0@128], r2
+ vld1.32 {q9, q10}, [r12@128]! ; q9=14500, q10=7500
+ vld1.16 {q3}, [r0@128], r2
+
+ ; transpose q0=ip[0], q1=ip[1], q2=ip[2], q3=ip[3]
+ vtrn.32 q0, q2 ; [A0|B0]
+ vtrn.32 q1, q3 ; [A1|B1]
+ vtrn.16 q0, q1 ; [A2|B2]
+ vtrn.16 q2, q3 ; [A3|B3]
- ldmia sp!, {r0, lr}
+ vadd.s16 q11, q0, q3 ; a1 = ip[0] + ip[3]
+ vadd.s16 q12, q1, q2 ; b1 = ip[1] + ip[2]
+ vsub.s16 q13, q1, q2 ; c1 = ip[1] - ip[2]
+ vsub.s16 q14, q0, q3 ; d1 = ip[0] - ip[3]
- ; Move to the next block of data.
- add r0, r0, #8
- add r1, r1, #32
+ vshl.s16 q11, q11, #3 ; a1 << 3
+ vshl.s16 q12, q12, #3 ; b1 << 3
+ vshl.s16 q13, q13, #3 ; c1 << 3
+ vshl.s16 q14, q14, #3 ; d1 << 3
- ; Second time through do not store off the
- ; link register, just return from the 4x4 fdtc
- b vp8_short_fdct4x4_neon
+ vadd.s16 q0, q11, q12 ; [A0 | B0] = a1 + b1
+ vsub.s16 q2, q11, q12 ; [A2 | B2] = a1 - b1
+
+ vmov.s16 q11, q9 ; 14500
+ vmov.s16 q12, q10 ; 7500
+
+ vmlal.s16 q9, d28, d16 ; A[1] = d1*5352 + 14500
+ vmlal.s16 q10, d28, d17 ; A[3] = d1*2217 + 7500
+ vmlal.s16 q11, d29, d16 ; B[1] = d1*5352 + 14500
+ vmlal.s16 q12, d29, d17 ; B[3] = d1*2217 + 7500
+
+ vmlal.s16 q9, d26, d17 ; A[1] = c1*2217 + d1*5352 + 14500
+ vmlsl.s16 q10, d26, d16 ; A[3] = d1*2217 - c1*5352 + 7500
+ vmlal.s16 q11, d27, d17 ; B[1] = c1*2217 + d1*5352 + 14500
+ vmlsl.s16 q12, d27, d16 ; B[3] = d1*2217 - c1*5352 + 7500
+
+ vshrn.s32 d2, q9, #12 ; A[1] = (c1*2217 + d1*5352 + 14500)>>12
+ vshrn.s32 d6, q10, #12 ; A[3] = (d1*2217 - c1*5352 + 7500)>>12
+ vshrn.s32 d3, q11, #12 ; B[1] = (c1*2217 + d1*5352 + 14500)>>12
+ vshrn.s32 d7, q12, #12 ; B[3] = (d1*2217 - c1*5352 + 7500)>>12
+
+
+ ; Part two
+ vld1.32 {q9,q10}, [r12@128] ; q9=12000, q10=51000
+
+ ; transpose q0=ip[0], q1=ip[4], q2=ip[8], q3=ip[12]
+ vtrn.32 q0, q2 ; q0=[A0 | B0]
+ vtrn.32 q1, q3 ; q1=[A4 | B4]
+ vtrn.16 q0, q1 ; q2=[A8 | B8]
+ vtrn.16 q2, q3 ; q3=[A12|B12]
+
+ vmov.s16 q15, #7
+
+ vadd.s16 q11, q0, q3 ; a1 = ip[0] + ip[12]
+ vadd.s16 q12, q1, q2 ; b1 = ip[4] + ip[8]
+ vadd.s16 q11, q11, q15 ; a1 + 7
+ vsub.s16 q13, q1, q2 ; c1 = ip[4] - ip[8]
+ vsub.s16 q14, q0, q3 ; d1 = ip[0] - ip[12]
+
+ vadd.s16 q0, q11, q12 ; a1 + b1 + 7
+ vsub.s16 q1, q11, q12 ; a1 - b1 + 7
+
+ vmov.s16 q11, q9 ; 12000
+ vmov.s16 q12, q10 ; 51000
+
+ vshr.s16 d0, d0, #4 ; A[0] = (a1 + b1 + 7)>>4
+ vshr.s16 d4, d1, #4 ; B[0] = (a1 + b1 + 7)>>4
+ vshr.s16 d2, d2, #4 ; A[8] = (a1 + b1 + 7)>>4
+ vshr.s16 d6, d3, #4 ; B[8] = (a1 + b1 + 7)>>4
+
+
+ vmlal.s16 q9, d28, d16 ; A[4] = d1*5352 + 12000
+ vmlal.s16 q10, d28, d17 ; A[12] = d1*2217 + 51000
+ vmlal.s16 q11, d29, d16 ; B[4] = d1*5352 + 12000
+ vmlal.s16 q12, d29, d17 ; B[12] = d1*2217 + 51000
+
+ vceq.s16 q14, q14, #0
+
+ vmlal.s16 q9, d26, d17 ; A[4] = c1*2217 + d1*5352 + 12000
+ vmlsl.s16 q10, d26, d16 ; A[12] = d1*2217 - c1*5352 + 51000
+ vmlal.s16 q11, d27, d17 ; B[4] = c1*2217 + d1*5352 + 12000
+ vmlsl.s16 q12, d27, d16 ; B[12] = d1*2217 - c1*5352 + 51000
+
+ vmvn.s16 q14, q14
+
+ vshrn.s32 d1, q9, #16 ; A[4] = (c1*2217 + d1*5352 + 12000)>>16
+ vshrn.s32 d3, q10, #16 ; A[12]= (d1*2217 - c1*5352 + 51000)>>16
+ vsub.s16 d1, d1, d28 ; A[4] += (d1!=0)
+
+ vshrn.s32 d5, q11, #16 ; B[4] = (c1*2217 + d1*5352 + 12000)>>16
+ vshrn.s32 d7, q12, #16 ; B[12]= (d1*2217 - c1*5352 + 51000)>>16
+ vsub.s16 d5, d5, d29 ; B[4] += (d1!=0)
+
+ vst1.16 {q0, q1}, [r1@128]! ; block A
+ vst1.16 {q2, q3}, [r1@128]! ; block B
- ; Should never get to this.
bx lr
ENDP
-;-----------------
-
-_dct_matrix_
- DCD dct_matrix
-dct_matrix
-; DCW 23170, 30274, 23170, 12540
-; DCW 23170, 12540, -23170,-30274
-; DCW 23170, -12540, -23170, 30274
-; DCW 23170, -30274, 23170,-12540
-; 23170 = 0x5a82
-; -23170 = 0xa57e
-; 30274 = 0x7642
-; -30274 = 0x89be
-; 12540 = 0x30fc
-; -12540 = 0xcf04
- DCD 0x76425a82, 0x30fc5a82
- DCD 0x30fc5a82, 0x89bea57e
- DCD 0xcf045a82, 0x7642a57e
- DCD 0x89be5a82, 0xcf045a82
-
END
+
diff --git a/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.asm b/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.asm
index ba3decf6c..22266297a 100644
--- a/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.asm
+++ b/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.asm
@@ -16,58 +16,85 @@
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
-;void vp8_short_walsh4x4_c(short *input, short *output, int pitch)
-
+;void vp8_short_walsh4x4_neon(short *input, short *output, int pitch)
+; r0 short *input,
+; r1 short *output,
+; r2 int pitch
|vp8_short_walsh4x4_neon| PROC
- vld1.16 {d2}, [r0], r2 ;load input
- vld1.16 {d3}, [r0], r2
- vld1.16 {d4}, [r0], r2
- vld1.16 {d5}, [r0], r2
- ;First for-loop
- ;transpose d2, d3, d4, d5. Then, d2=ip[0], d3=ip[1], d4=ip[2], d5=ip[3]
- vtrn.32 d2, d4
- vtrn.32 d3, d5
- vtrn.16 d2, d3
- vtrn.16 d4, d5
+ vld1.16 {d0}, [r0@64], r2 ; load input
+ vld1.16 {d1}, [r0@64], r2
+ vld1.16 {d2}, [r0@64], r2
+ vld1.16 {d3}, [r0@64]
- vadd.s16 d6, d2, d5 ;a1 = ip[0]+ip[3]
- vadd.s16 d7, d3, d4 ;b1 = ip[1]+ip[2]
- vsub.s16 d8, d3, d4 ;c1 = ip[1]-ip[2]
- vsub.s16 d9, d2, d5 ;d1 = ip[0]-ip[3]
+ ;First for-loop
+ ;transpose d0, d1, d2, d3. Then, d0=ip[0], d1=ip[1], d2=ip[2], d3=ip[3]
+ vtrn.32 d0, d2
+ vtrn.32 d1, d3
- vadd.s16 d2, d6, d7 ;op[0] = a1 + b1
- vsub.s16 d4, d6, d7 ;op[2] = a1 - b1
- vadd.s16 d3, d8, d9 ;op[1] = c1 + d1
- vsub.s16 d5, d9, d8 ;op[3] = d1 - c1
+ vmov.s32 q15, #3 ; add 3 to all values
- ;Second for-loop
- ;transpose d2, d3, d4, d5. Then, d2=ip[0], d3=ip[4], d4=ip[8], d5=ip[12]
- vtrn.32 d2, d4
- vtrn.32 d3, d5
+ vtrn.16 d0, d1
vtrn.16 d2, d3
- vtrn.16 d4, d5
- vadd.s16 d6, d2, d5 ;a1 = ip[0]+ip[12]
- vadd.s16 d7, d3, d4 ;b1 = ip[4]+ip[8]
- vsub.s16 d8, d3, d4 ;c1 = ip[4]-ip[8]
- vsub.s16 d9, d2, d5 ;d1 = ip[0]-ip[12]
+ vadd.s16 d4, d0, d2 ; ip[0] + ip[2]
+ vadd.s16 d5, d1, d3 ; ip[1] + ip[3]
+ vsub.s16 d6, d1, d3 ; ip[1] - ip[3]
+ vsub.s16 d7, d0, d2 ; ip[0] - ip[2]
- vadd.s16 d2, d6, d7 ;a2 = a1 + b1;
- vsub.s16 d4, d6, d7 ;c2 = a1 - b1;
- vadd.s16 d3, d8, d9 ;b2 = c1 + d1;
- vsub.s16 d5, d9, d8 ;d2 = d1 - c1;
+ vshl.s16 d4, d4, #2 ; a1 = (ip[0] + ip[2]) << 2
+ vshl.s16 d5, d5, #2 ; d1 = (ip[1] + ip[3]) << 2
+ vshl.s16 d6, d6, #2 ; c1 = (ip[1] - ip[3]) << 2
+ vceq.s16 d16, d4, #0 ; a1 == 0
+ vshl.s16 d7, d7, #2 ; b1 = (ip[0] - ip[2]) << 2
- vcgt.s16 q3, q1, #0
- vcgt.s16 q4, q2, #0
+ vadd.s16 d0, d4, d5 ; a1 + d1
+ vmvn d16, d16 ; a1 != 0
+ vsub.s16 d3, d4, d5 ; op[3] = a1 - d1
+ vadd.s16 d1, d7, d6 ; op[1] = b1 + c1
+ vsub.s16 d2, d7, d6 ; op[2] = b1 - c1
+ vsub.s16 d0, d0, d16 ; op[0] = a1 + d1 + (a1 != 0)
- vsub.s16 q1, q1, q3
- vsub.s16 q2, q2, q4
-
- vshr.s16 q1, q1, #1
- vshr.s16 q2, q2, #1
-
- vst1.16 {q1, q2}, [r1]
+ ;Second for-loop
+ ;transpose d0, d1, d2, d3, Then, d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12]
+ vtrn.32 d1, d3
+ vtrn.32 d0, d2
+ vtrn.16 d2, d3
+ vtrn.16 d0, d1
+
+ vaddl.s16 q8, d0, d2 ; a1 = ip[0]+ip[8]
+ vaddl.s16 q9, d1, d3 ; d1 = ip[4]+ip[12]
+ vsubl.s16 q10, d1, d3 ; c1 = ip[4]-ip[12]
+ vsubl.s16 q11, d0, d2 ; b1 = ip[0]-ip[8]
+
+ vadd.s32 q0, q8, q9 ; a2 = a1 + d1
+ vadd.s32 q1, q11, q10 ; b2 = b1 + c1
+ vsub.s32 q2, q11, q10 ; c2 = b1 - c1
+ vsub.s32 q3, q8, q9 ; d2 = a1 - d1
+
+ vclt.s32 q8, q0, #0
+ vclt.s32 q9, q1, #0
+ vclt.s32 q10, q2, #0
+ vclt.s32 q11, q3, #0
+
+ ; subtract -1 (or 0)
+ vsub.s32 q0, q0, q8 ; a2 += a2 < 0
+ vsub.s32 q1, q1, q9 ; b2 += b2 < 0
+ vsub.s32 q2, q2, q10 ; c2 += c2 < 0
+ vsub.s32 q3, q3, q11 ; d2 += d2 < 0
+
+ vadd.s32 q8, q0, q15 ; a2 + 3
+ vadd.s32 q9, q1, q15 ; b2 + 3
+ vadd.s32 q10, q2, q15 ; c2 + 3
+ vadd.s32 q11, q3, q15 ; d2 + 3
+
+ ; vrshrn? would add 1 << 3-1 = 2
+ vshrn.s32 d0, q8, #3
+ vshrn.s32 d1, q9, #3
+ vshrn.s32 d2, q10, #3
+ vshrn.s32 d3, q11, #3
+
+ vst1.16 {q0, q1}, [r1@128]
bx lr
diff --git a/vp8/vp8cx_arm.mk b/vp8/vp8cx_arm.mk
index e8dbd5d7e..f8f054a76 100644
--- a/vp8/vp8cx_arm.mk
+++ b/vp8/vp8cx_arm.mk
@@ -18,7 +18,6 @@ VP8_CX_SRCS-$(ARCH_ARM) += encoder/arm/arm_csystemdependent.c
VP8_CX_SRCS-$(ARCH_ARM) += encoder/arm/dct_arm.c
VP8_CX_SRCS-$(ARCH_ARM) += encoder/arm/dct_arm.h
VP8_CX_SRCS-$(ARCH_ARM) += encoder/arm/encodemb_arm.h
-VP8_CX_SRCS-$(ARCH_ARM) += encoder/arm/picklpf_arm.c
VP8_CX_SRCS-$(ARCH_ARM) += encoder/arm/quantize_arm.c
VP8_CX_SRCS-$(ARCH_ARM) += encoder/arm/quantize_arm.h
VP8_CX_SRCS-$(ARCH_ARM) += encoder/arm/variance_arm.c
@@ -36,7 +35,7 @@ VP8_CX_SRCS-$(HAVE_ARMV5TE) += encoder/arm/armv5te/vp8_packtokens_partitions_ar
#File list for armv6
# encoder
VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/vp8_subtract_armv6$(ASM)
-VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/vp8_fast_fdct4x4_armv6$(ASM)
+VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/vp8_short_fdct4x4_armv6$(ASM)
VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/vp8_fast_quantize_b_armv6$(ASM)
VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/vp8_sad16x16_armv6$(ASM)
VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/vp8_variance16x16_armv6$(ASM)
@@ -49,9 +48,8 @@ VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/walsh_v6$(ASM)
#File list for neon
# encoder
-VP8_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/neon/fastfdct4x4_neon$(ASM)
-VP8_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/neon/fastfdct8x4_neon$(ASM)
VP8_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/neon/fastquantizeb_neon$(ASM)
+VP8_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/neon/picklpf_arm.c
VP8_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/neon/sad8_neon$(ASM)
VP8_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/neon/sad16_neon$(ASM)
VP8_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/neon/shortfdct_neon$(ASM)
diff --git a/vpx_scale/arm/yv12extend_arm.c b/vpx_scale/arm/neon/yv12extend_arm.c
index d7a8289a9..d7a8289a9 100644
--- a/vpx_scale/arm/yv12extend_arm.c
+++ b/vpx_scale/arm/neon/yv12extend_arm.c
diff --git a/vpx_scale/vpx_scale.mk b/vpx_scale/vpx_scale.mk
index edb5419c3..9680ded3c 100644
--- a/vpx_scale/vpx_scale.mk
+++ b/vpx_scale/vpx_scale.mk
@@ -11,7 +11,6 @@ SCALE_SRCS-$(CONFIG_SPATIAL_RESAMPLING) += generic/gen_scalers.c
#arm
SCALE_SRCS-$(HAVE_ARMV7) += arm/scalesystemdependent.c
-SCALE_SRCS-$(HAVE_ARMV7) += arm/yv12extend_arm.c
SCALE_SRCS_REMOVE-$(HAVE_ARMV7) += generic/scalesystemdependent.c
#neon
@@ -19,5 +18,6 @@ SCALE_SRCS-$(HAVE_ARMV7) += arm/neon/vp8_vpxyv12_copyframe_func_neon$(ASM)
SCALE_SRCS-$(HAVE_ARMV7) += arm/neon/vp8_vpxyv12_copyframeyonly_neon$(ASM)
SCALE_SRCS-$(HAVE_ARMV7) += arm/neon/vp8_vpxyv12_copysrcframe_func_neon$(ASM)
SCALE_SRCS-$(HAVE_ARMV7) += arm/neon/vp8_vpxyv12_extendframeborders_neon$(ASM)
+SCALE_SRCS-$(HAVE_ARMV7) += arm/neon/yv12extend_arm.c
SCALE_SRCS-no += $(SCALE_SRCS_REMOVE-yes)