summaryrefslogtreecommitdiff
path: root/vp8/common/arm/armv6
diff options
context:
space:
mode:
authorTaekhyun Kim <takim@nvidia.com>2011-06-08 12:12:45 -0700
committerTaekhyun Kim <takim@nvidia.com>2011-06-17 14:04:53 -0700
commit458fb8f4911a3f53f0601b883069f844c2e40fa5 (patch)
treebbe5220090bc5453ec7955a5447ca200f48f59ce /vp8/common/arm/armv6
parent04edde2b114da4ac94cea13bbe9a6fe0ed474576 (diff)
downloadlibvpx-458fb8f4911a3f53f0601b883069f844c2e40fa5.tar
libvpx-458fb8f4911a3f53f0601b883069f844c2e40fa5.tar.gz
libvpx-458fb8f4911a3f53f0601b883069f844c2e40fa5.tar.bz2
libvpx-458fb8f4911a3f53f0601b883069f844c2e40fa5.zip
utilize preload in ARMv6 MC/LPF/Copy routines
About 9~10% decoding perf improvement on non-Neon ARM cpus Change-Id: I7dc2a026764e84e9c2faf282b4ae113090326837
Diffstat (limited to 'vp8/common/arm/armv6')
-rw-r--r--vp8/common/arm/armv6/bilinearfilter_v6.asm13
-rw-r--r--vp8/common/arm/armv6/copymem16x16_v6.asm10
-rw-r--r--vp8/common/arm/armv6/filter_v6.asm215
-rw-r--r--vp8/common/arm/armv6/loopfilter_v6.asm14
-rw-r--r--vp8/common/arm/armv6/simpleloopfilter_v6.asm8
-rw-r--r--vp8/common/arm/armv6/sixtappredict8x4_v6.asm6
6 files changed, 233 insertions, 33 deletions
diff --git a/vp8/common/arm/armv6/bilinearfilter_v6.asm b/vp8/common/arm/armv6/bilinearfilter_v6.asm
index a86ed5d0a..9704b4210 100644
--- a/vp8/common/arm/armv6/bilinearfilter_v6.asm
+++ b/vp8/common/arm/armv6/bilinearfilter_v6.asm
@@ -30,11 +30,11 @@
ldr r4, [sp, #36] ; width
mov r12, r3 ; outer-loop counter
- sub r2, r2, r4 ; src increment for height loop
- ;;IF ARCHITECTURE=6
- pld [r0]
- ;;ENDIF
+ add r7, r2, r4 ; preload next row
+ pld [r0, r7]
+
+ sub r2, r2, r4 ; src increment for height loop
ldr r5, [r11] ; load up filter coefficients
@@ -96,9 +96,8 @@
add r0, r0, r2 ; move to next input row
subs r12, r12, #1
- ;;IF ARCHITECTURE=6
- pld [r0]
- ;;ENDIF
+ add r9, r2, r4, lsl #1 ; adding back block width
+ pld [r0, r9] ; preload next row
add r11, r11, #2 ; move over to next column
mov r1, r11
diff --git a/vp8/common/arm/armv6/copymem16x16_v6.asm b/vp8/common/arm/armv6/copymem16x16_v6.asm
index fca91a0db..abf048c2f 100644
--- a/vp8/common/arm/armv6/copymem16x16_v6.asm
+++ b/vp8/common/arm/armv6/copymem16x16_v6.asm
@@ -22,9 +22,7 @@
;push {r4-r7}
;preload
- pld [r0]
- pld [r0, r1]
- pld [r0, r1, lsl #1]
+ pld [r0, #31] ; preload for next 16x16 block
ands r4, r0, #15
beq copy_mem16x16_fast
@@ -90,6 +88,8 @@ copy_mem16x16_1_loop
ldrneb r6, [r0, #2]
ldrneb r7, [r0, #3]
+ pld [r0, #31] ; preload for next 16x16 block
+
bne copy_mem16x16_1_loop
ldmia sp!, {r4 - r7}
@@ -121,6 +121,8 @@ copy_mem16x16_4_loop
ldrne r6, [r0, #8]
ldrne r7, [r0, #12]
+ pld [r0, #31] ; preload for next 16x16 block
+
bne copy_mem16x16_4_loop
ldmia sp!, {r4 - r7}
@@ -148,6 +150,7 @@ copy_mem16x16_8_loop
add r2, r2, r3
+ pld [r0, #31] ; preload for next 16x16 block
bne copy_mem16x16_8_loop
ldmia sp!, {r4 - r7}
@@ -171,6 +174,7 @@ copy_mem16x16_fast_loop
;stm r2, {r4-r7}
add r2, r2, r3
+ pld [r0, #31] ; preload for next 16x16 block
bne copy_mem16x16_fast_loop
ldmia sp!, {r4 - r7}
diff --git a/vp8/common/arm/armv6/filter_v6.asm b/vp8/common/arm/armv6/filter_v6.asm
index 03b5bccd7..1ba91ddd6 100644
--- a/vp8/common/arm/armv6/filter_v6.asm
+++ b/vp8/common/arm/armv6/filter_v6.asm
@@ -10,6 +10,8 @@
EXPORT |vp8_filter_block2d_first_pass_armv6|
+ EXPORT |vp8_filter_block2d_first_pass_16x16_armv6|
+ EXPORT |vp8_filter_block2d_first_pass_8x8_armv6|
EXPORT |vp8_filter_block2d_second_pass_armv6|
EXPORT |vp8_filter4_block2d_second_pass_armv6|
EXPORT |vp8_filter_block2d_first_pass_only_armv6|
@@ -40,11 +42,6 @@
add r12, r3, #16 ; square off the output
sub sp, sp, #4
- ;;IF ARCHITECTURE=6
- ;pld [r0, #-2]
- ;;pld [r0, #30]
- ;;ENDIF
-
ldr r4, [r11] ; load up packed filter coefficients
ldr r5, [r11, #4]
ldr r6, [r11, #8]
@@ -101,15 +98,10 @@
bne width_loop_1st_6
- ;;add r9, r2, #30 ; attempt to load 2 adjacent cache lines
- ;;IF ARCHITECTURE=6
- ;pld [r0, r2]
- ;;pld [r0, r9]
- ;;ENDIF
-
ldr r1, [sp] ; load and update dst address
subs r7, r7, #0x10000
add r0, r0, r2 ; move to next input line
+
add r1, r1, #2 ; move over to next column
str r1, [sp]
@@ -120,6 +112,192 @@
ENDP
+; --------------------------
+; 16x16 version
+; -----------------------------
+|vp8_filter_block2d_first_pass_16x16_armv6| PROC
+ stmdb sp!, {r4 - r11, lr}
+
+ ldr r11, [sp, #40] ; vp8_filter address
+ ldr r7, [sp, #36] ; output height
+
+ add r4, r2, #18 ; preload next low
+ pld [r0, r4]
+
+ sub r2, r2, r3 ; inside loop increments input array,
+ ; so the height loop only needs to add
+ ; r2 - width to the input pointer
+
+ mov r3, r3, lsl #1 ; multiply width by 2 because using shorts
+ add r12, r3, #16 ; square off the output
+ sub sp, sp, #4
+
+ ldr r4, [r11] ; load up packed filter coefficients
+ ldr r5, [r11, #4]
+ ldr r6, [r11, #8]
+
+ str r1, [sp] ; push destination to stack
+ mov r7, r7, lsl #16 ; height is top part of counter
+
+; six tap filter
+|height_loop_1st_16_6|
+ ldrb r8, [r0, #-2] ; load source data
+ ldrb r9, [r0, #-1]
+ ldrb r10, [r0], #2
+ orr r7, r7, r3, lsr #2 ; construct loop counter
+
+|width_loop_1st_16_6|
+ ldrb r11, [r0, #-1]
+
+ pkhbt lr, r8, r9, lsl #16 ; r9 | r8
+ pkhbt r8, r9, r10, lsl #16 ; r10 | r9
+
+ ldrb r9, [r0]
+
+ smuad lr, lr, r4 ; apply the filter
+ pkhbt r10, r10, r11, lsl #16 ; r11 | r10
+ smuad r8, r8, r4
+ pkhbt r11, r11, r9, lsl #16 ; r9 | r11
+
+ smlad lr, r10, r5, lr
+ ldrb r10, [r0, #1]
+ smlad r8, r11, r5, r8
+ ldrb r11, [r0, #2]
+
+ sub r7, r7, #1
+
+ pkhbt r9, r9, r10, lsl #16 ; r10 | r9
+ pkhbt r10, r10, r11, lsl #16 ; r11 | r10
+
+ smlad lr, r9, r6, lr
+ smlad r11, r10, r6, r8
+
+ ands r10, r7, #0xff ; test loop counter
+
+ add lr, lr, #0x40 ; round_shift_and_clamp
+ ldrneb r8, [r0, #-2] ; load data for next loop
+ usat lr, #8, lr, asr #7
+ add r11, r11, #0x40
+ ldrneb r9, [r0, #-1]
+ usat r11, #8, r11, asr #7
+
+ strh lr, [r1], r12 ; result is transposed and stored, which
+ ; will make second pass filtering easier.
+ ldrneb r10, [r0], #2
+ strh r11, [r1], r12
+
+ bne width_loop_1st_16_6
+
+ ldr r1, [sp] ; load and update dst address
+ subs r7, r7, #0x10000
+ add r0, r0, r2 ; move to next input line
+
+ add r11, r2, #34 ; adding back block width(=16)
+ pld [r0, r11] ; preload next low
+
+ add r1, r1, #2 ; move over to next column
+ str r1, [sp]
+
+ bne height_loop_1st_16_6
+
+ add sp, sp, #4
+ ldmia sp!, {r4 - r11, pc}
+
+ ENDP
+
+; --------------------------
+; 8x8 version
+; -----------------------------
+|vp8_filter_block2d_first_pass_8x8_armv6| PROC
+ stmdb sp!, {r4 - r11, lr}
+
+ ldr r11, [sp, #40] ; vp8_filter address
+ ldr r7, [sp, #36] ; output height
+
+ add r4, r2, #10 ; preload next low
+ pld [r0, r4]
+
+ sub r2, r2, r3 ; inside loop increments input array,
+ ; so the height loop only needs to add
+ ; r2 - width to the input pointer
+
+ mov r3, r3, lsl #1 ; multiply width by 2 because using shorts
+ add r12, r3, #16 ; square off the output
+ sub sp, sp, #4
+
+ ldr r4, [r11] ; load up packed filter coefficients
+ ldr r5, [r11, #4]
+ ldr r6, [r11, #8]
+
+ str r1, [sp] ; push destination to stack
+ mov r7, r7, lsl #16 ; height is top part of counter
+
+; six tap filter
+|height_loop_1st_8_6|
+ ldrb r8, [r0, #-2] ; load source data
+ ldrb r9, [r0, #-1]
+ ldrb r10, [r0], #2
+ orr r7, r7, r3, lsr #2 ; construct loop counter
+
+|width_loop_1st_8_6|
+ ldrb r11, [r0, #-1]
+
+ pkhbt lr, r8, r9, lsl #16 ; r9 | r8
+ pkhbt r8, r9, r10, lsl #16 ; r10 | r9
+
+ ldrb r9, [r0]
+
+ smuad lr, lr, r4 ; apply the filter
+ pkhbt r10, r10, r11, lsl #16 ; r11 | r10
+ smuad r8, r8, r4
+ pkhbt r11, r11, r9, lsl #16 ; r9 | r11
+
+ smlad lr, r10, r5, lr
+ ldrb r10, [r0, #1]
+ smlad r8, r11, r5, r8
+ ldrb r11, [r0, #2]
+
+ sub r7, r7, #1
+
+ pkhbt r9, r9, r10, lsl #16 ; r10 | r9
+ pkhbt r10, r10, r11, lsl #16 ; r11 | r10
+
+ smlad lr, r9, r6, lr
+ smlad r11, r10, r6, r8
+
+ ands r10, r7, #0xff ; test loop counter
+
+ add lr, lr, #0x40 ; round_shift_and_clamp
+ ldrneb r8, [r0, #-2] ; load data for next loop
+ usat lr, #8, lr, asr #7
+ add r11, r11, #0x40
+ ldrneb r9, [r0, #-1]
+ usat r11, #8, r11, asr #7
+
+ strh lr, [r1], r12 ; result is transposed and stored, which
+ ; will make second pass filtering easier.
+ ldrneb r10, [r0], #2
+ strh r11, [r1], r12
+
+ bne width_loop_1st_8_6
+
+ ldr r1, [sp] ; load and update dst address
+ subs r7, r7, #0x10000
+ add r0, r0, r2 ; move to next input line
+
+ add r11, r2, #18 ; adding back block width(=8)
+ pld [r0, r11] ; preload next low
+
+ add r1, r1, #2 ; move over to next column
+ str r1, [sp]
+
+ bne height_loop_1st_8_6
+
+ add sp, sp, #4
+ ldmia sp!, {r4 - r11, pc}
+
+ ENDP
+
;---------------------------------
; r0 short *src_ptr,
; r1 unsigned char *output_ptr,
@@ -262,6 +440,10 @@
|vp8_filter_block2d_first_pass_only_armv6| PROC
stmdb sp!, {r4 - r11, lr}
+ add r7, r2, r3 ; preload next low
+ add r7, r7, #2
+ pld [r0, r7]
+
ldr r4, [sp, #36] ; output pitch
ldr r11, [sp, #40] ; HFilter address
sub sp, sp, #8
@@ -330,16 +512,15 @@
bne width_loop_1st_only_6
- ;;add r9, r2, #30 ; attempt to load 2 adjacent cache lines
- ;;IF ARCHITECTURE=6
- ;pld [r0, r2]
- ;;pld [r0, r9]
- ;;ENDIF
-
ldr lr, [sp] ; load back output pitch
ldr r12, [sp, #4] ; load back output pitch
subs r7, r7, #1
add r0, r0, r12 ; updata src for next loop
+
+ add r11, r12, r3 ; preload next low
+ add r11, r11, #2
+ pld [r0, r11]
+
add r1, r1, lr ; update dst for next loop
bne height_loop_1st_only_6
diff --git a/vp8/common/arm/armv6/loopfilter_v6.asm b/vp8/common/arm/armv6/loopfilter_v6.asm
index b6417dee6..c7441b055 100644
--- a/vp8/common/arm/armv6/loopfilter_v6.asm
+++ b/vp8/common/arm/armv6/loopfilter_v6.asm
@@ -253,12 +253,6 @@ count RN r5
subs count, count, #1
- ;pld [src]
- ;pld [src, pstep]
- ;pld [src, pstep, lsl #1]
- ;pld [src, pstep, lsl #2]
- ;pld [src, pstep, lsl #3]
-
ldrne r9, [src], pstep ; p3
ldrne r10, [src], pstep ; p2
ldrne r11, [src], pstep ; p1
@@ -857,15 +851,19 @@ count RN r5
sub src, src, #4 ; move src pointer down by 4
ldr count, [sp, #40] ; count for 8-in-parallel
ldr r12, [sp, #36] ; load thresh address
+ pld [src, #23] ; preload for next block
sub sp, sp, #16 ; create temp buffer
ldr r6, [src], pstep ; load source data
ldr r4, [r2], #4 ; flimit
+ pld [src, #23]
ldr r7, [src], pstep
ldr r2, [r3], #4 ; limit
+ pld [src, #23]
ldr r8, [src], pstep
uadd8 r4, r4, r4 ; flimit * 2
ldr r3, [r12], #4 ; thresh
+ pld [src, #23]
ldr lr, [src], pstep
mov count, count, lsl #1 ; 4-in-parallel
uadd8 r4, r4, r2 ; flimit * 2 + limit
@@ -1242,9 +1240,13 @@ count RN r5
sub src, src, #4
subs count, count, #1
+ pld [src, #23] ; preload for next block
ldrne r6, [src], pstep ; load source data
+ pld [src, #23]
ldrne r7, [src], pstep
+ pld [src, #23]
ldrne r8, [src], pstep
+ pld [src, #23]
ldrne lr, [src], pstep
bne MBVnext8
diff --git a/vp8/common/arm/armv6/simpleloopfilter_v6.asm b/vp8/common/arm/armv6/simpleloopfilter_v6.asm
index 013712036..40a71f49d 100644
--- a/vp8/common/arm/armv6/simpleloopfilter_v6.asm
+++ b/vp8/common/arm/armv6/simpleloopfilter_v6.asm
@@ -154,22 +154,26 @@ pstep RN r1
; load soure data to r7, r8, r9, r10
ldrh r3, [src, #-2]
+ pld [src, #23] ; preload for next block
ldrh r4, [src], pstep
uadd8 r12, r12, r12 ; flimit * 2
ldrh r5, [src, #-2]
+ pld [src, #23]
ldrh r6, [src], pstep
uadd8 r12, r12, r7 ; flimit * 2 + limit
pkhbt r7, r3, r4, lsl #16
ldrh r3, [src, #-2]
+ pld [src, #23]
ldrh r4, [src], pstep
ldr r11, [sp, #40] ; count (r11) for 8-in-parallel
pkhbt r8, r5, r6, lsl #16
ldrh r5, [src, #-2]
+ pld [src, #23]
ldrh r6, [src], pstep
mov r11, r11, lsl #1 ; 4-in-parallel
@@ -259,19 +263,23 @@ pstep RN r1
; load soure data to r7, r8, r9, r10
ldrneh r3, [src, #-2]
+ pld [src, #23] ; preload for next block
ldrneh r4, [src], pstep
ldrneh r5, [src, #-2]
+ pld [src, #23]
ldrneh r6, [src], pstep
pkhbt r7, r3, r4, lsl #16
ldrneh r3, [src, #-2]
+ pld [src, #23]
ldrneh r4, [src], pstep
pkhbt r8, r5, r6, lsl #16
ldrneh r5, [src, #-2]
+ pld [src, #23]
ldrneh r6, [src], pstep
bne simple_vnext8
diff --git a/vp8/common/arm/armv6/sixtappredict8x4_v6.asm b/vp8/common/arm/armv6/sixtappredict8x4_v6.asm
index 029e02aa0..3fda1cefa 100644
--- a/vp8/common/arm/armv6/sixtappredict8x4_v6.asm
+++ b/vp8/common/arm/armv6/sixtappredict8x4_v6.asm
@@ -35,6 +35,9 @@
ldr r12, _filter8_coeff_
sub r0, r0, r1, lsl #1
+ add r3, r1, #10 ; preload next low
+ pld [r0, r3]
+
add r2, r12, r2, lsl #4 ;calculate filter location
add r0, r0, #3 ;adjust src only for loading convinience
@@ -110,6 +113,9 @@
add r0, r0, r1 ; move to next input line
+ add r11, r1, #18 ; preload next low. adding back block width(=8), which is subtracted earlier
+ pld [r0, r11]
+
bne first_pass_hloop_v6
;second pass filter