summaryrefslogtreecommitdiff
path: root/vp8
diff options
context:
space:
mode:
authorYunqing Wang <yunqingwang@google.com>2011-02-22 18:01:08 -0500
committerYunqing Wang <yunqingwang@google.com>2011-02-28 11:25:55 -0500
commitd96ba65a23f383c64ea2b244503636c96d1e8437 (patch)
treeb91c9c86e4b584c2ed87ddde4e4a3db078928df2 /vp8
parent945dad277d3c09708956d60bf7844d47e0eeed1f (diff)
downloadlibvpx-d96ba65a23f383c64ea2b244503636c96d1e8437.tar
libvpx-d96ba65a23f383c64ea2b244503636c96d1e8437.tar.gz
libvpx-d96ba65a23f383c64ea2b244503636c96d1e8437.tar.bz2
libvpx-d96ba65a23f383c64ea2b244503636c96d1e8437.zip
Add prefetch before variance calculation
This improved encoding performance by 0.5% (good, speed 1) to 1.5% (good, speed 5). Change-Id: I843d72a0d68a90b5f694adf770943e4a4618f50e
Diffstat (limited to 'vp8')
-rw-r--r--vp8/encoder/x86/variance_impl_sse2.asm32
1 files changed, 28 insertions, 4 deletions
diff --git a/vp8/encoder/x86/variance_impl_sse2.asm b/vp8/encoder/x86/variance_impl_sse2.asm
index 7178e7e31..6cdc47bc9 100644
--- a/vp8/encoder/x86/variance_impl_sse2.asm
+++ b/vp8/encoder/x86/variance_impl_sse2.asm
@@ -85,10 +85,9 @@ sym(vp8_get16x16var_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
- GET_GOT rbx
+ push rbx
push rsi
push rdi
- sub rsp, 16
; end prolog
mov rsi, arg(0) ;[src_ptr]
@@ -97,6 +96,29 @@ sym(vp8_get16x16var_sse2):
movsxd rax, DWORD PTR arg(1) ;[source_stride]
movsxd rdx, DWORD PTR arg(3) ;[recon_stride]
+ ; Prefetch data
+ lea rcx, [rax+rax*2]
+ prefetcht0 [rsi]
+ prefetcht0 [rsi+rax]
+ prefetcht0 [rsi+rax*2]
+ prefetcht0 [rsi+rcx]
+ lea rbx, [rsi+rax*4]
+ prefetcht0 [rbx]
+ prefetcht0 [rbx+rax]
+ prefetcht0 [rbx+rax*2]
+ prefetcht0 [rbx+rcx]
+
+ lea rcx, [rdx+rdx*2]
+ prefetcht0 [rdi]
+ prefetcht0 [rdi+rdx]
+ prefetcht0 [rdi+rdx*2]
+ prefetcht0 [rdi+rcx]
+ lea rbx, [rdi+rdx*4]
+ prefetcht0 [rbx]
+ prefetcht0 [rbx+rdx]
+ prefetcht0 [rbx+rdx*2]
+ prefetcht0 [rbx+rcx]
+
pxor xmm0, xmm0 ; clear xmm0 for unpack
pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
@@ -107,6 +129,9 @@ var16loop:
movdqu xmm1, XMMWORD PTR [rsi]
movdqu xmm2, XMMWORD PTR [rdi]
+ prefetcht0 [rsi+rax*8]
+ prefetcht0 [rdi+rdx*8]
+
movdqa xmm3, xmm1
movdqa xmm4, xmm2
@@ -178,10 +203,9 @@ var16loop:
; begin epilog
- add rsp, 16
pop rdi
pop rsi
- RESTORE_GOT
+ pop rbx
UNSHADOW_ARGS
pop rbp
ret