diff options
author | Yunqing Wang <yunqingwang@google.com> | 2011-02-22 18:01:08 -0500 |
---|---|---|
committer | Yunqing Wang <yunqingwang@google.com> | 2011-02-28 11:25:55 -0500 |
commit | d96ba65a23f383c64ea2b244503636c96d1e8437 (patch) | |
tree | b91c9c86e4b584c2ed87ddde4e4a3db078928df2 /vp8/encoder | |
parent | 945dad277d3c09708956d60bf7844d47e0eeed1f (diff) | |
download | libvpx-d96ba65a23f383c64ea2b244503636c96d1e8437.tar libvpx-d96ba65a23f383c64ea2b244503636c96d1e8437.tar.gz libvpx-d96ba65a23f383c64ea2b244503636c96d1e8437.tar.bz2 libvpx-d96ba65a23f383c64ea2b244503636c96d1e8437.zip |
Add prefetch before variance calculation
This improved encoding performance by 0.5% (good, speed 1) to
1.5% (good, speed 5).
Change-Id: I843d72a0d68a90b5f694adf770943e4a4618f50e
Diffstat (limited to 'vp8/encoder')
-rw-r--r-- | vp8/encoder/x86/variance_impl_sse2.asm | 32 |
1 files changed, 28 insertions, 4 deletions
diff --git a/vp8/encoder/x86/variance_impl_sse2.asm b/vp8/encoder/x86/variance_impl_sse2.asm index 7178e7e31..6cdc47bc9 100644 --- a/vp8/encoder/x86/variance_impl_sse2.asm +++ b/vp8/encoder/x86/variance_impl_sse2.asm @@ -85,10 +85,9 @@ sym(vp8_get16x16var_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 - GET_GOT rbx + push rbx push rsi push rdi - sub rsp, 16 ; end prolog mov rsi, arg(0) ;[src_ptr] @@ -97,6 +96,29 @@ sym(vp8_get16x16var_sse2): movsxd rax, DWORD PTR arg(1) ;[source_stride] movsxd rdx, DWORD PTR arg(3) ;[recon_stride] + ; Prefetch data + lea rcx, [rax+rax*2] + prefetcht0 [rsi] + prefetcht0 [rsi+rax] + prefetcht0 [rsi+rax*2] + prefetcht0 [rsi+rcx] + lea rbx, [rsi+rax*4] + prefetcht0 [rbx] + prefetcht0 [rbx+rax] + prefetcht0 [rbx+rax*2] + prefetcht0 [rbx+rcx] + + lea rcx, [rdx+rdx*2] + prefetcht0 [rdi] + prefetcht0 [rdi+rdx] + prefetcht0 [rdi+rdx*2] + prefetcht0 [rdi+rcx] + lea rbx, [rdi+rdx*4] + prefetcht0 [rbx] + prefetcht0 [rbx+rdx] + prefetcht0 [rbx+rdx*2] + prefetcht0 [rbx+rcx] + pxor xmm0, xmm0 ; clear xmm0 for unpack pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs @@ -107,6 +129,9 @@ var16loop: movdqu xmm1, XMMWORD PTR [rsi] movdqu xmm2, XMMWORD PTR [rdi] + prefetcht0 [rsi+rax*8] + prefetcht0 [rdi+rdx*8] + movdqa xmm3, xmm1 movdqa xmm4, xmm2 @@ -178,10 +203,9 @@ var16loop: ; begin epilog - add rsp, 16 pop rdi pop rsi - RESTORE_GOT + pop rbx UNSHADOW_ARGS pop rbp ret |