Add prefetch before variance calculation

This improved encoding performance by 0.5% (good, speed 1) to 1.5% (good, speed 5). Change-Id: I843d72a0d68a90b5f694adf770943e4a4618f50e
author: Yunqing Wang <yunqingwang@google.com> 2011-02-22 18:01:08 -0500
committer: Yunqing Wang <yunqingwang@google.com> 2011-02-28 11:25:55 -0500
commit: d96ba65a23f383c64ea2b244503636c96d1e8437 (patch)
tree: b91c9c86e4b584c2ed87ddde4e4a3db078928df2 /vp8
parent: 945dad277d3c09708956d60bf7844d47e0eeed1f (diff)
download: libvpx-d96ba65a23f383c64ea2b244503636c96d1e8437.tar
libvpx-d96ba65a23f383c64ea2b244503636c96d1e8437.tar.gz
libvpx-d96ba65a23f383c64ea2b244503636c96d1e8437.tar.bz2
libvpx-d96ba65a23f383c64ea2b244503636c96d1e8437.zip
1 files changed, 28 insertions, 4 deletions
diff --git a/vp8/encoder/x86/variance_impl_sse2.asm b/vp8/encoder/x86/variance_impl_sse2.asm
index 7178e7e31..6cdc47bc9 100644
--- a/vp8/encoder/x86/variance_impl_sse2.asm
+++ b/vp8/encoder/x86/variance_impl_sse2.asm
@@ -85,10 +85,9 @@ sym(vp8_get16x16var_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
-    GET_GOT     rbx
+    push rbx
     push rsi
     push rdi
-    sub         rsp, 16
     ; end prolog
 
         mov         rsi,            arg(0) ;[src_ptr]
@@ -97,6 +96,29 @@ sym(vp8_get16x16var_sse2):
         movsxd      rax,            DWORD PTR arg(1) ;[source_stride]
         movsxd      rdx,            DWORD PTR arg(3) ;[recon_stride]
 
+        ; Prefetch data
+        lea             rcx,    [rax+rax*2]
+        prefetcht0      [rsi]
+        prefetcht0      [rsi+rax]
+        prefetcht0      [rsi+rax*2]
+        prefetcht0      [rsi+rcx]
+        lea             rbx,    [rsi+rax*4]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+rax]
+        prefetcht0      [rbx+rax*2]
+        prefetcht0      [rbx+rcx]
+
+        lea             rcx,    [rdx+rdx*2]
+        prefetcht0      [rdi]
+        prefetcht0      [rdi+rdx]
+        prefetcht0      [rdi+rdx*2]
+        prefetcht0      [rdi+rcx]
+        lea             rbx,    [rdi+rdx*4]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+rdx]
+        prefetcht0      [rbx+rdx*2]
+        prefetcht0      [rbx+rcx]
+
         pxor        xmm0,           xmm0                        ; clear xmm0 for unpack
         pxor        xmm7,           xmm7                        ; clear xmm7 for accumulating diffs
 
@@ -107,6 +129,9 @@ var16loop:
         movdqu      xmm1,           XMMWORD PTR [rsi]
         movdqu      xmm2,           XMMWORD PTR [rdi]
 
+        prefetcht0      [rsi+rax*8]
+        prefetcht0      [rdi+rdx*8]
+
         movdqa      xmm3,           xmm1
         movdqa      xmm4,           xmm2
 
@@ -178,10 +203,9 @@ var16loop:
 
 
     ; begin epilog
-    add rsp, 16
     pop rdi
     pop rsi
-    RESTORE_GOT
+    pop rbx
     UNSHADOW_ARGS
     pop         rbp
     ret
author	Yunqing Wang <yunqingwang@google.com>	2011-02-22 18:01:08 -0500
committer	Yunqing Wang <yunqingwang@google.com>	2011-02-28 11:25:55 -0500
commit	d96ba65a23f383c64ea2b244503636c96d1e8437 (patch)
tree	b91c9c86e4b584c2ed87ddde4e4a3db078928df2 /vp8
parent	945dad277d3c09708956d60bf7844d47e0eeed1f (diff)
download	libvpx-d96ba65a23f383c64ea2b244503636c96d1e8437.tar libvpx-d96ba65a23f383c64ea2b244503636c96d1e8437.tar.gz libvpx-d96ba65a23f383c64ea2b244503636c96d1e8437.tar.bz2 libvpx-d96ba65a23f383c64ea2b244503636c96d1e8437.zip