summaryrefslogtreecommitdiff
path: root/vp8/encoder/x86/sad_sse2.asm
diff options
context:
space:
mode:
authorJohann <johannkoenig@google.com>2011-04-07 13:17:22 -0400
committerJohann <johannkoenig@google.com>2011-04-18 16:30:38 -0400
commitc7cfde42a9ec05b72d15ebaa9a59cefed4cd323a (patch)
tree395d38ba42df5e8be5abe33baa028bc937226155 /vp8/encoder/x86/sad_sse2.asm
parentd889035fe6802b64567c2ed250c1dff0eb377acf (diff)
downloadlibvpx-c7cfde42a9ec05b72d15ebaa9a59cefed4cd323a.tar
libvpx-c7cfde42a9ec05b72d15ebaa9a59cefed4cd323a.tar.gz
libvpx-c7cfde42a9ec05b72d15ebaa9a59cefed4cd323a.tar.bz2
libvpx-c7cfde42a9ec05b72d15ebaa9a59cefed4cd323a.zip
Add save/restore xmm registers in x86 assembly code
Went through the code and fixed it. Verified on Windows. Where possible, remove dependencies on xmm[67] Current code relies on pushing rbp to the stack to get 16 byte alignment. This broke when rbp wasn't pushed (vp8/encoder/x86/sad_sse3.asm). Work around this by using unaligned memory accesses. Revisit this and the offsets in vp8/encoder/x86/sad_sse3.asm in another change to SAVE_XMM. Change-Id: I5f940994d3ebfd977c3d68446cef20fd78b07877
Diffstat (limited to 'vp8/encoder/x86/sad_sse2.asm')
-rw-r--r--vp8/encoder/x86/sad_sse2.asm18
1 files changed, 10 insertions, 8 deletions
diff --git a/vp8/encoder/x86/sad_sse2.asm b/vp8/encoder/x86/sad_sse2.asm
index cc6bc3cd9..d9ac3ff4f 100644
--- a/vp8/encoder/x86/sad_sse2.asm
+++ b/vp8/encoder/x86/sad_sse2.asm
@@ -21,6 +21,7 @@ sym(vp8_sad16x16_wmt):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 4
+ SAVE_XMM ; 6
push rsi
push rdi
; end prolog
@@ -34,7 +35,7 @@ sym(vp8_sad16x16_wmt):
lea rcx, [rsi+rax*8]
lea rcx, [rcx+rax*8]
- pxor xmm7, xmm7
+ pxor xmm6, xmm6
x16x16sad_wmt_loop:
@@ -52,32 +53,33 @@ x16x16sad_wmt_loop:
punpcklbw xmm1, xmm3
psadbw xmm0, xmm1
- movq xmm6, QWORD PTR [rsi+rax+8]
+ movq xmm2, QWORD PTR [rsi+rax+8]
movq xmm3, QWORD PTR [rdi+rdx+8]
lea rsi, [rsi+rax*2]
lea rdi, [rdi+rdx*2]
- punpcklbw xmm4, xmm6
+ punpcklbw xmm4, xmm2
punpcklbw xmm5, xmm3
psadbw xmm4, xmm5
- paddw xmm7, xmm0
- paddw xmm7, xmm4
+ paddw xmm6, xmm0
+ paddw xmm6, xmm4
cmp rsi, rcx
jne x16x16sad_wmt_loop
- movq xmm0, xmm7
- psrldq xmm7, 8
+ movq xmm0, xmm6
+ psrldq xmm6, 8
- paddw xmm0, xmm7
+ paddw xmm0, xmm6
movq rax, xmm0
; begin epilog
pop rdi
pop rsi
+ RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret