diff options
author | Johann <johannkoenig@google.com> | 2011-04-07 13:17:22 -0400 |
---|---|---|
committer | Johann <johannkoenig@google.com> | 2011-04-18 16:30:38 -0400 |
commit | c7cfde42a9ec05b72d15ebaa9a59cefed4cd323a (patch) | |
tree | 395d38ba42df5e8be5abe33baa028bc937226155 /vp8/encoder/x86/sad_sse2.asm | |
parent | d889035fe6802b64567c2ed250c1dff0eb377acf (diff) | |
download | libvpx-c7cfde42a9ec05b72d15ebaa9a59cefed4cd323a.tar libvpx-c7cfde42a9ec05b72d15ebaa9a59cefed4cd323a.tar.gz libvpx-c7cfde42a9ec05b72d15ebaa9a59cefed4cd323a.tar.bz2 libvpx-c7cfde42a9ec05b72d15ebaa9a59cefed4cd323a.zip |
Add save/restore xmm registers in x86 assembly code
Went through the code and fixed it. Verified on Windows.
Where possible, remove dependencies on xmm[67]
Current code relies on pushing rbp to the stack to get 16 byte
alignment. This broke when rbp wasn't pushed
(vp8/encoder/x86/sad_sse3.asm). Work around this by using unaligned
memory accesses. Revisit this and the offsets in
vp8/encoder/x86/sad_sse3.asm in another change to SAVE_XMM.
Change-Id: I5f940994d3ebfd977c3d68446cef20fd78b07877
Diffstat (limited to 'vp8/encoder/x86/sad_sse2.asm')
-rw-r--r-- | vp8/encoder/x86/sad_sse2.asm | 18 |
1 files changed, 10 insertions, 8 deletions
diff --git a/vp8/encoder/x86/sad_sse2.asm b/vp8/encoder/x86/sad_sse2.asm index cc6bc3cd9..d9ac3ff4f 100644 --- a/vp8/encoder/x86/sad_sse2.asm +++ b/vp8/encoder/x86/sad_sse2.asm @@ -21,6 +21,7 @@ sym(vp8_sad16x16_wmt): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 4 + SAVE_XMM ; 6 push rsi push rdi ; end prolog @@ -34,7 +35,7 @@ sym(vp8_sad16x16_wmt): lea rcx, [rsi+rax*8] lea rcx, [rcx+rax*8] - pxor xmm7, xmm7 + pxor xmm6, xmm6 x16x16sad_wmt_loop: @@ -52,32 +53,33 @@ x16x16sad_wmt_loop: punpcklbw xmm1, xmm3 psadbw xmm0, xmm1 - movq xmm6, QWORD PTR [rsi+rax+8] + movq xmm2, QWORD PTR [rsi+rax+8] movq xmm3, QWORD PTR [rdi+rdx+8] lea rsi, [rsi+rax*2] lea rdi, [rdi+rdx*2] - punpcklbw xmm4, xmm6 + punpcklbw xmm4, xmm2 punpcklbw xmm5, xmm3 psadbw xmm4, xmm5 - paddw xmm7, xmm0 - paddw xmm7, xmm4 + paddw xmm6, xmm0 + paddw xmm6, xmm4 cmp rsi, rcx jne x16x16sad_wmt_loop - movq xmm0, xmm7 - psrldq xmm7, 8 + movq xmm0, xmm6 + psrldq xmm6, 8 - paddw xmm0, xmm7 + paddw xmm0, xmm6 movq rax, xmm0 ; begin epilog pop rdi pop rsi + RESTORE_XMM UNSHADOW_ARGS pop rbp ret |