From c7cfde42a9ec05b72d15ebaa9a59cefed4cd323a Mon Sep 17 00:00:00 2001 From: Johann Date: Thu, 7 Apr 2011 13:17:22 -0400 Subject: Add save/restore xmm registers in x86 assembly code Went through the code and fixed it. Verified on Windows. Where possible, remove dependencies on xmm[67] Current code relies on pushing rbp to the stack to get 16 byte alignment. This broke when rbp wasn't pushed (vp8/encoder/x86/sad_sse3.asm). Work around this by using unaligned memory accesses. Revisit this and the offsets in vp8/encoder/x86/sad_sse3.asm in another change to SAVE_XMM. Change-Id: I5f940994d3ebfd977c3d68446cef20fd78b07877 --- vp8/common/x86/idctllm_sse2.asm | 50 +++++++++++++++++++++------------------ vp8/common/x86/subpixel_ssse3.asm | 12 ++++++++++ 2 files changed, 39 insertions(+), 23 deletions(-) (limited to 'vp8/common') diff --git a/vp8/common/x86/idctllm_sse2.asm b/vp8/common/x86/idctllm_sse2.asm index edee1578e..c873869ab 100644 --- a/vp8/common/x86/idctllm_sse2.asm +++ b/vp8/common/x86/idctllm_sse2.asm @@ -32,9 +32,6 @@ sym(idct_dequant_0_2x_sse2): mov rdx, arg(1) ; dequant mov rax, arg(0) ; qcoeff - ; Zero out xmm7, for use unpacking - pxor xmm7, xmm7 - movd xmm4, [rax] movd xmm5, [rdx] @@ -43,9 +40,12 @@ sym(idct_dequant_0_2x_sse2): pmullw xmm4, xmm5 + ; Zero out xmm5, for use unpacking + pxor xmm5, xmm5 + ; clear coeffs - movd [rax], xmm7 - movd [rax+32], xmm7 + movd [rax], xmm5 + movd [rax+32], xmm5 ;pshufb pshuflw xmm4, xmm4, 00000000b pshufhw xmm4, xmm4, 00000000b @@ -62,10 +62,10 @@ sym(idct_dequant_0_2x_sse2): lea rcx, [3*rcx] movq xmm3, [rax+rcx] - punpcklbw xmm0, xmm7 - punpcklbw xmm1, xmm7 - punpcklbw xmm2, xmm7 - punpcklbw xmm3, xmm7 + punpcklbw xmm0, xmm5 + punpcklbw xmm1, xmm5 + punpcklbw xmm2, xmm5 + punpcklbw xmm3, xmm5 mov rax, arg(3) ; dst movsxd rdx, dword ptr arg(4) ; dst_stride @@ -77,10 +77,10 @@ sym(idct_dequant_0_2x_sse2): paddw xmm3, xmm4 ; pack up before storing - packuswb xmm0, xmm7 - packuswb xmm1, xmm7 - packuswb xmm2, xmm7 - packuswb xmm3, xmm7 + packuswb xmm0, xmm5 + packuswb xmm1, xmm5 + packuswb xmm2, xmm5 + packuswb xmm3, xmm5 ; store blocks back out movq [rax], xmm0 @@ -102,6 +102,7 @@ sym(idct_dequant_full_2x_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 + SAVE_XMM GET_GOT rbx push rsi push rdi @@ -347,6 +348,7 @@ sym(idct_dequant_full_2x_sse2): pop rdi pop rsi RESTORE_GOT + RESTORE_XMM UNSHADOW_ARGS pop rbp ret @@ -377,8 +379,8 @@ sym(idct_dequant_dc_0_2x_sse2): mov rdi, arg(3) ; dst mov rdx, arg(5) ; dc - ; Zero out xmm7, for use unpacking - pxor xmm7, xmm7 + ; Zero out xmm5, for use unpacking + pxor xmm5, xmm5 ; load up 2 dc words here == 2*16 = doubleword movd xmm4, [rdx] @@ -398,10 +400,10 @@ sym(idct_dequant_dc_0_2x_sse2): psraw xmm4, 3 ; Predict buffer needs to be expanded from bytes to words - punpcklbw xmm0, xmm7 - punpcklbw xmm1, xmm7 - punpcklbw xmm2, xmm7 - punpcklbw xmm3, xmm7 + punpcklbw xmm0, xmm5 + punpcklbw xmm1, xmm5 + punpcklbw xmm2, xmm5 + punpcklbw xmm3, xmm5 ; Add to predict buffer paddw xmm0, xmm4 @@ -410,10 +412,10 @@ sym(idct_dequant_dc_0_2x_sse2): paddw xmm3, xmm4 ; pack up before storing - packuswb xmm0, xmm7 - packuswb xmm1, xmm7 - packuswb xmm2, xmm7 - packuswb xmm3, xmm7 + packuswb xmm0, xmm5 + packuswb xmm1, xmm5 + packuswb xmm2, xmm5 + packuswb xmm3, xmm5 ; Load destination stride before writing out, ; doesn't need to persist @@ -441,6 +443,7 @@ sym(idct_dequant_dc_full_2x_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 + SAVE_XMM GET_GOT rbx push rsi push rdi @@ -692,6 +695,7 @@ sym(idct_dequant_dc_full_2x_sse2): pop rdi pop rsi RESTORE_GOT + RESTORE_XMM UNSHADOW_ARGS pop rbp ret diff --git a/vp8/common/x86/subpixel_ssse3.asm b/vp8/common/x86/subpixel_ssse3.asm index 0ec18de76..1db3d629c 100644 --- a/vp8/common/x86/subpixel_ssse3.asm +++ b/vp8/common/x86/subpixel_ssse3.asm @@ -39,6 +39,7 @@ sym(vp8_filter_block1d8_h6_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 + SAVE_XMM GET_GOT rbx push rsi push rdi @@ -107,6 +108,7 @@ filter_block1d8_h6_rowloop_ssse3: pop rdi pop rsi RESTORE_GOT + RESTORE_XMM UNSHADOW_ARGS pop rbp ret @@ -162,6 +164,7 @@ filter_block1d8_h4_rowloop_ssse3: pop rdi pop rsi RESTORE_GOT + RESTORE_XMM UNSHADOW_ARGS pop rbp ret @@ -286,6 +289,7 @@ sym(vp8_filter_block1d4_h6_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 + SAVE_XMM GET_GOT rbx push rsi push rdi @@ -393,6 +397,7 @@ filter_block1d4_h4_rowloop_ssse3: pop rdi pop rsi RESTORE_GOT + RESTORE_XMM UNSHADOW_ARGS pop rbp ret @@ -413,6 +418,7 @@ sym(vp8_filter_block1d16_v6_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 + SAVE_XMM GET_GOT rbx push rsi push rdi @@ -508,6 +514,7 @@ vp8_filter_block1d16_v6_ssse3_loop: pop rdi pop rsi RESTORE_GOT + RESTORE_XMM UNSHADOW_ARGS pop rbp ret @@ -580,6 +587,7 @@ vp8_filter_block1d16_v4_ssse3_loop: pop rdi pop rsi RESTORE_GOT + RESTORE_XMM UNSHADOW_ARGS pop rbp ret @@ -598,6 +606,7 @@ sym(vp8_filter_block1d8_v6_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 + SAVE_XMM GET_GOT rbx push rsi push rdi @@ -670,6 +679,7 @@ vp8_filter_block1d8_v6_ssse3_loop: pop rdi pop rsi RESTORE_GOT + RESTORE_XMM UNSHADOW_ARGS pop rbp ret @@ -718,6 +728,7 @@ vp8_filter_block1d8_v4_ssse3_loop: pop rdi pop rsi RESTORE_GOT + RESTORE_XMM UNSHADOW_ARGS pop rbp ret @@ -808,6 +819,7 @@ vp8_filter_block1d4_v6_ssse3_loop: pop rdi pop rsi RESTORE_GOT + RESTORE_XMM UNSHADOW_ARGS pop rbp ret -- cgit v1.2.3