diff options
Diffstat (limited to 'vp8/encoder')
-rw-r--r-- | vp8/encoder/x86/dct_sse2.asm | 2 | ||||
-rw-r--r-- | vp8/encoder/x86/encodeopt.asm | 82 | ||||
-rw-r--r-- | vp8/encoder/x86/sad_sse2.asm | 18 | ||||
-rw-r--r-- | vp8/encoder/x86/sad_sse3.asm | 10 | ||||
-rw-r--r-- | vp8/encoder/x86/sad_ssse3.asm | 4 | ||||
-rw-r--r-- | vp8/encoder/x86/variance_impl_sse2.asm | 12 |
6 files changed, 77 insertions, 51 deletions
diff --git a/vp8/encoder/x86/dct_sse2.asm b/vp8/encoder/x86/dct_sse2.asm index 652dd9804..287ad482f 100644 --- a/vp8/encoder/x86/dct_sse2.asm +++ b/vp8/encoder/x86/dct_sse2.asm @@ -33,6 +33,7 @@ %define input rcx %define output rdx %define pitch r8 + SAVE_XMM %else %define input rdi %define output rsi @@ -53,6 +54,7 @@ pop rbp %else %ifidn __OUTPUT_FORMAT__,x64 + RESTORE_XMM %endif %endif ret diff --git a/vp8/encoder/x86/encodeopt.asm b/vp8/encoder/x86/encodeopt.asm index c0f06bbbb..e142a7573 100644 --- a/vp8/encoder/x86/encodeopt.asm +++ b/vp8/encoder/x86/encodeopt.asm @@ -22,33 +22,33 @@ sym(vp8_block_error_xmm): ; end prologue mov rsi, arg(0) ;coeff_ptr - mov rdi, arg(1) ;dcoef_ptr - movdqa xmm3, [rsi] - movdqa xmm4, [rdi] - movdqa xmm5, [rsi+16] + movdqa xmm0, [rsi] + movdqa xmm1, [rdi] + + movdqa xmm2, [rsi+16] + movdqa xmm3, [rdi+16] - movdqa xmm6, [rdi+16] - psubw xmm3, xmm4 + psubw xmm0, xmm1 + psubw xmm2, xmm3 - psubw xmm5, xmm6 - pmaddwd xmm3, xmm3 - pmaddwd xmm5, xmm5 + pmaddwd xmm0, xmm0 + pmaddwd xmm2, xmm2 - paddd xmm3, xmm5 + paddd xmm0, xmm2 - pxor xmm7, xmm7 - movdqa xmm0, xmm3 + pxor xmm5, xmm5 + movdqa xmm1, xmm0 - punpckldq xmm0, xmm7 - punpckhdq xmm3, xmm7 + punpckldq xmm0, xmm5 + punpckhdq xmm1, xmm5 - paddd xmm0, xmm3 - movdqa xmm3, xmm0 + paddd xmm0, xmm1 + movdqa xmm1, xmm0 psrldq xmm0, 8 - paddd xmm0, xmm3 + paddd xmm0, xmm1 movq rax, xmm0 @@ -208,53 +208,54 @@ sym(vp8_mbblock_error_xmm_impl): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 3 + SAVE_XMM ; 6 push rsi push rdi ; end prolog mov rsi, arg(0) ;coeff_ptr - pxor xmm7, xmm7 + pxor xmm6, xmm6 mov rdi, arg(1) ;dcoef_ptr - pxor xmm2, xmm2 + pxor xmm4, xmm4 - movd xmm1, dword ptr arg(2) ;dc - por xmm1, xmm2 + movd xmm5, dword ptr arg(2) ;dc + por xmm5, xmm4 - pcmpeqw xmm1, xmm7 + pcmpeqw xmm5, xmm6 mov rcx, 16 mberror_loop: - movdqa xmm3, [rsi] - movdqa xmm4, [rdi] + movdqa xmm0, [rsi] + movdqa xmm1, [rdi] - movdqa xmm5, [rsi+16] - movdqa xmm6, [rdi+16] + movdqa xmm2, [rsi+16] + movdqa xmm3, [rdi+16] - psubw xmm5, xmm6 - pmaddwd xmm5, xmm5 + psubw xmm2, xmm3 + pmaddwd xmm2, xmm2 - psubw xmm3, xmm4 - pand xmm3, xmm1 + psubw xmm0, xmm1 + pand xmm0, xmm5 - pmaddwd xmm3, xmm3 + pmaddwd xmm0, xmm0 add rsi, 32 add rdi, 32 sub rcx, 1 - paddd xmm2, xmm5 + paddd xmm4, xmm2 - paddd xmm2, xmm3 + paddd xmm4, xmm0 jnz mberror_loop - movdqa xmm0, xmm2 - punpckldq xmm0, xmm7 + movdqa xmm0, xmm4 + punpckldq xmm0, xmm6 - punpckhdq xmm2, xmm7 - paddd xmm0, xmm2 + punpckhdq xmm4, xmm6 + paddd xmm0, xmm4 movdqa xmm1, xmm0 psrldq xmm0, 8 @@ -265,6 +266,7 @@ mberror_loop: pop rdi pop rsi ; begin epilog + RESTORE_XMM UNSHADOW_ARGS pop rbp ret @@ -342,7 +344,7 @@ sym(vp8_mbuverror_xmm_impl): mov rdi, arg(1) ;d_ptr mov rcx, 16 - pxor xmm7, xmm7 + pxor xmm3, xmm3 mbuverror_loop: @@ -352,7 +354,7 @@ mbuverror_loop: psubw xmm1, xmm2 pmaddwd xmm1, xmm1 - paddd xmm7, xmm1 + paddd xmm3, xmm1 add rsi, 16 add rdi, 16 @@ -361,7 +363,7 @@ mbuverror_loop: jnz mbuverror_loop pxor xmm0, xmm0 - movdqa xmm1, xmm7 + movdqa xmm1, xmm3 movdqa xmm2, xmm1 punpckldq xmm1, xmm0 diff --git a/vp8/encoder/x86/sad_sse2.asm b/vp8/encoder/x86/sad_sse2.asm index cc6bc3cd9..d9ac3ff4f 100644 --- a/vp8/encoder/x86/sad_sse2.asm +++ b/vp8/encoder/x86/sad_sse2.asm @@ -21,6 +21,7 @@ sym(vp8_sad16x16_wmt): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 4 + SAVE_XMM ; 6 push rsi push rdi ; end prolog @@ -34,7 +35,7 @@ sym(vp8_sad16x16_wmt): lea rcx, [rsi+rax*8] lea rcx, [rcx+rax*8] - pxor xmm7, xmm7 + pxor xmm6, xmm6 x16x16sad_wmt_loop: @@ -52,32 +53,33 @@ x16x16sad_wmt_loop: punpcklbw xmm1, xmm3 psadbw xmm0, xmm1 - movq xmm6, QWORD PTR [rsi+rax+8] + movq xmm2, QWORD PTR [rsi+rax+8] movq xmm3, QWORD PTR [rdi+rdx+8] lea rsi, [rsi+rax*2] lea rdi, [rdi+rdx*2] - punpcklbw xmm4, xmm6 + punpcklbw xmm4, xmm2 punpcklbw xmm5, xmm3 psadbw xmm4, xmm5 - paddw xmm7, xmm0 - paddw xmm7, xmm4 + paddw xmm6, xmm0 + paddw xmm6, xmm4 cmp rsi, rcx jne x16x16sad_wmt_loop - movq xmm0, xmm7 - psrldq xmm7, 8 + movq xmm0, xmm6 + psrldq xmm6, 8 - paddw xmm0, xmm7 + paddw xmm0, xmm6 movq rax, xmm0 ; begin epilog pop rdi pop rsi + RESTORE_XMM UNSHADOW_ARGS pop rbp ret diff --git a/vp8/encoder/x86/sad_sse3.asm b/vp8/encoder/x86/sad_sse3.asm index f0336ab17..666879267 100644 --- a/vp8/encoder/x86/sad_sse3.asm +++ b/vp8/encoder/x86/sad_sse3.asm @@ -39,8 +39,9 @@ %define ref_stride r9 %define end_ptr r10 %define ret_var r11 - %define result_ptr [rsp+8+4*8] - %define max_err [rsp+8+4*8] + %define result_ptr [rsp+40+4*8] + %define max_err [rsp+40+4*8] + SAVE_XMM %else %define src_ptr rdi %define src_stride rsi @@ -72,6 +73,7 @@ pop rbp %else %ifidn __OUTPUT_FORMAT__,x64 + RESTORE_XMM %endif %endif ret @@ -113,7 +115,8 @@ %define r2_ptr r11 %define r3_ptr r8 %define ref_stride r9 - %define result_ptr [rsp+16+4*8] + %define result_ptr [rsp+48+4*8] + SAVE_XMM push rsi LOAD_X4_ADDRESSES r8, r0_ptr, r1_ptr, r2_ptr, r3_ptr @@ -151,6 +154,7 @@ %else %ifidn __OUTPUT_FORMAT__,x64 pop rsi + RESTORE_XMM %endif %endif ret diff --git a/vp8/encoder/x86/sad_ssse3.asm b/vp8/encoder/x86/sad_ssse3.asm index 69c5eaedc..7c7cd0ade 100644 --- a/vp8/encoder/x86/sad_ssse3.asm +++ b/vp8/encoder/x86/sad_ssse3.asm @@ -157,6 +157,7 @@ sym(vp8_sad16x16x3_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 5 + SAVE_XMM push rsi push rdi push rcx @@ -253,6 +254,7 @@ vp8_sad16x16x3_ssse3_store_off: pop rcx pop rdi pop rsi + RESTORE_XMM UNSHADOW_ARGS pop rbp ret @@ -268,6 +270,7 @@ sym(vp8_sad16x8x3_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 5 + SAVE_XMM push rsi push rdi push rcx @@ -361,6 +364,7 @@ vp8_sad16x8x3_ssse3_store_off: pop rcx pop rdi pop rsi + RESTORE_XMM UNSHADOW_ARGS pop rbp ret diff --git a/vp8/encoder/x86/variance_impl_sse2.asm b/vp8/encoder/x86/variance_impl_sse2.asm index c2c30deb2..2c0e170d8 100644 --- a/vp8/encoder/x86/variance_impl_sse2.asm +++ b/vp8/encoder/x86/variance_impl_sse2.asm @@ -85,6 +85,7 @@ sym(vp8_get16x16var_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 + SAVE_XMM push rbx push rsi push rdi @@ -206,6 +207,7 @@ var16loop: pop rdi pop rsi pop rbx + RESTORE_XMM UNSHADOW_ARGS pop rbp ret @@ -223,6 +225,7 @@ sym(vp8_get16x16pred_error_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 4 + SAVE_XMM GET_GOT rbx push rsi push rdi @@ -321,6 +324,7 @@ var16peloop: pop rdi pop rsi RESTORE_GOT + RESTORE_XMM UNSHADOW_ARGS pop rbp ret @@ -341,6 +345,7 @@ sym(vp8_get8x8var_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 + SAVE_XMM GET_GOT rbx push rsi push rdi @@ -506,6 +511,7 @@ sym(vp8_get8x8var_sse2): pop rdi pop rsi RESTORE_GOT + RESTORE_XMM UNSHADOW_ARGS pop rbp ret @@ -805,6 +811,7 @@ sym(vp8_half_horiz_vert_variance8x_h_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 + SAVE_XMM GET_GOT rbx push rsi push rdi @@ -906,6 +913,7 @@ vp8_half_horiz_vert_variance8x_h_1: pop rdi pop rsi RESTORE_GOT + RESTORE_XMM UNSHADOW_ARGS pop rbp ret @@ -1041,6 +1049,7 @@ sym(vp8_half_vert_variance8x_h_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 + SAVE_XMM GET_GOT rbx push rsi push rdi @@ -1127,6 +1136,7 @@ vp8_half_vert_variance8x_h_1: pop rdi pop rsi RESTORE_GOT + RESTORE_XMM UNSHADOW_ARGS pop rbp ret @@ -1254,6 +1264,7 @@ sym(vp8_half_horiz_variance8x_h_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 + SAVE_XMM GET_GOT rbx push rsi push rdi @@ -1338,6 +1349,7 @@ vp8_half_horiz_variance8x_h_1: pop rdi pop rsi RESTORE_GOT + RESTORE_XMM UNSHADOW_ARGS pop rbp ret |