From a337725625207758e56c1820c3a06bc00ca22b1a Mon Sep 17 00:00:00 2001 From: Scott LaVarnway Date: Thu, 29 Mar 2012 14:24:53 -0400 Subject: Updated vp8_build_intra_predictors_mby_s(sse2/ssse3) to work with the latest code. Patch Set 2: aligned the above_row buffers to fix crash Change-Id: I7a6992a20ed079ccd302f8c26215cf3057f8b70c --- vp8/common/x86/recon_sse2.asm | 138 +++++++++++++++++++++++------------------- 1 file changed, 76 insertions(+), 62 deletions(-) (limited to 'vp8/common/x86/recon_sse2.asm') diff --git a/vp8/common/x86/recon_sse2.asm b/vp8/common/x86/recon_sse2.asm index d371ebd74..7b6e3cffe 100644 --- a/vp8/common/x86/recon_sse2.asm +++ b/vp8/common/x86/recon_sse2.asm @@ -133,22 +133,20 @@ sym(vp8_intra_pred_uv_dc_mmx2): ; end prolog ; from top - mov rsi, arg(2) ;above; - pxor mm0, mm0 - movq mm1, [rsi] - psadbw mm1, mm0 - - ; from left + mov rdi, arg(2) ;above; mov rsi, arg(3) ;left; movsxd rax, dword ptr arg(4) ;left_stride; + pxor mm0, mm0 + movq mm1, [rdi] lea rdi, [rax*3] + psadbw mm1, mm0 + ; from left movzx ecx, byte [rsi] movzx edx, byte [rsi+rax*1] add ecx, edx movzx edx, byte [rsi+rax*2] add ecx, edx - movzx edx, byte [rsi+rdi] lea rsi, [rsi+rax*4] add ecx, edx @@ -166,23 +164,23 @@ sym(vp8_intra_pred_uv_dc_mmx2): lea edx, [edx+ecx+8] sar edx, 4 movd mm1, edx + movsxd rcx, dword ptr arg(1) ;dst_stride pshufw mm1, mm1, 0x0 + mov rdi, arg(0) ;dst; packuswb mm1, mm1 ; write out - mov rdi, arg(0) ;dst; - movsxd rcx, dword ptr arg(1) ;dst_stride lea rax, [rcx*3] + lea rdx, [rdi+rcx*4] movq [rdi ], mm1 movq [rdi+rcx ], mm1 movq [rdi+rcx*2], mm1 movq [rdi+rax ], mm1 - lea rdi, [rdi+rcx*4] - movq [rdi ], mm1 - movq [rdi+rcx ], mm1 - movq [rdi+rcx*2], mm1 - movq [rdi+rax ], mm1 + movq [rdx ], mm1 + movq [rdx+rcx ], mm1 + movq [rdx+rcx*2], mm1 + movq [rdx+rax ], mm1 ; begin epilog pop rdi @@ -478,7 +476,7 @@ sym(vp8_intra_pred_uv_ve_mmx): ; int dst_stride ; unsigned char *above, ; unsigned char *left, -; int left_stride, +; int left_stride ; ) %macro vp8_intra_pred_uv_ho 1 global sym(vp8_intra_pred_uv_ho_%1) @@ -575,38 +573,43 @@ vp8_intra_pred_uv_ho ssse3 ;void vp8_intra_pred_y_dc_sse2( ; unsigned char *dst, ; int dst_stride -; unsigned char *src, -; int src_stride, +; unsigned char *above, +; unsigned char *left, +; int left_stride ; ) global sym(vp8_intra_pred_y_dc_sse2) sym(vp8_intra_pred_y_dc_sse2): push rbp mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 + SHADOW_ARGS_TO_STACK 5 push rsi push rdi ; end prolog ; from top - mov rsi, arg(2) ;src; - movsxd rax, dword ptr arg(3) ;src_stride; - sub rsi, rax + mov rdi, arg(2) ;above + mov rsi, arg(3) ;left + movsxd rax, dword ptr arg(4) ;left_stride; + pxor xmm0, xmm0 - movdqa xmm1, [rsi] + movdqa xmm1, [rdi] psadbw xmm1, xmm0 movq xmm2, xmm1 punpckhqdq xmm1, xmm1 paddw xmm1, xmm2 ; from left - dec rsi lea rdi, [rax*3] - movzx ecx, byte [rsi+rax] + + movzx ecx, byte [rsi] + movzx edx, byte [rsi+rax] + add ecx, edx movzx edx, byte [rsi+rax*2] add ecx, edx movzx edx, byte [rsi+rdi] add ecx, edx lea rsi, [rsi+rax*4] + movzx edx, byte [rsi] add ecx, edx movzx edx, byte [rsi+rax] @@ -616,6 +619,7 @@ sym(vp8_intra_pred_y_dc_sse2): movzx edx, byte [rsi+rdi] add ecx, edx lea rsi, [rsi+rax*4] + movzx edx, byte [rsi] add ecx, edx movzx edx, byte [rsi+rax] @@ -625,6 +629,7 @@ sym(vp8_intra_pred_y_dc_sse2): movzx edx, byte [rsi+rdi] add ecx, edx lea rsi, [rsi+rax*4] + movzx edx, byte [rsi] add ecx, edx movzx edx, byte [rsi+rax] @@ -633,8 +638,6 @@ sym(vp8_intra_pred_y_dc_sse2): add ecx, edx movzx edx, byte [rsi+rdi] add ecx, edx - movzx edx, byte [rsi+rax*4] - add ecx, edx ; add up pextrw edx, xmm1, 0x0 @@ -676,22 +679,23 @@ sym(vp8_intra_pred_y_dc_sse2): ;void vp8_intra_pred_y_dctop_sse2( ; unsigned char *dst, ; int dst_stride -; unsigned char *src, -; int src_stride, +; unsigned char *above, +; unsigned char *left, +; int left_stride ; ) global sym(vp8_intra_pred_y_dctop_sse2) sym(vp8_intra_pred_y_dctop_sse2): push rbp mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 + SHADOW_ARGS_TO_STACK 5 push rsi GET_GOT rbx ; end prolog + ;arg(3), arg(4) not used + ; from top - mov rcx, arg(2) ;src; - movsxd rax, dword ptr arg(3) ;src_stride; - sub rcx, rax + mov rcx, arg(2) ;above; pxor xmm0, xmm0 movdqa xmm1, [rcx] psadbw xmm1, xmm0 @@ -737,22 +741,25 @@ sym(vp8_intra_pred_y_dctop_sse2): ;void vp8_intra_pred_y_dcleft_sse2( ; unsigned char *dst, ; int dst_stride -; unsigned char *src, -; int src_stride, +; unsigned char *above, +; unsigned char *left, +; int left_stride ; ) global sym(vp8_intra_pred_y_dcleft_sse2) sym(vp8_intra_pred_y_dcleft_sse2): push rbp mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 + SHADOW_ARGS_TO_STACK 5 push rsi push rdi ; end prolog + ;arg(2) not used + ; from left - mov rsi, arg(2) ;src; - movsxd rax, dword ptr arg(3) ;src_stride; - dec rsi + mov rsi, arg(3) ;left; + movsxd rax, dword ptr arg(4) ;left_stride; + lea rdi, [rax*3] movzx ecx, byte [rsi] movzx edx, byte [rsi+rax] @@ -827,18 +834,21 @@ sym(vp8_intra_pred_y_dcleft_sse2): ;void vp8_intra_pred_y_dc128_sse2( ; unsigned char *dst, ; int dst_stride -; unsigned char *src, -; int src_stride, +; unsigned char *above, +; unsigned char *left, +; int left_stride ; ) global sym(vp8_intra_pred_y_dc128_sse2) sym(vp8_intra_pred_y_dc128_sse2): push rbp mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 + SHADOW_ARGS_TO_STACK 5 push rsi GET_GOT rbx ; end prolog + ;arg(2), arg(3), arg(4) not used + ; write out mov rsi, 2 movdqa xmm1, [GLOBAL(dc_128)] @@ -870,15 +880,16 @@ sym(vp8_intra_pred_y_dc128_sse2): ;void vp8_intra_pred_y_tm_sse2( ; unsigned char *dst, ; int dst_stride -; unsigned char *src, -; int src_stride, +; unsigned char *above, +; unsigned char *left, +; int left_stride ; ) %macro vp8_intra_pred_y_tm 1 global sym(vp8_intra_pred_y_tm_%1) sym(vp8_intra_pred_y_tm_%1): push rbp mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 + SHADOW_ARGS_TO_STACK 5 push rsi push rdi GET_GOT rbx @@ -886,9 +897,8 @@ sym(vp8_intra_pred_y_tm_%1): ; read top row mov edx, 8 - mov rsi, arg(2) ;src; - movsxd rax, dword ptr arg(3) ;src_stride; - sub rsi, rax + mov rsi, arg(2) ;above + movsxd rax, dword ptr arg(4) ;left_stride; pxor xmm0, xmm0 %ifidn %1, ssse3 movdqa xmm3, [GLOBAL(dc_1024)] @@ -900,7 +910,7 @@ sym(vp8_intra_pred_y_tm_%1): ; set up left ptrs ans subtract topleft movd xmm4, [rsi-1] - lea rsi, [rsi+rax-1] + mov rsi, arg(3) ;left %ifidn %1, sse2 punpcklbw xmm4, xmm0 pshuflw xmm4, xmm4, 0x0 @@ -958,27 +968,29 @@ vp8_intra_pred_y_tm ssse3 ;void vp8_intra_pred_y_ve_sse2( ; unsigned char *dst, ; int dst_stride -; unsigned char *src, -; int src_stride, +; unsigned char *above, +; unsigned char *left, +; int left_stride ; ) global sym(vp8_intra_pred_y_ve_sse2) sym(vp8_intra_pred_y_ve_sse2): push rbp mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 + SHADOW_ARGS_TO_STACK 5 push rsi ; end prolog + ;arg(3), arg(4) not used + + mov rax, arg(2) ;above; + mov rsi, 2 + movsxd rdx, dword ptr arg(1) ;dst_stride + ; read from top - mov rax, arg(2) ;src; - movsxd rdx, dword ptr arg(3) ;src_stride; - sub rax, rdx movdqa xmm1, [rax] ; write out - mov rsi, 2 mov rax, arg(0) ;dst; - movsxd rdx, dword ptr arg(1) ;dst_stride lea rcx, [rdx*3] .label @@ -1004,25 +1016,27 @@ sym(vp8_intra_pred_y_ve_sse2): ;void vp8_intra_pred_y_ho_sse2( ; unsigned char *dst, ; int dst_stride -; unsigned char *src, -; int src_stride, +; unsigned char *above, +; unsigned char *left, +; int left_stride, ; ) global sym(vp8_intra_pred_y_ho_sse2) sym(vp8_intra_pred_y_ho_sse2): push rbp mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 + SHADOW_ARGS_TO_STACK 5 push rsi push rdi ; end prolog + ;arg(2) not used + ; read from left and write out mov edx, 8 - mov rsi, arg(2) ;src; - movsxd rax, dword ptr arg(3) ;src_stride; + mov rsi, arg(3) ;left; + movsxd rax, dword ptr arg(4) ;left_stride; mov rdi, arg(0) ;dst; movsxd rcx, dword ptr arg(1) ;dst_stride - dec rsi vp8_intra_pred_y_ho_sse2_loop: movd xmm0, [rsi] -- cgit v1.2.3