diff options
Diffstat (limited to 'vp8/common/x86/recon_sse2.asm')
-rw-r--r-- | vp8/common/x86/recon_sse2.asm | 101 |
1 files changed, 57 insertions, 44 deletions
diff --git a/vp8/common/x86/recon_sse2.asm b/vp8/common/x86/recon_sse2.asm index 4b68ef5f2..d371ebd74 100644 --- a/vp8/common/x86/recon_sse2.asm +++ b/vp8/common/x86/recon_sse2.asm @@ -119,35 +119,39 @@ sym(vp8_copy_mem16x16_sse2): ;void vp8_intra_pred_uv_dc_mmx2( ; unsigned char *dst, ; int dst_stride -; unsigned char *src, -; int src_stride, +; unsigned char *above, +; unsigned char *left, +; int left_stride, ; ) global sym(vp8_intra_pred_uv_dc_mmx2) sym(vp8_intra_pred_uv_dc_mmx2): push rbp mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 + SHADOW_ARGS_TO_STACK 5 push rsi push rdi ; end prolog ; from top - mov rsi, arg(2) ;src; - movsxd rax, dword ptr arg(3) ;src_stride; - sub rsi, rax + mov rsi, arg(2) ;above; pxor mm0, mm0 movq mm1, [rsi] psadbw mm1, mm0 ; from left - dec rsi + mov rsi, arg(3) ;left; + movsxd rax, dword ptr arg(4) ;left_stride; lea rdi, [rax*3] - movzx ecx, byte [rsi+rax] + movzx ecx, byte [rsi] + movzx edx, byte [rsi+rax*1] + add ecx, edx movzx edx, byte [rsi+rax*2] add ecx, edx + + movzx edx, byte [rsi+rdi] - add ecx, edx lea rsi, [rsi+rax*4] + add ecx, edx movzx edx, byte [rsi] add ecx, edx movzx edx, byte [rsi+rax] @@ -156,8 +160,6 @@ sym(vp8_intra_pred_uv_dc_mmx2): add ecx, edx movzx edx, byte [rsi+rdi] add ecx, edx - movzx edx, byte [rsi+rax*4] - add ecx, edx ; add up pextrw edx, mm1, 0x0 @@ -192,23 +194,24 @@ sym(vp8_intra_pred_uv_dc_mmx2): ;void vp8_intra_pred_uv_dctop_mmx2( ; unsigned char *dst, ; int dst_stride -; unsigned char *src, -; int src_stride, +; unsigned char *above, +; unsigned char *left, +; int left_stride, ; ) global sym(vp8_intra_pred_uv_dctop_mmx2) sym(vp8_intra_pred_uv_dctop_mmx2): push rbp mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 + SHADOW_ARGS_TO_STACK 5 GET_GOT rbx push rsi push rdi ; end prolog + ;arg(3), arg(4) not used + ; from top - mov rsi, arg(2) ;src; - movsxd rax, dword ptr arg(3) ;src_stride; - sub rsi, rax + mov rsi, arg(2) ;above; pxor mm0, mm0 movq mm1, [rsi] psadbw mm1, mm0 @@ -245,22 +248,24 @@ sym(vp8_intra_pred_uv_dctop_mmx2): ;void vp8_intra_pred_uv_dcleft_mmx2( ; unsigned char *dst, ; int dst_stride -; unsigned char *src, -; int src_stride, +; unsigned char *above, +; unsigned char *left, +; int left_stride, ; ) global sym(vp8_intra_pred_uv_dcleft_mmx2) sym(vp8_intra_pred_uv_dcleft_mmx2): push rbp mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 + SHADOW_ARGS_TO_STACK 5 push rsi push rdi ; end prolog + ;arg(2) not used + ; from left - mov rsi, arg(2) ;src; - movsxd rax, dword ptr arg(3) ;src_stride; - dec rsi + mov rsi, arg(3) ;left; + movsxd rax, dword ptr arg(4) ;left_stride; lea rdi, [rax*3] movzx ecx, byte [rsi] movzx edx, byte [rsi+rax] @@ -310,17 +315,20 @@ sym(vp8_intra_pred_uv_dcleft_mmx2): ;void vp8_intra_pred_uv_dc128_mmx( ; unsigned char *dst, ; int dst_stride -; unsigned char *src, -; int src_stride, +; unsigned char *above, +; unsigned char *left, +; int left_stride, ; ) global sym(vp8_intra_pred_uv_dc128_mmx) sym(vp8_intra_pred_uv_dc128_mmx): push rbp mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 + SHADOW_ARGS_TO_STACK 5 GET_GOT rbx ; end prolog + ;arg(2), arg(3), arg(4) not used + ; write out movq mm1, [GLOBAL(dc_128)] mov rax, arg(0) ;dst; @@ -346,15 +354,16 @@ sym(vp8_intra_pred_uv_dc128_mmx): ;void vp8_intra_pred_uv_tm_sse2( ; unsigned char *dst, ; int dst_stride -; unsigned char *src, -; int src_stride, +; unsigned char *above, +; unsigned char *left, +; int left_stride, ; ) %macro vp8_intra_pred_uv_tm 1 global sym(vp8_intra_pred_uv_tm_%1) sym(vp8_intra_pred_uv_tm_%1): push rbp mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 + SHADOW_ARGS_TO_STACK 5 GET_GOT rbx push rsi push rdi @@ -362,9 +371,8 @@ sym(vp8_intra_pred_uv_tm_%1): ; read top row mov edx, 4 - mov rsi, arg(2) ;src; - movsxd rax, dword ptr arg(3) ;src_stride; - sub rsi, rax + mov rsi, arg(2) ;above + movsxd rax, dword ptr arg(4) ;left_stride; pxor xmm0, xmm0 %ifidn %1, ssse3 movdqa xmm2, [GLOBAL(dc_1024)] @@ -374,7 +382,7 @@ sym(vp8_intra_pred_uv_tm_%1): ; set up left ptrs ans subtract topleft movd xmm3, [rsi-1] - lea rsi, [rsi+rax-1] + mov rsi, arg(3) ;left; %ifidn %1, sse2 punpcklbw xmm3, xmm0 pshuflw xmm3, xmm3, 0x0 @@ -427,20 +435,22 @@ vp8_intra_pred_uv_tm ssse3 ;void vp8_intra_pred_uv_ve_mmx( ; unsigned char *dst, ; int dst_stride -; unsigned char *src, -; int src_stride, +; unsigned char *above, +; unsigned char *left, +; int left_stride, ; ) global sym(vp8_intra_pred_uv_ve_mmx) sym(vp8_intra_pred_uv_ve_mmx): push rbp mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 + SHADOW_ARGS_TO_STACK 5 ; end prolog + ; arg(3), arg(4) not used + ; read from top mov rax, arg(2) ;src; - movsxd rdx, dword ptr arg(3) ;src_stride; - sub rax, rdx + movq mm1, [rax] ; write out @@ -466,15 +476,16 @@ sym(vp8_intra_pred_uv_ve_mmx): ;void vp8_intra_pred_uv_ho_mmx2( ; unsigned char *dst, ; int dst_stride -; unsigned char *src, -; int src_stride, +; unsigned char *above, +; unsigned char *left, +; int left_stride, ; ) %macro vp8_intra_pred_uv_ho 1 global sym(vp8_intra_pred_uv_ho_%1) sym(vp8_intra_pred_uv_ho_%1): push rbp mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 + SHADOW_ARGS_TO_STACK 5 push rsi push rdi %ifidn %1, ssse3 @@ -485,12 +496,14 @@ sym(vp8_intra_pred_uv_ho_%1): %endif ; end prolog + ;arg(2) not used + ; read from left and write out %ifidn %1, mmx2 mov edx, 4 %endif - mov rsi, arg(2) ;src; - movsxd rax, dword ptr arg(3) ;src_stride; + mov rsi, arg(3) ;left + movsxd rax, dword ptr arg(4) ;left_stride; mov rdi, arg(0) ;dst; movsxd rcx, dword ptr arg(1) ;dst_stride %ifidn %1, ssse3 @@ -498,7 +511,7 @@ sym(vp8_intra_pred_uv_ho_%1): movdqa xmm2, [GLOBAL(dc_00001111)] lea rbx, [rax*3] %endif - dec rsi + %ifidn %1, mmx2 .vp8_intra_pred_uv_ho_%1_loop: movd mm0, [rsi] |