From 54d48955f6b80de5d58e8da52dba97813296b713 Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Wed, 30 Sep 2015 11:07:35 -0400 Subject: vp8: change build_intra_predictors_mby_s to use vpx_dsp. Change-Id: I2000820e0c04de2c975d370a0cf7145330289bb2 --- vp8/common/x86/recon_sse2.asm | 509 +----------------------------------------- 1 file changed, 1 insertion(+), 508 deletions(-) (limited to 'vp8/common/x86/recon_sse2.asm') diff --git a/vp8/common/x86/recon_sse2.asm b/vp8/common/x86/recon_sse2.asm index 7141f8324..e9c5dc72b 100644 --- a/vp8/common/x86/recon_sse2.asm +++ b/vp8/common/x86/recon_sse2.asm @@ -593,520 +593,13 @@ sym(vp8_intra_pred_uv_ho_%1): vp8_intra_pred_uv_ho mmx2 vp8_intra_pred_uv_ho ssse3 -;void vp8_intra_pred_y_dc_sse2( -; unsigned char *dst, -; int dst_stride -; unsigned char *above, -; unsigned char *left, -; int left_stride -; ) -global sym(vp8_intra_pred_y_dc_sse2) PRIVATE -sym(vp8_intra_pred_y_dc_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - push rsi - push rdi - ; end prolog - - ; from top - mov rdi, arg(2) ;above - mov rsi, arg(3) ;left - movsxd rax, dword ptr arg(4) ;left_stride; - - pxor xmm0, xmm0 - movdqa xmm1, [rdi] - psadbw xmm1, xmm0 - movq xmm2, xmm1 - punpckhqdq xmm1, xmm1 - paddw xmm1, xmm2 - - ; from left - lea rdi, [rax*3] - - movzx ecx, byte [rsi] - movzx edx, byte [rsi+rax] - add ecx, edx - movzx edx, byte [rsi+rax*2] - add ecx, edx - movzx edx, byte [rsi+rdi] - add ecx, edx - lea rsi, [rsi+rax*4] - - movzx edx, byte [rsi] - add ecx, edx - movzx edx, byte [rsi+rax] - add ecx, edx - movzx edx, byte [rsi+rax*2] - add ecx, edx - movzx edx, byte [rsi+rdi] - add ecx, edx - lea rsi, [rsi+rax*4] - - movzx edx, byte [rsi] - add ecx, edx - movzx edx, byte [rsi+rax] - add ecx, edx - movzx edx, byte [rsi+rax*2] - add ecx, edx - movzx edx, byte [rsi+rdi] - add ecx, edx - lea rsi, [rsi+rax*4] - - movzx edx, byte [rsi] - add ecx, edx - movzx edx, byte [rsi+rax] - add ecx, edx - movzx edx, byte [rsi+rax*2] - add ecx, edx - movzx edx, byte [rsi+rdi] - add ecx, edx - - ; add up - pextrw edx, xmm1, 0x0 - lea edx, [edx+ecx+16] - sar edx, 5 - movd xmm1, edx - ; FIXME use pshufb for ssse3 version - pshuflw xmm1, xmm1, 0x0 - punpcklqdq xmm1, xmm1 - packuswb xmm1, xmm1 - - ; write out - mov rsi, 2 - mov rdi, arg(0) ;dst; - movsxd rcx, dword ptr arg(1) ;dst_stride - lea rax, [rcx*3] - -.label - movdqa [rdi ], xmm1 - movdqa [rdi+rcx ], xmm1 - movdqa [rdi+rcx*2], xmm1 - movdqa [rdi+rax ], xmm1 - lea rdi, [rdi+rcx*4] - movdqa [rdi ], xmm1 - movdqa [rdi+rcx ], xmm1 - movdqa [rdi+rcx*2], xmm1 - movdqa [rdi+rax ], xmm1 - lea rdi, [rdi+rcx*4] - dec rsi - jnz .label - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - -;void vp8_intra_pred_y_dctop_sse2( -; unsigned char *dst, -; int dst_stride -; unsigned char *above, -; unsigned char *left, -; int left_stride -; ) -global sym(vp8_intra_pred_y_dctop_sse2) PRIVATE -sym(vp8_intra_pred_y_dctop_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - push rsi - GET_GOT rbx - ; end prolog - - ;arg(3), arg(4) not used - - ; from top - mov rcx, arg(2) ;above; - pxor xmm0, xmm0 - movdqa xmm1, [rcx] - psadbw xmm1, xmm0 - movdqa xmm2, xmm1 - punpckhqdq xmm1, xmm1 - paddw xmm1, xmm2 - - ; add up - paddw xmm1, [GLOBAL(dc_8)] - psraw xmm1, 4 - ; FIXME use pshufb for ssse3 version - pshuflw xmm1, xmm1, 0x0 - punpcklqdq xmm1, xmm1 - packuswb xmm1, xmm1 - - ; write out - mov rsi, 2 - mov rdx, arg(0) ;dst; - movsxd rcx, dword ptr arg(1) ;dst_stride - lea rax, [rcx*3] - -.label - movdqa [rdx ], xmm1 - movdqa [rdx+rcx ], xmm1 - movdqa [rdx+rcx*2], xmm1 - movdqa [rdx+rax ], xmm1 - lea rdx, [rdx+rcx*4] - movdqa [rdx ], xmm1 - movdqa [rdx+rcx ], xmm1 - movdqa [rdx+rcx*2], xmm1 - movdqa [rdx+rax ], xmm1 - lea rdx, [rdx+rcx*4] - dec rsi - jnz .label - - ; begin epilog - RESTORE_GOT - pop rsi - UNSHADOW_ARGS - pop rbp - ret - -;void vp8_intra_pred_y_dcleft_sse2( -; unsigned char *dst, -; int dst_stride -; unsigned char *above, -; unsigned char *left, -; int left_stride -; ) -global sym(vp8_intra_pred_y_dcleft_sse2) PRIVATE -sym(vp8_intra_pred_y_dcleft_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - push rsi - push rdi - ; end prolog - - ;arg(2) not used - - ; from left - mov rsi, arg(3) ;left; - movsxd rax, dword ptr arg(4) ;left_stride; - - lea rdi, [rax*3] - movzx ecx, byte [rsi] - movzx edx, byte [rsi+rax] - add ecx, edx - movzx edx, byte [rsi+rax*2] - add ecx, edx - movzx edx, byte [rsi+rdi] - add ecx, edx - lea rsi, [rsi+rax*4] - movzx edx, byte [rsi] - add ecx, edx - movzx edx, byte [rsi+rax] - add ecx, edx - movzx edx, byte [rsi+rax*2] - add ecx, edx - movzx edx, byte [rsi+rdi] - add ecx, edx - lea rsi, [rsi+rax*4] - movzx edx, byte [rsi] - add ecx, edx - movzx edx, byte [rsi+rax] - add ecx, edx - movzx edx, byte [rsi+rax*2] - add ecx, edx - movzx edx, byte [rsi+rdi] - add ecx, edx - lea rsi, [rsi+rax*4] - movzx edx, byte [rsi] - add ecx, edx - movzx edx, byte [rsi+rax] - add ecx, edx - movzx edx, byte [rsi+rax*2] - add ecx, edx - movzx edx, byte [rsi+rdi] - lea edx, [ecx+edx+8] - - ; add up - shr edx, 4 - movd xmm1, edx - ; FIXME use pshufb for ssse3 version - pshuflw xmm1, xmm1, 0x0 - punpcklqdq xmm1, xmm1 - packuswb xmm1, xmm1 - - ; write out - mov rsi, 2 - mov rdi, arg(0) ;dst; - movsxd rcx, dword ptr arg(1) ;dst_stride - lea rax, [rcx*3] - -.label - movdqa [rdi ], xmm1 - movdqa [rdi+rcx ], xmm1 - movdqa [rdi+rcx*2], xmm1 - movdqa [rdi+rax ], xmm1 - lea rdi, [rdi+rcx*4] - movdqa [rdi ], xmm1 - movdqa [rdi+rcx ], xmm1 - movdqa [rdi+rcx*2], xmm1 - movdqa [rdi+rax ], xmm1 - lea rdi, [rdi+rcx*4] - dec rsi - jnz .label - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - -;void vp8_intra_pred_y_dc128_sse2( -; unsigned char *dst, -; int dst_stride -; unsigned char *above, -; unsigned char *left, -; int left_stride -; ) -global sym(vp8_intra_pred_y_dc128_sse2) PRIVATE -sym(vp8_intra_pred_y_dc128_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - push rsi - GET_GOT rbx - ; end prolog - - ;arg(2), arg(3), arg(4) not used - - ; write out - mov rsi, 2 - movdqa xmm1, [GLOBAL(dc_128)] - mov rax, arg(0) ;dst; - movsxd rdx, dword ptr arg(1) ;dst_stride - lea rcx, [rdx*3] - -.label - movdqa [rax ], xmm1 - movdqa [rax+rdx ], xmm1 - movdqa [rax+rdx*2], xmm1 - movdqa [rax+rcx ], xmm1 - lea rax, [rax+rdx*4] - movdqa [rax ], xmm1 - movdqa [rax+rdx ], xmm1 - movdqa [rax+rdx*2], xmm1 - movdqa [rax+rcx ], xmm1 - lea rax, [rax+rdx*4] - dec rsi - jnz .label - - ; begin epilog - RESTORE_GOT - pop rsi - UNSHADOW_ARGS - pop rbp - ret - -;void vp8_intra_pred_y_tm_sse2( -; unsigned char *dst, -; int dst_stride -; unsigned char *above, -; unsigned char *left, -; int left_stride -; ) -%macro vp8_intra_pred_y_tm 1 -global sym(vp8_intra_pred_y_tm_%1) PRIVATE -sym(vp8_intra_pred_y_tm_%1): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - SAVE_XMM 7 - push rsi - push rdi - push rbx - GET_GOT rbx - ; end prolog - - ; read top row - mov edx, 8 - mov rsi, arg(2) ;above - movsxd rax, dword ptr arg(4) ;left_stride; - pxor xmm0, xmm0 -%ifidn %1, ssse3 - movdqa xmm3, [GLOBAL(dc_1024)] -%endif - movdqa xmm1, [rsi] - movdqa xmm2, xmm1 - punpcklbw xmm1, xmm0 - punpckhbw xmm2, xmm0 - - ; set up left ptrs ans subtract topleft - movd xmm4, [rsi-1] - mov rsi, arg(3) ;left -%ifidn %1, sse2 - punpcklbw xmm4, xmm0 - pshuflw xmm4, xmm4, 0x0 - punpcklqdq xmm4, xmm4 -%else - pshufb xmm4, xmm3 -%endif - psubw xmm1, xmm4 - psubw xmm2, xmm4 - - ; set up dest ptrs - mov rdi, arg(0) ;dst; - movsxd rcx, dword ptr arg(1) ;dst_stride -vp8_intra_pred_y_tm_%1_loop: - mov bl, [rsi] - movd xmm4, ebx - - mov bl, [rsi+rax] - movd xmm5, ebx -%ifidn %1, sse2 - punpcklbw xmm4, xmm0 - punpcklbw xmm5, xmm0 - pshuflw xmm4, xmm4, 0x0 - pshuflw xmm5, xmm5, 0x0 - punpcklqdq xmm4, xmm4 - punpcklqdq xmm5, xmm5 -%else - pshufb xmm4, xmm3 - pshufb xmm5, xmm3 -%endif - movdqa xmm6, xmm4 - movdqa xmm7, xmm5 - paddw xmm4, xmm1 - paddw xmm6, xmm2 - paddw xmm5, xmm1 - paddw xmm7, xmm2 - packuswb xmm4, xmm6 - packuswb xmm5, xmm7 - movdqa [rdi ], xmm4 - movdqa [rdi+rcx], xmm5 - lea rsi, [rsi+rax*2] - lea rdi, [rdi+rcx*2] - dec edx - jnz vp8_intra_pred_y_tm_%1_loop - - ; begin epilog - RESTORE_GOT - pop rbx - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret -%endmacro - -vp8_intra_pred_y_tm sse2 -vp8_intra_pred_y_tm ssse3 - -;void vp8_intra_pred_y_ve_sse2( -; unsigned char *dst, -; int dst_stride -; unsigned char *above, -; unsigned char *left, -; int left_stride -; ) -global sym(vp8_intra_pred_y_ve_sse2) PRIVATE -sym(vp8_intra_pred_y_ve_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - push rsi - ; end prolog - - ;arg(3), arg(4) not used - - mov rax, arg(2) ;above; - mov rsi, 2 - movsxd rdx, dword ptr arg(1) ;dst_stride - - ; read from top - movdqa xmm1, [rax] - - ; write out - mov rax, arg(0) ;dst; - lea rcx, [rdx*3] - -.label - movdqa [rax ], xmm1 - movdqa [rax+rdx ], xmm1 - movdqa [rax+rdx*2], xmm1 - movdqa [rax+rcx ], xmm1 - lea rax, [rax+rdx*4] - movdqa [rax ], xmm1 - movdqa [rax+rdx ], xmm1 - movdqa [rax+rdx*2], xmm1 - movdqa [rax+rcx ], xmm1 - lea rax, [rax+rdx*4] - dec rsi - jnz .label - - ; begin epilog - pop rsi - UNSHADOW_ARGS - pop rbp - ret - -;void vp8_intra_pred_y_ho_sse2( -; unsigned char *dst, -; int dst_stride -; unsigned char *above, -; unsigned char *left, -; int left_stride, -; ) -global sym(vp8_intra_pred_y_ho_sse2) PRIVATE -sym(vp8_intra_pred_y_ho_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - push rsi - push rdi - push rbx - ; end prolog - - ;arg(2) not used - - ; read from left and write out - mov edx, 8 - mov rsi, arg(3) ;left; - movsxd rax, dword ptr arg(4) ;left_stride; - mov rdi, arg(0) ;dst; - movsxd rcx, dword ptr arg(1) ;dst_stride - -vp8_intra_pred_y_ho_sse2_loop: - mov bl, [rsi] - movd xmm0, ebx - mov bl, [rsi+rax] - movd xmm1, ebx - - ; FIXME use pshufb for ssse3 version - punpcklbw xmm0, xmm0 - punpcklbw xmm1, xmm1 - pshuflw xmm0, xmm0, 0x0 - pshuflw xmm1, xmm1, 0x0 - punpcklqdq xmm0, xmm0 - punpcklqdq xmm1, xmm1 - movdqa [rdi ], xmm0 - movdqa [rdi+rcx], xmm1 - lea rsi, [rsi+rax*2] - lea rdi, [rdi+rcx*2] - dec edx - jnz vp8_intra_pred_y_ho_sse2_loop - - ; begin epilog - pop rbx - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - SECTION_RODATA align 16 dc_128: - times 16 db 128 + times 8 db 128 dc_4: times 4 dw 4 align 16 -dc_8: - times 8 dw 8 -align 16 dc_1024: times 8 dw 0x400 align 16 -- cgit v1.2.3