diff options
Diffstat (limited to 'vp9/encoder/x86/vp9_sad_sse3.asm')
-rw-r--r-- | vp9/encoder/x86/vp9_sad_sse3.asm | 542 |
1 files changed, 0 insertions, 542 deletions
diff --git a/vp9/encoder/x86/vp9_sad_sse3.asm b/vp9/encoder/x86/vp9_sad_sse3.asm index 8d98f6901..75e9d0ca4 100644 --- a/vp9/encoder/x86/vp9_sad_sse3.asm +++ b/vp9/encoder/x86/vp9_sad_sse3.asm @@ -83,87 +83,6 @@ ret %endmacro -%macro STACK_FRAME_CREATE_X4 0 -%if ABI_IS_32BIT - %define src_ptr rsi - %define src_stride rax - %define r0_ptr rcx - %define r1_ptr rdx - %define r2_ptr rbx - %define r3_ptr rdi - %define ref_stride rbp - %define result_ptr arg(4) - push rbp - mov rbp, rsp - push rsi - push rdi - push rbx - - push rbp - mov rdi, arg(2) ; ref_ptr_base - - LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi - - mov rsi, arg(0) ; src_ptr - - movsxd rbx, dword ptr arg(1) ; src_stride - movsxd rbp, dword ptr arg(3) ; ref_stride - - xchg rbx, rax -%else - %ifidn __OUTPUT_FORMAT__,x64 - SAVE_XMM 7, u - %define src_ptr rcx - %define src_stride rdx - %define r0_ptr rsi - %define r1_ptr r10 - %define r2_ptr r11 - %define r3_ptr r8 - %define ref_stride r9 - %define result_ptr [rsp+xmm_stack_space+16+4*8] - push rsi - - LOAD_X4_ADDRESSES r8, r0_ptr, r1_ptr, r2_ptr, r3_ptr - %else - %define src_ptr rdi - %define src_stride rsi - %define r0_ptr r9 - %define r1_ptr r10 - %define r2_ptr r11 - %define r3_ptr rdx - %define ref_stride rcx - %define result_ptr r8 - - LOAD_X4_ADDRESSES rdx, r0_ptr, r1_ptr, r2_ptr, r3_ptr - - %endif -%endif -%endmacro - -%macro STACK_FRAME_DESTROY_X4 0 - %define src_ptr - %define src_stride - %define r0_ptr - %define r1_ptr - %define r2_ptr - %define r3_ptr - %define ref_stride - %define result_ptr - -%if ABI_IS_32BIT - pop rbx - pop rdi - pop rsi - pop rbp -%else - %ifidn __OUTPUT_FORMAT__,x64 - pop rsi - RESTORE_XMM - %endif -%endif - ret -%endmacro - %macro PROCESS_16X2X3 5 %if %1==0 movdqa xmm0, XMMWORD PTR [%2] @@ -250,130 +169,6 @@ paddw mm7, mm3 %endmacro -%macro LOAD_X4_ADDRESSES 5 - mov %2, [%1+REG_SZ_BYTES*0] - mov %3, [%1+REG_SZ_BYTES*1] - - mov %4, [%1+REG_SZ_BYTES*2] - mov %5, [%1+REG_SZ_BYTES*3] -%endmacro - -%macro PROCESS_16X2X4 8-9 0 -%if %1==0 || %1==3 - movdqa xmm0, XMMWORD PTR [%2+%9] - lddqu xmm4, XMMWORD PTR [%3+%9] - lddqu xmm5, XMMWORD PTR [%4+%9] - lddqu xmm6, XMMWORD PTR [%5+%9] - lddqu xmm7, XMMWORD PTR [%6+%9] - - psadbw xmm4, xmm0 - psadbw xmm5, xmm0 - psadbw xmm6, xmm0 - psadbw xmm7, xmm0 -%else - movdqa xmm0, XMMWORD PTR [%2+%9] - lddqu xmm1, XMMWORD PTR [%3+%9] - lddqu xmm2, XMMWORD PTR [%4+%9] - lddqu xmm3, XMMWORD PTR [%5+%9] - - psadbw xmm1, xmm0 - psadbw xmm2, xmm0 - psadbw xmm3, xmm0 - - paddd xmm4, xmm1 - lddqu xmm1, XMMWORD PTR [%6+%9] - paddd xmm5, xmm2 - paddd xmm6, xmm3 - - psadbw xmm1, xmm0 - paddd xmm7, xmm1 -%endif - movdqa xmm0, XMMWORD PTR [%2+%7+%9] - lddqu xmm1, XMMWORD PTR [%3+%8+%9] - lddqu xmm2, XMMWORD PTR [%4+%8+%9] - lddqu xmm3, XMMWORD PTR [%5+%8+%9] - - psadbw xmm1, xmm0 - psadbw xmm2, xmm0 - psadbw xmm3, xmm0 - - paddd xmm4, xmm1 - lddqu xmm1, XMMWORD PTR [%6+%8+%9] - paddd xmm5, xmm2 - paddd xmm6, xmm3 - -%if %1==0 || %1==1 - lea %2, [%2+%7*2] - lea %3, [%3+%8*2] - - lea %4, [%4+%8*2] - lea %5, [%5+%8*2] - - lea %6, [%6+%8*2] -%endif - psadbw xmm1, xmm0 - paddd xmm7, xmm1 - -%endmacro - -%macro PROCESS_8X2X4 8 -%if %1==0 - movq mm0, QWORD PTR [%2] - movq mm4, QWORD PTR [%3] - movq mm5, QWORD PTR [%4] - movq mm6, QWORD PTR [%5] - movq mm7, QWORD PTR [%6] - - psadbw mm4, mm0 - psadbw mm5, mm0 - psadbw mm6, mm0 - psadbw mm7, mm0 -%else - movq mm0, QWORD PTR [%2] - movq mm1, QWORD PTR [%3] - movq mm2, QWORD PTR [%4] - movq mm3, QWORD PTR [%5] - - psadbw mm1, mm0 - psadbw mm2, mm0 - psadbw mm3, mm0 - - paddw mm4, mm1 - movq mm1, QWORD PTR [%6] - paddw mm5, mm2 - paddw mm6, mm3 - - psadbw mm1, mm0 - paddw mm7, mm1 -%endif - movq mm0, QWORD PTR [%2+%7] - movq mm1, QWORD PTR [%3+%8] - movq mm2, QWORD PTR [%4+%8] - movq mm3, QWORD PTR [%5+%8] - - psadbw mm1, mm0 - psadbw mm2, mm0 - psadbw mm3, mm0 - - paddw mm4, mm1 - movq mm1, QWORD PTR [%6+%8] - paddw mm5, mm2 - paddw mm6, mm3 - -%if %1==0 || %1==1 - lea %2, [%2+%7*2] - lea %3, [%3+%8*2] - - lea %4, [%4+%8*2] - lea %5, [%5+%8*2] - - lea %6, [%6+%8*2] -%endif - psadbw mm1, mm0 - paddw mm7, mm1 - -%endmacro - ;void int vp9_sad16x16x3_sse3( ; unsigned char *src_ptr, ; int src_stride, @@ -642,340 +437,3 @@ sym(vp9_copy32xn_sse3): .copy_is_done: STACK_FRAME_DESTROY_X3 - -;void vp9_sad64x64x4d_sse3( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr_base, -; int ref_stride, -; int *results) -global sym(vp9_sad64x64x4d_sse3) PRIVATE -sym(vp9_sad64x64x4d_sse3): - - STACK_FRAME_CREATE_X4 - -%macro PROCESS_64X8X4 2-3+ - PROCESS_16X2X4 %1, %3 - PROCESS_16X2X4 2, %3, 16 - PROCESS_16X2X4 2, %3, 32 - PROCESS_16X2X4 1, %3, 48 - PROCESS_16X2X4 2, %3 - PROCESS_16X2X4 2, %3, 16 - PROCESS_16X2X4 2, %3, 32 - PROCESS_16X2X4 1, %3, 48 - PROCESS_16X2X4 2, %3 - PROCESS_16X2X4 2, %3, 16 - PROCESS_16X2X4 2, %3, 32 - PROCESS_16X2X4 1, %3, 48 - PROCESS_16X2X4 2, %3 - PROCESS_16X2X4 2, %3, 16 - PROCESS_16X2X4 2, %3, 32 - PROCESS_16X2X4 %2, %3, 48 -%endmacro - - PROCESS_64X8X4 3, 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_64X8X4 2, 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_64X8X4 2, 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_64X8X4 2, 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_64X8X4 2, 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_64X8X4 2, 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_64X8X4 2, 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_64X8X4 2, 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - -%macro STORE_4D_RESULTS 0 -%if ABI_IS_32BIT - pop rbp -%endif - mov rcx, result_ptr - - movq xmm0, xmm4 - psrldq xmm4, 8 - - paddd xmm0, xmm4 - movd [rcx], xmm0 -;- - movq xmm0, xmm5 - psrldq xmm5, 8 - - paddd xmm0, xmm5 - movd [rcx+4], xmm0 -;- - movq xmm0, xmm6 - psrldq xmm6, 8 - - paddd xmm0, xmm6 - movd [rcx+8], xmm0 -;- - movq xmm0, xmm7 - psrldq xmm7, 8 - - paddd xmm0, xmm7 - movd [rcx+12], xmm0 -%endmacro - - STORE_4D_RESULTS - STACK_FRAME_DESTROY_X4 - -;void vp9_sad32x32x4d_sse3( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr_base, -; int ref_stride, -; int *results) -global sym(vp9_sad32x32x4d_sse3) PRIVATE -sym(vp9_sad32x32x4d_sse3): - - STACK_FRAME_CREATE_X4 - -%macro PROCESS_32X4X4 2-3+ - PROCESS_16X2X4 %1, %3 - PROCESS_16X2X4 1, %3, 16 - PROCESS_16X2X4 2, %3 - PROCESS_16X2X4 %2, %3, 16 -%endmacro - - PROCESS_32X4X4 3, 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_32X4X4 2, 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_32X4X4 2, 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_32X4X4 2, 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_32X4X4 2, 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_32X4X4 2, 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_32X4X4 2, 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_32X4X4 2, 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - - STORE_4D_RESULTS - STACK_FRAME_DESTROY_X4 - -;void vp9_sad16x16x4d_sse3( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr_base, -; int ref_stride, -; int *results) -global sym(vp9_sad16x16x4d_sse3) PRIVATE -sym(vp9_sad16x16x4d_sse3): - - STACK_FRAME_CREATE_X4 - - PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - - STORE_4D_RESULTS - STACK_FRAME_DESTROY_X4 - -;void vp9_sad16x8x4d_sse3( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr_base, -; int ref_stride, -; int *results) -global sym(vp9_sad16x8x4d_sse3) PRIVATE -sym(vp9_sad16x8x4d_sse3): - - STACK_FRAME_CREATE_X4 - - PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - -%if ABI_IS_32BIT - pop rbp -%endif - mov rcx, result_ptr - - movq xmm0, xmm4 - psrldq xmm4, 8 - - paddw xmm0, xmm4 - movd [rcx], xmm0 -;- - movq xmm0, xmm5 - psrldq xmm5, 8 - - paddw xmm0, xmm5 - movd [rcx+4], xmm0 -;- - movq xmm0, xmm6 - psrldq xmm6, 8 - - paddw xmm0, xmm6 - movd [rcx+8], xmm0 -;- - movq xmm0, xmm7 - psrldq xmm7, 8 - - paddw xmm0, xmm7 - movd [rcx+12], xmm0 - - STACK_FRAME_DESTROY_X4 - -;void int vp9_sad8x16x4d_sse3( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride, -; int *results) -global sym(vp9_sad8x16x4d_sse3) PRIVATE -sym(vp9_sad8x16x4d_sse3): - - STACK_FRAME_CREATE_X4 - - PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - -%if ABI_IS_32BIT - pop rbp -%endif - mov rcx, result_ptr - - punpckldq mm4, mm5 - punpckldq mm6, mm7 - - movq [rcx], mm4 - movq [rcx+8], mm6 - - STACK_FRAME_DESTROY_X4 - -;void int vp9_sad8x8x4d_sse3( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride, -; int *results) -global sym(vp9_sad8x8x4d_sse3) PRIVATE -sym(vp9_sad8x8x4d_sse3): - - STACK_FRAME_CREATE_X4 - - PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - -%if ABI_IS_32BIT - pop rbp -%endif - mov rcx, result_ptr - - punpckldq mm4, mm5 - punpckldq mm6, mm7 - - movq [rcx], mm4 - movq [rcx+8], mm6 - - STACK_FRAME_DESTROY_X4 - -;void int vp9_sad4x4x4d_sse3( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride, -; int *results) -global sym(vp9_sad4x4x4d_sse3) PRIVATE -sym(vp9_sad4x4x4d_sse3): - - STACK_FRAME_CREATE_X4 - - movd mm0, DWORD PTR [src_ptr] - movd mm1, DWORD PTR [r0_ptr] - - movd mm2, DWORD PTR [src_ptr+src_stride] - movd mm3, DWORD PTR [r0_ptr+ref_stride] - - punpcklbw mm0, mm2 - punpcklbw mm1, mm3 - - movd mm4, DWORD PTR [r1_ptr] - movd mm5, DWORD PTR [r2_ptr] - - movd mm6, DWORD PTR [r3_ptr] - movd mm2, DWORD PTR [r1_ptr+ref_stride] - - movd mm3, DWORD PTR [r2_ptr+ref_stride] - movd mm7, DWORD PTR [r3_ptr+ref_stride] - - psadbw mm1, mm0 - - punpcklbw mm4, mm2 - punpcklbw mm5, mm3 - - punpcklbw mm6, mm7 - psadbw mm4, mm0 - - psadbw mm5, mm0 - psadbw mm6, mm0 - - - - lea src_ptr, [src_ptr+src_stride*2] - lea r0_ptr, [r0_ptr+ref_stride*2] - - lea r1_ptr, [r1_ptr+ref_stride*2] - lea r2_ptr, [r2_ptr+ref_stride*2] - - lea r3_ptr, [r3_ptr+ref_stride*2] - - movd mm0, DWORD PTR [src_ptr] - movd mm2, DWORD PTR [r0_ptr] - - movd mm3, DWORD PTR [src_ptr+src_stride] - movd mm7, DWORD PTR [r0_ptr+ref_stride] - - punpcklbw mm0, mm3 - punpcklbw mm2, mm7 - - movd mm3, DWORD PTR [r1_ptr] - movd mm7, DWORD PTR [r2_ptr] - - psadbw mm2, mm0 -%if ABI_IS_32BIT - mov rax, rbp - - pop rbp -%define ref_stride rax -%endif - mov rsi, result_ptr - - paddw mm1, mm2 - movd [rsi], mm1 - - movd mm2, DWORD PTR [r1_ptr+ref_stride] - movd mm1, DWORD PTR [r2_ptr+ref_stride] - - punpcklbw mm3, mm2 - punpcklbw mm7, mm1 - - psadbw mm3, mm0 - psadbw mm7, mm0 - - movd mm2, DWORD PTR [r3_ptr] - movd mm1, DWORD PTR [r3_ptr+ref_stride] - - paddw mm3, mm4 - paddw mm7, mm5 - - movd [rsi+4], mm3 - punpcklbw mm2, mm1 - - movd [rsi+8], mm7 - psadbw mm2, mm0 - - paddw mm2, mm6 - movd [rsi+12], mm2 - - - STACK_FRAME_DESTROY_X4 - |