summaryrefslogtreecommitdiff
path: root/vp9/encoder/x86/vp9_sad_sse3.asm
diff options
context:
space:
mode:
Diffstat (limited to 'vp9/encoder/x86/vp9_sad_sse3.asm')
-rw-r--r--vp9/encoder/x86/vp9_sad_sse3.asm542
1 files changed, 0 insertions, 542 deletions
diff --git a/vp9/encoder/x86/vp9_sad_sse3.asm b/vp9/encoder/x86/vp9_sad_sse3.asm
index 8d98f6901..75e9d0ca4 100644
--- a/vp9/encoder/x86/vp9_sad_sse3.asm
+++ b/vp9/encoder/x86/vp9_sad_sse3.asm
@@ -83,87 +83,6 @@
ret
%endmacro
-%macro STACK_FRAME_CREATE_X4 0
-%if ABI_IS_32BIT
- %define src_ptr rsi
- %define src_stride rax
- %define r0_ptr rcx
- %define r1_ptr rdx
- %define r2_ptr rbx
- %define r3_ptr rdi
- %define ref_stride rbp
- %define result_ptr arg(4)
- push rbp
- mov rbp, rsp
- push rsi
- push rdi
- push rbx
-
- push rbp
- mov rdi, arg(2) ; ref_ptr_base
-
- LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi
-
- mov rsi, arg(0) ; src_ptr
-
- movsxd rbx, dword ptr arg(1) ; src_stride
- movsxd rbp, dword ptr arg(3) ; ref_stride
-
- xchg rbx, rax
-%else
- %ifidn __OUTPUT_FORMAT__,x64
- SAVE_XMM 7, u
- %define src_ptr rcx
- %define src_stride rdx
- %define r0_ptr rsi
- %define r1_ptr r10
- %define r2_ptr r11
- %define r3_ptr r8
- %define ref_stride r9
- %define result_ptr [rsp+xmm_stack_space+16+4*8]
- push rsi
-
- LOAD_X4_ADDRESSES r8, r0_ptr, r1_ptr, r2_ptr, r3_ptr
- %else
- %define src_ptr rdi
- %define src_stride rsi
- %define r0_ptr r9
- %define r1_ptr r10
- %define r2_ptr r11
- %define r3_ptr rdx
- %define ref_stride rcx
- %define result_ptr r8
-
- LOAD_X4_ADDRESSES rdx, r0_ptr, r1_ptr, r2_ptr, r3_ptr
-
- %endif
-%endif
-%endmacro
-
-%macro STACK_FRAME_DESTROY_X4 0
- %define src_ptr
- %define src_stride
- %define r0_ptr
- %define r1_ptr
- %define r2_ptr
- %define r3_ptr
- %define ref_stride
- %define result_ptr
-
-%if ABI_IS_32BIT
- pop rbx
- pop rdi
- pop rsi
- pop rbp
-%else
- %ifidn __OUTPUT_FORMAT__,x64
- pop rsi
- RESTORE_XMM
- %endif
-%endif
- ret
-%endmacro
-
%macro PROCESS_16X2X3 5
%if %1==0
movdqa xmm0, XMMWORD PTR [%2]
@@ -250,130 +169,6 @@
paddw mm7, mm3
%endmacro
-%macro LOAD_X4_ADDRESSES 5
- mov %2, [%1+REG_SZ_BYTES*0]
- mov %3, [%1+REG_SZ_BYTES*1]
-
- mov %4, [%1+REG_SZ_BYTES*2]
- mov %5, [%1+REG_SZ_BYTES*3]
-%endmacro
-
-%macro PROCESS_16X2X4 8-9 0
-%if %1==0 || %1==3
- movdqa xmm0, XMMWORD PTR [%2+%9]
- lddqu xmm4, XMMWORD PTR [%3+%9]
- lddqu xmm5, XMMWORD PTR [%4+%9]
- lddqu xmm6, XMMWORD PTR [%5+%9]
- lddqu xmm7, XMMWORD PTR [%6+%9]
-
- psadbw xmm4, xmm0
- psadbw xmm5, xmm0
- psadbw xmm6, xmm0
- psadbw xmm7, xmm0
-%else
- movdqa xmm0, XMMWORD PTR [%2+%9]
- lddqu xmm1, XMMWORD PTR [%3+%9]
- lddqu xmm2, XMMWORD PTR [%4+%9]
- lddqu xmm3, XMMWORD PTR [%5+%9]
-
- psadbw xmm1, xmm0
- psadbw xmm2, xmm0
- psadbw xmm3, xmm0
-
- paddd xmm4, xmm1
- lddqu xmm1, XMMWORD PTR [%6+%9]
- paddd xmm5, xmm2
- paddd xmm6, xmm3
-
- psadbw xmm1, xmm0
- paddd xmm7, xmm1
-%endif
- movdqa xmm0, XMMWORD PTR [%2+%7+%9]
- lddqu xmm1, XMMWORD PTR [%3+%8+%9]
- lddqu xmm2, XMMWORD PTR [%4+%8+%9]
- lddqu xmm3, XMMWORD PTR [%5+%8+%9]
-
- psadbw xmm1, xmm0
- psadbw xmm2, xmm0
- psadbw xmm3, xmm0
-
- paddd xmm4, xmm1
- lddqu xmm1, XMMWORD PTR [%6+%8+%9]
- paddd xmm5, xmm2
- paddd xmm6, xmm3
-
-%if %1==0 || %1==1
- lea %2, [%2+%7*2]
- lea %3, [%3+%8*2]
-
- lea %4, [%4+%8*2]
- lea %5, [%5+%8*2]
-
- lea %6, [%6+%8*2]
-%endif
- psadbw xmm1, xmm0
- paddd xmm7, xmm1
-
-%endmacro
-
-%macro PROCESS_8X2X4 8
-%if %1==0
- movq mm0, QWORD PTR [%2]
- movq mm4, QWORD PTR [%3]
- movq mm5, QWORD PTR [%4]
- movq mm6, QWORD PTR [%5]
- movq mm7, QWORD PTR [%6]
-
- psadbw mm4, mm0
- psadbw mm5, mm0
- psadbw mm6, mm0
- psadbw mm7, mm0
-%else
- movq mm0, QWORD PTR [%2]
- movq mm1, QWORD PTR [%3]
- movq mm2, QWORD PTR [%4]
- movq mm3, QWORD PTR [%5]
-
- psadbw mm1, mm0
- psadbw mm2, mm0
- psadbw mm3, mm0
-
- paddw mm4, mm1
- movq mm1, QWORD PTR [%6]
- paddw mm5, mm2
- paddw mm6, mm3
-
- psadbw mm1, mm0
- paddw mm7, mm1
-%endif
- movq mm0, QWORD PTR [%2+%7]
- movq mm1, QWORD PTR [%3+%8]
- movq mm2, QWORD PTR [%4+%8]
- movq mm3, QWORD PTR [%5+%8]
-
- psadbw mm1, mm0
- psadbw mm2, mm0
- psadbw mm3, mm0
-
- paddw mm4, mm1
- movq mm1, QWORD PTR [%6+%8]
- paddw mm5, mm2
- paddw mm6, mm3
-
-%if %1==0 || %1==1
- lea %2, [%2+%7*2]
- lea %3, [%3+%8*2]
-
- lea %4, [%4+%8*2]
- lea %5, [%5+%8*2]
-
- lea %6, [%6+%8*2]
-%endif
- psadbw mm1, mm0
- paddw mm7, mm1
-
-%endmacro
-
;void int vp9_sad16x16x3_sse3(
; unsigned char *src_ptr,
; int src_stride,
@@ -642,340 +437,3 @@ sym(vp9_copy32xn_sse3):
.copy_is_done:
STACK_FRAME_DESTROY_X3
-
-;void vp9_sad64x64x4d_sse3(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr_base,
-; int ref_stride,
-; int *results)
-global sym(vp9_sad64x64x4d_sse3) PRIVATE
-sym(vp9_sad64x64x4d_sse3):
-
- STACK_FRAME_CREATE_X4
-
-%macro PROCESS_64X8X4 2-3+
- PROCESS_16X2X4 %1, %3
- PROCESS_16X2X4 2, %3, 16
- PROCESS_16X2X4 2, %3, 32
- PROCESS_16X2X4 1, %3, 48
- PROCESS_16X2X4 2, %3
- PROCESS_16X2X4 2, %3, 16
- PROCESS_16X2X4 2, %3, 32
- PROCESS_16X2X4 1, %3, 48
- PROCESS_16X2X4 2, %3
- PROCESS_16X2X4 2, %3, 16
- PROCESS_16X2X4 2, %3, 32
- PROCESS_16X2X4 1, %3, 48
- PROCESS_16X2X4 2, %3
- PROCESS_16X2X4 2, %3, 16
- PROCESS_16X2X4 2, %3, 32
- PROCESS_16X2X4 %2, %3, 48
-%endmacro
-
- PROCESS_64X8X4 3, 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
- PROCESS_64X8X4 2, 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
- PROCESS_64X8X4 2, 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
- PROCESS_64X8X4 2, 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
- PROCESS_64X8X4 2, 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
- PROCESS_64X8X4 2, 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
- PROCESS_64X8X4 2, 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
- PROCESS_64X8X4 2, 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-
-%macro STORE_4D_RESULTS 0
-%if ABI_IS_32BIT
- pop rbp
-%endif
- mov rcx, result_ptr
-
- movq xmm0, xmm4
- psrldq xmm4, 8
-
- paddd xmm0, xmm4
- movd [rcx], xmm0
-;-
- movq xmm0, xmm5
- psrldq xmm5, 8
-
- paddd xmm0, xmm5
- movd [rcx+4], xmm0
-;-
- movq xmm0, xmm6
- psrldq xmm6, 8
-
- paddd xmm0, xmm6
- movd [rcx+8], xmm0
-;-
- movq xmm0, xmm7
- psrldq xmm7, 8
-
- paddd xmm0, xmm7
- movd [rcx+12], xmm0
-%endmacro
-
- STORE_4D_RESULTS
- STACK_FRAME_DESTROY_X4
-
-;void vp9_sad32x32x4d_sse3(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr_base,
-; int ref_stride,
-; int *results)
-global sym(vp9_sad32x32x4d_sse3) PRIVATE
-sym(vp9_sad32x32x4d_sse3):
-
- STACK_FRAME_CREATE_X4
-
-%macro PROCESS_32X4X4 2-3+
- PROCESS_16X2X4 %1, %3
- PROCESS_16X2X4 1, %3, 16
- PROCESS_16X2X4 2, %3
- PROCESS_16X2X4 %2, %3, 16
-%endmacro
-
- PROCESS_32X4X4 3, 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
- PROCESS_32X4X4 2, 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
- PROCESS_32X4X4 2, 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
- PROCESS_32X4X4 2, 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
- PROCESS_32X4X4 2, 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
- PROCESS_32X4X4 2, 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
- PROCESS_32X4X4 2, 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
- PROCESS_32X4X4 2, 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-
- STORE_4D_RESULTS
- STACK_FRAME_DESTROY_X4
-
-;void vp9_sad16x16x4d_sse3(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr_base,
-; int ref_stride,
-; int *results)
-global sym(vp9_sad16x16x4d_sse3) PRIVATE
-sym(vp9_sad16x16x4d_sse3):
-
- STACK_FRAME_CREATE_X4
-
- PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
- PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
- PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
- PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
- PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
- PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
- PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
- PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-
- STORE_4D_RESULTS
- STACK_FRAME_DESTROY_X4
-
-;void vp9_sad16x8x4d_sse3(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr_base,
-; int ref_stride,
-; int *results)
-global sym(vp9_sad16x8x4d_sse3) PRIVATE
-sym(vp9_sad16x8x4d_sse3):
-
- STACK_FRAME_CREATE_X4
-
- PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
- PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
- PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
- PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-
-%if ABI_IS_32BIT
- pop rbp
-%endif
- mov rcx, result_ptr
-
- movq xmm0, xmm4
- psrldq xmm4, 8
-
- paddw xmm0, xmm4
- movd [rcx], xmm0
-;-
- movq xmm0, xmm5
- psrldq xmm5, 8
-
- paddw xmm0, xmm5
- movd [rcx+4], xmm0
-;-
- movq xmm0, xmm6
- psrldq xmm6, 8
-
- paddw xmm0, xmm6
- movd [rcx+8], xmm0
-;-
- movq xmm0, xmm7
- psrldq xmm7, 8
-
- paddw xmm0, xmm7
- movd [rcx+12], xmm0
-
- STACK_FRAME_DESTROY_X4
-
-;void int vp9_sad8x16x4d_sse3(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride,
-; int *results)
-global sym(vp9_sad8x16x4d_sse3) PRIVATE
-sym(vp9_sad8x16x4d_sse3):
-
- STACK_FRAME_CREATE_X4
-
- PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
- PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
- PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
- PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
- PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
- PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
- PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
- PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-
-%if ABI_IS_32BIT
- pop rbp
-%endif
- mov rcx, result_ptr
-
- punpckldq mm4, mm5
- punpckldq mm6, mm7
-
- movq [rcx], mm4
- movq [rcx+8], mm6
-
- STACK_FRAME_DESTROY_X4
-
-;void int vp9_sad8x8x4d_sse3(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride,
-; int *results)
-global sym(vp9_sad8x8x4d_sse3) PRIVATE
-sym(vp9_sad8x8x4d_sse3):
-
- STACK_FRAME_CREATE_X4
-
- PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
- PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
- PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
- PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-
-%if ABI_IS_32BIT
- pop rbp
-%endif
- mov rcx, result_ptr
-
- punpckldq mm4, mm5
- punpckldq mm6, mm7
-
- movq [rcx], mm4
- movq [rcx+8], mm6
-
- STACK_FRAME_DESTROY_X4
-
-;void int vp9_sad4x4x4d_sse3(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride,
-; int *results)
-global sym(vp9_sad4x4x4d_sse3) PRIVATE
-sym(vp9_sad4x4x4d_sse3):
-
- STACK_FRAME_CREATE_X4
-
- movd mm0, DWORD PTR [src_ptr]
- movd mm1, DWORD PTR [r0_ptr]
-
- movd mm2, DWORD PTR [src_ptr+src_stride]
- movd mm3, DWORD PTR [r0_ptr+ref_stride]
-
- punpcklbw mm0, mm2
- punpcklbw mm1, mm3
-
- movd mm4, DWORD PTR [r1_ptr]
- movd mm5, DWORD PTR [r2_ptr]
-
- movd mm6, DWORD PTR [r3_ptr]
- movd mm2, DWORD PTR [r1_ptr+ref_stride]
-
- movd mm3, DWORD PTR [r2_ptr+ref_stride]
- movd mm7, DWORD PTR [r3_ptr+ref_stride]
-
- psadbw mm1, mm0
-
- punpcklbw mm4, mm2
- punpcklbw mm5, mm3
-
- punpcklbw mm6, mm7
- psadbw mm4, mm0
-
- psadbw mm5, mm0
- psadbw mm6, mm0
-
-
-
- lea src_ptr, [src_ptr+src_stride*2]
- lea r0_ptr, [r0_ptr+ref_stride*2]
-
- lea r1_ptr, [r1_ptr+ref_stride*2]
- lea r2_ptr, [r2_ptr+ref_stride*2]
-
- lea r3_ptr, [r3_ptr+ref_stride*2]
-
- movd mm0, DWORD PTR [src_ptr]
- movd mm2, DWORD PTR [r0_ptr]
-
- movd mm3, DWORD PTR [src_ptr+src_stride]
- movd mm7, DWORD PTR [r0_ptr+ref_stride]
-
- punpcklbw mm0, mm3
- punpcklbw mm2, mm7
-
- movd mm3, DWORD PTR [r1_ptr]
- movd mm7, DWORD PTR [r2_ptr]
-
- psadbw mm2, mm0
-%if ABI_IS_32BIT
- mov rax, rbp
-
- pop rbp
-%define ref_stride rax
-%endif
- mov rsi, result_ptr
-
- paddw mm1, mm2
- movd [rsi], mm1
-
- movd mm2, DWORD PTR [r1_ptr+ref_stride]
- movd mm1, DWORD PTR [r2_ptr+ref_stride]
-
- punpcklbw mm3, mm2
- punpcklbw mm7, mm1
-
- psadbw mm3, mm0
- psadbw mm7, mm0
-
- movd mm2, DWORD PTR [r3_ptr]
- movd mm1, DWORD PTR [r3_ptr+ref_stride]
-
- paddw mm3, mm4
- paddw mm7, mm5
-
- movd [rsi+4], mm3
- punpcklbw mm2, mm1
-
- movd [rsi+8], mm7
- psadbw mm2, mm0
-
- paddw mm2, mm6
- movd [rsi+12], mm2
-
-
- STACK_FRAME_DESTROY_X4
-