summaryrefslogtreecommitdiff
path: root/vpx_dsp/x86
diff options
context:
space:
mode:
authorJohann Koenig <johannkoenig@google.com>2022-03-31 00:45:31 +0000
committerGerrit Code Review <noreply-gerritcodereview@google.com>2022-03-31 00:45:31 +0000
commit6d1844e54d132c6c2078f529b511ab443bc910ac (patch)
treedc74fd5cadbd3164d16f6bd37e1252a02fbe4e6a /vpx_dsp/x86
parent2200039d33c49a9f7a5c438656df143755b022c4 (diff)
parentafd60bd07d41e5d20a0b11eeeb104846d9517c65 (diff)
downloadlibvpx-6d1844e54d132c6c2078f529b511ab443bc910ac.tar
libvpx-6d1844e54d132c6c2078f529b511ab443bc910ac.tar.gz
libvpx-6d1844e54d132c6c2078f529b511ab443bc910ac.tar.bz2
libvpx-6d1844e54d132c6c2078f529b511ab443bc910ac.zip
Merge "remove sad x3,x8 specializations" into main
Diffstat (limited to 'vpx_dsp/x86')
-rw-r--r--vpx_dsp/x86/sad4d_avx2.c57
-rw-r--r--vpx_dsp/x86/sad_sse3.asm376
-rw-r--r--vpx_dsp/x86/sad_sse4.asm361
-rw-r--r--vpx_dsp/x86/sad_ssse3.asm372
4 files changed, 0 insertions, 1166 deletions
diff --git a/vpx_dsp/x86/sad4d_avx2.c b/vpx_dsp/x86/sad4d_avx2.c
index 81f1a916f..399b67b3f 100644
--- a/vpx_dsp/x86/sad4d_avx2.c
+++ b/vpx_dsp/x86/sad4d_avx2.c
@@ -73,63 +73,6 @@ void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride,
calc_final_4(sums, sad_array);
}
-void vpx_sad32x32x8_avx2(const uint8_t *src_ptr, int src_stride,
- const uint8_t *ref_ptr, int ref_stride,
- uint32_t sad_array[8]) {
- int i;
- __m256i sums[8];
-
- sums[0] = _mm256_setzero_si256();
- sums[1] = _mm256_setzero_si256();
- sums[2] = _mm256_setzero_si256();
- sums[3] = _mm256_setzero_si256();
- sums[4] = _mm256_setzero_si256();
- sums[5] = _mm256_setzero_si256();
- sums[6] = _mm256_setzero_si256();
- sums[7] = _mm256_setzero_si256();
-
- for (i = 0; i < 32; i++) {
- __m256i r[8];
-
- // load src and all ref[]
- const __m256i s = _mm256_load_si256((const __m256i *)src_ptr);
- r[0] = _mm256_loadu_si256((const __m256i *)&ref_ptr[0]);
- r[1] = _mm256_loadu_si256((const __m256i *)&ref_ptr[1]);
- r[2] = _mm256_loadu_si256((const __m256i *)&ref_ptr[2]);
- r[3] = _mm256_loadu_si256((const __m256i *)&ref_ptr[3]);
- r[4] = _mm256_loadu_si256((const __m256i *)&ref_ptr[4]);
- r[5] = _mm256_loadu_si256((const __m256i *)&ref_ptr[5]);
- r[6] = _mm256_loadu_si256((const __m256i *)&ref_ptr[6]);
- r[7] = _mm256_loadu_si256((const __m256i *)&ref_ptr[7]);
-
- // sum of the absolute differences between every ref[] to src
- r[0] = _mm256_sad_epu8(r[0], s);
- r[1] = _mm256_sad_epu8(r[1], s);
- r[2] = _mm256_sad_epu8(r[2], s);
- r[3] = _mm256_sad_epu8(r[3], s);
- r[4] = _mm256_sad_epu8(r[4], s);
- r[5] = _mm256_sad_epu8(r[5], s);
- r[6] = _mm256_sad_epu8(r[6], s);
- r[7] = _mm256_sad_epu8(r[7], s);
-
- // sum every ref[]
- sums[0] = _mm256_add_epi32(sums[0], r[0]);
- sums[1] = _mm256_add_epi32(sums[1], r[1]);
- sums[2] = _mm256_add_epi32(sums[2], r[2]);
- sums[3] = _mm256_add_epi32(sums[3], r[3]);
- sums[4] = _mm256_add_epi32(sums[4], r[4]);
- sums[5] = _mm256_add_epi32(sums[5], r[5]);
- sums[6] = _mm256_add_epi32(sums[6], r[6]);
- sums[7] = _mm256_add_epi32(sums[7], r[7]);
-
- src_ptr += src_stride;
- ref_ptr += ref_stride;
- }
-
- calc_final_4(sums, sad_array);
- calc_final_4(sums + 4, sad_array + 4);
-}
-
void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride,
const uint8_t *const ref_array[4], int ref_stride,
uint32_t sad_array[4]) {
diff --git a/vpx_dsp/x86/sad_sse3.asm b/vpx_dsp/x86/sad_sse3.asm
deleted file mode 100644
index acbd2e4fa..000000000
--- a/vpx_dsp/x86/sad_sse3.asm
+++ /dev/null
@@ -1,376 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%macro STACK_FRAME_CREATE_X3 0
-%if ABI_IS_32BIT
- %define src_ptr rsi
- %define src_stride rax
- %define ref_ptr rdi
- %define ref_stride rdx
- %define end_ptr rcx
- %define ret_var rbx
- %define result_ptr arg(4)
- %define height dword ptr arg(4)
- push rbp
- mov rbp, rsp
- push rsi
- push rdi
- push rbx
-
- mov rsi, arg(0) ; src_ptr
- mov rdi, arg(2) ; ref_ptr
-
- movsxd rax, dword ptr arg(1) ; src_stride
- movsxd rdx, dword ptr arg(3) ; ref_stride
-%else
- %if LIBVPX_YASM_WIN64
- SAVE_XMM 7, u
- %define src_ptr rcx
- %define src_stride rdx
- %define ref_ptr r8
- %define ref_stride r9
- %define end_ptr r10
- %define ret_var r11
- %define result_ptr [rsp+xmm_stack_space+8+4*8]
- %define height dword ptr [rsp+xmm_stack_space+8+4*8]
- %else
- %define src_ptr rdi
- %define src_stride rsi
- %define ref_ptr rdx
- %define ref_stride rcx
- %define end_ptr r9
- %define ret_var r10
- %define result_ptr r8
- %define height r8
- %endif
-%endif
-
-%endmacro
-
-%macro STACK_FRAME_DESTROY_X3 0
- %define src_ptr
- %define src_stride
- %define ref_ptr
- %define ref_stride
- %define end_ptr
- %define ret_var
- %define result_ptr
- %define height
-
-%if ABI_IS_32BIT
- pop rbx
- pop rdi
- pop rsi
- pop rbp
-%else
- %if LIBVPX_YASM_WIN64
- RESTORE_XMM
- %endif
-%endif
- ret
-%endmacro
-
-%macro PROCESS_16X2X3 5
-%if %1==0
- movdqa xmm0, XMMWORD PTR [%2]
- lddqu xmm5, XMMWORD PTR [%3]
- lddqu xmm6, XMMWORD PTR [%3+1]
- lddqu xmm7, XMMWORD PTR [%3+2]
-
- psadbw xmm5, xmm0
- psadbw xmm6, xmm0
- psadbw xmm7, xmm0
-%else
- movdqa xmm0, XMMWORD PTR [%2]
- lddqu xmm1, XMMWORD PTR [%3]
- lddqu xmm2, XMMWORD PTR [%3+1]
- lddqu xmm3, XMMWORD PTR [%3+2]
-
- psadbw xmm1, xmm0
- psadbw xmm2, xmm0
- psadbw xmm3, xmm0
-
- paddw xmm5, xmm1
- paddw xmm6, xmm2
- paddw xmm7, xmm3
-%endif
- movdqa xmm0, XMMWORD PTR [%2+%4]
- lddqu xmm1, XMMWORD PTR [%3+%5]
- lddqu xmm2, XMMWORD PTR [%3+%5+1]
- lddqu xmm3, XMMWORD PTR [%3+%5+2]
-
-%if %1==0 || %1==1
- lea %2, [%2+%4*2]
- lea %3, [%3+%5*2]
-%endif
-
- psadbw xmm1, xmm0
- psadbw xmm2, xmm0
- psadbw xmm3, xmm0
-
- paddw xmm5, xmm1
- paddw xmm6, xmm2
- paddw xmm7, xmm3
-%endmacro
-
-%macro PROCESS_8X2X3 5
-%if %1==0
- movq mm0, QWORD PTR [%2]
- movq mm5, QWORD PTR [%3]
- movq mm6, QWORD PTR [%3+1]
- movq mm7, QWORD PTR [%3+2]
-
- psadbw mm5, mm0
- psadbw mm6, mm0
- psadbw mm7, mm0
-%else
- movq mm0, QWORD PTR [%2]
- movq mm1, QWORD PTR [%3]
- movq mm2, QWORD PTR [%3+1]
- movq mm3, QWORD PTR [%3+2]
-
- psadbw mm1, mm0
- psadbw mm2, mm0
- psadbw mm3, mm0
-
- paddw mm5, mm1
- paddw mm6, mm2
- paddw mm7, mm3
-%endif
- movq mm0, QWORD PTR [%2+%4]
- movq mm1, QWORD PTR [%3+%5]
- movq mm2, QWORD PTR [%3+%5+1]
- movq mm3, QWORD PTR [%3+%5+2]
-
-%if %1==0 || %1==1
- lea %2, [%2+%4*2]
- lea %3, [%3+%5*2]
-%endif
-
- psadbw mm1, mm0
- psadbw mm2, mm0
- psadbw mm3, mm0
-
- paddw mm5, mm1
- paddw mm6, mm2
- paddw mm7, mm3
-%endmacro
-
-SECTION .text
-
-;void int vpx_sad16x16x3_sse3(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride,
-; int *results)
-globalsym(vpx_sad16x16x3_sse3)
-sym(vpx_sad16x16x3_sse3):
-
- STACK_FRAME_CREATE_X3
-
- PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
-
- mov rcx, result_ptr
-
- movq xmm0, xmm5
- psrldq xmm5, 8
-
- paddw xmm0, xmm5
- movd [rcx], xmm0
-;-
- movq xmm0, xmm6
- psrldq xmm6, 8
-
- paddw xmm0, xmm6
- movd [rcx+4], xmm0
-;-
- movq xmm0, xmm7
- psrldq xmm7, 8
-
- paddw xmm0, xmm7
- movd [rcx+8], xmm0
-
- STACK_FRAME_DESTROY_X3
-
-;void int vpx_sad16x8x3_sse3(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride,
-; int *results)
-globalsym(vpx_sad16x8x3_sse3)
-sym(vpx_sad16x8x3_sse3):
-
- STACK_FRAME_CREATE_X3
-
- PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
-
- mov rcx, result_ptr
-
- movq xmm0, xmm5
- psrldq xmm5, 8
-
- paddw xmm0, xmm5
- movd [rcx], xmm0
-;-
- movq xmm0, xmm6
- psrldq xmm6, 8
-
- paddw xmm0, xmm6
- movd [rcx+4], xmm0
-;-
- movq xmm0, xmm7
- psrldq xmm7, 8
-
- paddw xmm0, xmm7
- movd [rcx+8], xmm0
-
- STACK_FRAME_DESTROY_X3
-
-;void int vpx_sad8x16x3_sse3(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride,
-; int *results)
-globalsym(vpx_sad8x16x3_sse3)
-sym(vpx_sad8x16x3_sse3):
-
- STACK_FRAME_CREATE_X3
-
- PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
-
- mov rcx, result_ptr
-
- punpckldq mm5, mm6
-
- movq [rcx], mm5
- movd [rcx+8], mm7
-
- STACK_FRAME_DESTROY_X3
-
-;void int vpx_sad8x8x3_sse3(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride,
-; int *results)
-globalsym(vpx_sad8x8x3_sse3)
-sym(vpx_sad8x8x3_sse3):
-
- STACK_FRAME_CREATE_X3
-
- PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
-
- mov rcx, result_ptr
-
- punpckldq mm5, mm6
-
- movq [rcx], mm5
- movd [rcx+8], mm7
-
- STACK_FRAME_DESTROY_X3
-
-;void int vpx_sad4x4x3_sse3(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride,
-; int *results)
-globalsym(vpx_sad4x4x3_sse3)
-sym(vpx_sad4x4x3_sse3):
-
- STACK_FRAME_CREATE_X3
-
- movd mm0, DWORD PTR [src_ptr]
- movd mm1, DWORD PTR [ref_ptr]
-
- movd mm2, DWORD PTR [src_ptr+src_stride]
- movd mm3, DWORD PTR [ref_ptr+ref_stride]
-
- punpcklbw mm0, mm2
- punpcklbw mm1, mm3
-
- movd mm4, DWORD PTR [ref_ptr+1]
- movd mm5, DWORD PTR [ref_ptr+2]
-
- movd mm2, DWORD PTR [ref_ptr+ref_stride+1]
- movd mm3, DWORD PTR [ref_ptr+ref_stride+2]
-
- psadbw mm1, mm0
-
- punpcklbw mm4, mm2
- punpcklbw mm5, mm3
-
- psadbw mm4, mm0
- psadbw mm5, mm0
-
- lea src_ptr, [src_ptr+src_stride*2]
- lea ref_ptr, [ref_ptr+ref_stride*2]
-
- movd mm0, DWORD PTR [src_ptr]
- movd mm2, DWORD PTR [ref_ptr]
-
- movd mm3, DWORD PTR [src_ptr+src_stride]
- movd mm6, DWORD PTR [ref_ptr+ref_stride]
-
- punpcklbw mm0, mm3
- punpcklbw mm2, mm6
-
- movd mm3, DWORD PTR [ref_ptr+1]
- movd mm7, DWORD PTR [ref_ptr+2]
-
- psadbw mm2, mm0
-
- paddw mm1, mm2
-
- movd mm2, DWORD PTR [ref_ptr+ref_stride+1]
- movd mm6, DWORD PTR [ref_ptr+ref_stride+2]
-
- punpcklbw mm3, mm2
- punpcklbw mm7, mm6
-
- psadbw mm3, mm0
- psadbw mm7, mm0
-
- paddw mm3, mm4
- paddw mm7, mm5
-
- mov rcx, result_ptr
-
- punpckldq mm1, mm3
-
- movq [rcx], mm1
- movd [rcx+8], mm7
-
- STACK_FRAME_DESTROY_X3
diff --git a/vpx_dsp/x86/sad_sse4.asm b/vpx_dsp/x86/sad_sse4.asm
deleted file mode 100644
index 0818ed5f0..000000000
--- a/vpx_dsp/x86/sad_sse4.asm
+++ /dev/null
@@ -1,361 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%macro PROCESS_16X2X8 1
-%if %1
- movdqa xmm0, XMMWORD PTR [rsi]
- movq xmm1, MMWORD PTR [rdi]
- movq xmm3, MMWORD PTR [rdi+8]
- movq xmm2, MMWORD PTR [rdi+16]
- punpcklqdq xmm1, xmm3
- punpcklqdq xmm3, xmm2
-
- movdqa xmm2, xmm1
- mpsadbw xmm1, xmm0, 0x0
- mpsadbw xmm2, xmm0, 0x5
-
- psrldq xmm0, 8
-
- movdqa xmm4, xmm3
- mpsadbw xmm3, xmm0, 0x0
- mpsadbw xmm4, xmm0, 0x5
-
- paddw xmm1, xmm2
- paddw xmm1, xmm3
- paddw xmm1, xmm4
-%else
- movdqa xmm0, XMMWORD PTR [rsi]
- movq xmm5, MMWORD PTR [rdi]
- movq xmm3, MMWORD PTR [rdi+8]
- movq xmm2, MMWORD PTR [rdi+16]
- punpcklqdq xmm5, xmm3
- punpcklqdq xmm3, xmm2
-
- movdqa xmm2, xmm5
- mpsadbw xmm5, xmm0, 0x0
- mpsadbw xmm2, xmm0, 0x5
-
- psrldq xmm0, 8
-
- movdqa xmm4, xmm3
- mpsadbw xmm3, xmm0, 0x0
- mpsadbw xmm4, xmm0, 0x5
-
- paddw xmm5, xmm2
- paddw xmm5, xmm3
- paddw xmm5, xmm4
-
- paddw xmm1, xmm5
-%endif
- movdqa xmm0, XMMWORD PTR [rsi + rax]
- movq xmm5, MMWORD PTR [rdi+ rdx]
- movq xmm3, MMWORD PTR [rdi+ rdx+8]
- movq xmm2, MMWORD PTR [rdi+ rdx+16]
- punpcklqdq xmm5, xmm3
- punpcklqdq xmm3, xmm2
-
- lea rsi, [rsi+rax*2]
- lea rdi, [rdi+rdx*2]
-
- movdqa xmm2, xmm5
- mpsadbw xmm5, xmm0, 0x0
- mpsadbw xmm2, xmm0, 0x5
-
- psrldq xmm0, 8
- movdqa xmm4, xmm3
- mpsadbw xmm3, xmm0, 0x0
- mpsadbw xmm4, xmm0, 0x5
-
- paddw xmm5, xmm2
- paddw xmm5, xmm3
- paddw xmm5, xmm4
-
- paddw xmm1, xmm5
-%endmacro
-
-%macro PROCESS_8X2X8 1
-%if %1
- movq xmm0, MMWORD PTR [rsi]
- movq xmm1, MMWORD PTR [rdi]
- movq xmm3, MMWORD PTR [rdi+8]
- punpcklqdq xmm1, xmm3
-
- movdqa xmm2, xmm1
- mpsadbw xmm1, xmm0, 0x0
- mpsadbw xmm2, xmm0, 0x5
- paddw xmm1, xmm2
-%else
- movq xmm0, MMWORD PTR [rsi]
- movq xmm5, MMWORD PTR [rdi]
- movq xmm3, MMWORD PTR [rdi+8]
- punpcklqdq xmm5, xmm3
-
- movdqa xmm2, xmm5
- mpsadbw xmm5, xmm0, 0x0
- mpsadbw xmm2, xmm0, 0x5
- paddw xmm5, xmm2
-
- paddw xmm1, xmm5
-%endif
- movq xmm0, MMWORD PTR [rsi + rax]
- movq xmm5, MMWORD PTR [rdi+ rdx]
- movq xmm3, MMWORD PTR [rdi+ rdx+8]
- punpcklqdq xmm5, xmm3
-
- lea rsi, [rsi+rax*2]
- lea rdi, [rdi+rdx*2]
-
- movdqa xmm2, xmm5
- mpsadbw xmm5, xmm0, 0x0
- mpsadbw xmm2, xmm0, 0x5
- paddw xmm5, xmm2
-
- paddw xmm1, xmm5
-%endmacro
-
-%macro PROCESS_4X2X8 1
-%if %1
- movd xmm0, [rsi]
- movq xmm1, MMWORD PTR [rdi]
- movq xmm3, MMWORD PTR [rdi+8]
- punpcklqdq xmm1, xmm3
-
- mpsadbw xmm1, xmm0, 0x0
-%else
- movd xmm0, [rsi]
- movq xmm5, MMWORD PTR [rdi]
- movq xmm3, MMWORD PTR [rdi+8]
- punpcklqdq xmm5, xmm3
-
- mpsadbw xmm5, xmm0, 0x0
-
- paddw xmm1, xmm5
-%endif
- movd xmm0, [rsi + rax]
- movq xmm5, MMWORD PTR [rdi+ rdx]
- movq xmm3, MMWORD PTR [rdi+ rdx+8]
- punpcklqdq xmm5, xmm3
-
- lea rsi, [rsi+rax*2]
- lea rdi, [rdi+rdx*2]
-
- mpsadbw xmm5, xmm0, 0x0
-
- paddw xmm1, xmm5
-%endmacro
-
-%macro WRITE_AS_INTS 0
- mov rdi, arg(4) ;Results
- pxor xmm0, xmm0
- movdqa xmm2, xmm1
- punpcklwd xmm1, xmm0
- punpckhwd xmm2, xmm0
-
- movdqa [rdi], xmm1
- movdqa [rdi + 16], xmm2
-%endmacro
-
-SECTION .text
-
-;void vpx_sad16x16x8_sse4_1(
-; const unsigned char *src_ptr,
-; int src_stride,
-; const unsigned char *ref_ptr,
-; int ref_stride,
-; unsigned short *sad_array);
-globalsym(vpx_sad16x16x8_sse4_1)
-sym(vpx_sad16x16x8_sse4_1):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;ref_ptr
-
- movsxd rax, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;ref_stride
-
- PROCESS_16X2X8 1
- PROCESS_16X2X8 0
- PROCESS_16X2X8 0
- PROCESS_16X2X8 0
- PROCESS_16X2X8 0
- PROCESS_16X2X8 0
- PROCESS_16X2X8 0
- PROCESS_16X2X8 0
-
- WRITE_AS_INTS
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vpx_sad16x8x8_sse4_1(
-; const unsigned char *src_ptr,
-; int src_stride,
-; const unsigned char *ref_ptr,
-; int ref_stride,
-; unsigned short *sad_array
-;);
-globalsym(vpx_sad16x8x8_sse4_1)
-sym(vpx_sad16x8x8_sse4_1):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;ref_ptr
-
- movsxd rax, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;ref_stride
-
- PROCESS_16X2X8 1
- PROCESS_16X2X8 0
- PROCESS_16X2X8 0
- PROCESS_16X2X8 0
-
- WRITE_AS_INTS
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vpx_sad8x8x8_sse4_1(
-; const unsigned char *src_ptr,
-; int src_stride,
-; const unsigned char *ref_ptr,
-; int ref_stride,
-; unsigned short *sad_array
-;);
-globalsym(vpx_sad8x8x8_sse4_1)
-sym(vpx_sad8x8x8_sse4_1):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;ref_ptr
-
- movsxd rax, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;ref_stride
-
- PROCESS_8X2X8 1
- PROCESS_8X2X8 0
- PROCESS_8X2X8 0
- PROCESS_8X2X8 0
-
- WRITE_AS_INTS
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vpx_sad8x16x8_sse4_1(
-; const unsigned char *src_ptr,
-; int src_stride,
-; const unsigned char *ref_ptr,
-; int ref_stride,
-; unsigned short *sad_array
-;);
-globalsym(vpx_sad8x16x8_sse4_1)
-sym(vpx_sad8x16x8_sse4_1):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;ref_ptr
-
- movsxd rax, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;ref_stride
-
- PROCESS_8X2X8 1
- PROCESS_8X2X8 0
- PROCESS_8X2X8 0
- PROCESS_8X2X8 0
- PROCESS_8X2X8 0
- PROCESS_8X2X8 0
- PROCESS_8X2X8 0
- PROCESS_8X2X8 0
-
- WRITE_AS_INTS
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vpx_sad4x4x8_sse4_1(
-; const unsigned char *src_ptr,
-; int src_stride,
-; const unsigned char *ref_ptr,
-; int ref_stride,
-; unsigned short *sad_array
-;);
-globalsym(vpx_sad4x4x8_sse4_1)
-sym(vpx_sad4x4x8_sse4_1):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;ref_ptr
-
- movsxd rax, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;ref_stride
-
- PROCESS_4X2X8 1
- PROCESS_4X2X8 0
-
- WRITE_AS_INTS
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-
-
diff --git a/vpx_dsp/x86/sad_ssse3.asm b/vpx_dsp/x86/sad_ssse3.asm
deleted file mode 100644
index a5bc6d730..000000000
--- a/vpx_dsp/x86/sad_ssse3.asm
+++ /dev/null
@@ -1,372 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%macro PROCESS_16X2X3 1
-%if %1
- movdqa xmm0, XMMWORD PTR [rsi]
- lddqu xmm5, XMMWORD PTR [rdi]
- lddqu xmm6, XMMWORD PTR [rdi+1]
- lddqu xmm7, XMMWORD PTR [rdi+2]
-
- psadbw xmm5, xmm0
- psadbw xmm6, xmm0
- psadbw xmm7, xmm0
-%else
- movdqa xmm0, XMMWORD PTR [rsi]
- lddqu xmm1, XMMWORD PTR [rdi]
- lddqu xmm2, XMMWORD PTR [rdi+1]
- lddqu xmm3, XMMWORD PTR [rdi+2]
-
- psadbw xmm1, xmm0
- psadbw xmm2, xmm0
- psadbw xmm3, xmm0
-
- paddw xmm5, xmm1
- paddw xmm6, xmm2
- paddw xmm7, xmm3
-%endif
- movdqa xmm0, XMMWORD PTR [rsi+rax]
- lddqu xmm1, XMMWORD PTR [rdi+rdx]
- lddqu xmm2, XMMWORD PTR [rdi+rdx+1]
- lddqu xmm3, XMMWORD PTR [rdi+rdx+2]
-
- lea rsi, [rsi+rax*2]
- lea rdi, [rdi+rdx*2]
-
- psadbw xmm1, xmm0
- psadbw xmm2, xmm0
- psadbw xmm3, xmm0
-
- paddw xmm5, xmm1
- paddw xmm6, xmm2
- paddw xmm7, xmm3
-%endmacro
-
-%macro PROCESS_16X2X3_OFFSET 2
-%if %1
- movdqa xmm0, XMMWORD PTR [rsi]
- movdqa xmm4, XMMWORD PTR [rdi]
- movdqa xmm7, XMMWORD PTR [rdi+16]
-
- movdqa xmm5, xmm7
- palignr xmm5, xmm4, %2
-
- movdqa xmm6, xmm7
- palignr xmm6, xmm4, (%2+1)
-
- palignr xmm7, xmm4, (%2+2)
-
- psadbw xmm5, xmm0
- psadbw xmm6, xmm0
- psadbw xmm7, xmm0
-%else
- movdqa xmm0, XMMWORD PTR [rsi]
- movdqa xmm4, XMMWORD PTR [rdi]
- movdqa xmm3, XMMWORD PTR [rdi+16]
-
- movdqa xmm1, xmm3
- palignr xmm1, xmm4, %2
-
- movdqa xmm2, xmm3
- palignr xmm2, xmm4, (%2+1)
-
- palignr xmm3, xmm4, (%2+2)
-
- psadbw xmm1, xmm0
- psadbw xmm2, xmm0
- psadbw xmm3, xmm0
-
- paddw xmm5, xmm1
- paddw xmm6, xmm2
- paddw xmm7, xmm3
-%endif
- movdqa xmm0, XMMWORD PTR [rsi+rax]
- movdqa xmm4, XMMWORD PTR [rdi+rdx]
- movdqa xmm3, XMMWORD PTR [rdi+rdx+16]
-
- movdqa xmm1, xmm3
- palignr xmm1, xmm4, %2
-
- movdqa xmm2, xmm3
- palignr xmm2, xmm4, (%2+1)
-
- palignr xmm3, xmm4, (%2+2)
-
- lea rsi, [rsi+rax*2]
- lea rdi, [rdi+rdx*2]
-
- psadbw xmm1, xmm0
- psadbw xmm2, xmm0
- psadbw xmm3, xmm0
-
- paddw xmm5, xmm1
- paddw xmm6, xmm2
- paddw xmm7, xmm3
-%endmacro
-
-%macro PROCESS_16X16X3_OFFSET 2
-%2_aligned_by_%1:
-
- sub rdi, %1
-
- PROCESS_16X2X3_OFFSET 1, %1
- PROCESS_16X2X3_OFFSET 0, %1
- PROCESS_16X2X3_OFFSET 0, %1
- PROCESS_16X2X3_OFFSET 0, %1
- PROCESS_16X2X3_OFFSET 0, %1
- PROCESS_16X2X3_OFFSET 0, %1
- PROCESS_16X2X3_OFFSET 0, %1
- PROCESS_16X2X3_OFFSET 0, %1
-
- jmp %2_store_off
-
-%endmacro
-
-%macro PROCESS_16X8X3_OFFSET 2
-%2_aligned_by_%1:
-
- sub rdi, %1
-
- PROCESS_16X2X3_OFFSET 1, %1
- PROCESS_16X2X3_OFFSET 0, %1
- PROCESS_16X2X3_OFFSET 0, %1
- PROCESS_16X2X3_OFFSET 0, %1
-
- jmp %2_store_off
-
-%endmacro
-
-SECTION .text
-
-;void int vpx_sad16x16x3_ssse3(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride,
-; int *results)
-globalsym(vpx_sad16x16x3_ssse3)
-sym(vpx_sad16x16x3_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- SAVE_XMM 7
- push rsi
- push rdi
- push rcx
- ; end prolog
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;ref_ptr
-
- mov rdx, 0xf
- and rdx, rdi
-
- jmp .vpx_sad16x16x3_ssse3_skiptable
-.vpx_sad16x16x3_ssse3_jumptable:
- dd .vpx_sad16x16x3_ssse3_aligned_by_0 - .vpx_sad16x16x3_ssse3_do_jump
- dd .vpx_sad16x16x3_ssse3_aligned_by_1 - .vpx_sad16x16x3_ssse3_do_jump
- dd .vpx_sad16x16x3_ssse3_aligned_by_2 - .vpx_sad16x16x3_ssse3_do_jump
- dd .vpx_sad16x16x3_ssse3_aligned_by_3 - .vpx_sad16x16x3_ssse3_do_jump
- dd .vpx_sad16x16x3_ssse3_aligned_by_4 - .vpx_sad16x16x3_ssse3_do_jump
- dd .vpx_sad16x16x3_ssse3_aligned_by_5 - .vpx_sad16x16x3_ssse3_do_jump
- dd .vpx_sad16x16x3_ssse3_aligned_by_6 - .vpx_sad16x16x3_ssse3_do_jump
- dd .vpx_sad16x16x3_ssse3_aligned_by_7 - .vpx_sad16x16x3_ssse3_do_jump
- dd .vpx_sad16x16x3_ssse3_aligned_by_8 - .vpx_sad16x16x3_ssse3_do_jump
- dd .vpx_sad16x16x3_ssse3_aligned_by_9 - .vpx_sad16x16x3_ssse3_do_jump
- dd .vpx_sad16x16x3_ssse3_aligned_by_10 - .vpx_sad16x16x3_ssse3_do_jump
- dd .vpx_sad16x16x3_ssse3_aligned_by_11 - .vpx_sad16x16x3_ssse3_do_jump
- dd .vpx_sad16x16x3_ssse3_aligned_by_12 - .vpx_sad16x16x3_ssse3_do_jump
- dd .vpx_sad16x16x3_ssse3_aligned_by_13 - .vpx_sad16x16x3_ssse3_do_jump
- dd .vpx_sad16x16x3_ssse3_aligned_by_14 - .vpx_sad16x16x3_ssse3_do_jump
- dd .vpx_sad16x16x3_ssse3_aligned_by_15 - .vpx_sad16x16x3_ssse3_do_jump
-.vpx_sad16x16x3_ssse3_skiptable:
-
- call .vpx_sad16x16x3_ssse3_do_jump
-.vpx_sad16x16x3_ssse3_do_jump:
- pop rcx ; get the address of do_jump
- mov rax, .vpx_sad16x16x3_ssse3_jumptable - .vpx_sad16x16x3_ssse3_do_jump
- add rax, rcx ; get the absolute address of vpx_sad16x16x3_ssse3_jumptable
-
- movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable
- add rcx, rax
-
- movsxd rax, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;ref_stride
-
- jmp rcx
-
- PROCESS_16X16X3_OFFSET 0, .vpx_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 1, .vpx_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 2, .vpx_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 3, .vpx_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 4, .vpx_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 5, .vpx_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 6, .vpx_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 7, .vpx_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 8, .vpx_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 9, .vpx_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 10, .vpx_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 11, .vpx_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 12, .vpx_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 13, .vpx_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 14, .vpx_sad16x16x3_ssse3
-
-.vpx_sad16x16x3_ssse3_aligned_by_15:
- PROCESS_16X2X3 1
- PROCESS_16X2X3 0
- PROCESS_16X2X3 0
- PROCESS_16X2X3 0
- PROCESS_16X2X3 0
- PROCESS_16X2X3 0
- PROCESS_16X2X3 0
- PROCESS_16X2X3 0
-
-.vpx_sad16x16x3_ssse3_store_off:
- mov rdi, arg(4) ;Results
-
- movq xmm0, xmm5
- psrldq xmm5, 8
-
- paddw xmm0, xmm5
- movd [rdi], xmm0
-;-
- movq xmm0, xmm6
- psrldq xmm6, 8
-
- paddw xmm0, xmm6
- movd [rdi+4], xmm0
-;-
- movq xmm0, xmm7
- psrldq xmm7, 8
-
- paddw xmm0, xmm7
- movd [rdi+8], xmm0
-
- ; begin epilog
- pop rcx
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void int vpx_sad16x8x3_ssse3(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride,
-; int *results)
-globalsym(vpx_sad16x8x3_ssse3)
-sym(vpx_sad16x8x3_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- SAVE_XMM 7
- push rsi
- push rdi
- push rcx
- ; end prolog
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;ref_ptr
-
- mov rdx, 0xf
- and rdx, rdi
-
- jmp .vpx_sad16x8x3_ssse3_skiptable
-.vpx_sad16x8x3_ssse3_jumptable:
- dd .vpx_sad16x8x3_ssse3_aligned_by_0 - .vpx_sad16x8x3_ssse3_do_jump
- dd .vpx_sad16x8x3_ssse3_aligned_by_1 - .vpx_sad16x8x3_ssse3_do_jump
- dd .vpx_sad16x8x3_ssse3_aligned_by_2 - .vpx_sad16x8x3_ssse3_do_jump
- dd .vpx_sad16x8x3_ssse3_aligned_by_3 - .vpx_sad16x8x3_ssse3_do_jump
- dd .vpx_sad16x8x3_ssse3_aligned_by_4 - .vpx_sad16x8x3_ssse3_do_jump
- dd .vpx_sad16x8x3_ssse3_aligned_by_5 - .vpx_sad16x8x3_ssse3_do_jump
- dd .vpx_sad16x8x3_ssse3_aligned_by_6 - .vpx_sad16x8x3_ssse3_do_jump
- dd .vpx_sad16x8x3_ssse3_aligned_by_7 - .vpx_sad16x8x3_ssse3_do_jump
- dd .vpx_sad16x8x3_ssse3_aligned_by_8 - .vpx_sad16x8x3_ssse3_do_jump
- dd .vpx_sad16x8x3_ssse3_aligned_by_9 - .vpx_sad16x8x3_ssse3_do_jump
- dd .vpx_sad16x8x3_ssse3_aligned_by_10 - .vpx_sad16x8x3_ssse3_do_jump
- dd .vpx_sad16x8x3_ssse3_aligned_by_11 - .vpx_sad16x8x3_ssse3_do_jump
- dd .vpx_sad16x8x3_ssse3_aligned_by_12 - .vpx_sad16x8x3_ssse3_do_jump
- dd .vpx_sad16x8x3_ssse3_aligned_by_13 - .vpx_sad16x8x3_ssse3_do_jump
- dd .vpx_sad16x8x3_ssse3_aligned_by_14 - .vpx_sad16x8x3_ssse3_do_jump
- dd .vpx_sad16x8x3_ssse3_aligned_by_15 - .vpx_sad16x8x3_ssse3_do_jump
-.vpx_sad16x8x3_ssse3_skiptable:
-
- call .vpx_sad16x8x3_ssse3_do_jump
-.vpx_sad16x8x3_ssse3_do_jump:
- pop rcx ; get the address of do_jump
- mov rax, .vpx_sad16x8x3_ssse3_jumptable - .vpx_sad16x8x3_ssse3_do_jump
- add rax, rcx ; get the absolute address of vpx_sad16x8x3_ssse3_jumptable
-
- movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable
- add rcx, rax
-
- movsxd rax, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;ref_stride
-
- jmp rcx
-
- PROCESS_16X8X3_OFFSET 0, .vpx_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 1, .vpx_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 2, .vpx_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 3, .vpx_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 4, .vpx_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 5, .vpx_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 6, .vpx_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 7, .vpx_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 8, .vpx_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 9, .vpx_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 10, .vpx_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 11, .vpx_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 12, .vpx_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 13, .vpx_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 14, .vpx_sad16x8x3_ssse3
-
-.vpx_sad16x8x3_ssse3_aligned_by_15:
-
- PROCESS_16X2X3 1
- PROCESS_16X2X3 0
- PROCESS_16X2X3 0
- PROCESS_16X2X3 0
-
-.vpx_sad16x8x3_ssse3_store_off:
- mov rdi, arg(4) ;Results
-
- movq xmm0, xmm5
- psrldq xmm5, 8
-
- paddw xmm0, xmm5
- movd [rdi], xmm0
-;-
- movq xmm0, xmm6
- psrldq xmm6, 8
-
- paddw xmm0, xmm6
- movd [rdi+4], xmm0
-;-
- movq xmm0, xmm7
- psrldq xmm7, 8
-
- paddw xmm0, xmm7
- movd [rdi+8], xmm0
-
- ; begin epilog
- pop rcx
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret