summaryrefslogtreecommitdiff
path: root/vp8/common/x86
diff options
context:
space:
mode:
Diffstat (limited to 'vp8/common/x86')
-rw-r--r--vp8/common/x86/subpixel_ssse3.asm928
-rw-r--r--vp8/common/x86/subpixel_x86.h32
-rw-r--r--vp8/common/x86/vp8_asm_stubs.c191
-rw-r--r--vp8/common/x86/x86_systemdependent.c11
4 files changed, 0 insertions, 1162 deletions
diff --git a/vp8/common/x86/subpixel_ssse3.asm b/vp8/common/x86/subpixel_ssse3.asm
deleted file mode 100644
index 6b3f9994f..000000000
--- a/vp8/common/x86/subpixel_ssse3.asm
+++ /dev/null
@@ -1,928 +0,0 @@
-;
-; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%define BLOCK_HEIGHT_WIDTH 4
-%define VP8_FILTER_WEIGHT 128
-%define VP8_FILTER_SHIFT 7
-
-
-;/************************************************************************************
-; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
-; input pixel array has output_height rows. This routine assumes that output_height is an
-; even number. This function handles 8 pixels in horizontal direction, calculating ONE
-; rows each iteration to take advantage of the 128 bits operations.
-;*************************************************************************************/
-;void vp8_filter_block1d8_h6_ssse3
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pixels_per_line,
-; unsigned char *output_ptr,
-; unsigned int output_pitch,
-; unsigned int output_height,
-; unsigned int vp8_filter_index
-;)
-global sym(vp8_filter_block1d8_h6_ssse3)
-sym(vp8_filter_block1d8_h6_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- movsxd rdx, DWORD PTR arg(5) ;table index
- xor rsi, rsi
- shl rdx, 4
-
- movdqa xmm7, [rd GLOBAL]
-
- lea rax, [k0_k5 GLOBAL]
- add rax, rdx
- mov rdi, arg(2) ;output_ptr
-
- cmp esi, DWORD PTR [rax]
- je vp8_filter_block1d8_h4_ssse3
-
- movdqa xmm4, XMMWORD PTR [rax] ;k0_k5
- movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
- movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
-
- mov rsi, arg(0) ;src_ptr
- movsxd rax, dword ptr arg(1) ;src_pixels_per_line
- movsxd rcx, dword ptr arg(4) ;output_height
-
- movsxd rdx, dword ptr arg(3) ;output_pitch
-
- sub rdi, rdx
-;xmm3 free
-filter_block1d8_h6_rowloop_ssse3:
- movdqu xmm0, XMMWORD PTR [rsi - 2]
-
- movdqa xmm1, xmm0
- pshufb xmm0, [shuf1b GLOBAL]
-
- movdqa xmm2, xmm1
- pshufb xmm1, [shuf2b GLOBAL]
- pmaddubsw xmm0, xmm4
- pmaddubsw xmm1, xmm5
-
- pshufb xmm2, [shuf3b GLOBAL]
- add rdi, rdx
- pmaddubsw xmm2, xmm6
-
- lea rsi, [rsi + rax]
- dec rcx
- paddsw xmm0, xmm1
- paddsw xmm0, xmm7
- paddsw xmm0, xmm2
- psraw xmm0, 7
- packuswb xmm0, xmm0
-
- movq MMWORD Ptr [rdi], xmm0
- jnz filter_block1d8_h6_rowloop_ssse3
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-vp8_filter_block1d8_h4_ssse3:
- movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
- movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
-
- movdqa xmm3, XMMWORD PTR [shuf2b GLOBAL]
- movdqa xmm4, XMMWORD PTR [shuf3b GLOBAL]
-
- mov rsi, arg(0) ;src_ptr
-
- movsxd rax, dword ptr arg(1) ;src_pixels_per_line
- movsxd rcx, dword ptr arg(4) ;output_height
-
- movsxd rdx, dword ptr arg(3) ;output_pitch
-
- sub rdi, rdx
-;xmm3 free
-filter_block1d8_h4_rowloop_ssse3:
- movdqu xmm0, XMMWORD PTR [rsi - 2]
-
- movdqa xmm2, xmm0
- pshufb xmm0, xmm3 ;[shuf2b GLOBAL]
- pshufb xmm2, xmm4 ;[shuf3b GLOBAL]
-
- pmaddubsw xmm0, xmm5
- add rdi, rdx
- pmaddubsw xmm2, xmm6
-
- lea rsi, [rsi + rax]
- dec rcx
- paddsw xmm0, xmm7
- paddsw xmm0, xmm2
- psraw xmm0, 7
- packuswb xmm0, xmm0
-
- movq MMWORD Ptr [rdi], xmm0
-
- jnz filter_block1d8_h4_rowloop_ssse3
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-;void vp8_filter_block1d16_h6_ssse3
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pixels_per_line,
-; unsigned char *output_ptr,
-; unsigned int output_pitch,
-; unsigned int output_height,
-; unsigned int vp8_filter_index
-;)
-global sym(vp8_filter_block1d16_h6_ssse3)
-sym(vp8_filter_block1d16_h6_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- movsxd rdx, DWORD PTR arg(5) ;table index
- xor rsi, rsi
- shl rdx, 4 ;
-
- lea rax, [k0_k5 GLOBAL]
- add rax, rdx
-
- mov rdi, arg(2) ;output_ptr
- movdqa xmm7, [rd GLOBAL]
-
-;;
-;; cmp esi, DWORD PTR [rax]
-;; je vp8_filter_block1d16_h4_ssse3
-
- mov rsi, arg(0) ;src_ptr
-
- movdqa xmm4, XMMWORD PTR [rax] ;k0_k5
- movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
- movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
-
- movsxd rax, dword ptr arg(1) ;src_pixels_per_line
- movsxd rcx, dword ptr arg(4) ;output_height
- movsxd rdx, dword ptr arg(3) ;output_pitch
-
-filter_block1d16_h6_rowloop_ssse3:
- movdqu xmm0, XMMWORD PTR [rsi - 2]
-
- movdqa xmm1, xmm0
- pshufb xmm0, [shuf1b GLOBAL]
- movdqa xmm2, xmm1
- pmaddubsw xmm0, xmm4
- pshufb xmm1, [shuf2b GLOBAL]
- pshufb xmm2, [shuf3b GLOBAL]
- pmaddubsw xmm1, xmm5
-
- movdqu xmm3, XMMWORD PTR [rsi + 6]
-
- pmaddubsw xmm2, xmm6
- paddsw xmm0, xmm1
- movdqa xmm1, xmm3
- pshufb xmm3, [shuf1b GLOBAL]
- paddsw xmm0, xmm7
- pmaddubsw xmm3, xmm4
- paddsw xmm0, xmm2
- movdqa xmm2, xmm1
- pshufb xmm1, [shuf2b GLOBAL]
- pshufb xmm2, [shuf3b GLOBAL]
- pmaddubsw xmm1, xmm5
- pmaddubsw xmm2, xmm6
-
- psraw xmm0, 7
- packuswb xmm0, xmm0
- lea rsi, [rsi + rax]
- paddsw xmm3, xmm1
- paddsw xmm3, xmm7
- paddsw xmm3, xmm2
- psraw xmm3, 7
- packuswb xmm3, xmm3
-
- punpcklqdq xmm0, xmm3
-
- movdqa XMMWORD Ptr [rdi], xmm0
-
- add rdi, rdx
- dec rcx
- jnz filter_block1d16_h6_rowloop_ssse3
-
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-vp8_filter_block1d16_h4_ssse3:
- movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
- movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
-
- mov rsi, arg(0) ;src_ptr
- movsxd rax, dword ptr arg(1) ;src_pixels_per_line
- movsxd rcx, dword ptr arg(4) ;output_height
- movsxd rdx, dword ptr arg(3) ;output_pitch
-
-filter_block1d16_h4_rowloop_ssse3:
- movdqu xmm1, XMMWORD PTR [rsi - 2]
-
- movdqa xmm2, xmm1
- pshufb xmm1, [shuf2b GLOBAL]
- pshufb xmm2, [shuf3b GLOBAL]
- pmaddubsw xmm1, xmm5
-
- movdqu xmm3, XMMWORD PTR [rsi + 6]
-
- pmaddubsw xmm2, xmm6
- movdqa xmm0, xmm3
- pshufb xmm3, [shuf3b GLOBAL]
- pshufb xmm0, [shuf2b GLOBAL]
-
- paddsw xmm1, xmm7
- paddsw xmm1, xmm2
-
- pmaddubsw xmm0, xmm5
- pmaddubsw xmm3, xmm6
-
- psraw xmm1, 7
- packuswb xmm1, xmm1
- lea rsi, [rsi + rax]
- paddsw xmm3, xmm0
- paddsw xmm3, xmm7
- psraw xmm3, 7
- packuswb xmm3, xmm3
-
- punpcklqdq xmm1, xmm3
-
- movdqa XMMWORD Ptr [rdi], xmm1
-
- add rdi, rdx
- dec rcx
- jnz filter_block1d16_h4_rowloop_ssse3
-
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void vp8_filter_block1d4_h6_ssse3
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pixels_per_line,
-; unsigned char *output_ptr,
-; unsigned int output_pitch,
-; unsigned int output_height,
-; unsigned int vp8_filter_index
-;)
-global sym(vp8_filter_block1d4_h6_ssse3)
-sym(vp8_filter_block1d4_h6_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- movsxd rdx, DWORD PTR arg(5) ;table index
- xor rsi, rsi
- shl rdx, 4 ;
-
- lea rax, [k0_k5 GLOBAL]
- add rax, rdx
- movdqa xmm7, [rd GLOBAL]
-
- cmp esi, DWORD PTR [rax]
- je vp8_filter_block1d4_h4_ssse3
-
- movdqa xmm4, XMMWORD PTR [rax] ;k0_k5
- movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
- movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;output_ptr
- movsxd rax, dword ptr arg(1) ;src_pixels_per_line
- movsxd rcx, dword ptr arg(4) ;output_height
-
- movsxd rdx, dword ptr arg(3) ;output_pitch
-
-;xmm3 free
-filter_block1d4_h6_rowloop_ssse3:
- movdqu xmm0, XMMWORD PTR [rsi - 2]
-
- movdqa xmm1, xmm0
- pshufb xmm0, [shuf1b GLOBAL]
-
- movdqa xmm2, xmm1
- pshufb xmm1, [shuf2b GLOBAL]
- pmaddubsw xmm0, xmm4
- pshufb xmm2, [shuf3b GLOBAL]
- pmaddubsw xmm1, xmm5
-
-;--
- pmaddubsw xmm2, xmm6
-
- lea rsi, [rsi + rax]
-;--
- paddsw xmm0, xmm1
- paddsw xmm0, xmm7
- pxor xmm1, xmm1
- paddsw xmm0, xmm2
- psraw xmm0, 7
- packuswb xmm0, xmm0
-
- movd DWORD PTR [rdi], xmm0
-
- add rdi, rdx
- dec rcx
- jnz filter_block1d4_h6_rowloop_ssse3
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-vp8_filter_block1d4_h4_ssse3:
- movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
- movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
- movdqa xmm0, XMMWORD PTR [shuf2b GLOBAL]
- movdqa xmm3, XMMWORD PTR [shuf3b GLOBAL]
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;output_ptr
- movsxd rax, dword ptr arg(1) ;src_pixels_per_line
- movsxd rcx, dword ptr arg(4) ;output_height
-
- movsxd rdx, dword ptr arg(3) ;output_pitch
-
-filter_block1d4_h4_rowloop_ssse3:
- movdqu xmm1, XMMWORD PTR [rsi - 2]
-
- movdqa xmm2, xmm1
- pshufb xmm1, xmm0 ;;[shuf2b GLOBAL]
- pshufb xmm2, xmm3 ;;[shuf3b GLOBAL]
- pmaddubsw xmm1, xmm5
-
-;--
- pmaddubsw xmm2, xmm6
-
- lea rsi, [rsi + rax]
-;--
- paddsw xmm1, xmm7
- paddsw xmm1, xmm2
- psraw xmm1, 7
- packuswb xmm1, xmm1
-
- movd DWORD PTR [rdi], xmm1
-
- add rdi, rdx
- dec rcx
- jnz filter_block1d4_h4_rowloop_ssse3
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-
-;void vp8_filter_block1d16_v6_ssse3
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pitch,
-; unsigned char *output_ptr,
-; unsigned int out_pitch,
-; unsigned int output_height,
-; unsigned int vp8_filter_index
-;)
-global sym(vp8_filter_block1d16_v6_ssse3)
-sym(vp8_filter_block1d16_v6_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- movsxd rdx, DWORD PTR arg(5) ;table index
- xor rsi, rsi
- shl rdx, 4 ;
-
- lea rax, [k0_k5 GLOBAL]
- add rax, rdx
-
- cmp esi, DWORD PTR [rax]
- je vp8_filter_block1d16_v4_ssse3
-
- movdqa xmm5, XMMWORD PTR [rax] ;k0_k5
- movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
- movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3
-
- mov rsi, arg(0) ;src_ptr
- movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
- mov rdi, arg(2) ;output_ptr
-
-%if ABI_IS_32BIT=0
- movsxd r8, DWORD PTR arg(3) ;out_pitch
-%endif
- mov rax, rsi
- movsxd rcx, DWORD PTR arg(4) ;output_height
- add rax, rdx
-
-
-vp8_filter_block1d16_v6_ssse3_loop:
- movq xmm1, MMWORD PTR [rsi] ;A
- movq xmm2, MMWORD PTR [rsi + rdx] ;B
- movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
- movq xmm4, MMWORD PTR [rax + rdx * 2] ;D
- movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E
-
- punpcklbw xmm2, xmm4 ;B D
- punpcklbw xmm3, xmm0 ;C E
-
- movq xmm0, MMWORD PTR [rax + rdx * 4] ;F
-
- pmaddubsw xmm3, xmm6
- punpcklbw xmm1, xmm0 ;A F
- pmaddubsw xmm2, xmm7
- pmaddubsw xmm1, xmm5
-
- paddsw xmm2, xmm3
- paddsw xmm2, xmm1
- paddsw xmm2, [rd GLOBAL]
- psraw xmm2, 7
- packuswb xmm2, xmm2
-
- movq MMWORD PTR [rdi], xmm2 ;store the results
-
- movq xmm1, MMWORD PTR [rsi + 8] ;A
- movq xmm2, MMWORD PTR [rsi + rdx + 8] ;B
- movq xmm3, MMWORD PTR [rsi + rdx * 2 + 8] ;C
- movq xmm4, MMWORD PTR [rax + rdx * 2 + 8] ;D
- movq xmm0, MMWORD PTR [rsi + rdx * 4 + 8] ;E
-
- punpcklbw xmm2, xmm4 ;B D
- punpcklbw xmm3, xmm0 ;C E
-
- movq xmm0, MMWORD PTR [rax + rdx * 4 + 8] ;F
- pmaddubsw xmm3, xmm6
- punpcklbw xmm1, xmm0 ;A F
- pmaddubsw xmm2, xmm7
- pmaddubsw xmm1, xmm5
-
- add rsi, rdx
- add rax, rdx
-;--
-;--
- paddsw xmm2, xmm3
- paddsw xmm2, xmm1
- paddsw xmm2, [rd GLOBAL]
- psraw xmm2, 7
- packuswb xmm2, xmm2
-
- movq MMWORD PTR [rdi+8], xmm2
-
-%if ABI_IS_32BIT
- add rdi, DWORD PTR arg(3) ;out_pitch
-%else
- add rdi, r8
-%endif
- dec rcx
- jnz vp8_filter_block1d16_v6_ssse3_loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-vp8_filter_block1d16_v4_ssse3:
- movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
- movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3
-
- mov rsi, arg(0) ;src_ptr
- movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
- mov rdi, arg(2) ;output_ptr
-
-%if ABI_IS_32BIT=0
- movsxd r8, DWORD PTR arg(3) ;out_pitch
-%endif
- mov rax, rsi
- movsxd rcx, DWORD PTR arg(4) ;output_height
- add rax, rdx
-
-vp8_filter_block1d16_v4_ssse3_loop:
- movq xmm2, MMWORD PTR [rsi + rdx] ;B
- movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
- movq xmm4, MMWORD PTR [rax + rdx * 2] ;D
- movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E
-
- punpcklbw xmm2, xmm4 ;B D
- punpcklbw xmm3, xmm0 ;C E
-
- pmaddubsw xmm3, xmm6
- pmaddubsw xmm2, xmm7
- movq xmm5, MMWORD PTR [rsi + rdx + 8] ;B
- movq xmm1, MMWORD PTR [rsi + rdx * 2 + 8] ;C
- movq xmm4, MMWORD PTR [rax + rdx * 2 + 8] ;D
- movq xmm0, MMWORD PTR [rsi + rdx * 4 + 8] ;E
-
- paddsw xmm2, [rd GLOBAL]
- paddsw xmm2, xmm3
- psraw xmm2, 7
- packuswb xmm2, xmm2
-
- punpcklbw xmm5, xmm4 ;B D
- punpcklbw xmm1, xmm0 ;C E
-
- pmaddubsw xmm1, xmm6
- pmaddubsw xmm5, xmm7
-
- movdqa xmm4, [rd GLOBAL]
- add rsi, rdx
- add rax, rdx
-;--
-;--
- paddsw xmm5, xmm1
- paddsw xmm5, xmm4
- psraw xmm5, 7
- packuswb xmm5, xmm5
-
- punpcklqdq xmm2, xmm5
-
- movdqa XMMWORD PTR [rdi], xmm2
-
-%if ABI_IS_32BIT
- add rdi, DWORD PTR arg(3) ;out_pitch
-%else
- add rdi, r8
-%endif
- dec rcx
- jnz vp8_filter_block1d16_v4_ssse3_loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void vp8_filter_block1d8_v6_ssse3
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pitch,
-; unsigned char *output_ptr,
-; unsigned int out_pitch,
-; unsigned int output_height,
-; unsigned int vp8_filter_index
-;)
-global sym(vp8_filter_block1d8_v6_ssse3)
-sym(vp8_filter_block1d8_v6_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- movsxd rdx, DWORD PTR arg(5) ;table index
- xor rsi, rsi
- shl rdx, 4 ;
-
- lea rax, [k0_k5 GLOBAL]
- add rax, rdx
-
- movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
- mov rdi, arg(2) ;output_ptr
-%if ABI_IS_32BIT=0
- movsxd r8, DWORD PTR arg(3) ; out_pitch
-%endif
- movsxd rcx, DWORD PTR arg(4) ;[output_height]
-
- cmp esi, DWORD PTR [rax]
- je vp8_filter_block1d8_v4_ssse3
-
- movdqa xmm5, XMMWORD PTR [rax] ;k0_k5
- movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
- movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3
-
- mov rsi, arg(0) ;src_ptr
-
- mov rax, rsi
- add rax, rdx
-
-vp8_filter_block1d8_v6_ssse3_loop:
- movq xmm1, MMWORD PTR [rsi] ;A
- movq xmm2, MMWORD PTR [rsi + rdx] ;B
- movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
- movq xmm4, MMWORD PTR [rax + rdx * 2] ;D
- movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E
-
- punpcklbw xmm2, xmm4 ;B D
- punpcklbw xmm3, xmm0 ;C E
-
- movq xmm0, MMWORD PTR [rax + rdx * 4] ;F
- movdqa xmm4, [rd GLOBAL]
-
- pmaddubsw xmm3, xmm6
- punpcklbw xmm1, xmm0 ;A F
- pmaddubsw xmm2, xmm7
- pmaddubsw xmm1, xmm5
- add rsi, rdx
- add rax, rdx
-;--
-;--
- paddsw xmm2, xmm3
- paddsw xmm2, xmm1
- paddsw xmm2, xmm4
- psraw xmm2, 7
- packuswb xmm2, xmm2
-
- movq MMWORD PTR [rdi], xmm2
-
-%if ABI_IS_32BIT
- add rdi, DWORD PTR arg(3) ;[out_pitch]
-%else
- add rdi, r8
-%endif
- dec rcx
- jnz vp8_filter_block1d8_v6_ssse3_loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-vp8_filter_block1d8_v4_ssse3:
- movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
- movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3
- movdqa xmm5, [rd GLOBAL]
-
- mov rsi, arg(0) ;src_ptr
-
- mov rax, rsi
- add rax, rdx
-
-vp8_filter_block1d8_v4_ssse3_loop:
- movq xmm2, MMWORD PTR [rsi + rdx] ;B
- movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
- movq xmm4, MMWORD PTR [rax + rdx * 2] ;D
- movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E
-
- punpcklbw xmm2, xmm4 ;B D
- punpcklbw xmm3, xmm0 ;C E
-
- pmaddubsw xmm3, xmm6
- pmaddubsw xmm2, xmm7
- add rsi, rdx
- add rax, rdx
-;--
-;--
- paddsw xmm2, xmm3
- paddsw xmm2, xmm5
- psraw xmm2, 7
- packuswb xmm2, xmm2
-
- movq MMWORD PTR [rdi], xmm2
-
-%if ABI_IS_32BIT
- add rdi, DWORD PTR arg(3) ;[out_pitch]
-%else
- add rdi, r8
-%endif
- dec rcx
- jnz vp8_filter_block1d8_v4_ssse3_loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-;void vp8_filter_block1d4_v6_ssse3
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pitch,
-; unsigned char *output_ptr,
-; unsigned int out_pitch,
-; unsigned int output_height,
-; unsigned int vp8_filter_index
-;)
-global sym(vp8_filter_block1d4_v6_ssse3)
-sym(vp8_filter_block1d4_v6_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- movsxd rdx, DWORD PTR arg(5) ;table index
- xor rsi, rsi
- shl rdx, 4 ;
-
- lea rax, [k0_k5 GLOBAL]
- add rax, rdx
-
- movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
- mov rdi, arg(2) ;output_ptr
-%if ABI_IS_32BIT=0
- movsxd r8, DWORD PTR arg(3) ; out_pitch
-%endif
- movsxd rcx, DWORD PTR arg(4) ;[output_height]
-
- cmp esi, DWORD PTR [rax]
- je vp8_filter_block1d4_v4_ssse3
-
- movq mm5, MMWORD PTR [rax] ;k0_k5
- movq mm6, MMWORD PTR [rax+256] ;k2_k4
- movq mm7, MMWORD PTR [rax+128] ;k1_k3
-
- mov rsi, arg(0) ;src_ptr
-
- mov rax, rsi
- add rax, rdx
-
-vp8_filter_block1d4_v6_ssse3_loop:
- movd mm1, DWORD PTR [rsi] ;A
- movd mm2, DWORD PTR [rsi + rdx] ;B
- movd mm3, DWORD PTR [rsi + rdx * 2] ;C
- movd mm4, DWORD PTR [rax + rdx * 2] ;D
- movd mm0, DWORD PTR [rsi + rdx * 4] ;E
-
- punpcklbw mm2, mm4 ;B D
- punpcklbw mm3, mm0 ;C E
-
- movd mm0, DWORD PTR [rax + rdx * 4] ;F
-
- movq mm4, [rd GLOBAL]
-
- pmaddubsw mm3, mm6
- punpcklbw mm1, mm0 ;A F
- pmaddubsw mm2, mm7
- pmaddubsw mm1, mm5
- add rsi, rdx
- add rax, rdx
-;--
-;--
- paddsw mm2, mm3
- paddsw mm2, mm1
- paddsw mm2, mm4
- psraw mm2, 7
- packuswb mm2, mm2
-
- movd DWORD PTR [rdi], mm2
-
-%if ABI_IS_32BIT
- add rdi, DWORD PTR arg(3) ;[out_pitch]
-%else
- add rdi, r8
-%endif
- dec rcx
- jnz vp8_filter_block1d4_v6_ssse3_loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-vp8_filter_block1d4_v4_ssse3:
- movq mm6, MMWORD PTR [rax+256] ;k2_k4
- movq mm7, MMWORD PTR [rax+128] ;k1_k3
- movq mm5, MMWORD PTR [rd GLOBAL]
-
- mov rsi, arg(0) ;src_ptr
-
- mov rax, rsi
- add rax, rdx
-
-vp8_filter_block1d4_v4_ssse3_loop:
- movd mm2, DWORD PTR [rsi + rdx] ;B
- movd mm3, DWORD PTR [rsi + rdx * 2] ;C
- movd mm4, DWORD PTR [rax + rdx * 2] ;D
- movd mm0, DWORD PTR [rsi + rdx * 4] ;E
-
- punpcklbw mm2, mm4 ;B D
- punpcklbw mm3, mm0 ;C E
-
- pmaddubsw mm3, mm6
- pmaddubsw mm2, mm7
- add rsi, rdx
- add rax, rdx
-;--
-;--
- paddsw mm2, mm3
- paddsw mm2, mm5
- psraw mm2, 7
- packuswb mm2, mm2
-
- movd DWORD PTR [rdi], mm2
-
-%if ABI_IS_32BIT
- add rdi, DWORD PTR arg(3) ;[out_pitch]
-%else
- add rdi, r8
-%endif
- dec rcx
- jnz vp8_filter_block1d4_v4_ssse3_loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-SECTION_RODATA
-align 16
-shuf1b:
- db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
-shuf2b:
- db 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11
-shuf3b:
- db 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10
-
-align 16
-rd:
- times 8 dw 0x40
-
-align 16
-k0_k5:
- times 8 db 0, 0 ;placeholder
- times 8 db 0, 0
- times 8 db 2, 1
- times 8 db 0, 0
- times 8 db 3, 3
- times 8 db 0, 0
- times 8 db 1, 2
- times 8 db 0, 0
-k1_k3:
- times 8 db 0, 0 ;placeholder
- times 8 db -6, 12
- times 8 db -11, 36
- times 8 db -9, 50
- times 8 db -16, 77
- times 8 db -6, 93
- times 8 db -8, 108
- times 8 db -1, 123
-k2_k4:
- times 8 db 128, 0 ;placeholder
- times 8 db 123, -1
- times 8 db 108, -8
- times 8 db 93, -6
- times 8 db 77, -16
- times 8 db 50, -9
- times 8 db 36, -11
- times 8 db 12, -6
-
diff --git a/vp8/common/x86/subpixel_x86.h b/vp8/common/x86/subpixel_x86.h
index b371892c9..9d6c8e53b 100644
--- a/vp8/common/x86/subpixel_x86.h
+++ b/vp8/common/x86/subpixel_x86.h
@@ -86,37 +86,5 @@ extern prototype_subpixel_predict(vp8_bilinear_predict8x8_sse2);
#endif
#endif
-#if HAVE_SSSE3
-extern prototype_subpixel_predict(vp8_sixtap_predict16x16_ssse3);
-extern prototype_subpixel_predict(vp8_sixtap_predict8x8_ssse3);
-extern prototype_subpixel_predict(vp8_sixtap_predict8x4_ssse3);
-extern prototype_subpixel_predict(vp8_sixtap_predict4x4_ssse3);
-//extern prototype_subpixel_predict(vp8_bilinear_predict16x16_sse2);
-//extern prototype_subpixel_predict(vp8_bilinear_predict8x8_sse2);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef vp8_subpix_sixtap16x16
-#define vp8_subpix_sixtap16x16 vp8_sixtap_predict16x16_ssse3
-
-#undef vp8_subpix_sixtap8x8
-#define vp8_subpix_sixtap8x8 vp8_sixtap_predict8x8_ssse3
-
-#undef vp8_subpix_sixtap8x4
-#define vp8_subpix_sixtap8x4 vp8_sixtap_predict8x4_ssse3
-
-#undef vp8_subpix_sixtap4x4
-#define vp8_subpix_sixtap4x4 vp8_sixtap_predict4x4_ssse3
-
-
-//#undef vp8_subpix_bilinear16x16
-//#define vp8_subpix_bilinear16x16 vp8_bilinear_predict16x16_sse2
-
-//#undef vp8_subpix_bilinear8x8
-//#define vp8_subpix_bilinear8x8 vp8_bilinear_predict8x8_sse2
-
-#endif
-#endif
-
-
#endif
diff --git a/vp8/common/x86/vp8_asm_stubs.c b/vp8/common/x86/vp8_asm_stubs.c
index 8b54b2327..0d7e095d7 100644
--- a/vp8/common/x86/vp8_asm_stubs.c
+++ b/vp8/common/x86/vp8_asm_stubs.c
@@ -360,194 +360,3 @@ void vp8_sixtap_predict8x4_sse2
#endif
-#if HAVE_SSSE3
-
-extern void vp8_filter_block1d8_h6_ssse3
-(
- unsigned char *src_ptr,
- unsigned int src_pixels_per_line,
- unsigned char *output_ptr,
- unsigned int output_pitch,
- unsigned int output_height,
- unsigned int vp8_filter_index
-);
-
-extern void vp8_filter_block1d16_h6_ssse3
-(
- unsigned char *src_ptr,
- unsigned int src_pixels_per_line,
- unsigned char *output_ptr,
- unsigned int output_pitch,
- unsigned int output_height,
- unsigned int vp8_filter_index
-);
-
-extern void vp8_filter_block1d16_v6_ssse3
-(
- unsigned char *src_ptr,
- unsigned int src_pitch,
- unsigned char *output_ptr,
- unsigned int out_pitch,
- unsigned int output_height,
- unsigned int vp8_filter_index
-);
-
-extern void vp8_filter_block1d8_v6_ssse3
-(
- unsigned char *src_ptr,
- unsigned int src_pitch,
- unsigned char *output_ptr,
- unsigned int out_pitch,
- unsigned int output_height,
- unsigned int vp8_filter_index
-);
-
-extern void vp8_filter_block1d4_h6_ssse3
-(
- unsigned char *src_ptr,
- unsigned int src_pixels_per_line,
- unsigned char *output_ptr,
- unsigned int output_pitch,
- unsigned int output_height,
- unsigned int vp8_filter_index
-);
-
-extern void vp8_filter_block1d4_v6_ssse3
-(
- unsigned char *src_ptr,
- unsigned int src_pitch,
- unsigned char *output_ptr,
- unsigned int out_pitch,
- unsigned int output_height,
- unsigned int vp8_filter_index
-);
-
-void vp8_sixtap_predict16x16_ssse3
-(
- unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch
-
-)
-{
- DECLARE_ALIGNED_ARRAY(16, unsigned char, FData2, 24*24);
-
- if (xoffset)
- {
- if (yoffset)
- {
- vp8_filter_block1d16_h6_ssse3(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, FData2, 16, 21, xoffset);
- vp8_filter_block1d16_v6_ssse3(FData2 , 16, dst_ptr, dst_pitch, 16, yoffset);
- }
- else
- {
- // First-pass only
- vp8_filter_block1d16_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 16, xoffset);
- }
- }
- else
- {
- // Second-pass only
- vp8_filter_block1d16_v6_ssse3(src_ptr - (2 * src_pixels_per_line) , src_pixels_per_line, dst_ptr, dst_pitch, 16, yoffset);
- }
-}
-
-void vp8_sixtap_predict8x8_ssse3
-(
- unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch
-)
-{
- DECLARE_ALIGNED_ARRAY(16, unsigned char, FData2, 256);
-
- if (xoffset)
- {
- if (yoffset)
- {
- vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, FData2, 8, 13, xoffset);
- vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch, 8, yoffset);
- }
- else
- {
- vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 8, xoffset);
- }
- }
- else
- {
- // Second-pass only
- vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, dst_ptr, dst_pitch, 8, yoffset);
- }
-}
-
-
-void vp8_sixtap_predict8x4_ssse3
-(
- unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch
-)
-{
- DECLARE_ALIGNED_ARRAY(16, unsigned char, FData2, 256);
-
- if (xoffset)
- {
- if (yoffset)
- {
- vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, FData2, 8, 9, xoffset);
- vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch, 4, yoffset);
- }
- else
- {
- // First-pass only
- vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 4, xoffset);
- }
- }
- else
- {
- // Second-pass only
- vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, dst_ptr, dst_pitch, 4, yoffset);
- }
-}
-
-void vp8_sixtap_predict4x4_ssse3
-(
- unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch
-)
-{
- DECLARE_ALIGNED_ARRAY(16, unsigned char, FData2, 4*9);
-
- if (xoffset)
- {
- if (yoffset)
- {
- vp8_filter_block1d4_h6_ssse3(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, FData2, 4, 9, xoffset);
- vp8_filter_block1d4_v6_ssse3(FData2, 4, dst_ptr, dst_pitch, 4, yoffset);
- }
- else
- {
- vp8_filter_block1d4_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 4, xoffset);
- }
- }
- else
- {
- vp8_filter_block1d4_v6_ssse3(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, dst_ptr, dst_pitch, 4, yoffset);
- }
-
-}
-
-#endif
diff --git a/vp8/common/x86/x86_systemdependent.c b/vp8/common/x86/x86_systemdependent.c
index ce487ff9f..021be92b6 100644
--- a/vp8/common/x86/x86_systemdependent.c
+++ b/vp8/common/x86/x86_systemdependent.c
@@ -116,16 +116,5 @@ void vp8_arch_x86_common_init(VP8_COMMON *ctx)
#endif
-#if HAVE_SSSE3
-
- if (SSSE3Enabled)
- {
- rtcd->subpix.sixtap16x16 = vp8_sixtap_predict16x16_ssse3;
- rtcd->subpix.sixtap8x8 = vp8_sixtap_predict8x8_ssse3;
- rtcd->subpix.sixtap8x4 = vp8_sixtap_predict8x4_ssse3;
- rtcd->subpix.sixtap4x4 = vp8_sixtap_predict4x4_ssse3;
- }
-#endif
-
#endif
}