From d9fc451666c9895a0614df2f5a7ec60839a1c717 Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Mon, 17 Jun 2013 16:54:09 -0700 Subject: Move subpixel variance function from common/ to encoder/. This seems to only be used in the encoder. Also remove an empty wrapper file that contained forward declarations for this function, but didn't actually define any actual functions. Change-Id: Ifc561eef7ebe374a7d03698055e51e105f6d614b --- vp9/common/x86/vp9_subpel_variance_impl_sse2.asm | 645 ---------------------- vp9/common/x86/vp9_subpixel_variance_sse2.c | 45 -- vp9/encoder/x86/vp9_subpel_variance_impl_sse2.asm | 645 ++++++++++++++++++++++ vp9/vp9_common.mk | 2 - vp9/vp9cx.mk | 1 + 5 files changed, 646 insertions(+), 692 deletions(-) delete mode 100644 vp9/common/x86/vp9_subpel_variance_impl_sse2.asm delete mode 100644 vp9/common/x86/vp9_subpixel_variance_sse2.c create mode 100644 vp9/encoder/x86/vp9_subpel_variance_impl_sse2.asm (limited to 'vp9') diff --git a/vp9/common/x86/vp9_subpel_variance_impl_sse2.asm b/vp9/common/x86/vp9_subpel_variance_impl_sse2.asm deleted file mode 100644 index 8a2a471f5..000000000 --- a/vp9/common/x86/vp9_subpel_variance_impl_sse2.asm +++ /dev/null @@ -1,645 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - -%define xmm_filter_shift 7 - -;void vp9_filter_block2d_bil_var_sse2 -;( -; unsigned char *ref_ptr, -; int ref_pixels_per_line, -; unsigned char *src_ptr, -; int src_pixels_per_line, -; unsigned int Height, -; int xoffset, -; int yoffset, -; int *sum, -; unsigned int *sumsquared;; -; -;) -global sym(vp9_filter_block2d_bil_var_sse2) PRIVATE -sym(vp9_filter_block2d_bil_var_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 9 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - push rbx - ; end prolog - - pxor xmm6, xmm6 ; - pxor xmm7, xmm7 ; - - lea rsi, [GLOBAL(xmm_bi_rd)] ; rounding - movdqa xmm4, XMMWORD PTR [rsi] - - lea rcx, [GLOBAL(bilinear_filters_sse2)] - movsxd rax, dword ptr arg(5) ; xoffset - - cmp rax, 0 ; skip first_pass filter if xoffset=0 - je filter_block2d_bil_var_sse2_sp_only - - shl rax, 5 ; point to filter coeff with xoffset - lea rax, [rax + rcx] ; HFilter - - movsxd rdx, dword ptr arg(6) ; yoffset - - cmp rdx, 0 ; skip second_pass filter if yoffset=0 - je filter_block2d_bil_var_sse2_fp_only - - shl rdx, 5 - lea rdx, [rdx + rcx] ; VFilter - - mov rsi, arg(0) ;ref_ptr - mov rdi, arg(2) ;src_ptr - movsxd rcx, dword ptr arg(4) ;Height - - pxor xmm0, xmm0 ; - movq xmm1, QWORD PTR [rsi] ; - movq xmm3, QWORD PTR [rsi+1] ; - - punpcklbw xmm1, xmm0 ; - pmullw xmm1, [rax] ; - punpcklbw xmm3, xmm0 - pmullw xmm3, [rax+16] ; - - paddw xmm1, xmm3 ; - paddw xmm1, xmm4 ; - psraw xmm1, xmm_filter_shift ; - movdqa xmm5, xmm1 - - movsxd rbx, dword ptr arg(1) ;ref_pixels_per_line - lea rsi, [rsi + rbx] -%if ABI_IS_32BIT=0 - movsxd r9, dword ptr arg(3) ;src_pixels_per_line -%endif - -filter_block2d_bil_var_sse2_loop: - movq xmm1, QWORD PTR [rsi] ; - movq xmm3, QWORD PTR [rsi+1] ; - - punpcklbw xmm1, xmm0 ; - pmullw xmm1, [rax] ; - punpcklbw xmm3, xmm0 ; - pmullw xmm3, [rax+16] ; - - paddw xmm1, xmm3 ; - paddw xmm1, xmm4 ; - psraw xmm1, xmm_filter_shift ; - - movdqa xmm3, xmm5 ; - movdqa xmm5, xmm1 ; - - pmullw xmm3, [rdx] ; - pmullw xmm1, [rdx+16] ; - paddw xmm1, xmm3 ; - paddw xmm1, xmm4 ; - psraw xmm1, xmm_filter_shift ; - - movq xmm3, QWORD PTR [rdi] ; - punpcklbw xmm3, xmm0 ; - - psubw xmm1, xmm3 ; - paddw xmm6, xmm1 ; - - pmaddwd xmm1, xmm1 ; - paddd xmm7, xmm1 ; - - lea rsi, [rsi + rbx] ;ref_pixels_per_line -%if ABI_IS_32BIT - add rdi, dword ptr arg(3) ;src_pixels_per_line -%else - lea rdi, [rdi + r9] -%endif - - sub rcx, 1 ; - jnz filter_block2d_bil_var_sse2_loop ; - - jmp filter_block2d_bil_variance - -filter_block2d_bil_var_sse2_sp_only: - movsxd rdx, dword ptr arg(6) ; yoffset - - cmp rdx, 0 ; skip all if both xoffset=0 and yoffset=0 - je filter_block2d_bil_var_sse2_full_pixel - - shl rdx, 5 - lea rdx, [rdx + rcx] ; VFilter - - mov rsi, arg(0) ;ref_ptr - mov rdi, arg(2) ;src_ptr - movsxd rcx, dword ptr arg(4) ;Height - movsxd rax, dword ptr arg(1) ;ref_pixels_per_line - - pxor xmm0, xmm0 ; - movq xmm1, QWORD PTR [rsi] ; - punpcklbw xmm1, xmm0 ; - - movsxd rbx, dword ptr arg(3) ;src_pixels_per_line - lea rsi, [rsi + rax] - -filter_block2d_bil_sp_only_loop: - movq xmm3, QWORD PTR [rsi] ; - punpcklbw xmm3, xmm0 ; - movdqa xmm5, xmm3 - - pmullw xmm1, [rdx] ; - pmullw xmm3, [rdx+16] ; - paddw xmm1, xmm3 ; - paddw xmm1, xmm4 ; - psraw xmm1, xmm_filter_shift ; - - movq xmm3, QWORD PTR [rdi] ; - punpcklbw xmm3, xmm0 ; - - psubw xmm1, xmm3 ; - paddw xmm6, xmm1 ; - - pmaddwd xmm1, xmm1 ; - paddd xmm7, xmm1 ; - - movdqa xmm1, xmm5 ; - lea rsi, [rsi + rax] ;ref_pixels_per_line - lea rdi, [rdi + rbx] ;src_pixels_per_line - - sub rcx, 1 ; - jnz filter_block2d_bil_sp_only_loop ; - - jmp filter_block2d_bil_variance - -filter_block2d_bil_var_sse2_full_pixel: - mov rsi, arg(0) ;ref_ptr - mov rdi, arg(2) ;src_ptr - movsxd rcx, dword ptr arg(4) ;Height - movsxd rax, dword ptr arg(1) ;ref_pixels_per_line - movsxd rbx, dword ptr arg(3) ;src_pixels_per_line - pxor xmm0, xmm0 ; - -filter_block2d_bil_full_pixel_loop: - movq xmm1, QWORD PTR [rsi] ; - punpcklbw xmm1, xmm0 ; - - movq xmm2, QWORD PTR [rdi] ; - punpcklbw xmm2, xmm0 ; - - psubw xmm1, xmm2 ; - paddw xmm6, xmm1 ; - - pmaddwd xmm1, xmm1 ; - paddd xmm7, xmm1 ; - - lea rsi, [rsi + rax] ;ref_pixels_per_line - lea rdi, [rdi + rbx] ;src_pixels_per_line - - sub rcx, 1 ; - jnz filter_block2d_bil_full_pixel_loop ; - - jmp filter_block2d_bil_variance - -filter_block2d_bil_var_sse2_fp_only: - mov rsi, arg(0) ;ref_ptr - mov rdi, arg(2) ;src_ptr - movsxd rcx, dword ptr arg(4) ;Height - movsxd rdx, dword ptr arg(1) ;ref_pixels_per_line - - pxor xmm0, xmm0 ; - movsxd rbx, dword ptr arg(3) ;src_pixels_per_line - -filter_block2d_bil_fp_only_loop: - movq xmm1, QWORD PTR [rsi] ; - movq xmm3, QWORD PTR [rsi+1] ; - - punpcklbw xmm1, xmm0 ; - pmullw xmm1, [rax] ; - punpcklbw xmm3, xmm0 ; - pmullw xmm3, [rax+16] ; - - paddw xmm1, xmm3 ; - paddw xmm1, xmm4 ; - psraw xmm1, xmm_filter_shift ; - - movq xmm3, QWORD PTR [rdi] ; - punpcklbw xmm3, xmm0 ; - - psubw xmm1, xmm3 ; - paddw xmm6, xmm1 ; - - pmaddwd xmm1, xmm1 ; - paddd xmm7, xmm1 ; - lea rsi, [rsi + rdx] - lea rdi, [rdi + rbx] ;src_pixels_per_line - - sub rcx, 1 ; - jnz filter_block2d_bil_fp_only_loop ; - - jmp filter_block2d_bil_variance - -filter_block2d_bil_variance: - movdq2q mm6, xmm6 ; - movdq2q mm7, xmm7 ; - - psrldq xmm6, 8 - psrldq xmm7, 8 - - movdq2q mm2, xmm6 - movdq2q mm3, xmm7 - - paddw mm6, mm2 - paddd mm7, mm3 - - pxor mm3, mm3 ; - pxor mm2, mm2 ; - - punpcklwd mm2, mm6 ; - punpckhwd mm3, mm6 ; - - paddd mm2, mm3 ; - movq mm6, mm2 ; - - psrlq mm6, 32 ; - paddd mm2, mm6 ; - - psrad mm2, 16 ; - movq mm4, mm7 ; - - psrlq mm4, 32 ; - paddd mm4, mm7 ; - - mov rsi, arg(7) ; sum - mov rdi, arg(8) ; sumsquared - - movd [rsi], mm2 ; xsum - movd [rdi], mm4 ; xxsum - - ; begin epilog - pop rbx - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - - -;void vp9_half_horiz_vert_variance16x_h_sse2 -;( -; unsigned char *ref_ptr, -; int ref_pixels_per_line, -; unsigned char *src_ptr, -; int src_pixels_per_line, -; unsigned int Height, -; int *sum, -; unsigned int *sumsquared -;) -global sym(vp9_half_horiz_vert_variance16x_h_sse2) PRIVATE -sym(vp9_half_horiz_vert_variance16x_h_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - pxor xmm6, xmm6 ; error accumulator - pxor xmm7, xmm7 ; sse eaccumulator - mov rsi, arg(0) ;ref_ptr ; - - mov rdi, arg(2) ;src_ptr ; - movsxd rcx, dword ptr arg(4) ;Height ; - movsxd rax, dword ptr arg(1) ;ref_pixels_per_line - movsxd rdx, dword ptr arg(3) ;src_pixels_per_line - - pxor xmm0, xmm0 ; - - movdqu xmm5, XMMWORD PTR [rsi] - movdqu xmm3, XMMWORD PTR [rsi+1] - pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) horizontal line 1 - - lea rsi, [rsi + rax] - -.half_horiz_vert_variance16x_h_1: - movdqu xmm1, XMMWORD PTR [rsi] ; - movdqu xmm2, XMMWORD PTR [rsi+1] ; - pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,xmm3) horizontal line i+1 - - pavgb xmm5, xmm1 ; xmm = vertical average of the above - - movdqa xmm4, xmm5 - punpcklbw xmm5, xmm0 ; xmm5 = words of above - punpckhbw xmm4, xmm0 - - movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7 - punpcklbw xmm3, xmm0 ; xmm3 = words of above - psubw xmm5, xmm3 ; xmm5 -= xmm3 - - movq xmm3, QWORD PTR [rdi+8] - punpcklbw xmm3, xmm0 - psubw xmm4, xmm3 - - paddw xmm6, xmm5 ; xmm6 += accumulated column differences - paddw xmm6, xmm4 - pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 - pmaddwd xmm4, xmm4 - paddd xmm7, xmm5 ; xmm7 += accumulated square column differences - paddd xmm7, xmm4 - - movdqa xmm5, xmm1 ; save xmm1 for use on the next row - - lea rsi, [rsi + rax] - lea rdi, [rdi + rdx] - - sub rcx, 1 ; - jnz .half_horiz_vert_variance16x_h_1 ; - - pxor xmm1, xmm1 - pxor xmm5, xmm5 - - punpcklwd xmm0, xmm6 - punpckhwd xmm1, xmm6 - psrad xmm0, 16 - psrad xmm1, 16 - paddd xmm0, xmm1 - movdqa xmm1, xmm0 - - movdqa xmm6, xmm7 - punpckldq xmm6, xmm5 - punpckhdq xmm7, xmm5 - paddd xmm6, xmm7 - - punpckldq xmm0, xmm5 - punpckhdq xmm1, xmm5 - paddd xmm0, xmm1 - - movdqa xmm7, xmm6 - movdqa xmm1, xmm0 - - psrldq xmm7, 8 - psrldq xmm1, 8 - - paddd xmm6, xmm7 - paddd xmm0, xmm1 - - mov rsi, arg(5) ;[Sum] - mov rdi, arg(6) ;[SSE] - - movd [rsi], xmm0 - movd [rdi], xmm6 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void vp9_half_vert_variance16x_h_sse2 -;( -; unsigned char *ref_ptr, -; int ref_pixels_per_line, -; unsigned char *src_ptr, -; int src_pixels_per_line, -; unsigned int Height, -; int *sum, -; unsigned int *sumsquared -;) -global sym(vp9_half_vert_variance16x_h_sse2) PRIVATE -sym(vp9_half_vert_variance16x_h_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - pxor xmm6, xmm6 ; error accumulator - pxor xmm7, xmm7 ; sse eaccumulator - mov rsi, arg(0) ;ref_ptr - - mov rdi, arg(2) ;src_ptr - movsxd rcx, dword ptr arg(4) ;Height - movsxd rax, dword ptr arg(1) ;ref_pixels_per_line - movsxd rdx, dword ptr arg(3) ;src_pixels_per_line - - movdqu xmm5, XMMWORD PTR [rsi] - lea rsi, [rsi + rax ] - pxor xmm0, xmm0 - -.half_vert_variance16x_h_1: - movdqu xmm3, XMMWORD PTR [rsi] - - pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) - movdqa xmm4, xmm5 - punpcklbw xmm5, xmm0 - punpckhbw xmm4, xmm0 - - movq xmm2, QWORD PTR [rdi] - punpcklbw xmm2, xmm0 - psubw xmm5, xmm2 - movq xmm2, QWORD PTR [rdi+8] - punpcklbw xmm2, xmm0 - psubw xmm4, xmm2 - - paddw xmm6, xmm5 ; xmm6 += accumulated column differences - paddw xmm6, xmm4 - pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 - pmaddwd xmm4, xmm4 - paddd xmm7, xmm5 ; xmm7 += accumulated square column differences - paddd xmm7, xmm4 - - movdqa xmm5, xmm3 - - lea rsi, [rsi + rax] - lea rdi, [rdi + rdx] - - sub rcx, 1 - jnz .half_vert_variance16x_h_1 - - pxor xmm1, xmm1 - pxor xmm5, xmm5 - - punpcklwd xmm0, xmm6 - punpckhwd xmm1, xmm6 - psrad xmm0, 16 - psrad xmm1, 16 - paddd xmm0, xmm1 - movdqa xmm1, xmm0 - - movdqa xmm6, xmm7 - punpckldq xmm6, xmm5 - punpckhdq xmm7, xmm5 - paddd xmm6, xmm7 - - punpckldq xmm0, xmm5 - punpckhdq xmm1, xmm5 - paddd xmm0, xmm1 - - movdqa xmm7, xmm6 - movdqa xmm1, xmm0 - - psrldq xmm7, 8 - psrldq xmm1, 8 - - paddd xmm6, xmm7 - paddd xmm0, xmm1 - - mov rsi, arg(5) ;[Sum] - mov rdi, arg(6) ;[SSE] - - movd [rsi], xmm0 - movd [rdi], xmm6 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void vp9_half_horiz_variance16x_h_sse2 -;( -; unsigned char *ref_ptr, -; int ref_pixels_per_line, -; unsigned char *src_ptr, -; int src_pixels_per_line, -; unsigned int Height, -; int *sum, -; unsigned int *sumsquared -;) -global sym(vp9_half_horiz_variance16x_h_sse2) PRIVATE -sym(vp9_half_horiz_variance16x_h_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - pxor xmm6, xmm6 ; error accumulator - pxor xmm7, xmm7 ; sse eaccumulator - mov rsi, arg(0) ;ref_ptr ; - - mov rdi, arg(2) ;src_ptr ; - movsxd rcx, dword ptr arg(4) ;Height ; - movsxd rax, dword ptr arg(1) ;ref_pixels_per_line - movsxd rdx, dword ptr arg(3) ;src_pixels_per_line - - pxor xmm0, xmm0 ; - -.half_horiz_variance16x_h_1: - movdqu xmm5, XMMWORD PTR [rsi] ; xmm5 = s0,s1,s2..s15 - movdqu xmm3, XMMWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s16 - - pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) - movdqa xmm1, xmm5 - punpcklbw xmm5, xmm0 ; xmm5 = words of above - punpckhbw xmm1, xmm0 - - movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7 - punpcklbw xmm3, xmm0 ; xmm3 = words of above - movq xmm2, QWORD PTR [rdi+8] - punpcklbw xmm2, xmm0 - - psubw xmm5, xmm3 ; xmm5 -= xmm3 - psubw xmm1, xmm2 - paddw xmm6, xmm5 ; xmm6 += accumulated column differences - paddw xmm6, xmm1 - pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 - pmaddwd xmm1, xmm1 - paddd xmm7, xmm5 ; xmm7 += accumulated square column differences - paddd xmm7, xmm1 - - lea rsi, [rsi + rax] - lea rdi, [rdi + rdx] - - sub rcx, 1 ; - jnz .half_horiz_variance16x_h_1 ; - - pxor xmm1, xmm1 - pxor xmm5, xmm5 - - punpcklwd xmm0, xmm6 - punpckhwd xmm1, xmm6 - psrad xmm0, 16 - psrad xmm1, 16 - paddd xmm0, xmm1 - movdqa xmm1, xmm0 - - movdqa xmm6, xmm7 - punpckldq xmm6, xmm5 - punpckhdq xmm7, xmm5 - paddd xmm6, xmm7 - - punpckldq xmm0, xmm5 - punpckhdq xmm1, xmm5 - paddd xmm0, xmm1 - - movdqa xmm7, xmm6 - movdqa xmm1, xmm0 - - psrldq xmm7, 8 - psrldq xmm1, 8 - - paddd xmm6, xmm7 - paddd xmm0, xmm1 - - mov rsi, arg(5) ;[Sum] - mov rdi, arg(6) ;[SSE] - - movd [rsi], xmm0 - movd [rdi], xmm6 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -SECTION_RODATA -; short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64}; -align 16 -xmm_bi_rd: - times 8 dw 64 -align 16 -bilinear_filters_sse2: - dw 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0 - dw 120, 120, 120, 120, 120, 120, 120, 120, 8, 8, 8, 8, 8, 8, 8, 8 - dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16 - dw 104, 104, 104, 104, 104, 104, 104, 104, 24, 24, 24, 24, 24, 24, 24, 24 - dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32 - dw 88, 88, 88, 88, 88, 88, 88, 88, 40, 40, 40, 40, 40, 40, 40, 40 - dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48 - dw 72, 72, 72, 72, 72, 72, 72, 72, 56, 56, 56, 56, 56, 56, 56, 56 - dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 - dw 56, 56, 56, 56, 56, 56, 56, 56, 72, 72, 72, 72, 72, 72, 72, 72 - dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80 - dw 40, 40, 40, 40, 40, 40, 40, 40, 88, 88, 88, 88, 88, 88, 88, 88 - dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96 - dw 24, 24, 24, 24, 24, 24, 24, 24, 104, 104, 104, 104, 104, 104, 104, 104 - dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112 - dw 8, 8, 8, 8, 8, 8, 8, 8, 120, 120, 120, 120, 120, 120, 120, 120 diff --git a/vp9/common/x86/vp9_subpixel_variance_sse2.c b/vp9/common/x86/vp9_subpixel_variance_sse2.c deleted file mode 100644 index c20b9fbe9..000000000 --- a/vp9/common/x86/vp9_subpixel_variance_sse2.c +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#define HALFNDX 8 - -void vp9_half_horiz_variance16x_h_sse2(const unsigned char *ref_ptr, - int ref_pixels_per_line, - const unsigned char *src_ptr, - int src_pixels_per_line, - unsigned int Height, - int *sum, - unsigned int *sumsquared); - -void vp9_half_vert_variance16x_h_sse2(const unsigned char *ref_ptr, - int ref_pixels_per_line, - const unsigned char *src_ptr, - int src_pixels_per_line, - unsigned int Height, - int *sum, - unsigned int *sumsquared); - -void vp9_half_horiz_vert_variance16x_h_sse2(const unsigned char *ref_ptr, - int ref_pixels_per_line, - const unsigned char *src_ptr, - int src_pixels_per_line, - unsigned int Height, - int *sum, - unsigned int *sumsquared); - -void vp9_filter_block2d_bil_var_sse2(const unsigned char *ref_ptr, - int ref_pixels_per_line, - const unsigned char *src_ptr, - int src_pixels_per_line, - unsigned int Height, - int xoffset, - int yoffset, - int *sum, - unsigned int *sumsquared); diff --git a/vp9/encoder/x86/vp9_subpel_variance_impl_sse2.asm b/vp9/encoder/x86/vp9_subpel_variance_impl_sse2.asm new file mode 100644 index 000000000..8a2a471f5 --- /dev/null +++ b/vp9/encoder/x86/vp9_subpel_variance_impl_sse2.asm @@ -0,0 +1,645 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +%define xmm_filter_shift 7 + +;void vp9_filter_block2d_bil_var_sse2 +;( +; unsigned char *ref_ptr, +; int ref_pixels_per_line, +; unsigned char *src_ptr, +; int src_pixels_per_line, +; unsigned int Height, +; int xoffset, +; int yoffset, +; int *sum, +; unsigned int *sumsquared;; +; +;) +global sym(vp9_filter_block2d_bil_var_sse2) PRIVATE +sym(vp9_filter_block2d_bil_var_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 9 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + push rbx + ; end prolog + + pxor xmm6, xmm6 ; + pxor xmm7, xmm7 ; + + lea rsi, [GLOBAL(xmm_bi_rd)] ; rounding + movdqa xmm4, XMMWORD PTR [rsi] + + lea rcx, [GLOBAL(bilinear_filters_sse2)] + movsxd rax, dword ptr arg(5) ; xoffset + + cmp rax, 0 ; skip first_pass filter if xoffset=0 + je filter_block2d_bil_var_sse2_sp_only + + shl rax, 5 ; point to filter coeff with xoffset + lea rax, [rax + rcx] ; HFilter + + movsxd rdx, dword ptr arg(6) ; yoffset + + cmp rdx, 0 ; skip second_pass filter if yoffset=0 + je filter_block2d_bil_var_sse2_fp_only + + shl rdx, 5 + lea rdx, [rdx + rcx] ; VFilter + + mov rsi, arg(0) ;ref_ptr + mov rdi, arg(2) ;src_ptr + movsxd rcx, dword ptr arg(4) ;Height + + pxor xmm0, xmm0 ; + movq xmm1, QWORD PTR [rsi] ; + movq xmm3, QWORD PTR [rsi+1] ; + + punpcklbw xmm1, xmm0 ; + pmullw xmm1, [rax] ; + punpcklbw xmm3, xmm0 + pmullw xmm3, [rax+16] ; + + paddw xmm1, xmm3 ; + paddw xmm1, xmm4 ; + psraw xmm1, xmm_filter_shift ; + movdqa xmm5, xmm1 + + movsxd rbx, dword ptr arg(1) ;ref_pixels_per_line + lea rsi, [rsi + rbx] +%if ABI_IS_32BIT=0 + movsxd r9, dword ptr arg(3) ;src_pixels_per_line +%endif + +filter_block2d_bil_var_sse2_loop: + movq xmm1, QWORD PTR [rsi] ; + movq xmm3, QWORD PTR [rsi+1] ; + + punpcklbw xmm1, xmm0 ; + pmullw xmm1, [rax] ; + punpcklbw xmm3, xmm0 ; + pmullw xmm3, [rax+16] ; + + paddw xmm1, xmm3 ; + paddw xmm1, xmm4 ; + psraw xmm1, xmm_filter_shift ; + + movdqa xmm3, xmm5 ; + movdqa xmm5, xmm1 ; + + pmullw xmm3, [rdx] ; + pmullw xmm1, [rdx+16] ; + paddw xmm1, xmm3 ; + paddw xmm1, xmm4 ; + psraw xmm1, xmm_filter_shift ; + + movq xmm3, QWORD PTR [rdi] ; + punpcklbw xmm3, xmm0 ; + + psubw xmm1, xmm3 ; + paddw xmm6, xmm1 ; + + pmaddwd xmm1, xmm1 ; + paddd xmm7, xmm1 ; + + lea rsi, [rsi + rbx] ;ref_pixels_per_line +%if ABI_IS_32BIT + add rdi, dword ptr arg(3) ;src_pixels_per_line +%else + lea rdi, [rdi + r9] +%endif + + sub rcx, 1 ; + jnz filter_block2d_bil_var_sse2_loop ; + + jmp filter_block2d_bil_variance + +filter_block2d_bil_var_sse2_sp_only: + movsxd rdx, dword ptr arg(6) ; yoffset + + cmp rdx, 0 ; skip all if both xoffset=0 and yoffset=0 + je filter_block2d_bil_var_sse2_full_pixel + + shl rdx, 5 + lea rdx, [rdx + rcx] ; VFilter + + mov rsi, arg(0) ;ref_ptr + mov rdi, arg(2) ;src_ptr + movsxd rcx, dword ptr arg(4) ;Height + movsxd rax, dword ptr arg(1) ;ref_pixels_per_line + + pxor xmm0, xmm0 ; + movq xmm1, QWORD PTR [rsi] ; + punpcklbw xmm1, xmm0 ; + + movsxd rbx, dword ptr arg(3) ;src_pixels_per_line + lea rsi, [rsi + rax] + +filter_block2d_bil_sp_only_loop: + movq xmm3, QWORD PTR [rsi] ; + punpcklbw xmm3, xmm0 ; + movdqa xmm5, xmm3 + + pmullw xmm1, [rdx] ; + pmullw xmm3, [rdx+16] ; + paddw xmm1, xmm3 ; + paddw xmm1, xmm4 ; + psraw xmm1, xmm_filter_shift ; + + movq xmm3, QWORD PTR [rdi] ; + punpcklbw xmm3, xmm0 ; + + psubw xmm1, xmm3 ; + paddw xmm6, xmm1 ; + + pmaddwd xmm1, xmm1 ; + paddd xmm7, xmm1 ; + + movdqa xmm1, xmm5 ; + lea rsi, [rsi + rax] ;ref_pixels_per_line + lea rdi, [rdi + rbx] ;src_pixels_per_line + + sub rcx, 1 ; + jnz filter_block2d_bil_sp_only_loop ; + + jmp filter_block2d_bil_variance + +filter_block2d_bil_var_sse2_full_pixel: + mov rsi, arg(0) ;ref_ptr + mov rdi, arg(2) ;src_ptr + movsxd rcx, dword ptr arg(4) ;Height + movsxd rax, dword ptr arg(1) ;ref_pixels_per_line + movsxd rbx, dword ptr arg(3) ;src_pixels_per_line + pxor xmm0, xmm0 ; + +filter_block2d_bil_full_pixel_loop: + movq xmm1, QWORD PTR [rsi] ; + punpcklbw xmm1, xmm0 ; + + movq xmm2, QWORD PTR [rdi] ; + punpcklbw xmm2, xmm0 ; + + psubw xmm1, xmm2 ; + paddw xmm6, xmm1 ; + + pmaddwd xmm1, xmm1 ; + paddd xmm7, xmm1 ; + + lea rsi, [rsi + rax] ;ref_pixels_per_line + lea rdi, [rdi + rbx] ;src_pixels_per_line + + sub rcx, 1 ; + jnz filter_block2d_bil_full_pixel_loop ; + + jmp filter_block2d_bil_variance + +filter_block2d_bil_var_sse2_fp_only: + mov rsi, arg(0) ;ref_ptr + mov rdi, arg(2) ;src_ptr + movsxd rcx, dword ptr arg(4) ;Height + movsxd rdx, dword ptr arg(1) ;ref_pixels_per_line + + pxor xmm0, xmm0 ; + movsxd rbx, dword ptr arg(3) ;src_pixels_per_line + +filter_block2d_bil_fp_only_loop: + movq xmm1, QWORD PTR [rsi] ; + movq xmm3, QWORD PTR [rsi+1] ; + + punpcklbw xmm1, xmm0 ; + pmullw xmm1, [rax] ; + punpcklbw xmm3, xmm0 ; + pmullw xmm3, [rax+16] ; + + paddw xmm1, xmm3 ; + paddw xmm1, xmm4 ; + psraw xmm1, xmm_filter_shift ; + + movq xmm3, QWORD PTR [rdi] ; + punpcklbw xmm3, xmm0 ; + + psubw xmm1, xmm3 ; + paddw xmm6, xmm1 ; + + pmaddwd xmm1, xmm1 ; + paddd xmm7, xmm1 ; + lea rsi, [rsi + rdx] + lea rdi, [rdi + rbx] ;src_pixels_per_line + + sub rcx, 1 ; + jnz filter_block2d_bil_fp_only_loop ; + + jmp filter_block2d_bil_variance + +filter_block2d_bil_variance: + movdq2q mm6, xmm6 ; + movdq2q mm7, xmm7 ; + + psrldq xmm6, 8 + psrldq xmm7, 8 + + movdq2q mm2, xmm6 + movdq2q mm3, xmm7 + + paddw mm6, mm2 + paddd mm7, mm3 + + pxor mm3, mm3 ; + pxor mm2, mm2 ; + + punpcklwd mm2, mm6 ; + punpckhwd mm3, mm6 ; + + paddd mm2, mm3 ; + movq mm6, mm2 ; + + psrlq mm6, 32 ; + paddd mm2, mm6 ; + + psrad mm2, 16 ; + movq mm4, mm7 ; + + psrlq mm4, 32 ; + paddd mm4, mm7 ; + + mov rsi, arg(7) ; sum + mov rdi, arg(8) ; sumsquared + + movd [rsi], mm2 ; xsum + movd [rdi], mm4 ; xxsum + + ; begin epilog + pop rbx + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + + +;void vp9_half_horiz_vert_variance16x_h_sse2 +;( +; unsigned char *ref_ptr, +; int ref_pixels_per_line, +; unsigned char *src_ptr, +; int src_pixels_per_line, +; unsigned int Height, +; int *sum, +; unsigned int *sumsquared +;) +global sym(vp9_half_horiz_vert_variance16x_h_sse2) PRIVATE +sym(vp9_half_horiz_vert_variance16x_h_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + pxor xmm6, xmm6 ; error accumulator + pxor xmm7, xmm7 ; sse eaccumulator + mov rsi, arg(0) ;ref_ptr ; + + mov rdi, arg(2) ;src_ptr ; + movsxd rcx, dword ptr arg(4) ;Height ; + movsxd rax, dword ptr arg(1) ;ref_pixels_per_line + movsxd rdx, dword ptr arg(3) ;src_pixels_per_line + + pxor xmm0, xmm0 ; + + movdqu xmm5, XMMWORD PTR [rsi] + movdqu xmm3, XMMWORD PTR [rsi+1] + pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) horizontal line 1 + + lea rsi, [rsi + rax] + +.half_horiz_vert_variance16x_h_1: + movdqu xmm1, XMMWORD PTR [rsi] ; + movdqu xmm2, XMMWORD PTR [rsi+1] ; + pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,xmm3) horizontal line i+1 + + pavgb xmm5, xmm1 ; xmm = vertical average of the above + + movdqa xmm4, xmm5 + punpcklbw xmm5, xmm0 ; xmm5 = words of above + punpckhbw xmm4, xmm0 + + movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7 + punpcklbw xmm3, xmm0 ; xmm3 = words of above + psubw xmm5, xmm3 ; xmm5 -= xmm3 + + movq xmm3, QWORD PTR [rdi+8] + punpcklbw xmm3, xmm0 + psubw xmm4, xmm3 + + paddw xmm6, xmm5 ; xmm6 += accumulated column differences + paddw xmm6, xmm4 + pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 + pmaddwd xmm4, xmm4 + paddd xmm7, xmm5 ; xmm7 += accumulated square column differences + paddd xmm7, xmm4 + + movdqa xmm5, xmm1 ; save xmm1 for use on the next row + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + + sub rcx, 1 ; + jnz .half_horiz_vert_variance16x_h_1 ; + + pxor xmm1, xmm1 + pxor xmm5, xmm5 + + punpcklwd xmm0, xmm6 + punpckhwd xmm1, xmm6 + psrad xmm0, 16 + psrad xmm1, 16 + paddd xmm0, xmm1 + movdqa xmm1, xmm0 + + movdqa xmm6, xmm7 + punpckldq xmm6, xmm5 + punpckhdq xmm7, xmm5 + paddd xmm6, xmm7 + + punpckldq xmm0, xmm5 + punpckhdq xmm1, xmm5 + paddd xmm0, xmm1 + + movdqa xmm7, xmm6 + movdqa xmm1, xmm0 + + psrldq xmm7, 8 + psrldq xmm1, 8 + + paddd xmm6, xmm7 + paddd xmm0, xmm1 + + mov rsi, arg(5) ;[Sum] + mov rdi, arg(6) ;[SSE] + + movd [rsi], xmm0 + movd [rdi], xmm6 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vp9_half_vert_variance16x_h_sse2 +;( +; unsigned char *ref_ptr, +; int ref_pixels_per_line, +; unsigned char *src_ptr, +; int src_pixels_per_line, +; unsigned int Height, +; int *sum, +; unsigned int *sumsquared +;) +global sym(vp9_half_vert_variance16x_h_sse2) PRIVATE +sym(vp9_half_vert_variance16x_h_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + pxor xmm6, xmm6 ; error accumulator + pxor xmm7, xmm7 ; sse eaccumulator + mov rsi, arg(0) ;ref_ptr + + mov rdi, arg(2) ;src_ptr + movsxd rcx, dword ptr arg(4) ;Height + movsxd rax, dword ptr arg(1) ;ref_pixels_per_line + movsxd rdx, dword ptr arg(3) ;src_pixels_per_line + + movdqu xmm5, XMMWORD PTR [rsi] + lea rsi, [rsi + rax ] + pxor xmm0, xmm0 + +.half_vert_variance16x_h_1: + movdqu xmm3, XMMWORD PTR [rsi] + + pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) + movdqa xmm4, xmm5 + punpcklbw xmm5, xmm0 + punpckhbw xmm4, xmm0 + + movq xmm2, QWORD PTR [rdi] + punpcklbw xmm2, xmm0 + psubw xmm5, xmm2 + movq xmm2, QWORD PTR [rdi+8] + punpcklbw xmm2, xmm0 + psubw xmm4, xmm2 + + paddw xmm6, xmm5 ; xmm6 += accumulated column differences + paddw xmm6, xmm4 + pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 + pmaddwd xmm4, xmm4 + paddd xmm7, xmm5 ; xmm7 += accumulated square column differences + paddd xmm7, xmm4 + + movdqa xmm5, xmm3 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + + sub rcx, 1 + jnz .half_vert_variance16x_h_1 + + pxor xmm1, xmm1 + pxor xmm5, xmm5 + + punpcklwd xmm0, xmm6 + punpckhwd xmm1, xmm6 + psrad xmm0, 16 + psrad xmm1, 16 + paddd xmm0, xmm1 + movdqa xmm1, xmm0 + + movdqa xmm6, xmm7 + punpckldq xmm6, xmm5 + punpckhdq xmm7, xmm5 + paddd xmm6, xmm7 + + punpckldq xmm0, xmm5 + punpckhdq xmm1, xmm5 + paddd xmm0, xmm1 + + movdqa xmm7, xmm6 + movdqa xmm1, xmm0 + + psrldq xmm7, 8 + psrldq xmm1, 8 + + paddd xmm6, xmm7 + paddd xmm0, xmm1 + + mov rsi, arg(5) ;[Sum] + mov rdi, arg(6) ;[SSE] + + movd [rsi], xmm0 + movd [rdi], xmm6 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vp9_half_horiz_variance16x_h_sse2 +;( +; unsigned char *ref_ptr, +; int ref_pixels_per_line, +; unsigned char *src_ptr, +; int src_pixels_per_line, +; unsigned int Height, +; int *sum, +; unsigned int *sumsquared +;) +global sym(vp9_half_horiz_variance16x_h_sse2) PRIVATE +sym(vp9_half_horiz_variance16x_h_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + pxor xmm6, xmm6 ; error accumulator + pxor xmm7, xmm7 ; sse eaccumulator + mov rsi, arg(0) ;ref_ptr ; + + mov rdi, arg(2) ;src_ptr ; + movsxd rcx, dword ptr arg(4) ;Height ; + movsxd rax, dword ptr arg(1) ;ref_pixels_per_line + movsxd rdx, dword ptr arg(3) ;src_pixels_per_line + + pxor xmm0, xmm0 ; + +.half_horiz_variance16x_h_1: + movdqu xmm5, XMMWORD PTR [rsi] ; xmm5 = s0,s1,s2..s15 + movdqu xmm3, XMMWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s16 + + pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) + movdqa xmm1, xmm5 + punpcklbw xmm5, xmm0 ; xmm5 = words of above + punpckhbw xmm1, xmm0 + + movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7 + punpcklbw xmm3, xmm0 ; xmm3 = words of above + movq xmm2, QWORD PTR [rdi+8] + punpcklbw xmm2, xmm0 + + psubw xmm5, xmm3 ; xmm5 -= xmm3 + psubw xmm1, xmm2 + paddw xmm6, xmm5 ; xmm6 += accumulated column differences + paddw xmm6, xmm1 + pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 + pmaddwd xmm1, xmm1 + paddd xmm7, xmm5 ; xmm7 += accumulated square column differences + paddd xmm7, xmm1 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + + sub rcx, 1 ; + jnz .half_horiz_variance16x_h_1 ; + + pxor xmm1, xmm1 + pxor xmm5, xmm5 + + punpcklwd xmm0, xmm6 + punpckhwd xmm1, xmm6 + psrad xmm0, 16 + psrad xmm1, 16 + paddd xmm0, xmm1 + movdqa xmm1, xmm0 + + movdqa xmm6, xmm7 + punpckldq xmm6, xmm5 + punpckhdq xmm7, xmm5 + paddd xmm6, xmm7 + + punpckldq xmm0, xmm5 + punpckhdq xmm1, xmm5 + paddd xmm0, xmm1 + + movdqa xmm7, xmm6 + movdqa xmm1, xmm0 + + psrldq xmm7, 8 + psrldq xmm1, 8 + + paddd xmm6, xmm7 + paddd xmm0, xmm1 + + mov rsi, arg(5) ;[Sum] + mov rdi, arg(6) ;[SSE] + + movd [rsi], xmm0 + movd [rdi], xmm6 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +SECTION_RODATA +; short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64}; +align 16 +xmm_bi_rd: + times 8 dw 64 +align 16 +bilinear_filters_sse2: + dw 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0 + dw 120, 120, 120, 120, 120, 120, 120, 120, 8, 8, 8, 8, 8, 8, 8, 8 + dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16 + dw 104, 104, 104, 104, 104, 104, 104, 104, 24, 24, 24, 24, 24, 24, 24, 24 + dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32 + dw 88, 88, 88, 88, 88, 88, 88, 88, 40, 40, 40, 40, 40, 40, 40, 40 + dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48 + dw 72, 72, 72, 72, 72, 72, 72, 72, 56, 56, 56, 56, 56, 56, 56, 56 + dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 + dw 56, 56, 56, 56, 56, 56, 56, 56, 72, 72, 72, 72, 72, 72, 72, 72 + dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80 + dw 40, 40, 40, 40, 40, 40, 40, 40, 88, 88, 88, 88, 88, 88, 88, 88 + dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96 + dw 24, 24, 24, 24, 24, 24, 24, 24, 104, 104, 104, 104, 104, 104, 104, 104 + dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112 + dw 8, 8, 8, 8, 8, 8, 8, 8, 120, 120, 120, 120, 120, 120, 120, 120 diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk index b6d50f8ef..1a1fba9a0 100644 --- a/vp9/vp9_common.mk +++ b/vp9/vp9_common.mk @@ -87,8 +87,6 @@ VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_iwalsh_sse2.asm VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_loopfilter_sse2.asm VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_recon_sse2.asm VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_recon_wrapper_sse2.c -VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpel_variance_impl_sse2.asm -VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_variance_sse2.c VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_ssse3.asm ifeq ($(CONFIG_POSTPROC),yes) VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_postproc_mmx.asm diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk index 86fd08850..4bed6c0d7 100644 --- a/vp9/vp9cx.mk +++ b/vp9/vp9cx.mk @@ -86,6 +86,7 @@ VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad_sse2.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad4d_sse2.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_fwalsh_sse2.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subtract_sse2.asm +VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance_impl_sse2.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm VP9_CX_SRCS-$(HAVE_SSE3) += encoder/x86/vp9_sad_sse3.asm VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_sad_ssse3.asm -- cgit v1.2.3