From 244e2e14510d3dcd058157ec1abe89963cfa7e89 Mon Sep 17 00:00:00 2001 From: Yunqing Wang Date: Thu, 3 Mar 2011 19:02:45 -0500 Subject: Write SSSE3 sub-pixel filter function 1. Process 16 pixels at one time instead of 8. 2. Add check for both xoffset =0 and yoffset=0, which happens during motion search. This change gave encoder 1%~3% performance gain. Change-Id: Idaa39506b48f4f8b2fbbeb45aae8226fa32afb3e --- vp8/encoder/x86/variance_impl_ssse3.asm | 348 ++++++++++++++++++++++++++++++++ vp8/encoder/x86/variance_ssse3.c | 140 +++++++++++++ vp8/encoder/x86/variance_x86.h | 4 + vp8/encoder/x86/x86_csystemdependent.c | 2 + 4 files changed, 494 insertions(+) create mode 100644 vp8/encoder/x86/variance_impl_ssse3.asm create mode 100644 vp8/encoder/x86/variance_ssse3.c (limited to 'vp8/encoder/x86') diff --git a/vp8/encoder/x86/variance_impl_ssse3.asm b/vp8/encoder/x86/variance_impl_ssse3.asm new file mode 100644 index 000000000..b1976328d --- /dev/null +++ b/vp8/encoder/x86/variance_impl_ssse3.asm @@ -0,0 +1,348 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +%define xmm_filter_shift 7 + + +;void vp8_filter_block2d_bil_var_ssse3 +;( +; unsigned char *ref_ptr, +; int ref_pixels_per_line, +; unsigned char *src_ptr, +; int src_pixels_per_line, +; unsigned int Height, +; int xoffset, +; int yoffset, +; int *sum, +; unsigned int *sumsquared;; +; +;) +;Note: The filter coefficient at offset=0 is 128. Since the second register +;for Pmaddubsw is signed bytes, we must calculate zero offset seperately. +global sym(vp8_filter_block2d_bil_var_ssse3) +sym(vp8_filter_block2d_bil_var_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 9 + SAVE_XMM + GET_GOT rbx + push rsi + push rdi + push rbx + ; end prolog + + pxor xmm6, xmm6 + pxor xmm7, xmm7 + + lea rcx, [GLOBAL(vp8_bilinear_filters_ssse3)] + movsxd rax, dword ptr arg(5) ; xoffset + + cmp rax, 0 ; skip first_pass filter if xoffset=0 + je filter_block2d_bil_var_ssse3_sp_only + + shl rax, 4 ; point to filter coeff with xoffset + lea rax, [rax + rcx] ; HFilter + + movsxd rdx, dword ptr arg(6) ; yoffset + + cmp rdx, 0 ; skip second_pass filter if yoffset=0 + je filter_block2d_bil_var_ssse3_fp_only + + shl rdx, 4 + lea rdx, [rdx + rcx] ; VFilter + + mov rsi, arg(0) ;ref_ptr + mov rdi, arg(2) ;src_ptr + movsxd rcx, dword ptr arg(4) ;Height + + movdqu xmm0, XMMWORD PTR [rsi] + movdqu xmm1, XMMWORD PTR [rsi+1] + movdqa xmm2, xmm0 + + punpcklbw xmm0, xmm1 + punpckhbw xmm2, xmm1 + pmaddubsw xmm0, [rax] + pmaddubsw xmm2, [rax] + + paddw xmm0, [GLOBAL(xmm_bi_rd)] + paddw xmm2, [GLOBAL(xmm_bi_rd)] + psraw xmm0, xmm_filter_shift + psraw xmm2, xmm_filter_shift + + packuswb xmm0, xmm2 + + movsxd rbx, dword ptr arg(1) ;ref_pixels_per_line + lea rsi, [rsi + rbx] +%if ABI_IS_32BIT=0 + movsxd r9, dword ptr arg(3) ;src_pixels_per_line +%endif + +filter_block2d_bil_var_ssse3_loop: + movdqu xmm1, XMMWORD PTR [rsi] + movdqu xmm2, XMMWORD PTR [rsi+1] + movdqa xmm3, xmm1 + + punpcklbw xmm1, xmm2 + punpckhbw xmm3, xmm2 + pmaddubsw xmm1, [rax] + pmaddubsw xmm3, [rax] + + paddw xmm1, [GLOBAL(xmm_bi_rd)] + paddw xmm3, [GLOBAL(xmm_bi_rd)] + psraw xmm1, xmm_filter_shift + psraw xmm3, xmm_filter_shift + packuswb xmm1, xmm3 + + movdqa xmm2, xmm0 + movdqa xmm0, xmm1 + movdqa xmm3, xmm2 + + punpcklbw xmm2, xmm1 + punpckhbw xmm3, xmm1 + pmaddubsw xmm2, [rdx] + pmaddubsw xmm3, [rdx] + + paddw xmm2, [GLOBAL(xmm_bi_rd)] + paddw xmm3, [GLOBAL(xmm_bi_rd)] + psraw xmm2, xmm_filter_shift + psraw xmm3, xmm_filter_shift + + movq xmm1, QWORD PTR [rdi] + pxor xmm4, xmm4 + punpcklbw xmm1, xmm4 + movq xmm5, QWORD PTR [rdi+8] + punpcklbw xmm5, xmm4 + + psubw xmm2, xmm1 + psubw xmm3, xmm5 + paddw xmm6, xmm2 + paddw xmm6, xmm3 + pmaddwd xmm2, xmm2 + pmaddwd xmm3, xmm3 + paddd xmm7, xmm2 + paddd xmm7, xmm3 + + lea rsi, [rsi + rbx] ;ref_pixels_per_line +%if ABI_IS_32BIT + add rdi, dword ptr arg(3) ;src_pixels_per_line +%else + lea rdi, [rdi + r9] +%endif + + sub rcx, 1 + jnz filter_block2d_bil_var_ssse3_loop + + jmp filter_block2d_bil_variance + +filter_block2d_bil_var_ssse3_sp_only: + movsxd rdx, dword ptr arg(6) ; yoffset + + cmp rdx, 0 ; Both xoffset =0 and yoffset=0 + je filter_block2d_bil_var_ssse3_full_pixel + + shl rdx, 4 + lea rdx, [rdx + rcx] ; VFilter + + mov rsi, arg(0) ;ref_ptr + mov rdi, arg(2) ;src_ptr + movsxd rcx, dword ptr arg(4) ;Height + movsxd rax, dword ptr arg(1) ;ref_pixels_per_line + + movdqu xmm1, XMMWORD PTR [rsi] + movdqa xmm0, xmm1 + + movsxd rbx, dword ptr arg(3) ;src_pixels_per_line + lea rsi, [rsi + rax] + +filter_block2d_bil_sp_only_loop: + movdqu xmm3, XMMWORD PTR [rsi] + movdqa xmm2, xmm1 + movdqa xmm0, xmm3 + + punpcklbw xmm1, xmm3 + punpckhbw xmm2, xmm3 + pmaddubsw xmm1, [rdx] + pmaddubsw xmm2, [rdx] + + paddw xmm1, [GLOBAL(xmm_bi_rd)] + paddw xmm2, [GLOBAL(xmm_bi_rd)] + psraw xmm1, xmm_filter_shift + psraw xmm2, xmm_filter_shift + + movq xmm3, QWORD PTR [rdi] + pxor xmm4, xmm4 + punpcklbw xmm3, xmm4 + movq xmm5, QWORD PTR [rdi+8] + punpcklbw xmm5, xmm4 + + psubw xmm1, xmm3 + psubw xmm2, xmm5 + paddw xmm6, xmm1 + paddw xmm6, xmm2 + pmaddwd xmm1, xmm1 + pmaddwd xmm2, xmm2 + paddd xmm7, xmm1 + paddd xmm7, xmm2 + + movdqa xmm1, xmm0 + lea rsi, [rsi + rax] ;ref_pixels_per_line + lea rdi, [rdi + rbx] ;src_pixels_per_line + + sub rcx, 1 + jnz filter_block2d_bil_sp_only_loop + + jmp filter_block2d_bil_variance + +filter_block2d_bil_var_ssse3_full_pixel: + mov rsi, arg(0) ;ref_ptr + mov rdi, arg(2) ;src_ptr + movsxd rcx, dword ptr arg(4) ;Height + movsxd rax, dword ptr arg(1) ;ref_pixels_per_line + movsxd rbx, dword ptr arg(3) ;src_pixels_per_line + pxor xmm0, xmm0 + +filter_block2d_bil_full_pixel_loop: + movq xmm1, QWORD PTR [rsi] + punpcklbw xmm1, xmm0 + movq xmm2, QWORD PTR [rsi+8] + punpcklbw xmm2, xmm0 + + movq xmm3, QWORD PTR [rdi] + punpcklbw xmm3, xmm0 + movq xmm4, QWORD PTR [rdi+8] + punpcklbw xmm4, xmm0 + + psubw xmm1, xmm3 + psubw xmm2, xmm4 + paddw xmm6, xmm1 + paddw xmm6, xmm2 + pmaddwd xmm1, xmm1 + pmaddwd xmm2, xmm2 + paddd xmm7, xmm1 + paddd xmm7, xmm2 + + lea rsi, [rsi + rax] ;ref_pixels_per_line + lea rdi, [rdi + rbx] ;src_pixels_per_line + sub rcx, 1 + jnz filter_block2d_bil_full_pixel_loop + + jmp filter_block2d_bil_variance + +filter_block2d_bil_var_ssse3_fp_only: + mov rsi, arg(0) ;ref_ptr + mov rdi, arg(2) ;src_ptr + movsxd rcx, dword ptr arg(4) ;Height + movsxd rdx, dword ptr arg(1) ;ref_pixels_per_line + + pxor xmm0, xmm0 + movsxd rbx, dword ptr arg(3) ;src_pixels_per_line + +filter_block2d_bil_fp_only_loop: + movdqu xmm1, XMMWORD PTR [rsi] + movdqu xmm2, XMMWORD PTR [rsi+1] + movdqa xmm3, xmm1 + + punpcklbw xmm1, xmm2 + punpckhbw xmm3, xmm2 + pmaddubsw xmm1, [rax] + pmaddubsw xmm3, [rax] + + paddw xmm1, [GLOBAL(xmm_bi_rd)] + paddw xmm3, [GLOBAL(xmm_bi_rd)] + psraw xmm1, xmm_filter_shift + psraw xmm3, xmm_filter_shift + + movq xmm2, XMMWORD PTR [rdi] + pxor xmm4, xmm4 + punpcklbw xmm2, xmm4 + movq xmm5, QWORD PTR [rdi+8] + punpcklbw xmm5, xmm4 + + psubw xmm1, xmm2 + psubw xmm3, xmm5 + paddw xmm6, xmm1 + paddw xmm6, xmm3 + pmaddwd xmm1, xmm1 + pmaddwd xmm3, xmm3 + paddd xmm7, xmm1 + paddd xmm7, xmm3 + + lea rsi, [rsi + rdx] + lea rdi, [rdi + rbx] ;src_pixels_per_line + + sub rcx, 1 + jnz filter_block2d_bil_fp_only_loop + + jmp filter_block2d_bil_variance + +filter_block2d_bil_variance: + pxor xmm0, xmm0 + pxor xmm1, xmm1 + pxor xmm5, xmm5 + + punpcklwd xmm0, xmm6 + punpckhwd xmm1, xmm6 + psrad xmm0, 16 + psrad xmm1, 16 + paddd xmm0, xmm1 + movdqa xmm1, xmm0 + + movdqa xmm6, xmm7 + punpckldq xmm6, xmm5 + punpckhdq xmm7, xmm5 + paddd xmm6, xmm7 + + punpckldq xmm0, xmm5 + punpckhdq xmm1, xmm5 + paddd xmm0, xmm1 + + movdqa xmm7, xmm6 + movdqa xmm1, xmm0 + + psrldq xmm7, 8 + psrldq xmm1, 8 + + paddd xmm6, xmm7 + paddd xmm0, xmm1 + + mov rsi, arg(7) ;[Sum] + mov rdi, arg(8) ;[SSE] + + movd [rsi], xmm0 + movd [rdi], xmm6 + + ; begin epilog + pop rbx + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +SECTION_RODATA +align 16 +xmm_bi_rd: + times 8 dw 64 +align 16 +vp8_bilinear_filters_ssse3: + times 8 db 128, 0 + times 8 db 112, 16 + times 8 db 96, 32 + times 8 db 80, 48 + times 8 db 64, 64 + times 8 db 48, 80 + times 8 db 32, 96 + times 8 db 16, 112 diff --git a/vp8/encoder/x86/variance_ssse3.c b/vp8/encoder/x86/variance_ssse3.c new file mode 100644 index 000000000..750ae8b86 --- /dev/null +++ b/vp8/encoder/x86/variance_ssse3.c @@ -0,0 +1,140 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vp8/encoder/variance.h" +#include "vp8/common/pragmas.h" +#include "vpx_ports/mem.h" + +extern unsigned int vp8_get16x16var_sse2 +( + const unsigned char *src_ptr, + int source_stride, + const unsigned char *ref_ptr, + int recon_stride, + unsigned int *SSE, + int *Sum +); +extern void vp8_half_horiz_vert_variance16x_h_sse2 +( + const unsigned char *ref_ptr, + int ref_pixels_per_line, + const unsigned char *src_ptr, + int src_pixels_per_line, + unsigned int Height, + int *sum, + unsigned int *sumsquared +); +extern void vp8_half_horiz_variance16x_h_sse2 +( + const unsigned char *ref_ptr, + int ref_pixels_per_line, + const unsigned char *src_ptr, + int src_pixels_per_line, + unsigned int Height, + int *sum, + unsigned int *sumsquared +); +extern void vp8_half_vert_variance16x_h_sse2 +( + const unsigned char *ref_ptr, + int ref_pixels_per_line, + const unsigned char *src_ptr, + int src_pixels_per_line, + unsigned int Height, + int *sum, + unsigned int *sumsquared +); +extern void vp8_filter_block2d_bil_var_ssse3 +( + const unsigned char *ref_ptr, + int ref_pixels_per_line, + const unsigned char *src_ptr, + int src_pixels_per_line, + unsigned int Height, + int xoffset, + int yoffset, + int *sum, + unsigned int *sumsquared +); + +unsigned int vp8_sub_pixel_variance16x16_ssse3 +( + const unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + const unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse +) +{ + int xsum0, xsum1; + unsigned int xxsum0, xxsum1; + + // note we could avoid these if statements if the calling function + // just called the appropriate functions inside. + if (xoffset == 4 && yoffset == 0) + { + vp8_half_horiz_variance16x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 16, + &xsum0, &xxsum0); + + vp8_half_horiz_variance16x_h_sse2( + src_ptr + 8, src_pixels_per_line, + dst_ptr + 8, dst_pixels_per_line, 16, + &xsum1, &xxsum1); + + xsum0 += xsum1; + xxsum0 += xxsum1; + } + else if (xoffset == 0 && yoffset == 4) + { + vp8_half_vert_variance16x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 16, + &xsum0, &xxsum0); + + vp8_half_vert_variance16x_h_sse2( + src_ptr + 8, src_pixels_per_line, + dst_ptr + 8, dst_pixels_per_line, 16, + &xsum1, &xxsum1); + + xsum0 += xsum1; + xxsum0 += xxsum1; + } + else if (xoffset == 4 && yoffset == 4) + { + vp8_half_horiz_vert_variance16x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 16, + &xsum0, &xxsum0); + + vp8_half_horiz_vert_variance16x_h_sse2( + src_ptr + 8, src_pixels_per_line, + dst_ptr + 8, dst_pixels_per_line, 16, + &xsum1, &xxsum1); + + xsum0 += xsum1; + xxsum0 += xxsum1; + } + else + { + vp8_filter_block2d_bil_var_ssse3( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 16, + xoffset, yoffset, + &xsum0, &xxsum0); + } + + *sse = xxsum0; + return (xxsum0 - ((xsum0 * xsum0) >> 8)); +} diff --git a/vp8/encoder/x86/variance_x86.h b/vp8/encoder/x86/variance_x86.h index 6bea15ebc..1e2fb3490 100644 --- a/vp8/encoder/x86/variance_x86.h +++ b/vp8/encoder/x86/variance_x86.h @@ -286,6 +286,7 @@ extern prototype_sad_multi_dif_address(vp8_sad4x4x4d_sse3); #if HAVE_SSSE3 extern prototype_sad_multi_same_address(vp8_sad16x16x3_ssse3); extern prototype_sad_multi_same_address(vp8_sad16x8x3_ssse3); +extern prototype_subpixvariance(vp8_sub_pixel_variance16x16_ssse3); #if !CONFIG_RUNTIME_CPU_DETECT #undef vp8_variance_sad16x16x3 @@ -294,6 +295,9 @@ extern prototype_sad_multi_same_address(vp8_sad16x8x3_ssse3); #undef vp8_variance_sad16x8x3 #define vp8_variance_sad16x8x3 vp8_sad16x8x3_ssse3 +#undef vp8_variance_subpixvar16x16 +#define vp8_variance_subpixvar16x16 vp8_sub_pixel_variance16x16_ssse3 + #endif #endif diff --git a/vp8/encoder/x86/x86_csystemdependent.c b/vp8/encoder/x86/x86_csystemdependent.c index 61c603229..c7639a7e4 100644 --- a/vp8/encoder/x86/x86_csystemdependent.c +++ b/vp8/encoder/x86/x86_csystemdependent.c @@ -334,6 +334,8 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi) cpi->rtcd.variance.sad16x16x3 = vp8_sad16x16x3_ssse3; cpi->rtcd.variance.sad16x8x3 = vp8_sad16x8x3_ssse3; + cpi->rtcd.variance.subpixvar16x16 = vp8_sub_pixel_variance16x16_ssse3; + cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_ssse3; } -- cgit v1.2.3 From 8432a1729f1ed7ce015e436d67976cbd90eb6f47 Mon Sep 17 00:00:00 2001 From: Yunqing Wang Date: Tue, 8 Mar 2011 15:22:07 -0500 Subject: Add zero offset checking in SSE2 sub-pixel filter function Skip filter at zero offset. Change-Id: I95fc7e211869bc0ab5bcfb7ab2e3259d1c0ccf38 --- vp8/encoder/x86/variance_impl_sse2.asm | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) (limited to 'vp8/encoder/x86') diff --git a/vp8/encoder/x86/variance_impl_sse2.asm b/vp8/encoder/x86/variance_impl_sse2.asm index 6cdc47bc9..5d1a17d44 100644 --- a/vp8/encoder/x86/variance_impl_sse2.asm +++ b/vp8/encoder/x86/variance_impl_sse2.asm @@ -627,6 +627,10 @@ filter_block2d_bil_var_sse2_loop: filter_block2d_bil_var_sse2_sp_only: movsxd rdx, dword ptr arg(6) ; yoffset + + cmp rdx, 0 ; skip all if both xoffset=0 and yoffset=0 + je filter_block2d_bil_var_sse2_full_pixel + shl rdx, 5 lea rdx, [rdx + rcx] ; VFilter @@ -671,6 +675,35 @@ filter_block2d_bil_sp_only_loop: jmp filter_block2d_bil_variance +filter_block2d_bil_var_sse2_full_pixel: + mov rsi, arg(0) ;ref_ptr + mov rdi, arg(2) ;src_ptr + movsxd rcx, dword ptr arg(4) ;Height + movsxd rax, dword ptr arg(1) ;ref_pixels_per_line + movsxd rbx, dword ptr arg(3) ;src_pixels_per_line + pxor xmm0, xmm0 ; + +filter_block2d_bil_full_pixel_loop: + movq xmm1, QWORD PTR [rsi] ; + punpcklbw xmm1, xmm0 ; + + movq xmm2, QWORD PTR [rdi] ; + punpcklbw xmm2, xmm0 ; + + psubw xmm1, xmm2 ; + paddw xmm6, xmm1 ; + + pmaddwd xmm1, xmm1 ; + paddd xmm7, xmm1 ; + + lea rsi, [rsi + rax] ;ref_pixels_per_line + lea rdi, [rdi + rbx] ;src_pixels_per_line + + sub rcx, 1 ; + jnz filter_block2d_bil_full_pixel_loop ; + + jmp filter_block2d_bil_variance + filter_block2d_bil_var_sse2_fp_only: mov rsi, arg(0) ;ref_ptr mov rdi, arg(2) ;src_ptr -- cgit v1.2.3