summaryrefslogtreecommitdiff
path: root/vp8
diff options
context:
space:
mode:
authorYunqing Wang <yunqingwang@google.com>2011-01-20 13:01:30 -0500
committerYunqing Wang <yunqingwang@google.com>2011-01-21 13:59:27 -0500
commit0822a62f4051289fb3853c997b797ae3b6a006f5 (patch)
tree236eb81f4d270266214fa86c9b1a8f29ab3d0930 /vp8
parent06e7320c3e909c33e248b9910dc182b13451d1c8 (diff)
downloadlibvpx-0822a62f4051289fb3853c997b797ae3b6a006f5.tar
libvpx-0822a62f4051289fb3853c997b797ae3b6a006f5.tar.gz
libvpx-0822a62f4051289fb3853c997b797ae3b6a006f5.tar.bz2
libvpx-0822a62f4051289fb3853c997b797ae3b6a006f5.zip
Modify sub-pixel filters to eliminate unnecessary calculations
In sub-pixel calculation, xoffset and yoffset mostly take some specific values. Modified sub-pixel filter functions according to these possible values to improve performance. Change-Id: I83083570af8b00ff65093467914fbb97a4e9ea21
Diffstat (limited to 'vp8')
-rw-r--r--vp8/encoder/x86/variance_impl_sse2.asm181
-rw-r--r--vp8/encoder/x86/variance_sse2.c155
2 files changed, 254 insertions, 82 deletions
diff --git a/vp8/encoder/x86/variance_impl_sse2.asm b/vp8/encoder/x86/variance_impl_sse2.asm
index cefa0a956..7178e7e31 100644
--- a/vp8/encoder/x86/variance_impl_sse2.asm
+++ b/vp8/encoder/x86/variance_impl_sse2.asm
@@ -493,8 +493,8 @@ sym(vp8_get8x8var_sse2):
; unsigned char *src_ptr,
; int src_pixels_per_line,
; unsigned int Height,
-; unsigned short *HFilter,
-; unsigned short *VFilter,
+; int xoffset,
+; int yoffset,
; int *sum,
; unsigned int *sumsquared;;
;
@@ -504,68 +504,80 @@ sym(vp8_filter_block2d_bil_var_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 9
+ SAVE_XMM
GET_GOT rbx
push rsi
push rdi
- sub rsp, 16
+ push rbx
; end prolog
pxor xmm6, xmm6 ;
pxor xmm7, xmm7 ;
- mov rax, arg(5) ;HFilter ;
- mov rdx, arg(6) ;VFilter ;
- mov rsi, arg(0) ;ref_ptr ;
+ lea rsi, [GLOBAL(xmm_bi_rd)] ; rounding
+ movdqa xmm4, XMMWORD PTR [rsi]
- mov rdi, arg(2) ;src_ptr ;
- movsxd rcx, dword ptr arg(4) ;Height ;
+ lea rcx, [GLOBAL(vp8_bilinear_filters_sse2)]
+ movsxd rax, dword ptr arg(5) ; xoffset
+
+ cmp rax, 0 ; skip first_pass filter if xoffset=0
+ je filter_block2d_bil_var_sse2_sp_only
+
+ shl rax, 5 ; point to filter coeff with xoffset
+ lea rax, [rax + rcx] ; HFilter
+
+ movsxd rdx, dword ptr arg(6) ; yoffset
+
+ cmp rdx, 0 ; skip second_pass filter if yoffset=0
+ je filter_block2d_bil_var_sse2_fp_only
+
+ shl rdx, 5
+ lea rdx, [rdx + rcx] ; VFilter
+
+ mov rsi, arg(0) ;ref_ptr
+ mov rdi, arg(2) ;src_ptr
+ movsxd rcx, dword ptr arg(4) ;Height
pxor xmm0, xmm0 ;
- movq xmm1, QWORD PTR [rsi] ;
+ movq xmm1, QWORD PTR [rsi] ;
+ movq xmm3, QWORD PTR [rsi+1] ;
- movq xmm3, QWORD PTR [rsi+1] ;
punpcklbw xmm1, xmm0 ;
-
- pmullw xmm1, [rax] ;
+ pmullw xmm1, [rax] ;
punpcklbw xmm3, xmm0
- ;
pmullw xmm3, [rax+16] ;
- paddw xmm1, xmm3 ;
-
- paddw xmm1, [GLOBAL(xmm_bi_rd)] ;
- psraw xmm1, xmm_filter_shift ;
+ paddw xmm1, xmm3 ;
+ paddw xmm1, xmm4 ;
+ psraw xmm1, xmm_filter_shift ;
movdqa xmm5, xmm1
-%if ABI_IS_32BIT
- add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
-%else
- movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ;
- add rsi, r8
+
+ movsxd rbx, dword ptr arg(1) ;ref_pixels_per_line
+ lea rsi, [rsi + rbx]
+%if ABI_IS_32BIT=0
+ movsxd r9, dword ptr arg(3) ;src_pixels_per_line
%endif
-filter_block2d_bil_var_sse2_loop:
+filter_block2d_bil_var_sse2_loop:
movq xmm1, QWORD PTR [rsi] ;
movq xmm3, QWORD PTR [rsi+1] ;
punpcklbw xmm1, xmm0 ;
pmullw xmm1, [rax] ;
-
punpcklbw xmm3, xmm0 ;
pmullw xmm3, [rax+16] ;
paddw xmm1, xmm3 ;
- paddw xmm1, [GLOBAL(xmm_bi_rd)] ;
-
+ paddw xmm1, xmm4 ;
psraw xmm1, xmm_filter_shift ;
- movdqa xmm3, xmm5 ;
+ movdqa xmm3, xmm5 ;
movdqa xmm5, xmm1 ;
- pmullw xmm3, [rdx] ;
+ pmullw xmm3, [rdx] ;
pmullw xmm1, [rdx+16] ;
paddw xmm1, xmm3 ;
-
- paddw xmm1, [GLOBAL(xmm_bi_rd)] ;
+ paddw xmm1, xmm4 ;
psraw xmm1, xmm_filter_shift ;
movq xmm3, QWORD PTR [rdi] ;
@@ -577,20 +589,103 @@ filter_block2d_bil_var_sse2_loop:
pmaddwd xmm1, xmm1 ;
paddd xmm7, xmm1 ;
+ lea rsi, [rsi + rbx] ;ref_pixels_per_line
%if ABI_IS_32BIT
- add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
- add rdi, dword ptr arg(3) ;src_pixels_per_line ;
+ add rdi, dword ptr arg(3) ;src_pixels_per_line
%else
- movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ;
- movsxd r9, dword ptr arg(3) ;src_pixels_per_line ;
- add rsi, r8
- add rdi, r9
+ lea rdi, [rdi + r9]
%endif
sub rcx, 1 ;
jnz filter_block2d_bil_var_sse2_loop ;
+ jmp filter_block2d_bil_variance
+
+filter_block2d_bil_var_sse2_sp_only:
+ movsxd rdx, dword ptr arg(6) ; yoffset
+ shl rdx, 5
+ lea rdx, [rdx + rcx] ; VFilter
+
+ mov rsi, arg(0) ;ref_ptr
+ mov rdi, arg(2) ;src_ptr
+ movsxd rcx, dword ptr arg(4) ;Height
+ movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
+
+ pxor xmm0, xmm0 ;
+ movq xmm1, QWORD PTR [rsi] ;
+ punpcklbw xmm1, xmm0 ;
+
+ movsxd rbx, dword ptr arg(3) ;src_pixels_per_line
+ lea rsi, [rsi + rax]
+
+filter_block2d_bil_sp_only_loop:
+ movq xmm3, QWORD PTR [rsi] ;
+ punpcklbw xmm3, xmm0 ;
+ movdqa xmm5, xmm3
+
+ pmullw xmm1, [rdx] ;
+ pmullw xmm3, [rdx+16] ;
+ paddw xmm1, xmm3 ;
+ paddw xmm1, xmm4 ;
+ psraw xmm1, xmm_filter_shift ;
+
+ movq xmm3, QWORD PTR [rdi] ;
+ punpcklbw xmm3, xmm0 ;
+
+ psubw xmm1, xmm3 ;
+ paddw xmm6, xmm1 ;
+
+ pmaddwd xmm1, xmm1 ;
+ paddd xmm7, xmm1 ;
+
+ movdqa xmm1, xmm5 ;
+ lea rsi, [rsi + rax] ;ref_pixels_per_line
+ lea rdi, [rdi + rbx] ;src_pixels_per_line
+
+ sub rcx, 1 ;
+ jnz filter_block2d_bil_sp_only_loop ;
+
+ jmp filter_block2d_bil_variance
+
+filter_block2d_bil_var_sse2_fp_only:
+ mov rsi, arg(0) ;ref_ptr
+ mov rdi, arg(2) ;src_ptr
+ movsxd rcx, dword ptr arg(4) ;Height
+ movsxd rdx, dword ptr arg(1) ;ref_pixels_per_line
+
+ pxor xmm0, xmm0 ;
+ movsxd rbx, dword ptr arg(3) ;src_pixels_per_line
+
+filter_block2d_bil_fp_only_loop:
+ movq xmm1, QWORD PTR [rsi] ;
+ movq xmm3, QWORD PTR [rsi+1] ;
+
+ punpcklbw xmm1, xmm0 ;
+ pmullw xmm1, [rax] ;
+ punpcklbw xmm3, xmm0 ;
+ pmullw xmm3, [rax+16] ;
+
+ paddw xmm1, xmm3 ;
+ paddw xmm1, xmm4 ;
+ psraw xmm1, xmm_filter_shift ;
+
+ movq xmm3, QWORD PTR [rdi] ;
+ punpcklbw xmm3, xmm0 ;
+
+ psubw xmm1, xmm3 ;
+ paddw xmm6, xmm1 ;
+
+ pmaddwd xmm1, xmm1 ;
+ paddd xmm7, xmm1 ;
+ lea rsi, [rsi + rdx]
+ lea rdi, [rdi + rbx] ;src_pixels_per_line
+
+ sub rcx, 1 ;
+ jnz filter_block2d_bil_fp_only_loop ;
+
+ jmp filter_block2d_bil_variance
+filter_block2d_bil_variance:
movdq2q mm6, xmm6 ;
movdq2q mm7, xmm7 ;
@@ -627,12 +722,12 @@ filter_block2d_bil_var_sse2_loop:
movd [rsi], mm2 ; xsum
movd [rdi], mm4 ; xxsum
-
; begin epilog
- add rsp, 16
+ pop rbx
pop rdi
pop rsi
RESTORE_GOT
+ RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
@@ -974,3 +1069,13 @@ SECTION_RODATA
align 16
xmm_bi_rd:
times 8 dw 64
+align 16
+vp8_bilinear_filters_sse2:
+ dw 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0
+ dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16
+ dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32
+ dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48
+ dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
+ dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80
+ dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96
+ dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112
diff --git a/vp8/encoder/x86/variance_sse2.c b/vp8/encoder/x86/variance_sse2.c
index 006e0a24a..6f79f0d23 100644
--- a/vp8/encoder/x86/variance_sse2.c
+++ b/vp8/encoder/x86/variance_sse2.c
@@ -76,8 +76,8 @@ void vp8_filter_block2d_bil_var_sse2
const unsigned char *src_ptr,
int src_pixels_per_line,
unsigned int Height,
- const short *HFilter,
- const short *VFilter,
+ int xoffset,
+ int yoffset,
int *sum,
unsigned int *sumsquared
);
@@ -222,21 +222,6 @@ unsigned int vp8_variance8x16_wmt
}
-///////////////////////////////////////////////////////////////////////////
-// the mmx function that does the bilinear filtering and var calculation //
-// int one pass //
-///////////////////////////////////////////////////////////////////////////
-DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_xmm[8][16]) =
-{
- { 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0 },
- { 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16 },
- { 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32 },
- { 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48 },
- { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
- { 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80 },
- { 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96 },
- { 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112 }
-};
unsigned int vp8_sub_pixel_variance4x4_wmt
(
const unsigned char *src_ptr,
@@ -272,15 +257,38 @@ unsigned int vp8_sub_pixel_variance8x8_wmt
unsigned int *sse
)
{
-
int xsum;
unsigned int xxsum;
- vp8_filter_block2d_bil_var_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 8,
- vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset],
- &xsum, &xxsum
- );
+
+ if (xoffset == 4 && yoffset == 0)
+ {
+ vp8_half_horiz_variance16x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 8,
+ &xsum, &xxsum);
+ }
+ else if (xoffset == 0 && yoffset == 4)
+ {
+ vp8_half_vert_variance16x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 8,
+ &xsum, &xxsum);
+ }
+ else if (xoffset == 4 && yoffset == 4)
+ {
+ vp8_half_horiz_vert_variance16x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 8,
+ &xsum, &xxsum);
+ }
+ else
+ {
+ vp8_filter_block2d_bil_var_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 8,
+ xoffset, yoffset,
+ &xsum, &xxsum);
+ }
*sse = xxsum;
return (xxsum - ((xsum * xsum) >> 6));
@@ -344,7 +352,7 @@ unsigned int vp8_sub_pixel_variance16x16_wmt
vp8_filter_block2d_bil_var_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 16,
- vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset],
+ xoffset, yoffset,
&xsum0, &xxsum0
);
@@ -352,7 +360,7 @@ unsigned int vp8_sub_pixel_variance16x16_wmt
vp8_filter_block2d_bil_var_sse2(
src_ptr + 8, src_pixels_per_line,
dst_ptr + 8, dst_pixels_per_line, 16,
- vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset],
+ xoffset, yoffset,
&xsum1, &xxsum1
);
}
@@ -392,21 +400,56 @@ unsigned int vp8_sub_pixel_variance16x8_wmt
int xsum0, xsum1;
unsigned int xxsum0, xxsum1;
+ if (xoffset == 4 && yoffset == 0)
+ {
+ vp8_half_horiz_variance16x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 8,
+ &xsum0, &xxsum0);
- vp8_filter_block2d_bil_var_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 8,
- vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset],
- &xsum0, &xxsum0
- );
+ vp8_half_horiz_variance16x_h_sse2(
+ src_ptr + 8, src_pixels_per_line,
+ dst_ptr + 8, dst_pixels_per_line, 8,
+ &xsum1, &xxsum1);
+ }
+ else if (xoffset == 0 && yoffset == 4)
+ {
+ vp8_half_vert_variance16x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 8,
+ &xsum0, &xxsum0);
+
+ vp8_half_vert_variance16x_h_sse2(
+ src_ptr + 8, src_pixels_per_line,
+ dst_ptr + 8, dst_pixels_per_line, 8,
+ &xsum1, &xxsum1);
+ }
+ else if (xoffset == 4 && yoffset == 4)
+ {
+ vp8_half_horiz_vert_variance16x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 8,
+ &xsum0, &xxsum0);
+ vp8_half_horiz_vert_variance16x_h_sse2(
+ src_ptr + 8, src_pixels_per_line,
+ dst_ptr + 8, dst_pixels_per_line, 8,
+ &xsum1, &xxsum1);
+ }
+ else
+ {
+ vp8_filter_block2d_bil_var_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 8,
+ xoffset, yoffset,
+ &xsum0, &xxsum0);
- vp8_filter_block2d_bil_var_sse2(
- src_ptr + 8, src_pixels_per_line,
- dst_ptr + 8, dst_pixels_per_line, 8,
- vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset],
- &xsum1, &xxsum1
- );
+ vp8_filter_block2d_bil_var_sse2(
+ src_ptr + 8, src_pixels_per_line,
+ dst_ptr + 8, dst_pixels_per_line, 8,
+ xoffset, yoffset,
+ &xsum1, &xxsum1);
+ }
xsum0 += xsum1;
xxsum0 += xxsum1;
@@ -428,12 +471,36 @@ unsigned int vp8_sub_pixel_variance8x16_wmt
{
int xsum;
unsigned int xxsum;
- vp8_filter_block2d_bil_var_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 16,
- vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset],
- &xsum, &xxsum
- );
+
+ if (xoffset == 4 && yoffset == 0)
+ {
+ vp8_half_horiz_variance16x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 16,
+ &xsum, &xxsum);
+ }
+ else if (xoffset == 0 && yoffset == 4)
+ {
+ vp8_half_vert_variance16x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 16,
+ &xsum, &xxsum);
+ }
+ else if (xoffset == 4 && yoffset == 4)
+ {
+ vp8_half_horiz_vert_variance16x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 16,
+ &xsum, &xxsum);
+ }
+ else
+ {
+ vp8_filter_block2d_bil_var_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 16,
+ xoffset, yoffset,
+ &xsum, &xxsum);
+ }
*sse = xxsum;
return (xxsum - ((xsum * xsum) >> 7));