summaryrefslogtreecommitdiff
path: root/vp8/encoder/x86/variance_impl_sse2.asm
diff options
context:
space:
mode:
authorJohn Koleszar <jkoleszar@google.com>2011-01-22 00:05:13 -0500
committerJohn Koleszar <jkoleszar@google.com>2011-01-22 00:05:13 -0500
commitd4797aa8fd200413b61af63006fac847b3e0ca02 (patch)
treed0abcc9923df6930c19a94b1eef4f08d44845c46 /vp8/encoder/x86/variance_impl_sse2.asm
parentcbe07a749120a48e827c881cbdf9a649a32e25b8 (diff)
parentd3e9409bb07e6411ff867935883bd5d56d2f9041 (diff)
downloadlibvpx-d4797aa8fd200413b61af63006fac847b3e0ca02.tar
libvpx-d4797aa8fd200413b61af63006fac847b3e0ca02.tar.gz
libvpx-d4797aa8fd200413b61af63006fac847b3e0ca02.tar.bz2
libvpx-d4797aa8fd200413b61af63006fac847b3e0ca02.zip
Merge remote branch 'origin/master' into experimental
Change-Id: I2c3326f7e4d9e901f098e499973586e973e1b8fb
Diffstat (limited to 'vp8/encoder/x86/variance_impl_sse2.asm')
-rw-r--r--vp8/encoder/x86/variance_impl_sse2.asm181
1 files changed, 143 insertions, 38 deletions
diff --git a/vp8/encoder/x86/variance_impl_sse2.asm b/vp8/encoder/x86/variance_impl_sse2.asm
index cefa0a956..7178e7e31 100644
--- a/vp8/encoder/x86/variance_impl_sse2.asm
+++ b/vp8/encoder/x86/variance_impl_sse2.asm
@@ -493,8 +493,8 @@ sym(vp8_get8x8var_sse2):
; unsigned char *src_ptr,
; int src_pixels_per_line,
; unsigned int Height,
-; unsigned short *HFilter,
-; unsigned short *VFilter,
+; int xoffset,
+; int yoffset,
; int *sum,
; unsigned int *sumsquared;;
;
@@ -504,68 +504,80 @@ sym(vp8_filter_block2d_bil_var_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 9
+ SAVE_XMM
GET_GOT rbx
push rsi
push rdi
- sub rsp, 16
+ push rbx
; end prolog
pxor xmm6, xmm6 ;
pxor xmm7, xmm7 ;
- mov rax, arg(5) ;HFilter ;
- mov rdx, arg(6) ;VFilter ;
- mov rsi, arg(0) ;ref_ptr ;
+ lea rsi, [GLOBAL(xmm_bi_rd)] ; rounding
+ movdqa xmm4, XMMWORD PTR [rsi]
- mov rdi, arg(2) ;src_ptr ;
- movsxd rcx, dword ptr arg(4) ;Height ;
+ lea rcx, [GLOBAL(vp8_bilinear_filters_sse2)]
+ movsxd rax, dword ptr arg(5) ; xoffset
+
+ cmp rax, 0 ; skip first_pass filter if xoffset=0
+ je filter_block2d_bil_var_sse2_sp_only
+
+ shl rax, 5 ; point to filter coeff with xoffset
+ lea rax, [rax + rcx] ; HFilter
+
+ movsxd rdx, dword ptr arg(6) ; yoffset
+
+ cmp rdx, 0 ; skip second_pass filter if yoffset=0
+ je filter_block2d_bil_var_sse2_fp_only
+
+ shl rdx, 5
+ lea rdx, [rdx + rcx] ; VFilter
+
+ mov rsi, arg(0) ;ref_ptr
+ mov rdi, arg(2) ;src_ptr
+ movsxd rcx, dword ptr arg(4) ;Height
pxor xmm0, xmm0 ;
- movq xmm1, QWORD PTR [rsi] ;
+ movq xmm1, QWORD PTR [rsi] ;
+ movq xmm3, QWORD PTR [rsi+1] ;
- movq xmm3, QWORD PTR [rsi+1] ;
punpcklbw xmm1, xmm0 ;
-
- pmullw xmm1, [rax] ;
+ pmullw xmm1, [rax] ;
punpcklbw xmm3, xmm0
- ;
pmullw xmm3, [rax+16] ;
- paddw xmm1, xmm3 ;
-
- paddw xmm1, [GLOBAL(xmm_bi_rd)] ;
- psraw xmm1, xmm_filter_shift ;
+ paddw xmm1, xmm3 ;
+ paddw xmm1, xmm4 ;
+ psraw xmm1, xmm_filter_shift ;
movdqa xmm5, xmm1
-%if ABI_IS_32BIT
- add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
-%else
- movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ;
- add rsi, r8
+
+ movsxd rbx, dword ptr arg(1) ;ref_pixels_per_line
+ lea rsi, [rsi + rbx]
+%if ABI_IS_32BIT=0
+ movsxd r9, dword ptr arg(3) ;src_pixels_per_line
%endif
-filter_block2d_bil_var_sse2_loop:
+filter_block2d_bil_var_sse2_loop:
movq xmm1, QWORD PTR [rsi] ;
movq xmm3, QWORD PTR [rsi+1] ;
punpcklbw xmm1, xmm0 ;
pmullw xmm1, [rax] ;
-
punpcklbw xmm3, xmm0 ;
pmullw xmm3, [rax+16] ;
paddw xmm1, xmm3 ;
- paddw xmm1, [GLOBAL(xmm_bi_rd)] ;
-
+ paddw xmm1, xmm4 ;
psraw xmm1, xmm_filter_shift ;
- movdqa xmm3, xmm5 ;
+ movdqa xmm3, xmm5 ;
movdqa xmm5, xmm1 ;
- pmullw xmm3, [rdx] ;
+ pmullw xmm3, [rdx] ;
pmullw xmm1, [rdx+16] ;
paddw xmm1, xmm3 ;
-
- paddw xmm1, [GLOBAL(xmm_bi_rd)] ;
+ paddw xmm1, xmm4 ;
psraw xmm1, xmm_filter_shift ;
movq xmm3, QWORD PTR [rdi] ;
@@ -577,20 +589,103 @@ filter_block2d_bil_var_sse2_loop:
pmaddwd xmm1, xmm1 ;
paddd xmm7, xmm1 ;
+ lea rsi, [rsi + rbx] ;ref_pixels_per_line
%if ABI_IS_32BIT
- add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
- add rdi, dword ptr arg(3) ;src_pixels_per_line ;
+ add rdi, dword ptr arg(3) ;src_pixels_per_line
%else
- movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ;
- movsxd r9, dword ptr arg(3) ;src_pixels_per_line ;
- add rsi, r8
- add rdi, r9
+ lea rdi, [rdi + r9]
%endif
sub rcx, 1 ;
jnz filter_block2d_bil_var_sse2_loop ;
+ jmp filter_block2d_bil_variance
+
+filter_block2d_bil_var_sse2_sp_only:
+ movsxd rdx, dword ptr arg(6) ; yoffset
+ shl rdx, 5
+ lea rdx, [rdx + rcx] ; VFilter
+
+ mov rsi, arg(0) ;ref_ptr
+ mov rdi, arg(2) ;src_ptr
+ movsxd rcx, dword ptr arg(4) ;Height
+ movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
+
+ pxor xmm0, xmm0 ;
+ movq xmm1, QWORD PTR [rsi] ;
+ punpcklbw xmm1, xmm0 ;
+
+ movsxd rbx, dword ptr arg(3) ;src_pixels_per_line
+ lea rsi, [rsi + rax]
+
+filter_block2d_bil_sp_only_loop:
+ movq xmm3, QWORD PTR [rsi] ;
+ punpcklbw xmm3, xmm0 ;
+ movdqa xmm5, xmm3
+
+ pmullw xmm1, [rdx] ;
+ pmullw xmm3, [rdx+16] ;
+ paddw xmm1, xmm3 ;
+ paddw xmm1, xmm4 ;
+ psraw xmm1, xmm_filter_shift ;
+
+ movq xmm3, QWORD PTR [rdi] ;
+ punpcklbw xmm3, xmm0 ;
+
+ psubw xmm1, xmm3 ;
+ paddw xmm6, xmm1 ;
+
+ pmaddwd xmm1, xmm1 ;
+ paddd xmm7, xmm1 ;
+
+ movdqa xmm1, xmm5 ;
+ lea rsi, [rsi + rax] ;ref_pixels_per_line
+ lea rdi, [rdi + rbx] ;src_pixels_per_line
+
+ sub rcx, 1 ;
+ jnz filter_block2d_bil_sp_only_loop ;
+
+ jmp filter_block2d_bil_variance
+
+filter_block2d_bil_var_sse2_fp_only:
+ mov rsi, arg(0) ;ref_ptr
+ mov rdi, arg(2) ;src_ptr
+ movsxd rcx, dword ptr arg(4) ;Height
+ movsxd rdx, dword ptr arg(1) ;ref_pixels_per_line
+
+ pxor xmm0, xmm0 ;
+ movsxd rbx, dword ptr arg(3) ;src_pixels_per_line
+
+filter_block2d_bil_fp_only_loop:
+ movq xmm1, QWORD PTR [rsi] ;
+ movq xmm3, QWORD PTR [rsi+1] ;
+
+ punpcklbw xmm1, xmm0 ;
+ pmullw xmm1, [rax] ;
+ punpcklbw xmm3, xmm0 ;
+ pmullw xmm3, [rax+16] ;
+
+ paddw xmm1, xmm3 ;
+ paddw xmm1, xmm4 ;
+ psraw xmm1, xmm_filter_shift ;
+
+ movq xmm3, QWORD PTR [rdi] ;
+ punpcklbw xmm3, xmm0 ;
+
+ psubw xmm1, xmm3 ;
+ paddw xmm6, xmm1 ;
+
+ pmaddwd xmm1, xmm1 ;
+ paddd xmm7, xmm1 ;
+ lea rsi, [rsi + rdx]
+ lea rdi, [rdi + rbx] ;src_pixels_per_line
+
+ sub rcx, 1 ;
+ jnz filter_block2d_bil_fp_only_loop ;
+
+ jmp filter_block2d_bil_variance
+filter_block2d_bil_variance:
movdq2q mm6, xmm6 ;
movdq2q mm7, xmm7 ;
@@ -627,12 +722,12 @@ filter_block2d_bil_var_sse2_loop:
movd [rsi], mm2 ; xsum
movd [rdi], mm4 ; xxsum
-
; begin epilog
- add rsp, 16
+ pop rbx
pop rdi
pop rsi
RESTORE_GOT
+ RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
@@ -974,3 +1069,13 @@ SECTION_RODATA
align 16
xmm_bi_rd:
times 8 dw 64
+align 16
+vp8_bilinear_filters_sse2:
+ dw 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0
+ dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16
+ dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32
+ dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48
+ dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
+ dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80
+ dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96
+ dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112