diff options
author | John Koleszar <jkoleszar@google.com> | 2011-01-22 00:05:13 -0500 |
---|---|---|
committer | John Koleszar <jkoleszar@google.com> | 2011-01-22 00:05:13 -0500 |
commit | d4797aa8fd200413b61af63006fac847b3e0ca02 (patch) | |
tree | d0abcc9923df6930c19a94b1eef4f08d44845c46 /vp8/encoder/x86/variance_impl_sse2.asm | |
parent | cbe07a749120a48e827c881cbdf9a649a32e25b8 (diff) | |
parent | d3e9409bb07e6411ff867935883bd5d56d2f9041 (diff) | |
download | libvpx-d4797aa8fd200413b61af63006fac847b3e0ca02.tar libvpx-d4797aa8fd200413b61af63006fac847b3e0ca02.tar.gz libvpx-d4797aa8fd200413b61af63006fac847b3e0ca02.tar.bz2 libvpx-d4797aa8fd200413b61af63006fac847b3e0ca02.zip |
Merge remote branch 'origin/master' into experimental
Change-Id: I2c3326f7e4d9e901f098e499973586e973e1b8fb
Diffstat (limited to 'vp8/encoder/x86/variance_impl_sse2.asm')
-rw-r--r-- | vp8/encoder/x86/variance_impl_sse2.asm | 181 |
1 files changed, 143 insertions, 38 deletions
diff --git a/vp8/encoder/x86/variance_impl_sse2.asm b/vp8/encoder/x86/variance_impl_sse2.asm index cefa0a956..7178e7e31 100644 --- a/vp8/encoder/x86/variance_impl_sse2.asm +++ b/vp8/encoder/x86/variance_impl_sse2.asm @@ -493,8 +493,8 @@ sym(vp8_get8x8var_sse2): ; unsigned char *src_ptr, ; int src_pixels_per_line, ; unsigned int Height, -; unsigned short *HFilter, -; unsigned short *VFilter, +; int xoffset, +; int yoffset, ; int *sum, ; unsigned int *sumsquared;; ; @@ -504,68 +504,80 @@ sym(vp8_filter_block2d_bil_var_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 9 + SAVE_XMM GET_GOT rbx push rsi push rdi - sub rsp, 16 + push rbx ; end prolog pxor xmm6, xmm6 ; pxor xmm7, xmm7 ; - mov rax, arg(5) ;HFilter ; - mov rdx, arg(6) ;VFilter ; - mov rsi, arg(0) ;ref_ptr ; + lea rsi, [GLOBAL(xmm_bi_rd)] ; rounding + movdqa xmm4, XMMWORD PTR [rsi] - mov rdi, arg(2) ;src_ptr ; - movsxd rcx, dword ptr arg(4) ;Height ; + lea rcx, [GLOBAL(vp8_bilinear_filters_sse2)] + movsxd rax, dword ptr arg(5) ; xoffset + + cmp rax, 0 ; skip first_pass filter if xoffset=0 + je filter_block2d_bil_var_sse2_sp_only + + shl rax, 5 ; point to filter coeff with xoffset + lea rax, [rax + rcx] ; HFilter + + movsxd rdx, dword ptr arg(6) ; yoffset + + cmp rdx, 0 ; skip second_pass filter if yoffset=0 + je filter_block2d_bil_var_sse2_fp_only + + shl rdx, 5 + lea rdx, [rdx + rcx] ; VFilter + + mov rsi, arg(0) ;ref_ptr + mov rdi, arg(2) ;src_ptr + movsxd rcx, dword ptr arg(4) ;Height pxor xmm0, xmm0 ; - movq xmm1, QWORD PTR [rsi] ; + movq xmm1, QWORD PTR [rsi] ; + movq xmm3, QWORD PTR [rsi+1] ; - movq xmm3, QWORD PTR [rsi+1] ; punpcklbw xmm1, xmm0 ; - - pmullw xmm1, [rax] ; + pmullw xmm1, [rax] ; punpcklbw xmm3, xmm0 - ; pmullw xmm3, [rax+16] ; - paddw xmm1, xmm3 ; - - paddw xmm1, [GLOBAL(xmm_bi_rd)] ; - psraw xmm1, xmm_filter_shift ; + paddw xmm1, xmm3 ; + paddw xmm1, xmm4 ; + psraw xmm1, xmm_filter_shift ; movdqa xmm5, xmm1 -%if ABI_IS_32BIT - add rsi, dword ptr arg(1) ;ref_pixels_per_line ; -%else - movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ; - add rsi, r8 + + movsxd rbx, dword ptr arg(1) ;ref_pixels_per_line + lea rsi, [rsi + rbx] +%if ABI_IS_32BIT=0 + movsxd r9, dword ptr arg(3) ;src_pixels_per_line %endif -filter_block2d_bil_var_sse2_loop: +filter_block2d_bil_var_sse2_loop: movq xmm1, QWORD PTR [rsi] ; movq xmm3, QWORD PTR [rsi+1] ; punpcklbw xmm1, xmm0 ; pmullw xmm1, [rax] ; - punpcklbw xmm3, xmm0 ; pmullw xmm3, [rax+16] ; paddw xmm1, xmm3 ; - paddw xmm1, [GLOBAL(xmm_bi_rd)] ; - + paddw xmm1, xmm4 ; psraw xmm1, xmm_filter_shift ; - movdqa xmm3, xmm5 ; + movdqa xmm3, xmm5 ; movdqa xmm5, xmm1 ; - pmullw xmm3, [rdx] ; + pmullw xmm3, [rdx] ; pmullw xmm1, [rdx+16] ; paddw xmm1, xmm3 ; - - paddw xmm1, [GLOBAL(xmm_bi_rd)] ; + paddw xmm1, xmm4 ; psraw xmm1, xmm_filter_shift ; movq xmm3, QWORD PTR [rdi] ; @@ -577,20 +589,103 @@ filter_block2d_bil_var_sse2_loop: pmaddwd xmm1, xmm1 ; paddd xmm7, xmm1 ; + lea rsi, [rsi + rbx] ;ref_pixels_per_line %if ABI_IS_32BIT - add rsi, dword ptr arg(1) ;ref_pixels_per_line ; - add rdi, dword ptr arg(3) ;src_pixels_per_line ; + add rdi, dword ptr arg(3) ;src_pixels_per_line %else - movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ; - movsxd r9, dword ptr arg(3) ;src_pixels_per_line ; - add rsi, r8 - add rdi, r9 + lea rdi, [rdi + r9] %endif sub rcx, 1 ; jnz filter_block2d_bil_var_sse2_loop ; + jmp filter_block2d_bil_variance + +filter_block2d_bil_var_sse2_sp_only: + movsxd rdx, dword ptr arg(6) ; yoffset + shl rdx, 5 + lea rdx, [rdx + rcx] ; VFilter + + mov rsi, arg(0) ;ref_ptr + mov rdi, arg(2) ;src_ptr + movsxd rcx, dword ptr arg(4) ;Height + movsxd rax, dword ptr arg(1) ;ref_pixels_per_line + + pxor xmm0, xmm0 ; + movq xmm1, QWORD PTR [rsi] ; + punpcklbw xmm1, xmm0 ; + + movsxd rbx, dword ptr arg(3) ;src_pixels_per_line + lea rsi, [rsi + rax] + +filter_block2d_bil_sp_only_loop: + movq xmm3, QWORD PTR [rsi] ; + punpcklbw xmm3, xmm0 ; + movdqa xmm5, xmm3 + + pmullw xmm1, [rdx] ; + pmullw xmm3, [rdx+16] ; + paddw xmm1, xmm3 ; + paddw xmm1, xmm4 ; + psraw xmm1, xmm_filter_shift ; + + movq xmm3, QWORD PTR [rdi] ; + punpcklbw xmm3, xmm0 ; + + psubw xmm1, xmm3 ; + paddw xmm6, xmm1 ; + + pmaddwd xmm1, xmm1 ; + paddd xmm7, xmm1 ; + + movdqa xmm1, xmm5 ; + lea rsi, [rsi + rax] ;ref_pixels_per_line + lea rdi, [rdi + rbx] ;src_pixels_per_line + + sub rcx, 1 ; + jnz filter_block2d_bil_sp_only_loop ; + + jmp filter_block2d_bil_variance + +filter_block2d_bil_var_sse2_fp_only: + mov rsi, arg(0) ;ref_ptr + mov rdi, arg(2) ;src_ptr + movsxd rcx, dword ptr arg(4) ;Height + movsxd rdx, dword ptr arg(1) ;ref_pixels_per_line + + pxor xmm0, xmm0 ; + movsxd rbx, dword ptr arg(3) ;src_pixels_per_line + +filter_block2d_bil_fp_only_loop: + movq xmm1, QWORD PTR [rsi] ; + movq xmm3, QWORD PTR [rsi+1] ; + + punpcklbw xmm1, xmm0 ; + pmullw xmm1, [rax] ; + punpcklbw xmm3, xmm0 ; + pmullw xmm3, [rax+16] ; + + paddw xmm1, xmm3 ; + paddw xmm1, xmm4 ; + psraw xmm1, xmm_filter_shift ; + + movq xmm3, QWORD PTR [rdi] ; + punpcklbw xmm3, xmm0 ; + + psubw xmm1, xmm3 ; + paddw xmm6, xmm1 ; + + pmaddwd xmm1, xmm1 ; + paddd xmm7, xmm1 ; + lea rsi, [rsi + rdx] + lea rdi, [rdi + rbx] ;src_pixels_per_line + + sub rcx, 1 ; + jnz filter_block2d_bil_fp_only_loop ; + + jmp filter_block2d_bil_variance +filter_block2d_bil_variance: movdq2q mm6, xmm6 ; movdq2q mm7, xmm7 ; @@ -627,12 +722,12 @@ filter_block2d_bil_var_sse2_loop: movd [rsi], mm2 ; xsum movd [rdi], mm4 ; xxsum - ; begin epilog - add rsp, 16 + pop rbx pop rdi pop rsi RESTORE_GOT + RESTORE_XMM UNSHADOW_ARGS pop rbp ret @@ -974,3 +1069,13 @@ SECTION_RODATA align 16 xmm_bi_rd: times 8 dw 64 +align 16 +vp8_bilinear_filters_sse2: + dw 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0 + dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16 + dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32 + dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48 + dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 + dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80 + dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96 + dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112 |