diff options
author | Johann <johannkoenig@google.com> | 2015-06-05 09:54:19 -0700 |
---|---|---|
committer | Johann <johannkoenig@google.com> | 2015-07-07 15:51:04 -0700 |
commit | 6a82f0d7fb9ee908c389e8d55444bbaed3d54e9c (patch) | |
tree | 5b346f932d7256defc451958f474a33cd8b51205 /vpx_dsp/x86/variance_impl_mmx.asm | |
parent | 155b9416b36d9708b18f22ef2bc396fba264f513 (diff) | |
download | libvpx-6a82f0d7fb9ee908c389e8d55444bbaed3d54e9c.tar libvpx-6a82f0d7fb9ee908c389e8d55444bbaed3d54e9c.tar.gz libvpx-6a82f0d7fb9ee908c389e8d55444bbaed3d54e9c.tar.bz2 libvpx-6a82f0d7fb9ee908c389e8d55444bbaed3d54e9c.zip |
Move sub pixel variance to vpx_dsp
Change-Id: I66bf6720c396c89aa2d1fd26d5d52bf5d5e3dff1
Diffstat (limited to 'vpx_dsp/x86/variance_impl_mmx.asm')
-rw-r--r-- | vpx_dsp/x86/variance_impl_mmx.asm | 342 |
1 files changed, 331 insertions, 11 deletions
diff --git a/vpx_dsp/x86/variance_impl_mmx.asm b/vpx_dsp/x86/variance_impl_mmx.asm index a8d7d99db..b8ba79b65 100644 --- a/vpx_dsp/x86/variance_impl_mmx.asm +++ b/vpx_dsp/x86/variance_impl_mmx.asm @@ -11,6 +11,8 @@ %include "vpx_ports/x86_abi_support.asm" +%define mmx_filter_shift 7 + ;unsigned int vpx_get_mb_ss_mmx( short *src_ptr ) global sym(vpx_get_mb_ss_mmx) PRIVATE sym(vpx_get_mb_ss_mmx): @@ -52,7 +54,6 @@ sym(vpx_get_mb_ss_mmx): movsxd rcx, dword ptr [rsp+4] add rax, rcx - ; begin epilog add rsp, 8 pop rdi @@ -62,7 +63,6 @@ sym(vpx_get_mb_ss_mmx): pop rbp ret - ;void vpx_get8x8var_mmx ;( ; unsigned char *src_ptr, @@ -83,7 +83,6 @@ sym(vpx_get8x8var_mmx): sub rsp, 16 ; end prolog - pxor mm5, mm5 ; Blank mmx6 pxor mm6, mm6 ; Blank mmx7 pxor mm7, mm7 ; Blank mmx7 @@ -117,7 +116,6 @@ sym(vpx_get8x8var_mmx): paddd mm7, mm0 ; accumulate in mm7 paddd mm7, mm2 ; accumulate in mm7 - ; Row 2 movq mm0, [rax] ; Copy eight bytes to mm0 movq mm2, mm0 ; Take copies @@ -298,7 +296,6 @@ sym(vpx_get8x8var_mmx): mov dword ptr [rdi], edx xor rax, rax ; return 0 - ; begin epilog add rsp, 16 pop rbx @@ -308,8 +305,6 @@ sym(vpx_get8x8var_mmx): pop rbp ret - - ;void ;vpx_get4x4var_mmx ;( @@ -331,7 +326,6 @@ sym(vpx_get4x4var_mmx): sub rsp, 16 ; end prolog - pxor mm5, mm5 ; Blank mmx6 pxor mm6, mm6 ; Blank mmx7 pxor mm7, mm7 ; Blank mmx7 @@ -354,7 +348,6 @@ sym(vpx_get4x4var_mmx): movd mm1, [rbx] ; Copy four bytes to mm1 paddd mm7, mm0 ; accumulate in mm7 - ; Row 2 movd mm0, [rax] ; Copy four bytes to mm0 punpcklbw mm0, mm6 ; unpack to higher prrcision @@ -393,7 +386,6 @@ sym(vpx_get4x4var_mmx): pmaddwd mm0, mm0 ; square and accumulate paddd mm7, mm0 ; accumulate in mm7 - ; Now accumulate the final results. movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory @@ -413,7 +405,6 @@ sym(vpx_get4x4var_mmx): mov dword ptr [rdi], edx xor rax, rax ; return 0 - ; begin epilog add rsp, 16 pop rbx @@ -422,3 +413,332 @@ sym(vpx_get4x4var_mmx): UNSHADOW_ARGS pop rbp ret + +;void vpx_filter_block2d_bil4x4_var_mmx +;( +; unsigned char *ref_ptr, +; int ref_pixels_per_line, +; unsigned char *src_ptr, +; int src_pixels_per_line, +; unsigned short *HFilter, +; unsigned short *VFilter, +; int *sum, +; unsigned int *sumsquared +;) +global sym(vpx_filter_block2d_bil4x4_var_mmx) PRIVATE +sym(vpx_filter_block2d_bil4x4_var_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 8 + GET_GOT rbx + push rsi + push rdi + sub rsp, 16 + ; end prolog + + pxor mm6, mm6 ; + pxor mm7, mm7 ; + + mov rax, arg(4) ;HFilter ; + mov rdx, arg(5) ;VFilter ; + + mov rsi, arg(0) ;ref_ptr ; + mov rdi, arg(2) ;src_ptr ; + + mov rcx, 4 ; + pxor mm0, mm0 ; + + movd mm1, [rsi] ; + movd mm3, [rsi+1] ; + + punpcklbw mm1, mm0 ; + pmullw mm1, [rax] ; + + punpcklbw mm3, mm0 ; + pmullw mm3, [rax+8] ; + + paddw mm1, mm3 ; + paddw mm1, [GLOBAL(mmx_bi_rd)] ; + + psraw mm1, mmx_filter_shift ; + movq mm5, mm1 + +%if ABI_IS_32BIT + add rsi, dword ptr arg(1) ;ref_pixels_per_line ; +%else + movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ; + add rsi, r8 +%endif + +.filter_block2d_bil4x4_var_mmx_loop: + + movd mm1, [rsi] ; + movd mm3, [rsi+1] ; + + punpcklbw mm1, mm0 ; + pmullw mm1, [rax] ; + + punpcklbw mm3, mm0 ; + pmullw mm3, [rax+8] ; + + paddw mm1, mm3 ; + paddw mm1, [GLOBAL(mmx_bi_rd)] ; + + psraw mm1, mmx_filter_shift ; + movq mm3, mm5 ; + + movq mm5, mm1 ; + pmullw mm3, [rdx] ; + + pmullw mm1, [rdx+8] ; + paddw mm1, mm3 ; + + paddw mm1, [GLOBAL(mmx_bi_rd)] ; + psraw mm1, mmx_filter_shift ; + + movd mm3, [rdi] ; + punpcklbw mm3, mm0 ; + + psubw mm1, mm3 ; + paddw mm6, mm1 ; + + pmaddwd mm1, mm1 ; + paddd mm7, mm1 ; + +%if ABI_IS_32BIT + add rsi, dword ptr arg(1) ;ref_pixels_per_line ; + add rdi, dword ptr arg(3) ;src_pixels_per_line ; +%else + movsxd r8, dword ptr arg(1) ;ref_pixels_per_line + movsxd r9, dword ptr arg(3) ;src_pixels_per_line + add rsi, r8 + add rdi, r9 +%endif + sub rcx, 1 ; + jnz .filter_block2d_bil4x4_var_mmx_loop ; + + pxor mm3, mm3 ; + pxor mm2, mm2 ; + + punpcklwd mm2, mm6 ; + punpckhwd mm3, mm6 ; + + paddd mm2, mm3 ; + movq mm6, mm2 ; + + psrlq mm6, 32 ; + paddd mm2, mm6 ; + + psrad mm2, 16 ; + movq mm4, mm7 ; + + psrlq mm4, 32 ; + paddd mm4, mm7 ; + + mov rdi, arg(6) ;sum + mov rsi, arg(7) ;sumsquared + + movd dword ptr [rdi], mm2 ; + movd dword ptr [rsi], mm4 ; + + ; begin epilog + add rsp, 16 + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + +;void vpx_filter_block2d_bil_var_mmx +;( +; unsigned char *ref_ptr, +; int ref_pixels_per_line, +; unsigned char *src_ptr, +; int src_pixels_per_line, +; unsigned int Height, +; unsigned short *HFilter, +; unsigned short *VFilter, +; int *sum, +; unsigned int *sumsquared +;) +global sym(vpx_filter_block2d_bil_var_mmx) PRIVATE +sym(vpx_filter_block2d_bil_var_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 9 + GET_GOT rbx + push rsi + push rdi + sub rsp, 16 + ; end prolog + + pxor mm6, mm6 ; + pxor mm7, mm7 ; + mov rax, arg(5) ;HFilter ; + + mov rdx, arg(6) ;VFilter ; + mov rsi, arg(0) ;ref_ptr ; + + mov rdi, arg(2) ;src_ptr ; + movsxd rcx, dword ptr arg(4) ;Height ; + + pxor mm0, mm0 ; + movq mm1, [rsi] ; + + movq mm3, [rsi+1] ; + movq mm2, mm1 ; + + movq mm4, mm3 ; + punpcklbw mm1, mm0 ; + + punpckhbw mm2, mm0 ; + pmullw mm1, [rax] ; + + pmullw mm2, [rax] ; + punpcklbw mm3, mm0 ; + + punpckhbw mm4, mm0 ; + pmullw mm3, [rax+8] ; + + pmullw mm4, [rax+8] ; + paddw mm1, mm3 ; + + paddw mm2, mm4 ; + paddw mm1, [GLOBAL(mmx_bi_rd)] ; + + psraw mm1, mmx_filter_shift ; + paddw mm2, [GLOBAL(mmx_bi_rd)] ; + + psraw mm2, mmx_filter_shift ; + movq mm5, mm1 + + packuswb mm5, mm2 ; +%if ABI_IS_32BIT + add rsi, dword ptr arg(1) ;ref_pixels_per_line +%else + movsxd r8, dword ptr arg(1) ;ref_pixels_per_line + add rsi, r8 +%endif + +.filter_block2d_bil_var_mmx_loop: + + movq mm1, [rsi] ; + movq mm3, [rsi+1] ; + + movq mm2, mm1 ; + movq mm4, mm3 ; + + punpcklbw mm1, mm0 ; + punpckhbw mm2, mm0 ; + + pmullw mm1, [rax] ; + pmullw mm2, [rax] ; + + punpcklbw mm3, mm0 ; + punpckhbw mm4, mm0 ; + + pmullw mm3, [rax+8] ; + pmullw mm4, [rax+8] ; + + paddw mm1, mm3 ; + paddw mm2, mm4 ; + + paddw mm1, [GLOBAL(mmx_bi_rd)] ; + psraw mm1, mmx_filter_shift ; + + paddw mm2, [GLOBAL(mmx_bi_rd)] ; + psraw mm2, mmx_filter_shift ; + + movq mm3, mm5 ; + movq mm4, mm5 ; + + punpcklbw mm3, mm0 ; + punpckhbw mm4, mm0 ; + + movq mm5, mm1 ; + packuswb mm5, mm2 ; + + pmullw mm3, [rdx] ; + pmullw mm4, [rdx] ; + + pmullw mm1, [rdx+8] ; + pmullw mm2, [rdx+8] ; + + paddw mm1, mm3 ; + paddw mm2, mm4 ; + + paddw mm1, [GLOBAL(mmx_bi_rd)] ; + paddw mm2, [GLOBAL(mmx_bi_rd)] ; + + psraw mm1, mmx_filter_shift ; + psraw mm2, mmx_filter_shift ; + + movq mm3, [rdi] ; + movq mm4, mm3 ; + + punpcklbw mm3, mm0 ; + punpckhbw mm4, mm0 ; + + psubw mm1, mm3 ; + psubw mm2, mm4 ; + + paddw mm6, mm1 ; + pmaddwd mm1, mm1 ; + + paddw mm6, mm2 ; + pmaddwd mm2, mm2 ; + + paddd mm7, mm1 ; + paddd mm7, mm2 ; + +%if ABI_IS_32BIT + add rsi, dword ptr arg(1) ;ref_pixels_per_line ; + add rdi, dword ptr arg(3) ;src_pixels_per_line ; +%else + movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ; + movsxd r9, dword ptr arg(3) ;src_pixels_per_line ; + add rsi, r8 + add rdi, r9 +%endif + sub rcx, 1 ; + jnz .filter_block2d_bil_var_mmx_loop ; + + pxor mm3, mm3 ; + pxor mm2, mm2 ; + + punpcklwd mm2, mm6 ; + punpckhwd mm3, mm6 ; + + paddd mm2, mm3 ; + movq mm6, mm2 ; + + psrlq mm6, 32 ; + paddd mm2, mm6 ; + + psrad mm2, 16 ; + movq mm4, mm7 ; + + psrlq mm4, 32 ; + paddd mm4, mm7 ; + + mov rdi, arg(7) ;sum + mov rsi, arg(8) ;sumsquared + + movd dword ptr [rdi], mm2 ; + movd dword ptr [rsi], mm4 ; + + ; begin epilog + add rsp, 16 + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + +SECTION_RODATA +;short mmx_bi_rd[4] = { 64, 64, 64, 64}; +align 16 +mmx_bi_rd: + times 4 dw 64 |