diff options
author | Johann <johann.koenig@duck.com> | 2018-10-25 13:37:50 -0700 |
---|---|---|
committer | Johann <johann.koenig@duck.com> | 2018-10-29 18:53:32 -0700 |
commit | c176e6490403076105faa2a07f275d31ec61d2a3 (patch) | |
tree | 977a36d51dcc5dd024d7fb3626bdef9d47c03eb6 /vpx_dsp/x86/deblock_sse2.asm | |
parent | fa0076282e62f649483bde868602aab86448a661 (diff) | |
download | libvpx-c176e6490403076105faa2a07f275d31ec61d2a3.tar libvpx-c176e6490403076105faa2a07f275d31ec61d2a3.tar.gz libvpx-c176e6490403076105faa2a07f275d31ec61d2a3.tar.bz2 libvpx-c176e6490403076105faa2a07f275d31ec61d2a3.zip |
vpx postproc: rewrite in intrinsics
About ~10% faster on 64bit but ~10% slower on 32
Removes the assembly usage of vpx_rv.
Change-Id: I214698fb5677f615dee0a8f5f5bb8f64daf2565e
Diffstat (limited to 'vpx_dsp/x86/deblock_sse2.asm')
-rw-r--r-- | vpx_dsp/x86/deblock_sse2.asm | 231 |
1 files changed, 0 insertions, 231 deletions
diff --git a/vpx_dsp/x86/deblock_sse2.asm b/vpx_dsp/x86/deblock_sse2.asm index 97cb43b67..9d8e5e3e0 100644 --- a/vpx_dsp/x86/deblock_sse2.asm +++ b/vpx_dsp/x86/deblock_sse2.asm @@ -232,237 +232,6 @@ sym(vpx_post_proc_down_and_across_mb_row_sse2): ret %undef flimit -;void vpx_mbpost_proc_down_sse2(unsigned char *dst, -; int pitch, int rows, int cols,int flimit) -extern sym(vpx_rv) -global sym(vpx_mbpost_proc_down_sse2) PRIVATE -sym(vpx_mbpost_proc_down_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 128+16 - - ; unsigned char d[16][8] at [rsp] - ; create flimit2 at [rsp+128] - mov eax, dword ptr arg(4) ;flimit - mov [rsp+128], eax - mov [rsp+128+4], eax - mov [rsp+128+8], eax - mov [rsp+128+12], eax -%define flimit4 [rsp+128] - -%if ABI_IS_32BIT=0 - lea r8, [GLOBAL(sym(vpx_rv))] -%endif - - ;rows +=8; - add dword arg(2), 8 - - ;for(c=0; c<cols; c+=8) -.loop_col: - mov rsi, arg(0) ; s - pxor xmm0, xmm0 ; - - movsxd rax, dword ptr arg(1) ;pitch ; - - ; this copies the last row down into the border 8 rows - mov rdi, rsi - mov rdx, arg(2) - sub rdx, 9 - imul rdx, rax - lea rdi, [rdi+rdx] - movq xmm1, QWORD ptr[rdi] ; first row - mov rcx, 8 -.init_borderd: ; initialize borders - lea rdi, [rdi + rax] - movq [rdi], xmm1 - - dec rcx - jne .init_borderd - - neg rax ; rax = -pitch - - ; this copies the first row up into the border 8 rows - mov rdi, rsi - movq xmm1, QWORD ptr[rdi] ; first row - mov rcx, 8 -.init_border: ; initialize borders - lea rdi, [rdi + rax] - movq [rdi], xmm1 - - dec rcx - jne .init_border - - - - lea rsi, [rsi + rax*8]; ; rdi = s[-pitch*8] - neg rax - - pxor xmm5, xmm5 - pxor xmm6, xmm6 ; - - pxor xmm7, xmm7 ; - mov rdi, rsi - - mov rcx, 15 ; - -.loop_initvar: - movq xmm1, QWORD PTR [rdi]; - punpcklbw xmm1, xmm0 ; - - paddw xmm5, xmm1 ; - pmullw xmm1, xmm1 ; - - movdqa xmm2, xmm1 ; - punpcklwd xmm1, xmm0 ; - - punpckhwd xmm2, xmm0 ; - paddd xmm6, xmm1 ; - - paddd xmm7, xmm2 ; - lea rdi, [rdi+rax] ; - - dec rcx - jne .loop_initvar - ;save the var and sum - xor rdx, rdx -.loop_row: - movq xmm1, QWORD PTR [rsi] ; [s-pitch*8] - movq xmm2, QWORD PTR [rdi] ; [s+pitch*7] - - punpcklbw xmm1, xmm0 - punpcklbw xmm2, xmm0 - - paddw xmm5, xmm2 - psubw xmm5, xmm1 - - pmullw xmm2, xmm2 - movdqa xmm4, xmm2 - - punpcklwd xmm2, xmm0 - punpckhwd xmm4, xmm0 - - paddd xmm6, xmm2 - paddd xmm7, xmm4 - - pmullw xmm1, xmm1 - movdqa xmm2, xmm1 - - punpcklwd xmm1, xmm0 - psubd xmm6, xmm1 - - punpckhwd xmm2, xmm0 - psubd xmm7, xmm2 - - - movdqa xmm3, xmm6 - pslld xmm3, 4 - - psubd xmm3, xmm6 - movdqa xmm1, xmm5 - - movdqa xmm4, xmm5 - pmullw xmm1, xmm1 - - pmulhw xmm4, xmm4 - movdqa xmm2, xmm1 - - punpcklwd xmm1, xmm4 - punpckhwd xmm2, xmm4 - - movdqa xmm4, xmm7 - pslld xmm4, 4 - - psubd xmm4, xmm7 - - psubd xmm3, xmm1 - psubd xmm4, xmm2 - - psubd xmm3, flimit4 - psubd xmm4, flimit4 - - psrad xmm3, 31 - psrad xmm4, 31 - - packssdw xmm3, xmm4 - packsswb xmm3, xmm0 - - movq xmm1, QWORD PTR [rsi+rax*8] - - movq xmm2, xmm1 - punpcklbw xmm1, xmm0 - - paddw xmm1, xmm5 - mov rcx, rdx - - and rcx, 127 -%if ABI_IS_32BIT=1 && CONFIG_PIC=1 - push rax - lea rax, [GLOBAL(sym(vpx_rv))] - movdqu xmm4, [rax + rcx*2] ;vpx_rv[rcx*2] - pop rax -%elif ABI_IS_32BIT=0 - movdqu xmm4, [r8 + rcx*2] ;vpx_rv[rcx*2] -%else - movdqu xmm4, [sym(vpx_rv) + rcx*2] -%endif - - paddw xmm1, xmm4 - ;paddw xmm1, eight8s - psraw xmm1, 4 - - packuswb xmm1, xmm0 - pand xmm1, xmm3 - - pandn xmm3, xmm2 - por xmm1, xmm3 - - and rcx, 15 - movq QWORD PTR [rsp + rcx*8], xmm1 ;d[rcx*8] - - cmp edx, 8 - jl .skip_assignment - - mov rcx, rdx - sub rcx, 8 - and rcx, 15 - movq mm0, [rsp + rcx*8] ;d[rcx*8] - movq [rsi], mm0 - -.skip_assignment: - lea rsi, [rsi+rax] - - lea rdi, [rdi+rax] - add rdx, 1 - - cmp edx, dword arg(2) ;rows - jl .loop_row - - add dword arg(0), 8 ; s += 8 - sub dword arg(3), 8 ; cols -= 8 - cmp dword arg(3), 0 - jg .loop_col - - add rsp, 128+16 - pop rsp - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret -%undef flimit4 - ;void vpx_mbpost_proc_across_ip_sse2(unsigned char *src, ; int pitch, int rows, int cols,int flimit) |