summaryrefslogtreecommitdiff
path: root/vpx_dsp/x86/deblock_sse2.asm
diff options
context:
space:
mode:
authorJohann <johann.koenig@duck.com>2018-10-25 13:37:50 -0700
committerJohann <johann.koenig@duck.com>2018-10-29 18:53:32 -0700
commitc176e6490403076105faa2a07f275d31ec61d2a3 (patch)
tree977a36d51dcc5dd024d7fb3626bdef9d47c03eb6 /vpx_dsp/x86/deblock_sse2.asm
parentfa0076282e62f649483bde868602aab86448a661 (diff)
downloadlibvpx-c176e6490403076105faa2a07f275d31ec61d2a3.tar
libvpx-c176e6490403076105faa2a07f275d31ec61d2a3.tar.gz
libvpx-c176e6490403076105faa2a07f275d31ec61d2a3.tar.bz2
libvpx-c176e6490403076105faa2a07f275d31ec61d2a3.zip
vpx postproc: rewrite in intrinsics
About ~10% faster on 64bit but ~10% slower on 32 Removes the assembly usage of vpx_rv. Change-Id: I214698fb5677f615dee0a8f5f5bb8f64daf2565e
Diffstat (limited to 'vpx_dsp/x86/deblock_sse2.asm')
-rw-r--r--vpx_dsp/x86/deblock_sse2.asm231
1 files changed, 0 insertions, 231 deletions
diff --git a/vpx_dsp/x86/deblock_sse2.asm b/vpx_dsp/x86/deblock_sse2.asm
index 97cb43b67..9d8e5e3e0 100644
--- a/vpx_dsp/x86/deblock_sse2.asm
+++ b/vpx_dsp/x86/deblock_sse2.asm
@@ -232,237 +232,6 @@ sym(vpx_post_proc_down_and_across_mb_row_sse2):
ret
%undef flimit
-;void vpx_mbpost_proc_down_sse2(unsigned char *dst,
-; int pitch, int rows, int cols,int flimit)
-extern sym(vpx_rv)
-global sym(vpx_mbpost_proc_down_sse2) PRIVATE
-sym(vpx_mbpost_proc_down_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 128+16
-
- ; unsigned char d[16][8] at [rsp]
- ; create flimit2 at [rsp+128]
- mov eax, dword ptr arg(4) ;flimit
- mov [rsp+128], eax
- mov [rsp+128+4], eax
- mov [rsp+128+8], eax
- mov [rsp+128+12], eax
-%define flimit4 [rsp+128]
-
-%if ABI_IS_32BIT=0
- lea r8, [GLOBAL(sym(vpx_rv))]
-%endif
-
- ;rows +=8;
- add dword arg(2), 8
-
- ;for(c=0; c<cols; c+=8)
-.loop_col:
- mov rsi, arg(0) ; s
- pxor xmm0, xmm0 ;
-
- movsxd rax, dword ptr arg(1) ;pitch ;
-
- ; this copies the last row down into the border 8 rows
- mov rdi, rsi
- mov rdx, arg(2)
- sub rdx, 9
- imul rdx, rax
- lea rdi, [rdi+rdx]
- movq xmm1, QWORD ptr[rdi] ; first row
- mov rcx, 8
-.init_borderd: ; initialize borders
- lea rdi, [rdi + rax]
- movq [rdi], xmm1
-
- dec rcx
- jne .init_borderd
-
- neg rax ; rax = -pitch
-
- ; this copies the first row up into the border 8 rows
- mov rdi, rsi
- movq xmm1, QWORD ptr[rdi] ; first row
- mov rcx, 8
-.init_border: ; initialize borders
- lea rdi, [rdi + rax]
- movq [rdi], xmm1
-
- dec rcx
- jne .init_border
-
-
-
- lea rsi, [rsi + rax*8]; ; rdi = s[-pitch*8]
- neg rax
-
- pxor xmm5, xmm5
- pxor xmm6, xmm6 ;
-
- pxor xmm7, xmm7 ;
- mov rdi, rsi
-
- mov rcx, 15 ;
-
-.loop_initvar:
- movq xmm1, QWORD PTR [rdi];
- punpcklbw xmm1, xmm0 ;
-
- paddw xmm5, xmm1 ;
- pmullw xmm1, xmm1 ;
-
- movdqa xmm2, xmm1 ;
- punpcklwd xmm1, xmm0 ;
-
- punpckhwd xmm2, xmm0 ;
- paddd xmm6, xmm1 ;
-
- paddd xmm7, xmm2 ;
- lea rdi, [rdi+rax] ;
-
- dec rcx
- jne .loop_initvar
- ;save the var and sum
- xor rdx, rdx
-.loop_row:
- movq xmm1, QWORD PTR [rsi] ; [s-pitch*8]
- movq xmm2, QWORD PTR [rdi] ; [s+pitch*7]
-
- punpcklbw xmm1, xmm0
- punpcklbw xmm2, xmm0
-
- paddw xmm5, xmm2
- psubw xmm5, xmm1
-
- pmullw xmm2, xmm2
- movdqa xmm4, xmm2
-
- punpcklwd xmm2, xmm0
- punpckhwd xmm4, xmm0
-
- paddd xmm6, xmm2
- paddd xmm7, xmm4
-
- pmullw xmm1, xmm1
- movdqa xmm2, xmm1
-
- punpcklwd xmm1, xmm0
- psubd xmm6, xmm1
-
- punpckhwd xmm2, xmm0
- psubd xmm7, xmm2
-
-
- movdqa xmm3, xmm6
- pslld xmm3, 4
-
- psubd xmm3, xmm6
- movdqa xmm1, xmm5
-
- movdqa xmm4, xmm5
- pmullw xmm1, xmm1
-
- pmulhw xmm4, xmm4
- movdqa xmm2, xmm1
-
- punpcklwd xmm1, xmm4
- punpckhwd xmm2, xmm4
-
- movdqa xmm4, xmm7
- pslld xmm4, 4
-
- psubd xmm4, xmm7
-
- psubd xmm3, xmm1
- psubd xmm4, xmm2
-
- psubd xmm3, flimit4
- psubd xmm4, flimit4
-
- psrad xmm3, 31
- psrad xmm4, 31
-
- packssdw xmm3, xmm4
- packsswb xmm3, xmm0
-
- movq xmm1, QWORD PTR [rsi+rax*8]
-
- movq xmm2, xmm1
- punpcklbw xmm1, xmm0
-
- paddw xmm1, xmm5
- mov rcx, rdx
-
- and rcx, 127
-%if ABI_IS_32BIT=1 && CONFIG_PIC=1
- push rax
- lea rax, [GLOBAL(sym(vpx_rv))]
- movdqu xmm4, [rax + rcx*2] ;vpx_rv[rcx*2]
- pop rax
-%elif ABI_IS_32BIT=0
- movdqu xmm4, [r8 + rcx*2] ;vpx_rv[rcx*2]
-%else
- movdqu xmm4, [sym(vpx_rv) + rcx*2]
-%endif
-
- paddw xmm1, xmm4
- ;paddw xmm1, eight8s
- psraw xmm1, 4
-
- packuswb xmm1, xmm0
- pand xmm1, xmm3
-
- pandn xmm3, xmm2
- por xmm1, xmm3
-
- and rcx, 15
- movq QWORD PTR [rsp + rcx*8], xmm1 ;d[rcx*8]
-
- cmp edx, 8
- jl .skip_assignment
-
- mov rcx, rdx
- sub rcx, 8
- and rcx, 15
- movq mm0, [rsp + rcx*8] ;d[rcx*8]
- movq [rsi], mm0
-
-.skip_assignment:
- lea rsi, [rsi+rax]
-
- lea rdi, [rdi+rax]
- add rdx, 1
-
- cmp edx, dword arg(2) ;rows
- jl .loop_row
-
- add dword arg(0), 8 ; s += 8
- sub dword arg(3), 8 ; cols -= 8
- cmp dword arg(3), 0
- jg .loop_col
-
- add rsp, 128+16
- pop rsp
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-%undef flimit4
-
;void vpx_mbpost_proc_across_ip_sse2(unsigned char *src,
; int pitch, int rows, int cols,int flimit)