diff options
author | Yunqing Wang <yunqingwang@google.com> | 2012-09-28 10:13:07 -0700 |
---|---|---|
committer | Yunqing Wang <yunqingwang@google.com> | 2012-10-08 12:06:44 -0700 |
commit | 4c53bacce4a97d98a4e73262bb3517d38ddd3514 (patch) | |
tree | 15f52f2961d64c89a246862bf59f149eee68e7dd /vp8/common/x86/postproc_mmx.asm | |
parent | 9704cdec9fb777833599312fc84e3f1311d25eed (diff) | |
download | libvpx-4c53bacce4a97d98a4e73262bb3517d38ddd3514.tar libvpx-4c53bacce4a97d98a4e73262bb3517d38ddd3514.tar.gz libvpx-4c53bacce4a97d98a4e73262bb3517d38ddd3514.tar.bz2 libvpx-4c53bacce4a97d98a4e73262bb3517d38ddd3514.zip |
post-proc: deblock filter optimization
1. Algorithm modification:
Instead of having same filter threshold for a whole frame, now we
allow the thresholds to be adjusted for each macroblock. In current
implementation, to avoid excessive blur on background as reported
in issue480(http://code.google.com/p/webm/issues/detail?id=480), we
reduce the thresholds for skipped macroblocks.
2. SSE2 optimization:
As started in issue479(http://code.google.com/p/webm/issues/detail?id=479),
the filter calculation was adjusted for better performance. The c
code was also modified accordingly. This made the deblock filter
2x faster, and the decoder was 1.2x faster overall.
Next, the demacroblock filter will be modified similarly.
Change-Id: I05e54c3f580ccd427487d085096b3174f2ab7e86
Diffstat (limited to 'vp8/common/x86/postproc_mmx.asm')
-rw-r--r-- | vp8/common/x86/postproc_mmx.asm | 265 |
1 files changed, 0 insertions, 265 deletions
diff --git a/vp8/common/x86/postproc_mmx.asm b/vp8/common/x86/postproc_mmx.asm index 534f2967a..966c586e4 100644 --- a/vp8/common/x86/postproc_mmx.asm +++ b/vp8/common/x86/postproc_mmx.asm @@ -14,271 +14,6 @@ %define VP8_FILTER_WEIGHT 128 %define VP8_FILTER_SHIFT 7 -;void vp8_post_proc_down_and_across_mmx -;( -; unsigned char *src_ptr, -; unsigned char *dst_ptr, -; int src_pixels_per_line, -; int dst_pixels_per_line, -; int rows, -; int cols, -; int flimit -;) -global sym(vp8_post_proc_down_and_across_mmx) PRIVATE -sym(vp8_post_proc_down_and_across_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - -%if ABI_IS_32BIT=1 && CONFIG_PIC=1 - ; move the global rd onto the stack, since we don't have enough registers - ; to do PIC addressing - movq mm0, [GLOBAL(rd)] - sub rsp, 8 - movq [rsp], mm0 -%define RD [rsp] -%else -%define RD [GLOBAL(rd)] -%endif - - push rbx - lea rbx, [GLOBAL(Blur)] - movd mm2, dword ptr arg(6) ;flimit - punpcklwd mm2, mm2 - punpckldq mm2, mm2 - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(1) ;dst_ptr - - movsxd rcx, DWORD PTR arg(4) ;rows - movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch? - pxor mm0, mm0 ; mm0 = 00000000 - -.nextrow: - - xor rdx, rdx ; clear out rdx for use as loop counter -.nextcol: - - pxor mm7, mm7 ; mm7 = 00000000 - movq mm6, [rbx + 32 ] ; mm6 = kernel 2 taps - movq mm3, [rsi] ; mm4 = r0 p0..p7 - punpcklbw mm3, mm0 ; mm3 = p0..p3 - movq mm1, mm3 ; mm1 = p0..p3 - pmullw mm3, mm6 ; mm3 *= kernel 2 modifiers - - movq mm6, [rbx + 48] ; mm6 = kernel 3 taps - movq mm5, [rsi + rax] ; mm4 = r1 p0..p7 - punpcklbw mm5, mm0 ; mm5 = r1 p0..p3 - pmullw mm6, mm5 ; mm6 *= p0..p3 * kernel 3 modifiers - paddusw mm3, mm6 ; mm3 += mm6 - - ; thresholding - movq mm7, mm1 ; mm7 = r0 p0..p3 - psubusw mm7, mm5 ; mm7 = r0 p0..p3 - r1 p0..p3 - psubusw mm5, mm1 ; mm5 = r1 p0..p3 - r0 p0..p3 - paddusw mm7, mm5 ; mm7 = abs(r0 p0..p3 - r1 p0..p3) - pcmpgtw mm7, mm2 - - movq mm6, [rbx + 64 ] ; mm6 = kernel 4 modifiers - movq mm5, [rsi + 2*rax] ; mm4 = r2 p0..p7 - punpcklbw mm5, mm0 ; mm5 = r2 p0..p3 - pmullw mm6, mm5 ; mm5 *= kernel 4 modifiers - paddusw mm3, mm6 ; mm3 += mm5 - - ; thresholding - movq mm6, mm1 ; mm6 = r0 p0..p3 - psubusw mm6, mm5 ; mm6 = r0 p0..p3 - r2 p0..p3 - psubusw mm5, mm1 ; mm5 = r2 p0..p3 - r2 p0..p3 - paddusw mm6, mm5 ; mm6 = abs(r0 p0..p3 - r2 p0..p3) - pcmpgtw mm6, mm2 - por mm7, mm6 ; accumulate thresholds - - - neg rax - movq mm6, [rbx ] ; kernel 0 taps - movq mm5, [rsi+2*rax] ; mm4 = r-2 p0..p7 - punpcklbw mm5, mm0 ; mm5 = r-2 p0..p3 - pmullw mm6, mm5 ; mm5 *= kernel 0 modifiers - paddusw mm3, mm6 ; mm3 += mm5 - - ; thresholding - movq mm6, mm1 ; mm6 = r0 p0..p3 - psubusw mm6, mm5 ; mm6 = p0..p3 - r-2 p0..p3 - psubusw mm5, mm1 ; mm5 = r-2 p0..p3 - p0..p3 - paddusw mm6, mm5 ; mm6 = abs(r0 p0..p3 - r-2 p0..p3) - pcmpgtw mm6, mm2 - por mm7, mm6 ; accumulate thresholds - - movq mm6, [rbx + 16] ; kernel 1 taps - movq mm4, [rsi+rax] ; mm4 = r-1 p0..p7 - punpcklbw mm4, mm0 ; mm4 = r-1 p0..p3 - pmullw mm6, mm4 ; mm4 *= kernel 1 modifiers. - paddusw mm3, mm6 ; mm3 += mm5 - - ; thresholding - movq mm6, mm1 ; mm6 = r0 p0..p3 - psubusw mm6, mm4 ; mm6 = p0..p3 - r-2 p0..p3 - psubusw mm4, mm1 ; mm5 = r-1 p0..p3 - p0..p3 - paddusw mm6, mm4 ; mm6 = abs(r0 p0..p3 - r-1 p0..p3) - pcmpgtw mm6, mm2 - por mm7, mm6 ; accumulate thresholds - - - paddusw mm3, RD ; mm3 += round value - psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128 - - pand mm1, mm7 ; mm1 select vals > thresh from source - pandn mm7, mm3 ; mm7 select vals < thresh from blurred result - paddusw mm1, mm7 ; combination - - packuswb mm1, mm0 ; pack to bytes - - movd [rdi], mm1 ; - neg rax ; pitch is positive - - - add rsi, 4 - add rdi, 4 - add rdx, 4 - - cmp edx, dword ptr arg(5) ;cols - jl .nextcol - ; done with the all cols, start the across filtering in place - sub rsi, rdx - sub rdi, rdx - - ; dup the first byte into the left border 8 times - movq mm1, [rdi] - punpcklbw mm1, mm1 - punpcklwd mm1, mm1 - punpckldq mm1, mm1 - - mov rdx, -8 - movq [rdi+rdx], mm1 - - ; dup the last byte into the right border - movsxd rdx, dword arg(5) - movq mm1, [rdi + rdx + -1] - punpcklbw mm1, mm1 - punpcklwd mm1, mm1 - punpckldq mm1, mm1 - movq [rdi+rdx], mm1 - - - push rax - xor rdx, rdx - mov rax, [rdi-4]; - -.acrossnextcol: - pxor mm7, mm7 ; mm7 = 00000000 - movq mm6, [rbx + 32 ] ; - movq mm4, [rdi+rdx] ; mm4 = p0..p7 - movq mm3, mm4 ; mm3 = p0..p7 - punpcklbw mm3, mm0 ; mm3 = p0..p3 - movq mm1, mm3 ; mm1 = p0..p3 - pmullw mm3, mm6 ; mm3 *= kernel 2 modifiers - - movq mm6, [rbx + 48] - psrlq mm4, 8 ; mm4 = p1..p7 - movq mm5, mm4 ; mm5 = p1..p7 - punpcklbw mm5, mm0 ; mm5 = p1..p4 - pmullw mm6, mm5 ; mm6 *= p1..p4 * kernel 3 modifiers - paddusw mm3, mm6 ; mm3 += mm6 - - ; thresholding - movq mm7, mm1 ; mm7 = p0..p3 - psubusw mm7, mm5 ; mm7 = p0..p3 - p1..p4 - psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3 - paddusw mm7, mm5 ; mm7 = abs(p0..p3 - p1..p4) - pcmpgtw mm7, mm2 - - movq mm6, [rbx + 64 ] - psrlq mm4, 8 ; mm4 = p2..p7 - movq mm5, mm4 ; mm5 = p2..p7 - punpcklbw mm5, mm0 ; mm5 = p2..p5 - pmullw mm6, mm5 ; mm5 *= kernel 4 modifiers - paddusw mm3, mm6 ; mm3 += mm5 - - ; thresholding - movq mm6, mm1 ; mm6 = p0..p3 - psubusw mm6, mm5 ; mm6 = p0..p3 - p1..p4 - psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3 - paddusw mm6, mm5 ; mm6 = abs(p0..p3 - p1..p4) - pcmpgtw mm6, mm2 - por mm7, mm6 ; accumulate thresholds - - - movq mm6, [rbx ] - movq mm4, [rdi+rdx-2] ; mm4 = p-2..p5 - movq mm5, mm4 ; mm5 = p-2..p5 - punpcklbw mm5, mm0 ; mm5 = p-2..p1 - pmullw mm6, mm5 ; mm5 *= kernel 0 modifiers - paddusw mm3, mm6 ; mm3 += mm5 - - ; thresholding - movq mm6, mm1 ; mm6 = p0..p3 - psubusw mm6, mm5 ; mm6 = p0..p3 - p1..p4 - psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3 - paddusw mm6, mm5 ; mm6 = abs(p0..p3 - p1..p4) - pcmpgtw mm6, mm2 - por mm7, mm6 ; accumulate thresholds - - movq mm6, [rbx + 16] - psrlq mm4, 8 ; mm4 = p-1..p5 - punpcklbw mm4, mm0 ; mm4 = p-1..p2 - pmullw mm6, mm4 ; mm4 *= kernel 1 modifiers. - paddusw mm3, mm6 ; mm3 += mm5 - - ; thresholding - movq mm6, mm1 ; mm6 = p0..p3 - psubusw mm6, mm4 ; mm6 = p0..p3 - p1..p4 - psubusw mm4, mm1 ; mm5 = p1..p4 - p0..p3 - paddusw mm6, mm4 ; mm6 = abs(p0..p3 - p1..p4) - pcmpgtw mm6, mm2 - por mm7, mm6 ; accumulate thresholds - - paddusw mm3, RD ; mm3 += round value - psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128 - - pand mm1, mm7 ; mm1 select vals > thresh from source - pandn mm7, mm3 ; mm7 select vals < thresh from blurred result - paddusw mm1, mm7 ; combination - - packuswb mm1, mm0 ; pack to bytes - mov DWORD PTR [rdi+rdx-4], eax ; store previous four bytes - movd eax, mm1 - - add rdx, 4 - cmp edx, dword ptr arg(5) ;cols - jl .acrossnextcol; - - mov DWORD PTR [rdi+rdx-4], eax - pop rax - - ; done with this rwo - add rsi,rax ; next line - movsxd rax, dword ptr arg(3) ;dst_pixels_per_line ; destination pitch? - add rdi,rax ; next destination - movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; destination pitch? - - dec rcx ; decrement count - jnz .nextrow ; next row - pop rbx - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret -%undef RD - - ;void vp8_mbpost_proc_down_mmx(unsigned char *dst, ; int pitch, int rows, int cols,int flimit) extern sym(vp8_rv) |