From 4c53bacce4a97d98a4e73262bb3517d38ddd3514 Mon Sep 17 00:00:00 2001 From: Yunqing Wang Date: Fri, 28 Sep 2012 10:13:07 -0700 Subject: post-proc: deblock filter optimization 1. Algorithm modification: Instead of having same filter threshold for a whole frame, now we allow the thresholds to be adjusted for each macroblock. In current implementation, to avoid excessive blur on background as reported in issue480(http://code.google.com/p/webm/issues/detail?id=480), we reduce the thresholds for skipped macroblocks. 2. SSE2 optimization: As started in issue479(http://code.google.com/p/webm/issues/detail?id=479), the filter calculation was adjusted for better performance. The c code was also modified accordingly. This made the deblock filter 2x faster, and the decoder was 1.2x faster overall. Next, the demacroblock filter will be modified similarly. Change-Id: I05e54c3f580ccd427487d085096b3174f2ab7e86 --- vp8/common/postproc.c | 209 ++++++++++++++---------- vp8/common/postproc.h | 3 +- vp8/common/ppc/systemdependent.c | 14 +- vp8/common/rtcd_defs.sh | 5 +- vp8/common/x86/postproc_mmx.asm | 265 ------------------------------- vp8/common/x86/postproc_sse2.asm | 334 +++++++++++++++++---------------------- vp8/encoder/onyx_if.c | 2 +- 7 files changed, 282 insertions(+), 550 deletions(-) (limited to 'vp8') diff --git a/vp8/common/postproc.c b/vp8/common/postproc.c index 444898784..752292eff 100644 --- a/vp8/common/postproc.c +++ b/vp8/common/postproc.c @@ -127,25 +127,24 @@ extern void vp8_blit_text(const char *msg, unsigned char *address, const int pit extern void vp8_blit_line(int x0, int x1, int y0, int y1, unsigned char *image, const int pitch); /*********************************************************************************************************** */ -void vp8_post_proc_down_and_across_c +void vp8_post_proc_down_and_across_mb_row_c ( unsigned char *src_ptr, unsigned char *dst_ptr, int src_pixels_per_line, int dst_pixels_per_line, - int rows, int cols, - int flimit + unsigned char *f, + int size ) { unsigned char *p_src, *p_dst; int row; int col; - int i; - int v; - unsigned char d[8]; + unsigned char v; + unsigned char d[4]; - for (row = 0; row < rows; row++) + for (row = 0; row < size; row++) { /* post_proc_down for one row */ p_src = src_ptr; @@ -153,20 +152,23 @@ void vp8_post_proc_down_and_across_c for (col = 0; col < cols; col++) { + unsigned char p_above2 = p_src[col - 2 * src_pixels_per_line]; + unsigned char p_above1 = p_src[col - src_pixels_per_line]; + unsigned char p_below1 = p_src[col + src_pixels_per_line]; + unsigned char p_below2 = p_src[col + 2 * src_pixels_per_line]; - int kernel = 4; - int v = p_src[col]; + v = p_src[col]; - for (i = -2; i <= 2; i++) + if ((abs(v - p_above2) < f[col]) && (abs(v - p_above1) < f[col]) + && (abs(v - p_below1) < f[col]) && (abs(v - p_below2) < f[col])) { - if (abs(v - p_src[col+i*src_pixels_per_line]) > flimit) - goto down_skip_convolve; - - kernel += kernel5[2+i] * p_src[col+i*src_pixels_per_line]; + unsigned char k1, k2, k3; + k1 = (p_above2 + p_above1 + 1) >> 1; + k2 = (p_below2 + p_below1 + 1) >> 1; + k3 = (k1 + k2 + 1) >> 1; + v = (k3 + v + 1) >> 1; } - v = (kernel >> 3); - down_skip_convolve: p_dst[col] = v; } @@ -174,40 +176,34 @@ void vp8_post_proc_down_and_across_c p_src = dst_ptr; p_dst = dst_ptr; - for (i = -8; i<0; i++) - p_src[i]=p_src[0]; - - for (i = cols; i flimit) - goto across_skip_convolve; - - kernel += kernel5[2+i] * p_src[col+i]; + unsigned char k1, k2, k3; + k1 = (p_src[col - 2] + p_src[col - 1] + 1) >> 1; + k2 = (p_src[col + 2] + p_src[col + 1] + 1) >> 1; + k3 = (k1 + k2 + 1) >> 1; + v = (k3 + v + 1) >> 1; } - d[col&7] = (kernel >> 3); - across_skip_convolve: + d[col & 3] = v; if (col >= 2) - p_dst[col-2] = d[(col-2)&7]; + p_dst[col - 2] = d[(col - 2) & 3]; } /* handle the last two pixels */ - p_dst[col-2] = d[(col-2)&7]; - p_dst[col-1] = d[(col-1)&7]; + p_dst[col - 2] = d[(col - 2) & 3]; + p_dst[col - 1] = d[(col - 1) & 3]; /* next row */ src_ptr += src_pixels_per_line; @@ -318,28 +314,17 @@ void vp8_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols, i } } - -static void vp8_deblock_and_de_macro_block(YV12_BUFFER_CONFIG *source, - YV12_BUFFER_CONFIG *post, - int q, - int low_var_thresh, - int flag) +static void vp8_de_mblock(YV12_BUFFER_CONFIG *post, + int q) { - double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065; - int ppl = (int)(level + .5); - (void) low_var_thresh; - (void) flag; - - vp8_post_proc_down_and_across(source->y_buffer, post->y_buffer, source->y_stride, post->y_stride, source->y_height, source->y_width, ppl); - vp8_mbpost_proc_across_ip(post->y_buffer, post->y_stride, post->y_height, post->y_width, q2mbl(q)); - vp8_mbpost_proc_down(post->y_buffer, post->y_stride, post->y_height, post->y_width, q2mbl(q)); - - vp8_post_proc_down_and_across(source->u_buffer, post->u_buffer, source->uv_stride, post->uv_stride, source->uv_height, source->uv_width, ppl); - vp8_post_proc_down_and_across(source->v_buffer, post->v_buffer, source->uv_stride, post->uv_stride, source->uv_height, source->uv_width, ppl); - + vp8_mbpost_proc_across_ip(post->y_buffer, post->y_stride, post->y_height, + post->y_width, q2mbl(q)); + vp8_mbpost_proc_down(post->y_buffer, post->y_stride, post->y_height, + post->y_width, q2mbl(q)); } -void vp8_deblock(YV12_BUFFER_CONFIG *source, +void vp8_deblock(VP8_COMMON *cm, + YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *post, int q, int low_var_thresh, @@ -347,12 +332,58 @@ void vp8_deblock(YV12_BUFFER_CONFIG *source, { double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065; int ppl = (int)(level + .5); + + const MODE_INFO *mode_info_context = cm->mi; + int mbr, mbc; + + /* The pixel thresholds are adjusted according to if or not the macroblock + * is a skipped block. */ + unsigned char *ylimits = (unsigned char *)vpx_memalign(16, 16 * cm->mb_cols); + unsigned char *uvlimits = (unsigned char *)vpx_memalign(16, 8 * cm->mb_cols); (void) low_var_thresh; (void) flag; - vp8_post_proc_down_and_across(source->y_buffer, post->y_buffer, source->y_stride, post->y_stride, source->y_height, source->y_width, ppl); - vp8_post_proc_down_and_across(source->u_buffer, post->u_buffer, source->uv_stride, post->uv_stride, source->uv_height, source->uv_width, ppl); - vp8_post_proc_down_and_across(source->v_buffer, post->v_buffer, source->uv_stride, post->uv_stride, source->uv_height, source->uv_width, ppl); + if (ppl > 0) + { + for (mbr = 0; mbr < cm->mb_rows; mbr++) + { + unsigned char *ylptr = ylimits; + unsigned char *uvlptr = uvlimits; + for (mbc = 0; mbc < cm->mb_cols; mbc++) + { + unsigned char mb_ppl; + + if (mode_info_context->mbmi.mb_skip_coeff) + mb_ppl = (unsigned char)ppl >> 1; + else + mb_ppl = (unsigned char)ppl; + + vpx_memset(ylptr, mb_ppl, 16); + vpx_memset(uvlptr, mb_ppl, 8); + + ylptr += 16; + uvlptr += 8; + mode_info_context++; + } + mode_info_context++; + + vp8_post_proc_down_and_across_mb_row( + source->y_buffer + 16 * mbr * source->y_stride, + post->y_buffer + 16 * mbr * post->y_stride, source->y_stride, + post->y_stride, source->y_width, ylimits, 16); + + vp8_post_proc_down_and_across_mb_row( + source->u_buffer + 8 * mbr * source->uv_stride, + post->u_buffer + 8 * mbr * post->uv_stride, source->uv_stride, + post->uv_stride, source->uv_width, uvlimits, 8); + vp8_post_proc_down_and_across_mb_row( + source->v_buffer + 8 * mbr * source->uv_stride, + post->v_buffer + 8 * mbr * post->uv_stride, source->uv_stride, + post->uv_stride, source->uv_width, uvlimits, 8); + } + } + vpx_free(ylimits); + vpx_free(uvlimits); } #if !(CONFIG_TEMPORAL_DENOISING) @@ -364,33 +395,35 @@ void vp8_de_noise(YV12_BUFFER_CONFIG *source, { double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065; int ppl = (int)(level + .5); + int mb_rows = source->y_width >> 4; + int mb_cols = source->y_height >> 4; + unsigned char *limits = (unsigned char *)vpx_memalign(16, 16 * mb_cols); + int mbr, mbc; (void) post; (void) low_var_thresh; (void) flag; - vp8_post_proc_down_and_across( - source->y_buffer + 2 * source->y_stride + 2, - source->y_buffer + 2 * source->y_stride + 2, - source->y_stride, - source->y_stride, - source->y_height - 4, - source->y_width - 4, - ppl); - vp8_post_proc_down_and_across( - source->u_buffer + 2 * source->uv_stride + 2, - source->u_buffer + 2 * source->uv_stride + 2, - source->uv_stride, - source->uv_stride, - source->uv_height - 4, - source->uv_width - 4, ppl); - vp8_post_proc_down_and_across( - source->v_buffer + 2 * source->uv_stride + 2, - source->v_buffer + 2 * source->uv_stride + 2, - source->uv_stride, - source->uv_stride, - source->uv_height - 4, - source->uv_width - 4, ppl); + /* TODO: The original code don't filter the 2 outer rows and columns. */ + vpx_memset(limits, (unsigned char)ppl, 16 * mb_cols); + for (mbr = 0; mbr < mb_rows; mbr++) + { + vp8_post_proc_down_and_across_mb_row( + source->y_buffer + 16 * mbr * source->y_stride, + source->y_buffer + 16 * mbr * source->y_stride, + source->y_stride, source->y_stride, source->y_width, limits, 16); + + vp8_post_proc_down_and_across_mb_row( + source->u_buffer + 8 * mbr * source->uv_stride, + source->u_buffer + 8 * mbr * source->uv_stride, + source->uv_stride, source->uv_stride, source->uv_width, limits, 8); + vp8_post_proc_down_and_across_mb_row( + source->v_buffer + 8 * mbr * source->uv_stride, + source->v_buffer + 8 * mbr * source->uv_stride, + source->uv_stride, source->uv_stride, source->uv_width, limits, 8); + } + + vpx_free(limits); } #endif @@ -752,12 +785,14 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t vp8_yv12_copy_frame(&oci->post_proc_buffer, &oci->post_proc_buffer_int); if (flags & VP8D_DEMACROBLOCK) { - vp8_deblock_and_de_macro_block(&oci->post_proc_buffer_int, &oci->post_proc_buffer, + vp8_deblock(oci, &oci->post_proc_buffer_int, &oci->post_proc_buffer, q + (deblock_level - 5) * 10, 1, 0); + vp8_de_mblock(&oci->post_proc_buffer, + q + (deblock_level - 5) * 10); } else if (flags & VP8D_DEBLOCK) { - vp8_deblock(&oci->post_proc_buffer_int, &oci->post_proc_buffer, + vp8_deblock(oci, &oci->post_proc_buffer_int, &oci->post_proc_buffer, q, 1, 0); } } @@ -766,13 +801,15 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t } else if (flags & VP8D_DEMACROBLOCK) { - vp8_deblock_and_de_macro_block(oci->frame_to_show, &oci->post_proc_buffer, - q + (deblock_level - 5) * 10, 1, 0); + vp8_deblock(oci, oci->frame_to_show, &oci->post_proc_buffer, + q + (deblock_level - 5) * 10, 1, 0); + vp8_de_mblock(&oci->post_proc_buffer, q + (deblock_level - 5) * 10); + oci->postproc_state.last_base_qindex = oci->base_qindex; } else if (flags & VP8D_DEBLOCK) { - vp8_deblock(oci->frame_to_show, &oci->post_proc_buffer, + vp8_deblock(oci, oci->frame_to_show, &oci->post_proc_buffer, q, 1, 0); oci->postproc_state.last_base_qindex = oci->base_qindex; } diff --git a/vp8/common/postproc.h b/vp8/common/postproc.h index 6ac788cbd..a156398d2 100644 --- a/vp8/common/postproc.h +++ b/vp8/common/postproc.h @@ -36,7 +36,8 @@ void vp8_de_noise(YV12_BUFFER_CONFIG *source, int low_var_thresh, int flag); -void vp8_deblock(YV12_BUFFER_CONFIG *source, +void vp8_deblock(struct VP8Common *oci, + YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *post, int q, int low_var_thresh, diff --git a/vp8/common/ppc/systemdependent.c b/vp8/common/ppc/systemdependent.c index 7046a63e8..87f4cac72 100644 --- a/vp8/common/ppc/systemdependent.c +++ b/vp8/common/ppc/systemdependent.c @@ -19,14 +19,14 @@ void (*vp8_short_idct4x4)(short *input, short *output, int pitch); void (*vp8_short_idct4x4_1)(short *input, short *output, int pitch); void (*vp8_dc_only_idct)(short input_dc, short *output, int pitch); -extern void (*vp8_post_proc_down_and_across)( +extern void (*vp8_post_proc_down_and_across_mb_row)( unsigned char *src_ptr, unsigned char *dst_ptr, int src_pixels_per_line, int dst_pixels_per_line, - int rows, int cols, - int flimit + unsigned char *f, + int size ); extern void (*vp8_mbpost_proc_down)(unsigned char *dst, int pitch, int rows, int cols, int flimit); @@ -34,15 +34,15 @@ extern void vp8_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int extern void (*vp8_mbpost_proc_across_ip)(unsigned char *src, int pitch, int rows, int cols, int flimit); extern void vp8_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows, int cols, int flimit); -extern void vp8_post_proc_down_and_across_c +extern void vp8_post_proc_down_and_across_mb_row_c ( unsigned char *src_ptr, unsigned char *dst_ptr, int src_pixels_per_line, int dst_pixels_per_line, - int rows, int cols, - int flimit + unsigned char *f, + int size ); void vp8_plane_add_noise_c(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a); @@ -158,7 +158,7 @@ void vp8_machine_specific_config(void) vp8_lf_mbhsimple = loop_filter_mbhs_ppc; vp8_lf_bhsimple = loop_filter_bhs_ppc; - vp8_post_proc_down_and_across = vp8_post_proc_down_and_across_c; + vp8_post_proc_down_and_across_mb_row = vp8_post_proc_down_and_across_mb_row_c; vp8_mbpost_proc_down = vp8_mbpost_proc_down_c; vp8_mbpost_proc_across_ip = vp8_mbpost_proc_across_ip_c; vp8_plane_add_noise = vp8_plane_add_noise_c; diff --git a/vp8/common/rtcd_defs.sh b/vp8/common/rtcd_defs.sh index f0bdf29be..0f950f8ab 100644 --- a/vp8/common/rtcd_defs.sh +++ b/vp8/common/rtcd_defs.sh @@ -162,9 +162,8 @@ if [ "$CONFIG_POSTPROC" = "yes" ]; then specialize vp8_mbpost_proc_across_ip sse2 vp8_mbpost_proc_across_ip_sse2=vp8_mbpost_proc_across_ip_xmm - prototype void vp8_post_proc_down_and_across "unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int rows, int cols, int flimit" - specialize vp8_post_proc_down_and_across mmx sse2 - vp8_post_proc_down_and_across_sse2=vp8_post_proc_down_and_across_xmm + prototype void vp8_post_proc_down_and_across_mb_row "unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size" + specialize vp8_post_proc_down_and_across_mb_row sse2 prototype void vp8_plane_add_noise "unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch" specialize vp8_plane_add_noise mmx sse2 diff --git a/vp8/common/x86/postproc_mmx.asm b/vp8/common/x86/postproc_mmx.asm index 534f2967a..966c586e4 100644 --- a/vp8/common/x86/postproc_mmx.asm +++ b/vp8/common/x86/postproc_mmx.asm @@ -14,271 +14,6 @@ %define VP8_FILTER_WEIGHT 128 %define VP8_FILTER_SHIFT 7 -;void vp8_post_proc_down_and_across_mmx -;( -; unsigned char *src_ptr, -; unsigned char *dst_ptr, -; int src_pixels_per_line, -; int dst_pixels_per_line, -; int rows, -; int cols, -; int flimit -;) -global sym(vp8_post_proc_down_and_across_mmx) PRIVATE -sym(vp8_post_proc_down_and_across_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - -%if ABI_IS_32BIT=1 && CONFIG_PIC=1 - ; move the global rd onto the stack, since we don't have enough registers - ; to do PIC addressing - movq mm0, [GLOBAL(rd)] - sub rsp, 8 - movq [rsp], mm0 -%define RD [rsp] -%else -%define RD [GLOBAL(rd)] -%endif - - push rbx - lea rbx, [GLOBAL(Blur)] - movd mm2, dword ptr arg(6) ;flimit - punpcklwd mm2, mm2 - punpckldq mm2, mm2 - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(1) ;dst_ptr - - movsxd rcx, DWORD PTR arg(4) ;rows - movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch? - pxor mm0, mm0 ; mm0 = 00000000 - -.nextrow: - - xor rdx, rdx ; clear out rdx for use as loop counter -.nextcol: - - pxor mm7, mm7 ; mm7 = 00000000 - movq mm6, [rbx + 32 ] ; mm6 = kernel 2 taps - movq mm3, [rsi] ; mm4 = r0 p0..p7 - punpcklbw mm3, mm0 ; mm3 = p0..p3 - movq mm1, mm3 ; mm1 = p0..p3 - pmullw mm3, mm6 ; mm3 *= kernel 2 modifiers - - movq mm6, [rbx + 48] ; mm6 = kernel 3 taps - movq mm5, [rsi + rax] ; mm4 = r1 p0..p7 - punpcklbw mm5, mm0 ; mm5 = r1 p0..p3 - pmullw mm6, mm5 ; mm6 *= p0..p3 * kernel 3 modifiers - paddusw mm3, mm6 ; mm3 += mm6 - - ; thresholding - movq mm7, mm1 ; mm7 = r0 p0..p3 - psubusw mm7, mm5 ; mm7 = r0 p0..p3 - r1 p0..p3 - psubusw mm5, mm1 ; mm5 = r1 p0..p3 - r0 p0..p3 - paddusw mm7, mm5 ; mm7 = abs(r0 p0..p3 - r1 p0..p3) - pcmpgtw mm7, mm2 - - movq mm6, [rbx + 64 ] ; mm6 = kernel 4 modifiers - movq mm5, [rsi + 2*rax] ; mm4 = r2 p0..p7 - punpcklbw mm5, mm0 ; mm5 = r2 p0..p3 - pmullw mm6, mm5 ; mm5 *= kernel 4 modifiers - paddusw mm3, mm6 ; mm3 += mm5 - - ; thresholding - movq mm6, mm1 ; mm6 = r0 p0..p3 - psubusw mm6, mm5 ; mm6 = r0 p0..p3 - r2 p0..p3 - psubusw mm5, mm1 ; mm5 = r2 p0..p3 - r2 p0..p3 - paddusw mm6, mm5 ; mm6 = abs(r0 p0..p3 - r2 p0..p3) - pcmpgtw mm6, mm2 - por mm7, mm6 ; accumulate thresholds - - - neg rax - movq mm6, [rbx ] ; kernel 0 taps - movq mm5, [rsi+2*rax] ; mm4 = r-2 p0..p7 - punpcklbw mm5, mm0 ; mm5 = r-2 p0..p3 - pmullw mm6, mm5 ; mm5 *= kernel 0 modifiers - paddusw mm3, mm6 ; mm3 += mm5 - - ; thresholding - movq mm6, mm1 ; mm6 = r0 p0..p3 - psubusw mm6, mm5 ; mm6 = p0..p3 - r-2 p0..p3 - psubusw mm5, mm1 ; mm5 = r-2 p0..p3 - p0..p3 - paddusw mm6, mm5 ; mm6 = abs(r0 p0..p3 - r-2 p0..p3) - pcmpgtw mm6, mm2 - por mm7, mm6 ; accumulate thresholds - - movq mm6, [rbx + 16] ; kernel 1 taps - movq mm4, [rsi+rax] ; mm4 = r-1 p0..p7 - punpcklbw mm4, mm0 ; mm4 = r-1 p0..p3 - pmullw mm6, mm4 ; mm4 *= kernel 1 modifiers. - paddusw mm3, mm6 ; mm3 += mm5 - - ; thresholding - movq mm6, mm1 ; mm6 = r0 p0..p3 - psubusw mm6, mm4 ; mm6 = p0..p3 - r-2 p0..p3 - psubusw mm4, mm1 ; mm5 = r-1 p0..p3 - p0..p3 - paddusw mm6, mm4 ; mm6 = abs(r0 p0..p3 - r-1 p0..p3) - pcmpgtw mm6, mm2 - por mm7, mm6 ; accumulate thresholds - - - paddusw mm3, RD ; mm3 += round value - psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128 - - pand mm1, mm7 ; mm1 select vals > thresh from source - pandn mm7, mm3 ; mm7 select vals < thresh from blurred result - paddusw mm1, mm7 ; combination - - packuswb mm1, mm0 ; pack to bytes - - movd [rdi], mm1 ; - neg rax ; pitch is positive - - - add rsi, 4 - add rdi, 4 - add rdx, 4 - - cmp edx, dword ptr arg(5) ;cols - jl .nextcol - ; done with the all cols, start the across filtering in place - sub rsi, rdx - sub rdi, rdx - - ; dup the first byte into the left border 8 times - movq mm1, [rdi] - punpcklbw mm1, mm1 - punpcklwd mm1, mm1 - punpckldq mm1, mm1 - - mov rdx, -8 - movq [rdi+rdx], mm1 - - ; dup the last byte into the right border - movsxd rdx, dword arg(5) - movq mm1, [rdi + rdx + -1] - punpcklbw mm1, mm1 - punpcklwd mm1, mm1 - punpckldq mm1, mm1 - movq [rdi+rdx], mm1 - - - push rax - xor rdx, rdx - mov rax, [rdi-4]; - -.acrossnextcol: - pxor mm7, mm7 ; mm7 = 00000000 - movq mm6, [rbx + 32 ] ; - movq mm4, [rdi+rdx] ; mm4 = p0..p7 - movq mm3, mm4 ; mm3 = p0..p7 - punpcklbw mm3, mm0 ; mm3 = p0..p3 - movq mm1, mm3 ; mm1 = p0..p3 - pmullw mm3, mm6 ; mm3 *= kernel 2 modifiers - - movq mm6, [rbx + 48] - psrlq mm4, 8 ; mm4 = p1..p7 - movq mm5, mm4 ; mm5 = p1..p7 - punpcklbw mm5, mm0 ; mm5 = p1..p4 - pmullw mm6, mm5 ; mm6 *= p1..p4 * kernel 3 modifiers - paddusw mm3, mm6 ; mm3 += mm6 - - ; thresholding - movq mm7, mm1 ; mm7 = p0..p3 - psubusw mm7, mm5 ; mm7 = p0..p3 - p1..p4 - psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3 - paddusw mm7, mm5 ; mm7 = abs(p0..p3 - p1..p4) - pcmpgtw mm7, mm2 - - movq mm6, [rbx + 64 ] - psrlq mm4, 8 ; mm4 = p2..p7 - movq mm5, mm4 ; mm5 = p2..p7 - punpcklbw mm5, mm0 ; mm5 = p2..p5 - pmullw mm6, mm5 ; mm5 *= kernel 4 modifiers - paddusw mm3, mm6 ; mm3 += mm5 - - ; thresholding - movq mm6, mm1 ; mm6 = p0..p3 - psubusw mm6, mm5 ; mm6 = p0..p3 - p1..p4 - psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3 - paddusw mm6, mm5 ; mm6 = abs(p0..p3 - p1..p4) - pcmpgtw mm6, mm2 - por mm7, mm6 ; accumulate thresholds - - - movq mm6, [rbx ] - movq mm4, [rdi+rdx-2] ; mm4 = p-2..p5 - movq mm5, mm4 ; mm5 = p-2..p5 - punpcklbw mm5, mm0 ; mm5 = p-2..p1 - pmullw mm6, mm5 ; mm5 *= kernel 0 modifiers - paddusw mm3, mm6 ; mm3 += mm5 - - ; thresholding - movq mm6, mm1 ; mm6 = p0..p3 - psubusw mm6, mm5 ; mm6 = p0..p3 - p1..p4 - psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3 - paddusw mm6, mm5 ; mm6 = abs(p0..p3 - p1..p4) - pcmpgtw mm6, mm2 - por mm7, mm6 ; accumulate thresholds - - movq mm6, [rbx + 16] - psrlq mm4, 8 ; mm4 = p-1..p5 - punpcklbw mm4, mm0 ; mm4 = p-1..p2 - pmullw mm6, mm4 ; mm4 *= kernel 1 modifiers. - paddusw mm3, mm6 ; mm3 += mm5 - - ; thresholding - movq mm6, mm1 ; mm6 = p0..p3 - psubusw mm6, mm4 ; mm6 = p0..p3 - p1..p4 - psubusw mm4, mm1 ; mm5 = p1..p4 - p0..p3 - paddusw mm6, mm4 ; mm6 = abs(p0..p3 - p1..p4) - pcmpgtw mm6, mm2 - por mm7, mm6 ; accumulate thresholds - - paddusw mm3, RD ; mm3 += round value - psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128 - - pand mm1, mm7 ; mm1 select vals > thresh from source - pandn mm7, mm3 ; mm7 select vals < thresh from blurred result - paddusw mm1, mm7 ; combination - - packuswb mm1, mm0 ; pack to bytes - mov DWORD PTR [rdi+rdx-4], eax ; store previous four bytes - movd eax, mm1 - - add rdx, 4 - cmp edx, dword ptr arg(5) ;cols - jl .acrossnextcol; - - mov DWORD PTR [rdi+rdx-4], eax - pop rax - - ; done with this rwo - add rsi,rax ; next line - movsxd rax, dword ptr arg(3) ;dst_pixels_per_line ; destination pitch? - add rdi,rax ; next destination - movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; destination pitch? - - dec rcx ; decrement count - jnz .nextrow ; next row - pop rbx - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret -%undef RD - - ;void vp8_mbpost_proc_down_mmx(unsigned char *dst, ; int pitch, int rows, int cols,int flimit) extern sym(vp8_rv) diff --git a/vp8/common/x86/postproc_sse2.asm b/vp8/common/x86/postproc_sse2.asm index bf36b0d7f..25c32e148 100644 --- a/vp8/common/x86/postproc_sse2.asm +++ b/vp8/common/x86/postproc_sse2.asm @@ -11,146 +11,158 @@ %include "vpx_ports/x86_abi_support.asm" -;void vp8_post_proc_down_and_across_xmm +;macro in deblock functions +%macro FIRST_2_ROWS 0 + movdqa xmm4, xmm0 + movdqa xmm6, xmm0 + movdqa xmm5, xmm1 + pavgb xmm5, xmm3 + + ;calculate absolute value + psubusb xmm4, xmm1 + psubusb xmm1, xmm0 + psubusb xmm6, xmm3 + psubusb xmm3, xmm0 + paddusb xmm4, xmm1 + paddusb xmm6, xmm3 + + ;get threshold + movdqa xmm2, flimit + pxor xmm1, xmm1 + movdqa xmm7, xmm2 + + ;get mask + psubusb xmm2, xmm4 + psubusb xmm7, xmm6 + pcmpeqb xmm2, xmm1 + pcmpeqb xmm7, xmm1 + por xmm7, xmm2 +%endmacro + +%macro SECOND_2_ROWS 0 + movdqa xmm6, xmm0 + movdqa xmm4, xmm0 + movdqa xmm2, xmm1 + pavgb xmm1, xmm3 + + ;calculate absolute value + psubusb xmm6, xmm2 + psubusb xmm2, xmm0 + psubusb xmm4, xmm3 + psubusb xmm3, xmm0 + paddusb xmm6, xmm2 + paddusb xmm4, xmm3 + + pavgb xmm5, xmm1 + + ;get threshold + movdqa xmm2, flimit + pxor xmm1, xmm1 + movdqa xmm3, xmm2 + + ;get mask + psubusb xmm2, xmm6 + psubusb xmm3, xmm4 + pcmpeqb xmm2, xmm1 + pcmpeqb xmm3, xmm1 + + por xmm7, xmm2 + por xmm7, xmm3 + + pavgb xmm5, xmm0 + + ;decide if or not to use filtered value + pand xmm0, xmm7 + pandn xmm7, xmm5 + paddusb xmm0, xmm7 +%endmacro + +%macro UPDATE_FLIMIT 0 + movdqa xmm2, XMMWORD PTR [rbx] + movdqa [rsp], xmm2 + add rbx, 16 +%endmacro + +;void vp8_post_proc_down_and_across_mb_row_sse2 ;( ; unsigned char *src_ptr, ; unsigned char *dst_ptr, ; int src_pixels_per_line, ; int dst_pixels_per_line, -; int rows, ; int cols, -; int flimit +; int *flimits, +; int size ;) -global sym(vp8_post_proc_down_and_across_xmm) PRIVATE -sym(vp8_post_proc_down_and_across_xmm): +global sym(vp8_post_proc_down_and_across_mb_row_sse2) PRIVATE +sym(vp8_post_proc_down_and_across_mb_row_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 SAVE_XMM 7 - GET_GOT rbx + push rbx push rsi push rdi ; end prolog - -%if ABI_IS_32BIT=1 && CONFIG_PIC=1 ALIGN_STACK 16, rax - ; move the global rd onto the stack, since we don't have enough registers - ; to do PIC addressing - movdqa xmm0, [GLOBAL(rd42)] sub rsp, 16 - movdqa [rsp], xmm0 -%define RD42 [rsp] -%else -%define RD42 [GLOBAL(rd42)] -%endif - - movd xmm2, dword ptr arg(6) ;flimit - punpcklwd xmm2, xmm2 - punpckldq xmm2, xmm2 - punpcklqdq xmm2, xmm2 + ; put flimit on stack + mov rbx, arg(5) ;flimits ptr + UPDATE_FLIMIT - mov rsi, arg(0) ;src_ptr - mov rdi, arg(1) ;dst_ptr +%define flimit [rsp] - movsxd rcx, DWORD PTR arg(4) ;rows - movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch? - pxor xmm0, xmm0 ; mm0 = 00000000 + mov rsi, arg(0) ;src_ptr + mov rdi, arg(1) ;dst_ptr + movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line + movsxd rcx, DWORD PTR arg(6) ;rows in a macroblock .nextrow: - - xor rdx, rdx ; clear out rdx for use as loop counter + xor rdx, rdx ;col .nextcol: - movq xmm3, QWORD PTR [rsi] ; mm4 = r0 p0..p7 - punpcklbw xmm3, xmm0 ; mm3 = p0..p3 - movdqa xmm1, xmm3 ; mm1 = p0..p3 - psllw xmm3, 2 ; - - movq xmm5, QWORD PTR [rsi + rax] ; mm4 = r1 p0..p7 - punpcklbw xmm5, xmm0 ; mm5 = r1 p0..p3 - paddusw xmm3, xmm5 ; mm3 += mm6 - - ; thresholding - movdqa xmm7, xmm1 ; mm7 = r0 p0..p3 - psubusw xmm7, xmm5 ; mm7 = r0 p0..p3 - r1 p0..p3 - psubusw xmm5, xmm1 ; mm5 = r1 p0..p3 - r0 p0..p3 - paddusw xmm7, xmm5 ; mm7 = abs(r0 p0..p3 - r1 p0..p3) - pcmpgtw xmm7, xmm2 - - movq xmm5, QWORD PTR [rsi + 2*rax] ; mm4 = r2 p0..p7 - punpcklbw xmm5, xmm0 ; mm5 = r2 p0..p3 - paddusw xmm3, xmm5 ; mm3 += mm5 - - ; thresholding - movdqa xmm6, xmm1 ; mm6 = r0 p0..p3 - psubusw xmm6, xmm5 ; mm6 = r0 p0..p3 - r2 p0..p3 - psubusw xmm5, xmm1 ; mm5 = r2 p0..p3 - r2 p0..p3 - paddusw xmm6, xmm5 ; mm6 = abs(r0 p0..p3 - r2 p0..p3) - pcmpgtw xmm6, xmm2 - por xmm7, xmm6 ; accumulate thresholds + ;load current and next 2 rows + movdqu xmm0, XMMWORD PTR [rsi] + movdqu xmm1, XMMWORD PTR [rsi + rax] + movdqu xmm3, XMMWORD PTR [rsi + 2*rax] + FIRST_2_ROWS + ;load above 2 rows neg rax - movq xmm5, QWORD PTR [rsi+2*rax] ; mm4 = r-2 p0..p7 - punpcklbw xmm5, xmm0 ; mm5 = r-2 p0..p3 - paddusw xmm3, xmm5 ; mm3 += mm5 - - ; thresholding - movdqa xmm6, xmm1 ; mm6 = r0 p0..p3 - psubusw xmm6, xmm5 ; mm6 = p0..p3 - r-2 p0..p3 - psubusw xmm5, xmm1 ; mm5 = r-2 p0..p3 - p0..p3 - paddusw xmm6, xmm5 ; mm6 = abs(r0 p0..p3 - r-2 p0..p3) - pcmpgtw xmm6, xmm2 - por xmm7, xmm6 ; accumulate thresholds - - movq xmm4, QWORD PTR [rsi+rax] ; mm4 = r-1 p0..p7 - punpcklbw xmm4, xmm0 ; mm4 = r-1 p0..p3 - paddusw xmm3, xmm4 ; mm3 += mm5 - - ; thresholding - movdqa xmm6, xmm1 ; mm6 = r0 p0..p3 - psubusw xmm6, xmm4 ; mm6 = p0..p3 - r-2 p0..p3 - psubusw xmm4, xmm1 ; mm5 = r-1 p0..p3 - p0..p3 - paddusw xmm6, xmm4 ; mm6 = abs(r0 p0..p3 - r-1 p0..p3) - pcmpgtw xmm6, xmm2 - por xmm7, xmm6 ; accumulate thresholds - - - paddusw xmm3, RD42 ; mm3 += round value - psraw xmm3, 3 ; mm3 /= 8 + movdqu xmm1, XMMWORD PTR [rsi + 2*rax] + movdqu xmm3, XMMWORD PTR [rsi + rax] - pand xmm1, xmm7 ; mm1 select vals > thresh from source - pandn xmm7, xmm3 ; mm7 select vals < thresh from blurred result - paddusw xmm1, xmm7 ; combination + SECOND_2_ROWS - packuswb xmm1, xmm0 ; pack to bytes - movq QWORD PTR [rdi], xmm1 ; + movdqu XMMWORD PTR [rdi], xmm0 - neg rax ; pitch is positive - add rsi, 8 - add rdi, 8 + neg rax ; positive stride + add rsi, 16 + add rdi, 16 - add rdx, 8 - cmp edx, dword arg(5) ;cols + UPDATE_FLIMIT + add rdx, 16 + cmp edx, dword arg(4) ;cols jl .nextcol ; done with the all cols, start the across filtering in place sub rsi, rdx sub rdi, rdx + mov rbx, arg(5) ; flimits + UPDATE_FLIMIT ; dup the first byte into the left border 8 times movq mm1, [rdi] punpcklbw mm1, mm1 punpcklwd mm1, mm1 punpckldq mm1, mm1 - mov rdx, -8 movq [rdi+rdx], mm1 ; dup the last byte into the right border - movsxd rdx, dword arg(5) + movsxd rdx, dword arg(4) movq mm1, [rdi + rdx + -1] punpcklbw mm1, mm1 punpcklwd mm1, mm1 @@ -158,113 +170,63 @@ sym(vp8_post_proc_down_and_across_xmm): movq [rdi+rdx], mm1 xor rdx, rdx - movq mm0, QWORD PTR [rdi-8]; + movq mm0, QWORD PTR [rdi-16]; + movq mm1, QWORD PTR [rdi-8]; .acrossnextcol: - movq xmm7, QWORD PTR [rdi +rdx -2] - movd xmm4, DWORD PTR [rdi +rdx +6] - - pslldq xmm4, 8 - por xmm4, xmm7 - - movdqa xmm3, xmm4 - psrldq xmm3, 2 - punpcklbw xmm3, xmm0 ; mm3 = p0..p3 - movdqa xmm1, xmm3 ; mm1 = p0..p3 - psllw xmm3, 2 - - - movdqa xmm5, xmm4 - psrldq xmm5, 3 - punpcklbw xmm5, xmm0 ; mm5 = p1..p4 - paddusw xmm3, xmm5 ; mm3 += mm6 - - ; thresholding - movdqa xmm7, xmm1 ; mm7 = p0..p3 - psubusw xmm7, xmm5 ; mm7 = p0..p3 - p1..p4 - psubusw xmm5, xmm1 ; mm5 = p1..p4 - p0..p3 - paddusw xmm7, xmm5 ; mm7 = abs(p0..p3 - p1..p4) - pcmpgtw xmm7, xmm2 - - movdqa xmm5, xmm4 - psrldq xmm5, 4 - punpcklbw xmm5, xmm0 ; mm5 = p2..p5 - paddusw xmm3, xmm5 ; mm3 += mm5 - - ; thresholding - movdqa xmm6, xmm1 ; mm6 = p0..p3 - psubusw xmm6, xmm5 ; mm6 = p0..p3 - p1..p4 - psubusw xmm5, xmm1 ; mm5 = p1..p4 - p0..p3 - paddusw xmm6, xmm5 ; mm6 = abs(p0..p3 - p1..p4) - pcmpgtw xmm6, xmm2 - por xmm7, xmm6 ; accumulate thresholds - - - movdqa xmm5, xmm4 ; mm5 = p-2..p5 - punpcklbw xmm5, xmm0 ; mm5 = p-2..p1 - paddusw xmm3, xmm5 ; mm3 += mm5 - - ; thresholding - movdqa xmm6, xmm1 ; mm6 = p0..p3 - psubusw xmm6, xmm5 ; mm6 = p0..p3 - p1..p4 - psubusw xmm5, xmm1 ; mm5 = p1..p4 - p0..p3 - paddusw xmm6, xmm5 ; mm6 = abs(p0..p3 - p1..p4) - pcmpgtw xmm6, xmm2 - por xmm7, xmm6 ; accumulate thresholds - - psrldq xmm4, 1 ; mm4 = p-1..p5 - punpcklbw xmm4, xmm0 ; mm4 = p-1..p2 - paddusw xmm3, xmm4 ; mm3 += mm5 - - ; thresholding - movdqa xmm6, xmm1 ; mm6 = p0..p3 - psubusw xmm6, xmm4 ; mm6 = p0..p3 - p1..p4 - psubusw xmm4, xmm1 ; mm5 = p1..p4 - p0..p3 - paddusw xmm6, xmm4 ; mm6 = abs(p0..p3 - p1..p4) - pcmpgtw xmm6, xmm2 - por xmm7, xmm6 ; accumulate thresholds - - paddusw xmm3, RD42 ; mm3 += round value - psraw xmm3, 3 ; mm3 /= 8 - - pand xmm1, xmm7 ; mm1 select vals > thresh from source - pandn xmm7, xmm3 ; mm7 select vals < thresh from blurred result - paddusw xmm1, xmm7 ; combination - - packuswb xmm1, xmm0 ; pack to bytes - movq QWORD PTR [rdi+rdx-8], mm0 ; store previous four bytes - movdq2q mm0, xmm1 - - add rdx, 8 - cmp edx, dword arg(5) ;cols + movdqu xmm0, XMMWORD PTR [rdi + rdx] + movdqu xmm1, XMMWORD PTR [rdi + rdx -2] + movdqu xmm3, XMMWORD PTR [rdi + rdx -1] + + FIRST_2_ROWS + + movdqu xmm1, XMMWORD PTR [rdi + rdx +1] + movdqu xmm3, XMMWORD PTR [rdi + rdx +2] + + SECOND_2_ROWS + + movq QWORD PTR [rdi+rdx-16], mm0 ; store previous 8 bytes + movq QWORD PTR [rdi+rdx-8], mm1 ; store previous 8 bytes + movdq2q mm0, xmm0 + psrldq xmm0, 8 + movdq2q mm1, xmm0 + + UPDATE_FLIMIT + + add rdx, 16 + cmp edx, dword arg(4) ;cols jl .acrossnextcol; - ; last 8 pixels - movq QWORD PTR [rdi+rdx-8], mm0 + ; last 16 pixels + movq QWORD PTR [rdi+rdx-16], mm0 + cmp edx, dword arg(4) + jne .throw_last_8 + movq QWORD PTR [rdi+rdx-8], mm1 +.throw_last_8: ; done with this rwo - add rsi,rax ; next line - mov eax, dword arg(3) ;dst_pixels_per_line ; destination pitch? - add rdi,rax ; next destination - mov eax, dword arg(2) ;src_pixels_per_line ; destination pitch? + add rsi,rax ;next src line + mov eax, dword arg(3) ;dst_pixels_per_line + add rdi,rax ;next destination + mov eax, dword arg(2) ;src_pixels_per_line - dec rcx ; decrement count - jnz .nextrow ; next row + mov rbx, arg(5) ;flimits + UPDATE_FLIMIT -%if ABI_IS_32BIT=1 && CONFIG_PIC=1 - add rsp,16 + dec rcx ;decrement count + jnz .nextrow ;next row + + add rsp, 16 pop rsp -%endif ; begin epilog pop rdi pop rsi - RESTORE_GOT + pop rbx RESTORE_XMM UNSHADOW_ARGS pop rbp ret -%undef RD42 - +%undef flimit ;void vp8_mbpost_proc_down_xmm(unsigned char *dst, ; int pitch, int rows, int cols,int flimit) @@ -753,7 +715,5 @@ sym(vp8_plane_add_noise_wmt): SECTION_RODATA align 16 -rd42: - times 8 dw 0x04 four8s: times 4 dd 8 diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c index b4e02e22c..c70e42c6f 100644 --- a/vp8/encoder/onyx_if.c +++ b/vp8/encoder/onyx_if.c @@ -5301,7 +5301,7 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned l double frame_psnr2, frame_ssim2 = 0; double weight = 0; - vp8_deblock(cm->frame_to_show, &cm->post_proc_buffer, cm->filter_level * 10 / 6, 1, 0); + vp8_deblock(cm, cm->frame_to_show, &cm->post_proc_buffer, cm->filter_level * 10 / 6, 1, 0); vp8_clear_system_state(); ye = calc_plane_error(orig->y_buffer, orig->y_stride, -- cgit v1.2.3