summaryrefslogtreecommitdiff
path: root/vp8
diff options
context:
space:
mode:
authorYunqing Wang <yunqingwang@google.com>2012-09-28 10:13:07 -0700
committerYunqing Wang <yunqingwang@google.com>2012-10-08 12:06:44 -0700
commit4c53bacce4a97d98a4e73262bb3517d38ddd3514 (patch)
tree15f52f2961d64c89a246862bf59f149eee68e7dd /vp8
parent9704cdec9fb777833599312fc84e3f1311d25eed (diff)
downloadlibvpx-4c53bacce4a97d98a4e73262bb3517d38ddd3514.tar
libvpx-4c53bacce4a97d98a4e73262bb3517d38ddd3514.tar.gz
libvpx-4c53bacce4a97d98a4e73262bb3517d38ddd3514.tar.bz2
libvpx-4c53bacce4a97d98a4e73262bb3517d38ddd3514.zip
post-proc: deblock filter optimization
1. Algorithm modification: Instead of having same filter threshold for a whole frame, now we allow the thresholds to be adjusted for each macroblock. In current implementation, to avoid excessive blur on background as reported in issue480(http://code.google.com/p/webm/issues/detail?id=480), we reduce the thresholds for skipped macroblocks. 2. SSE2 optimization: As started in issue479(http://code.google.com/p/webm/issues/detail?id=479), the filter calculation was adjusted for better performance. The c code was also modified accordingly. This made the deblock filter 2x faster, and the decoder was 1.2x faster overall. Next, the demacroblock filter will be modified similarly. Change-Id: I05e54c3f580ccd427487d085096b3174f2ab7e86
Diffstat (limited to 'vp8')
-rw-r--r--vp8/common/postproc.c209
-rw-r--r--vp8/common/postproc.h3
-rw-r--r--vp8/common/ppc/systemdependent.c14
-rw-r--r--vp8/common/rtcd_defs.sh5
-rw-r--r--vp8/common/x86/postproc_mmx.asm265
-rw-r--r--vp8/common/x86/postproc_sse2.asm334
-rw-r--r--vp8/encoder/onyx_if.c2
7 files changed, 282 insertions, 550 deletions
diff --git a/vp8/common/postproc.c b/vp8/common/postproc.c
index 444898784..752292eff 100644
--- a/vp8/common/postproc.c
+++ b/vp8/common/postproc.c
@@ -127,25 +127,24 @@ extern void vp8_blit_text(const char *msg, unsigned char *address, const int pit
extern void vp8_blit_line(int x0, int x1, int y0, int y1, unsigned char *image, const int pitch);
/***********************************************************************************************************
*/
-void vp8_post_proc_down_and_across_c
+void vp8_post_proc_down_and_across_mb_row_c
(
unsigned char *src_ptr,
unsigned char *dst_ptr,
int src_pixels_per_line,
int dst_pixels_per_line,
- int rows,
int cols,
- int flimit
+ unsigned char *f,
+ int size
)
{
unsigned char *p_src, *p_dst;
int row;
int col;
- int i;
- int v;
- unsigned char d[8];
+ unsigned char v;
+ unsigned char d[4];
- for (row = 0; row < rows; row++)
+ for (row = 0; row < size; row++)
{
/* post_proc_down for one row */
p_src = src_ptr;
@@ -153,20 +152,23 @@ void vp8_post_proc_down_and_across_c
for (col = 0; col < cols; col++)
{
+ unsigned char p_above2 = p_src[col - 2 * src_pixels_per_line];
+ unsigned char p_above1 = p_src[col - src_pixels_per_line];
+ unsigned char p_below1 = p_src[col + src_pixels_per_line];
+ unsigned char p_below2 = p_src[col + 2 * src_pixels_per_line];
- int kernel = 4;
- int v = p_src[col];
+ v = p_src[col];
- for (i = -2; i <= 2; i++)
+ if ((abs(v - p_above2) < f[col]) && (abs(v - p_above1) < f[col])
+ && (abs(v - p_below1) < f[col]) && (abs(v - p_below2) < f[col]))
{
- if (abs(v - p_src[col+i*src_pixels_per_line]) > flimit)
- goto down_skip_convolve;
-
- kernel += kernel5[2+i] * p_src[col+i*src_pixels_per_line];
+ unsigned char k1, k2, k3;
+ k1 = (p_above2 + p_above1 + 1) >> 1;
+ k2 = (p_below2 + p_below1 + 1) >> 1;
+ k3 = (k1 + k2 + 1) >> 1;
+ v = (k3 + v + 1) >> 1;
}
- v = (kernel >> 3);
- down_skip_convolve:
p_dst[col] = v;
}
@@ -174,40 +176,34 @@ void vp8_post_proc_down_and_across_c
p_src = dst_ptr;
p_dst = dst_ptr;
- for (i = -8; i<0; i++)
- p_src[i]=p_src[0];
-
- for (i = cols; i<cols+8; i++)
- p_src[i]=p_src[cols-1];
-
- for (i = 0; i < 8; i++)
- d[i] = p_src[i];
+ p_src[-2] = p_src[-1] = p_src[0];
+ p_src[cols] = p_src[cols + 1] = p_src[cols - 1];
for (col = 0; col < cols; col++)
{
- int kernel = 4;
v = p_src[col];
- d[col&7] = v;
-
- for (i = -2; i <= 2; i++)
+ if ((abs(v - p_src[col - 2]) < f[col])
+ && (abs(v - p_src[col - 1]) < f[col])
+ && (abs(v - p_src[col + 1]) < f[col])
+ && (abs(v - p_src[col + 2]) < f[col]))
{
- if (abs(v - p_src[col+i]) > flimit)
- goto across_skip_convolve;
-
- kernel += kernel5[2+i] * p_src[col+i];
+ unsigned char k1, k2, k3;
+ k1 = (p_src[col - 2] + p_src[col - 1] + 1) >> 1;
+ k2 = (p_src[col + 2] + p_src[col + 1] + 1) >> 1;
+ k3 = (k1 + k2 + 1) >> 1;
+ v = (k3 + v + 1) >> 1;
}
- d[col&7] = (kernel >> 3);
- across_skip_convolve:
+ d[col & 3] = v;
if (col >= 2)
- p_dst[col-2] = d[(col-2)&7];
+ p_dst[col - 2] = d[(col - 2) & 3];
}
/* handle the last two pixels */
- p_dst[col-2] = d[(col-2)&7];
- p_dst[col-1] = d[(col-1)&7];
+ p_dst[col - 2] = d[(col - 2) & 3];
+ p_dst[col - 1] = d[(col - 1) & 3];
/* next row */
src_ptr += src_pixels_per_line;
@@ -318,28 +314,17 @@ void vp8_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols, i
}
}
-
-static void vp8_deblock_and_de_macro_block(YV12_BUFFER_CONFIG *source,
- YV12_BUFFER_CONFIG *post,
- int q,
- int low_var_thresh,
- int flag)
+static void vp8_de_mblock(YV12_BUFFER_CONFIG *post,
+ int q)
{
- double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;
- int ppl = (int)(level + .5);
- (void) low_var_thresh;
- (void) flag;
-
- vp8_post_proc_down_and_across(source->y_buffer, post->y_buffer, source->y_stride, post->y_stride, source->y_height, source->y_width, ppl);
- vp8_mbpost_proc_across_ip(post->y_buffer, post->y_stride, post->y_height, post->y_width, q2mbl(q));
- vp8_mbpost_proc_down(post->y_buffer, post->y_stride, post->y_height, post->y_width, q2mbl(q));
-
- vp8_post_proc_down_and_across(source->u_buffer, post->u_buffer, source->uv_stride, post->uv_stride, source->uv_height, source->uv_width, ppl);
- vp8_post_proc_down_and_across(source->v_buffer, post->v_buffer, source->uv_stride, post->uv_stride, source->uv_height, source->uv_width, ppl);
-
+ vp8_mbpost_proc_across_ip(post->y_buffer, post->y_stride, post->y_height,
+ post->y_width, q2mbl(q));
+ vp8_mbpost_proc_down(post->y_buffer, post->y_stride, post->y_height,
+ post->y_width, q2mbl(q));
}
-void vp8_deblock(YV12_BUFFER_CONFIG *source,
+void vp8_deblock(VP8_COMMON *cm,
+ YV12_BUFFER_CONFIG *source,
YV12_BUFFER_CONFIG *post,
int q,
int low_var_thresh,
@@ -347,12 +332,58 @@ void vp8_deblock(YV12_BUFFER_CONFIG *source,
{
double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;
int ppl = (int)(level + .5);
+
+ const MODE_INFO *mode_info_context = cm->mi;
+ int mbr, mbc;
+
+ /* The pixel thresholds are adjusted according to if or not the macroblock
+ * is a skipped block. */
+ unsigned char *ylimits = (unsigned char *)vpx_memalign(16, 16 * cm->mb_cols);
+ unsigned char *uvlimits = (unsigned char *)vpx_memalign(16, 8 * cm->mb_cols);
(void) low_var_thresh;
(void) flag;
- vp8_post_proc_down_and_across(source->y_buffer, post->y_buffer, source->y_stride, post->y_stride, source->y_height, source->y_width, ppl);
- vp8_post_proc_down_and_across(source->u_buffer, post->u_buffer, source->uv_stride, post->uv_stride, source->uv_height, source->uv_width, ppl);
- vp8_post_proc_down_and_across(source->v_buffer, post->v_buffer, source->uv_stride, post->uv_stride, source->uv_height, source->uv_width, ppl);
+ if (ppl > 0)
+ {
+ for (mbr = 0; mbr < cm->mb_rows; mbr++)
+ {
+ unsigned char *ylptr = ylimits;
+ unsigned char *uvlptr = uvlimits;
+ for (mbc = 0; mbc < cm->mb_cols; mbc++)
+ {
+ unsigned char mb_ppl;
+
+ if (mode_info_context->mbmi.mb_skip_coeff)
+ mb_ppl = (unsigned char)ppl >> 1;
+ else
+ mb_ppl = (unsigned char)ppl;
+
+ vpx_memset(ylptr, mb_ppl, 16);
+ vpx_memset(uvlptr, mb_ppl, 8);
+
+ ylptr += 16;
+ uvlptr += 8;
+ mode_info_context++;
+ }
+ mode_info_context++;
+
+ vp8_post_proc_down_and_across_mb_row(
+ source->y_buffer + 16 * mbr * source->y_stride,
+ post->y_buffer + 16 * mbr * post->y_stride, source->y_stride,
+ post->y_stride, source->y_width, ylimits, 16);
+
+ vp8_post_proc_down_and_across_mb_row(
+ source->u_buffer + 8 * mbr * source->uv_stride,
+ post->u_buffer + 8 * mbr * post->uv_stride, source->uv_stride,
+ post->uv_stride, source->uv_width, uvlimits, 8);
+ vp8_post_proc_down_and_across_mb_row(
+ source->v_buffer + 8 * mbr * source->uv_stride,
+ post->v_buffer + 8 * mbr * post->uv_stride, source->uv_stride,
+ post->uv_stride, source->uv_width, uvlimits, 8);
+ }
+ }
+ vpx_free(ylimits);
+ vpx_free(uvlimits);
}
#if !(CONFIG_TEMPORAL_DENOISING)
@@ -364,33 +395,35 @@ void vp8_de_noise(YV12_BUFFER_CONFIG *source,
{
double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;
int ppl = (int)(level + .5);
+ int mb_rows = source->y_width >> 4;
+ int mb_cols = source->y_height >> 4;
+ unsigned char *limits = (unsigned char *)vpx_memalign(16, 16 * mb_cols);
+ int mbr, mbc;
(void) post;
(void) low_var_thresh;
(void) flag;
- vp8_post_proc_down_and_across(
- source->y_buffer + 2 * source->y_stride + 2,
- source->y_buffer + 2 * source->y_stride + 2,
- source->y_stride,
- source->y_stride,
- source->y_height - 4,
- source->y_width - 4,
- ppl);
- vp8_post_proc_down_and_across(
- source->u_buffer + 2 * source->uv_stride + 2,
- source->u_buffer + 2 * source->uv_stride + 2,
- source->uv_stride,
- source->uv_stride,
- source->uv_height - 4,
- source->uv_width - 4, ppl);
- vp8_post_proc_down_and_across(
- source->v_buffer + 2 * source->uv_stride + 2,
- source->v_buffer + 2 * source->uv_stride + 2,
- source->uv_stride,
- source->uv_stride,
- source->uv_height - 4,
- source->uv_width - 4, ppl);
+ /* TODO: The original code don't filter the 2 outer rows and columns. */
+ vpx_memset(limits, (unsigned char)ppl, 16 * mb_cols);
+ for (mbr = 0; mbr < mb_rows; mbr++)
+ {
+ vp8_post_proc_down_and_across_mb_row(
+ source->y_buffer + 16 * mbr * source->y_stride,
+ source->y_buffer + 16 * mbr * source->y_stride,
+ source->y_stride, source->y_stride, source->y_width, limits, 16);
+
+ vp8_post_proc_down_and_across_mb_row(
+ source->u_buffer + 8 * mbr * source->uv_stride,
+ source->u_buffer + 8 * mbr * source->uv_stride,
+ source->uv_stride, source->uv_stride, source->uv_width, limits, 8);
+ vp8_post_proc_down_and_across_mb_row(
+ source->v_buffer + 8 * mbr * source->uv_stride,
+ source->v_buffer + 8 * mbr * source->uv_stride,
+ source->uv_stride, source->uv_stride, source->uv_width, limits, 8);
+ }
+
+ vpx_free(limits);
}
#endif
@@ -752,12 +785,14 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t
vp8_yv12_copy_frame(&oci->post_proc_buffer, &oci->post_proc_buffer_int);
if (flags & VP8D_DEMACROBLOCK)
{
- vp8_deblock_and_de_macro_block(&oci->post_proc_buffer_int, &oci->post_proc_buffer,
+ vp8_deblock(oci, &oci->post_proc_buffer_int, &oci->post_proc_buffer,
q + (deblock_level - 5) * 10, 1, 0);
+ vp8_de_mblock(&oci->post_proc_buffer,
+ q + (deblock_level - 5) * 10);
}
else if (flags & VP8D_DEBLOCK)
{
- vp8_deblock(&oci->post_proc_buffer_int, &oci->post_proc_buffer,
+ vp8_deblock(oci, &oci->post_proc_buffer_int, &oci->post_proc_buffer,
q, 1, 0);
}
}
@@ -766,13 +801,15 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t
}
else if (flags & VP8D_DEMACROBLOCK)
{
- vp8_deblock_and_de_macro_block(oci->frame_to_show, &oci->post_proc_buffer,
- q + (deblock_level - 5) * 10, 1, 0);
+ vp8_deblock(oci, oci->frame_to_show, &oci->post_proc_buffer,
+ q + (deblock_level - 5) * 10, 1, 0);
+ vp8_de_mblock(&oci->post_proc_buffer, q + (deblock_level - 5) * 10);
+
oci->postproc_state.last_base_qindex = oci->base_qindex;
}
else if (flags & VP8D_DEBLOCK)
{
- vp8_deblock(oci->frame_to_show, &oci->post_proc_buffer,
+ vp8_deblock(oci, oci->frame_to_show, &oci->post_proc_buffer,
q, 1, 0);
oci->postproc_state.last_base_qindex = oci->base_qindex;
}
diff --git a/vp8/common/postproc.h b/vp8/common/postproc.h
index 6ac788cbd..a156398d2 100644
--- a/vp8/common/postproc.h
+++ b/vp8/common/postproc.h
@@ -36,7 +36,8 @@ void vp8_de_noise(YV12_BUFFER_CONFIG *source,
int low_var_thresh,
int flag);
-void vp8_deblock(YV12_BUFFER_CONFIG *source,
+void vp8_deblock(struct VP8Common *oci,
+ YV12_BUFFER_CONFIG *source,
YV12_BUFFER_CONFIG *post,
int q,
int low_var_thresh,
diff --git a/vp8/common/ppc/systemdependent.c b/vp8/common/ppc/systemdependent.c
index 7046a63e8..87f4cac72 100644
--- a/vp8/common/ppc/systemdependent.c
+++ b/vp8/common/ppc/systemdependent.c
@@ -19,14 +19,14 @@ void (*vp8_short_idct4x4)(short *input, short *output, int pitch);
void (*vp8_short_idct4x4_1)(short *input, short *output, int pitch);
void (*vp8_dc_only_idct)(short input_dc, short *output, int pitch);
-extern void (*vp8_post_proc_down_and_across)(
+extern void (*vp8_post_proc_down_and_across_mb_row)(
unsigned char *src_ptr,
unsigned char *dst_ptr,
int src_pixels_per_line,
int dst_pixels_per_line,
- int rows,
int cols,
- int flimit
+ unsigned char *f,
+ int size
);
extern void (*vp8_mbpost_proc_down)(unsigned char *dst, int pitch, int rows, int cols, int flimit);
@@ -34,15 +34,15 @@ extern void vp8_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int
extern void (*vp8_mbpost_proc_across_ip)(unsigned char *src, int pitch, int rows, int cols, int flimit);
extern void vp8_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows, int cols, int flimit);
-extern void vp8_post_proc_down_and_across_c
+extern void vp8_post_proc_down_and_across_mb_row_c
(
unsigned char *src_ptr,
unsigned char *dst_ptr,
int src_pixels_per_line,
int dst_pixels_per_line,
- int rows,
int cols,
- int flimit
+ unsigned char *f,
+ int size
);
void vp8_plane_add_noise_c(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a);
@@ -158,7 +158,7 @@ void vp8_machine_specific_config(void)
vp8_lf_mbhsimple = loop_filter_mbhs_ppc;
vp8_lf_bhsimple = loop_filter_bhs_ppc;
- vp8_post_proc_down_and_across = vp8_post_proc_down_and_across_c;
+ vp8_post_proc_down_and_across_mb_row = vp8_post_proc_down_and_across_mb_row_c;
vp8_mbpost_proc_down = vp8_mbpost_proc_down_c;
vp8_mbpost_proc_across_ip = vp8_mbpost_proc_across_ip_c;
vp8_plane_add_noise = vp8_plane_add_noise_c;
diff --git a/vp8/common/rtcd_defs.sh b/vp8/common/rtcd_defs.sh
index f0bdf29be..0f950f8ab 100644
--- a/vp8/common/rtcd_defs.sh
+++ b/vp8/common/rtcd_defs.sh
@@ -162,9 +162,8 @@ if [ "$CONFIG_POSTPROC" = "yes" ]; then
specialize vp8_mbpost_proc_across_ip sse2
vp8_mbpost_proc_across_ip_sse2=vp8_mbpost_proc_across_ip_xmm
- prototype void vp8_post_proc_down_and_across "unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int rows, int cols, int flimit"
- specialize vp8_post_proc_down_and_across mmx sse2
- vp8_post_proc_down_and_across_sse2=vp8_post_proc_down_and_across_xmm
+ prototype void vp8_post_proc_down_and_across_mb_row "unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size"
+ specialize vp8_post_proc_down_and_across_mb_row sse2
prototype void vp8_plane_add_noise "unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch"
specialize vp8_plane_add_noise mmx sse2
diff --git a/vp8/common/x86/postproc_mmx.asm b/vp8/common/x86/postproc_mmx.asm
index 534f2967a..966c586e4 100644
--- a/vp8/common/x86/postproc_mmx.asm
+++ b/vp8/common/x86/postproc_mmx.asm
@@ -14,271 +14,6 @@
%define VP8_FILTER_WEIGHT 128
%define VP8_FILTER_SHIFT 7
-;void vp8_post_proc_down_and_across_mmx
-;(
-; unsigned char *src_ptr,
-; unsigned char *dst_ptr,
-; int src_pixels_per_line,
-; int dst_pixels_per_line,
-; int rows,
-; int cols,
-; int flimit
-;)
-global sym(vp8_post_proc_down_and_across_mmx) PRIVATE
-sym(vp8_post_proc_down_and_across_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
-%if ABI_IS_32BIT=1 && CONFIG_PIC=1
- ; move the global rd onto the stack, since we don't have enough registers
- ; to do PIC addressing
- movq mm0, [GLOBAL(rd)]
- sub rsp, 8
- movq [rsp], mm0
-%define RD [rsp]
-%else
-%define RD [GLOBAL(rd)]
-%endif
-
- push rbx
- lea rbx, [GLOBAL(Blur)]
- movd mm2, dword ptr arg(6) ;flimit
- punpcklwd mm2, mm2
- punpckldq mm2, mm2
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(1) ;dst_ptr
-
- movsxd rcx, DWORD PTR arg(4) ;rows
- movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch?
- pxor mm0, mm0 ; mm0 = 00000000
-
-.nextrow:
-
- xor rdx, rdx ; clear out rdx for use as loop counter
-.nextcol:
-
- pxor mm7, mm7 ; mm7 = 00000000
- movq mm6, [rbx + 32 ] ; mm6 = kernel 2 taps
- movq mm3, [rsi] ; mm4 = r0 p0..p7
- punpcklbw mm3, mm0 ; mm3 = p0..p3
- movq mm1, mm3 ; mm1 = p0..p3
- pmullw mm3, mm6 ; mm3 *= kernel 2 modifiers
-
- movq mm6, [rbx + 48] ; mm6 = kernel 3 taps
- movq mm5, [rsi + rax] ; mm4 = r1 p0..p7
- punpcklbw mm5, mm0 ; mm5 = r1 p0..p3
- pmullw mm6, mm5 ; mm6 *= p0..p3 * kernel 3 modifiers
- paddusw mm3, mm6 ; mm3 += mm6
-
- ; thresholding
- movq mm7, mm1 ; mm7 = r0 p0..p3
- psubusw mm7, mm5 ; mm7 = r0 p0..p3 - r1 p0..p3
- psubusw mm5, mm1 ; mm5 = r1 p0..p3 - r0 p0..p3
- paddusw mm7, mm5 ; mm7 = abs(r0 p0..p3 - r1 p0..p3)
- pcmpgtw mm7, mm2
-
- movq mm6, [rbx + 64 ] ; mm6 = kernel 4 modifiers
- movq mm5, [rsi + 2*rax] ; mm4 = r2 p0..p7
- punpcklbw mm5, mm0 ; mm5 = r2 p0..p3
- pmullw mm6, mm5 ; mm5 *= kernel 4 modifiers
- paddusw mm3, mm6 ; mm3 += mm5
-
- ; thresholding
- movq mm6, mm1 ; mm6 = r0 p0..p3
- psubusw mm6, mm5 ; mm6 = r0 p0..p3 - r2 p0..p3
- psubusw mm5, mm1 ; mm5 = r2 p0..p3 - r2 p0..p3
- paddusw mm6, mm5 ; mm6 = abs(r0 p0..p3 - r2 p0..p3)
- pcmpgtw mm6, mm2
- por mm7, mm6 ; accumulate thresholds
-
-
- neg rax
- movq mm6, [rbx ] ; kernel 0 taps
- movq mm5, [rsi+2*rax] ; mm4 = r-2 p0..p7
- punpcklbw mm5, mm0 ; mm5 = r-2 p0..p3
- pmullw mm6, mm5 ; mm5 *= kernel 0 modifiers
- paddusw mm3, mm6 ; mm3 += mm5
-
- ; thresholding
- movq mm6, mm1 ; mm6 = r0 p0..p3
- psubusw mm6, mm5 ; mm6 = p0..p3 - r-2 p0..p3
- psubusw mm5, mm1 ; mm5 = r-2 p0..p3 - p0..p3
- paddusw mm6, mm5 ; mm6 = abs(r0 p0..p3 - r-2 p0..p3)
- pcmpgtw mm6, mm2
- por mm7, mm6 ; accumulate thresholds
-
- movq mm6, [rbx + 16] ; kernel 1 taps
- movq mm4, [rsi+rax] ; mm4 = r-1 p0..p7
- punpcklbw mm4, mm0 ; mm4 = r-1 p0..p3
- pmullw mm6, mm4 ; mm4 *= kernel 1 modifiers.
- paddusw mm3, mm6 ; mm3 += mm5
-
- ; thresholding
- movq mm6, mm1 ; mm6 = r0 p0..p3
- psubusw mm6, mm4 ; mm6 = p0..p3 - r-2 p0..p3
- psubusw mm4, mm1 ; mm5 = r-1 p0..p3 - p0..p3
- paddusw mm6, mm4 ; mm6 = abs(r0 p0..p3 - r-1 p0..p3)
- pcmpgtw mm6, mm2
- por mm7, mm6 ; accumulate thresholds
-
-
- paddusw mm3, RD ; mm3 += round value
- psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128
-
- pand mm1, mm7 ; mm1 select vals > thresh from source
- pandn mm7, mm3 ; mm7 select vals < thresh from blurred result
- paddusw mm1, mm7 ; combination
-
- packuswb mm1, mm0 ; pack to bytes
-
- movd [rdi], mm1 ;
- neg rax ; pitch is positive
-
-
- add rsi, 4
- add rdi, 4
- add rdx, 4
-
- cmp edx, dword ptr arg(5) ;cols
- jl .nextcol
- ; done with the all cols, start the across filtering in place
- sub rsi, rdx
- sub rdi, rdx
-
- ; dup the first byte into the left border 8 times
- movq mm1, [rdi]
- punpcklbw mm1, mm1
- punpcklwd mm1, mm1
- punpckldq mm1, mm1
-
- mov rdx, -8
- movq [rdi+rdx], mm1
-
- ; dup the last byte into the right border
- movsxd rdx, dword arg(5)
- movq mm1, [rdi + rdx + -1]
- punpcklbw mm1, mm1
- punpcklwd mm1, mm1
- punpckldq mm1, mm1
- movq [rdi+rdx], mm1
-
-
- push rax
- xor rdx, rdx
- mov rax, [rdi-4];
-
-.acrossnextcol:
- pxor mm7, mm7 ; mm7 = 00000000
- movq mm6, [rbx + 32 ] ;
- movq mm4, [rdi+rdx] ; mm4 = p0..p7
- movq mm3, mm4 ; mm3 = p0..p7
- punpcklbw mm3, mm0 ; mm3 = p0..p3
- movq mm1, mm3 ; mm1 = p0..p3
- pmullw mm3, mm6 ; mm3 *= kernel 2 modifiers
-
- movq mm6, [rbx + 48]
- psrlq mm4, 8 ; mm4 = p1..p7
- movq mm5, mm4 ; mm5 = p1..p7
- punpcklbw mm5, mm0 ; mm5 = p1..p4
- pmullw mm6, mm5 ; mm6 *= p1..p4 * kernel 3 modifiers
- paddusw mm3, mm6 ; mm3 += mm6
-
- ; thresholding
- movq mm7, mm1 ; mm7 = p0..p3
- psubusw mm7, mm5 ; mm7 = p0..p3 - p1..p4
- psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3
- paddusw mm7, mm5 ; mm7 = abs(p0..p3 - p1..p4)
- pcmpgtw mm7, mm2
-
- movq mm6, [rbx + 64 ]
- psrlq mm4, 8 ; mm4 = p2..p7
- movq mm5, mm4 ; mm5 = p2..p7
- punpcklbw mm5, mm0 ; mm5 = p2..p5
- pmullw mm6, mm5 ; mm5 *= kernel 4 modifiers
- paddusw mm3, mm6 ; mm3 += mm5
-
- ; thresholding
- movq mm6, mm1 ; mm6 = p0..p3
- psubusw mm6, mm5 ; mm6 = p0..p3 - p1..p4
- psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3
- paddusw mm6, mm5 ; mm6 = abs(p0..p3 - p1..p4)
- pcmpgtw mm6, mm2
- por mm7, mm6 ; accumulate thresholds
-
-
- movq mm6, [rbx ]
- movq mm4, [rdi+rdx-2] ; mm4 = p-2..p5
- movq mm5, mm4 ; mm5 = p-2..p5
- punpcklbw mm5, mm0 ; mm5 = p-2..p1
- pmullw mm6, mm5 ; mm5 *= kernel 0 modifiers
- paddusw mm3, mm6 ; mm3 += mm5
-
- ; thresholding
- movq mm6, mm1 ; mm6 = p0..p3
- psubusw mm6, mm5 ; mm6 = p0..p3 - p1..p4
- psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3
- paddusw mm6, mm5 ; mm6 = abs(p0..p3 - p1..p4)
- pcmpgtw mm6, mm2
- por mm7, mm6 ; accumulate thresholds
-
- movq mm6, [rbx + 16]
- psrlq mm4, 8 ; mm4 = p-1..p5
- punpcklbw mm4, mm0 ; mm4 = p-1..p2
- pmullw mm6, mm4 ; mm4 *= kernel 1 modifiers.
- paddusw mm3, mm6 ; mm3 += mm5
-
- ; thresholding
- movq mm6, mm1 ; mm6 = p0..p3
- psubusw mm6, mm4 ; mm6 = p0..p3 - p1..p4
- psubusw mm4, mm1 ; mm5 = p1..p4 - p0..p3
- paddusw mm6, mm4 ; mm6 = abs(p0..p3 - p1..p4)
- pcmpgtw mm6, mm2
- por mm7, mm6 ; accumulate thresholds
-
- paddusw mm3, RD ; mm3 += round value
- psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128
-
- pand mm1, mm7 ; mm1 select vals > thresh from source
- pandn mm7, mm3 ; mm7 select vals < thresh from blurred result
- paddusw mm1, mm7 ; combination
-
- packuswb mm1, mm0 ; pack to bytes
- mov DWORD PTR [rdi+rdx-4], eax ; store previous four bytes
- movd eax, mm1
-
- add rdx, 4
- cmp edx, dword ptr arg(5) ;cols
- jl .acrossnextcol;
-
- mov DWORD PTR [rdi+rdx-4], eax
- pop rax
-
- ; done with this rwo
- add rsi,rax ; next line
- movsxd rax, dword ptr arg(3) ;dst_pixels_per_line ; destination pitch?
- add rdi,rax ; next destination
- movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; destination pitch?
-
- dec rcx ; decrement count
- jnz .nextrow ; next row
- pop rbx
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-%undef RD
-
-
;void vp8_mbpost_proc_down_mmx(unsigned char *dst,
; int pitch, int rows, int cols,int flimit)
extern sym(vp8_rv)
diff --git a/vp8/common/x86/postproc_sse2.asm b/vp8/common/x86/postproc_sse2.asm
index bf36b0d7f..25c32e148 100644
--- a/vp8/common/x86/postproc_sse2.asm
+++ b/vp8/common/x86/postproc_sse2.asm
@@ -11,146 +11,158 @@
%include "vpx_ports/x86_abi_support.asm"
-;void vp8_post_proc_down_and_across_xmm
+;macro in deblock functions
+%macro FIRST_2_ROWS 0
+ movdqa xmm4, xmm0
+ movdqa xmm6, xmm0
+ movdqa xmm5, xmm1
+ pavgb xmm5, xmm3
+
+ ;calculate absolute value
+ psubusb xmm4, xmm1
+ psubusb xmm1, xmm0
+ psubusb xmm6, xmm3
+ psubusb xmm3, xmm0
+ paddusb xmm4, xmm1
+ paddusb xmm6, xmm3
+
+ ;get threshold
+ movdqa xmm2, flimit
+ pxor xmm1, xmm1
+ movdqa xmm7, xmm2
+
+ ;get mask
+ psubusb xmm2, xmm4
+ psubusb xmm7, xmm6
+ pcmpeqb xmm2, xmm1
+ pcmpeqb xmm7, xmm1
+ por xmm7, xmm2
+%endmacro
+
+%macro SECOND_2_ROWS 0
+ movdqa xmm6, xmm0
+ movdqa xmm4, xmm0
+ movdqa xmm2, xmm1
+ pavgb xmm1, xmm3
+
+ ;calculate absolute value
+ psubusb xmm6, xmm2
+ psubusb xmm2, xmm0
+ psubusb xmm4, xmm3
+ psubusb xmm3, xmm0
+ paddusb xmm6, xmm2
+ paddusb xmm4, xmm3
+
+ pavgb xmm5, xmm1
+
+ ;get threshold
+ movdqa xmm2, flimit
+ pxor xmm1, xmm1
+ movdqa xmm3, xmm2
+
+ ;get mask
+ psubusb xmm2, xmm6
+ psubusb xmm3, xmm4
+ pcmpeqb xmm2, xmm1
+ pcmpeqb xmm3, xmm1
+
+ por xmm7, xmm2
+ por xmm7, xmm3
+
+ pavgb xmm5, xmm0
+
+ ;decide if or not to use filtered value
+ pand xmm0, xmm7
+ pandn xmm7, xmm5
+ paddusb xmm0, xmm7
+%endmacro
+
+%macro UPDATE_FLIMIT 0
+ movdqa xmm2, XMMWORD PTR [rbx]
+ movdqa [rsp], xmm2
+ add rbx, 16
+%endmacro
+
+;void vp8_post_proc_down_and_across_mb_row_sse2
;(
; unsigned char *src_ptr,
; unsigned char *dst_ptr,
; int src_pixels_per_line,
; int dst_pixels_per_line,
-; int rows,
; int cols,
-; int flimit
+; int *flimits,
+; int size
;)
-global sym(vp8_post_proc_down_and_across_xmm) PRIVATE
-sym(vp8_post_proc_down_and_across_xmm):
+global sym(vp8_post_proc_down_and_across_mb_row_sse2) PRIVATE
+sym(vp8_post_proc_down_and_across_mb_row_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
SAVE_XMM 7
- GET_GOT rbx
+ push rbx
push rsi
push rdi
; end prolog
-
-%if ABI_IS_32BIT=1 && CONFIG_PIC=1
ALIGN_STACK 16, rax
- ; move the global rd onto the stack, since we don't have enough registers
- ; to do PIC addressing
- movdqa xmm0, [GLOBAL(rd42)]
sub rsp, 16
- movdqa [rsp], xmm0
-%define RD42 [rsp]
-%else
-%define RD42 [GLOBAL(rd42)]
-%endif
-
- movd xmm2, dword ptr arg(6) ;flimit
- punpcklwd xmm2, xmm2
- punpckldq xmm2, xmm2
- punpcklqdq xmm2, xmm2
+ ; put flimit on stack
+ mov rbx, arg(5) ;flimits ptr
+ UPDATE_FLIMIT
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(1) ;dst_ptr
+%define flimit [rsp]
- movsxd rcx, DWORD PTR arg(4) ;rows
- movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch?
- pxor xmm0, xmm0 ; mm0 = 00000000
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(1) ;dst_ptr
+ movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line
+ movsxd rcx, DWORD PTR arg(6) ;rows in a macroblock
.nextrow:
-
- xor rdx, rdx ; clear out rdx for use as loop counter
+ xor rdx, rdx ;col
.nextcol:
- movq xmm3, QWORD PTR [rsi] ; mm4 = r0 p0..p7
- punpcklbw xmm3, xmm0 ; mm3 = p0..p3
- movdqa xmm1, xmm3 ; mm1 = p0..p3
- psllw xmm3, 2 ;
-
- movq xmm5, QWORD PTR [rsi + rax] ; mm4 = r1 p0..p7
- punpcklbw xmm5, xmm0 ; mm5 = r1 p0..p3
- paddusw xmm3, xmm5 ; mm3 += mm6
-
- ; thresholding
- movdqa xmm7, xmm1 ; mm7 = r0 p0..p3
- psubusw xmm7, xmm5 ; mm7 = r0 p0..p3 - r1 p0..p3
- psubusw xmm5, xmm1 ; mm5 = r1 p0..p3 - r0 p0..p3
- paddusw xmm7, xmm5 ; mm7 = abs(r0 p0..p3 - r1 p0..p3)
- pcmpgtw xmm7, xmm2
-
- movq xmm5, QWORD PTR [rsi + 2*rax] ; mm4 = r2 p0..p7
- punpcklbw xmm5, xmm0 ; mm5 = r2 p0..p3
- paddusw xmm3, xmm5 ; mm3 += mm5
-
- ; thresholding
- movdqa xmm6, xmm1 ; mm6 = r0 p0..p3
- psubusw xmm6, xmm5 ; mm6 = r0 p0..p3 - r2 p0..p3
- psubusw xmm5, xmm1 ; mm5 = r2 p0..p3 - r2 p0..p3
- paddusw xmm6, xmm5 ; mm6 = abs(r0 p0..p3 - r2 p0..p3)
- pcmpgtw xmm6, xmm2
- por xmm7, xmm6 ; accumulate thresholds
+ ;load current and next 2 rows
+ movdqu xmm0, XMMWORD PTR [rsi]
+ movdqu xmm1, XMMWORD PTR [rsi + rax]
+ movdqu xmm3, XMMWORD PTR [rsi + 2*rax]
+ FIRST_2_ROWS
+ ;load above 2 rows
neg rax
- movq xmm5, QWORD PTR [rsi+2*rax] ; mm4 = r-2 p0..p7
- punpcklbw xmm5, xmm0 ; mm5 = r-2 p0..p3
- paddusw xmm3, xmm5 ; mm3 += mm5
-
- ; thresholding
- movdqa xmm6, xmm1 ; mm6 = r0 p0..p3
- psubusw xmm6, xmm5 ; mm6 = p0..p3 - r-2 p0..p3
- psubusw xmm5, xmm1 ; mm5 = r-2 p0..p3 - p0..p3
- paddusw xmm6, xmm5 ; mm6 = abs(r0 p0..p3 - r-2 p0..p3)
- pcmpgtw xmm6, xmm2
- por xmm7, xmm6 ; accumulate thresholds
-
- movq xmm4, QWORD PTR [rsi+rax] ; mm4 = r-1 p0..p7
- punpcklbw xmm4, xmm0 ; mm4 = r-1 p0..p3
- paddusw xmm3, xmm4 ; mm3 += mm5
-
- ; thresholding
- movdqa xmm6, xmm1 ; mm6 = r0 p0..p3
- psubusw xmm6, xmm4 ; mm6 = p0..p3 - r-2 p0..p3
- psubusw xmm4, xmm1 ; mm5 = r-1 p0..p3 - p0..p3
- paddusw xmm6, xmm4 ; mm6 = abs(r0 p0..p3 - r-1 p0..p3)
- pcmpgtw xmm6, xmm2
- por xmm7, xmm6 ; accumulate thresholds
-
-
- paddusw xmm3, RD42 ; mm3 += round value
- psraw xmm3, 3 ; mm3 /= 8
+ movdqu xmm1, XMMWORD PTR [rsi + 2*rax]
+ movdqu xmm3, XMMWORD PTR [rsi + rax]
- pand xmm1, xmm7 ; mm1 select vals > thresh from source
- pandn xmm7, xmm3 ; mm7 select vals < thresh from blurred result
- paddusw xmm1, xmm7 ; combination
+ SECOND_2_ROWS
- packuswb xmm1, xmm0 ; pack to bytes
- movq QWORD PTR [rdi], xmm1 ;
+ movdqu XMMWORD PTR [rdi], xmm0
- neg rax ; pitch is positive
- add rsi, 8
- add rdi, 8
+ neg rax ; positive stride
+ add rsi, 16
+ add rdi, 16
- add rdx, 8
- cmp edx, dword arg(5) ;cols
+ UPDATE_FLIMIT
+ add rdx, 16
+ cmp edx, dword arg(4) ;cols
jl .nextcol
; done with the all cols, start the across filtering in place
sub rsi, rdx
sub rdi, rdx
+ mov rbx, arg(5) ; flimits
+ UPDATE_FLIMIT
; dup the first byte into the left border 8 times
movq mm1, [rdi]
punpcklbw mm1, mm1
punpcklwd mm1, mm1
punpckldq mm1, mm1
-
mov rdx, -8
movq [rdi+rdx], mm1
; dup the last byte into the right border
- movsxd rdx, dword arg(5)
+ movsxd rdx, dword arg(4)
movq mm1, [rdi + rdx + -1]
punpcklbw mm1, mm1
punpcklwd mm1, mm1
@@ -158,113 +170,63 @@ sym(vp8_post_proc_down_and_across_xmm):
movq [rdi+rdx], mm1
xor rdx, rdx
- movq mm0, QWORD PTR [rdi-8];
+ movq mm0, QWORD PTR [rdi-16];
+ movq mm1, QWORD PTR [rdi-8];
.acrossnextcol:
- movq xmm7, QWORD PTR [rdi +rdx -2]
- movd xmm4, DWORD PTR [rdi +rdx +6]
-
- pslldq xmm4, 8
- por xmm4, xmm7
-
- movdqa xmm3, xmm4
- psrldq xmm3, 2
- punpcklbw xmm3, xmm0 ; mm3 = p0..p3
- movdqa xmm1, xmm3 ; mm1 = p0..p3
- psllw xmm3, 2
-
-
- movdqa xmm5, xmm4
- psrldq xmm5, 3
- punpcklbw xmm5, xmm0 ; mm5 = p1..p4
- paddusw xmm3, xmm5 ; mm3 += mm6
-
- ; thresholding
- movdqa xmm7, xmm1 ; mm7 = p0..p3
- psubusw xmm7, xmm5 ; mm7 = p0..p3 - p1..p4
- psubusw xmm5, xmm1 ; mm5 = p1..p4 - p0..p3
- paddusw xmm7, xmm5 ; mm7 = abs(p0..p3 - p1..p4)
- pcmpgtw xmm7, xmm2
-
- movdqa xmm5, xmm4
- psrldq xmm5, 4
- punpcklbw xmm5, xmm0 ; mm5 = p2..p5
- paddusw xmm3, xmm5 ; mm3 += mm5
-
- ; thresholding
- movdqa xmm6, xmm1 ; mm6 = p0..p3
- psubusw xmm6, xmm5 ; mm6 = p0..p3 - p1..p4
- psubusw xmm5, xmm1 ; mm5 = p1..p4 - p0..p3
- paddusw xmm6, xmm5 ; mm6 = abs(p0..p3 - p1..p4)
- pcmpgtw xmm6, xmm2
- por xmm7, xmm6 ; accumulate thresholds
-
-
- movdqa xmm5, xmm4 ; mm5 = p-2..p5
- punpcklbw xmm5, xmm0 ; mm5 = p-2..p1
- paddusw xmm3, xmm5 ; mm3 += mm5
-
- ; thresholding
- movdqa xmm6, xmm1 ; mm6 = p0..p3
- psubusw xmm6, xmm5 ; mm6 = p0..p3 - p1..p4
- psubusw xmm5, xmm1 ; mm5 = p1..p4 - p0..p3
- paddusw xmm6, xmm5 ; mm6 = abs(p0..p3 - p1..p4)
- pcmpgtw xmm6, xmm2
- por xmm7, xmm6 ; accumulate thresholds
-
- psrldq xmm4, 1 ; mm4 = p-1..p5
- punpcklbw xmm4, xmm0 ; mm4 = p-1..p2
- paddusw xmm3, xmm4 ; mm3 += mm5
-
- ; thresholding
- movdqa xmm6, xmm1 ; mm6 = p0..p3
- psubusw xmm6, xmm4 ; mm6 = p0..p3 - p1..p4
- psubusw xmm4, xmm1 ; mm5 = p1..p4 - p0..p3
- paddusw xmm6, xmm4 ; mm6 = abs(p0..p3 - p1..p4)
- pcmpgtw xmm6, xmm2
- por xmm7, xmm6 ; accumulate thresholds
-
- paddusw xmm3, RD42 ; mm3 += round value
- psraw xmm3, 3 ; mm3 /= 8
-
- pand xmm1, xmm7 ; mm1 select vals > thresh from source
- pandn xmm7, xmm3 ; mm7 select vals < thresh from blurred result
- paddusw xmm1, xmm7 ; combination
-
- packuswb xmm1, xmm0 ; pack to bytes
- movq QWORD PTR [rdi+rdx-8], mm0 ; store previous four bytes
- movdq2q mm0, xmm1
-
- add rdx, 8
- cmp edx, dword arg(5) ;cols
+ movdqu xmm0, XMMWORD PTR [rdi + rdx]
+ movdqu xmm1, XMMWORD PTR [rdi + rdx -2]
+ movdqu xmm3, XMMWORD PTR [rdi + rdx -1]
+
+ FIRST_2_ROWS
+
+ movdqu xmm1, XMMWORD PTR [rdi + rdx +1]
+ movdqu xmm3, XMMWORD PTR [rdi + rdx +2]
+
+ SECOND_2_ROWS
+
+ movq QWORD PTR [rdi+rdx-16], mm0 ; store previous 8 bytes
+ movq QWORD PTR [rdi+rdx-8], mm1 ; store previous 8 bytes
+ movdq2q mm0, xmm0
+ psrldq xmm0, 8
+ movdq2q mm1, xmm0
+
+ UPDATE_FLIMIT
+
+ add rdx, 16
+ cmp edx, dword arg(4) ;cols
jl .acrossnextcol;
- ; last 8 pixels
- movq QWORD PTR [rdi+rdx-8], mm0
+ ; last 16 pixels
+ movq QWORD PTR [rdi+rdx-16], mm0
+ cmp edx, dword arg(4)
+ jne .throw_last_8
+ movq QWORD PTR [rdi+rdx-8], mm1
+.throw_last_8:
; done with this rwo
- add rsi,rax ; next line
- mov eax, dword arg(3) ;dst_pixels_per_line ; destination pitch?
- add rdi,rax ; next destination
- mov eax, dword arg(2) ;src_pixels_per_line ; destination pitch?
+ add rsi,rax ;next src line
+ mov eax, dword arg(3) ;dst_pixels_per_line
+ add rdi,rax ;next destination
+ mov eax, dword arg(2) ;src_pixels_per_line
- dec rcx ; decrement count
- jnz .nextrow ; next row
+ mov rbx, arg(5) ;flimits
+ UPDATE_FLIMIT
-%if ABI_IS_32BIT=1 && CONFIG_PIC=1
- add rsp,16
+ dec rcx ;decrement count
+ jnz .nextrow ;next row
+
+ add rsp, 16
pop rsp
-%endif
; begin epilog
pop rdi
pop rsi
- RESTORE_GOT
+ pop rbx
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
-%undef RD42
-
+%undef flimit
;void vp8_mbpost_proc_down_xmm(unsigned char *dst,
; int pitch, int rows, int cols,int flimit)
@@ -753,7 +715,5 @@ sym(vp8_plane_add_noise_wmt):
SECTION_RODATA
align 16
-rd42:
- times 8 dw 0x04
four8s:
times 4 dd 8
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index b4e02e22c..c70e42c6f 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -5301,7 +5301,7 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned l
double frame_psnr2, frame_ssim2 = 0;
double weight = 0;
- vp8_deblock(cm->frame_to_show, &cm->post_proc_buffer, cm->filter_level * 10 / 6, 1, 0);
+ vp8_deblock(cm, cm->frame_to_show, &cm->post_proc_buffer, cm->filter_level * 10 / 6, 1, 0);
vp8_clear_system_state();
ye = calc_plane_error(orig->y_buffer, orig->y_stride,