diff options
-rwxr-xr-x | configure | 9 | ||||
-rw-r--r-- | vp8/common/onyx.h | 2 | ||||
-rw-r--r-- | vp8/common/onyxd.h | 2 | ||||
-rw-r--r-- | vp8/common/postproc.c | 99 | ||||
-rw-r--r-- | vp8/common/postproc.h | 2 | ||||
-rw-r--r-- | vp8/common/ppflags.h | 19 | ||||
-rw-r--r-- | vp8/decoder/onyxd_if.c | 4 | ||||
-rw-r--r-- | vp8/encoder/encodeframe.c | 47 | ||||
-rw-r--r-- | vp8/encoder/encodemb.c | 4 | ||||
-rw-r--r-- | vp8/encoder/firstpass.c | 33 | ||||
-rw-r--r-- | vp8/encoder/onyx_if.c | 7 | ||||
-rw-r--r-- | vp8/encoder/onyx_int.h | 3 | ||||
-rw-r--r-- | vp8/encoder/quantize.c | 75 | ||||
-rw-r--r-- | vp8/encoder/x86/dct_mmx.asm | 590 | ||||
-rw-r--r-- | vp8/encoder/x86/dct_sse2.asm | 289 | ||||
-rw-r--r-- | vp8/encoder/x86/dct_x86.h | 12 | ||||
-rw-r--r-- | vp8/encoder/x86/x86_csystemdependent.c | 20 | ||||
-rw-r--r-- | vp8/vp8_cx_iface.c | 10 | ||||
-rw-r--r-- | vp8/vp8_dx_iface.c | 67 | ||||
-rw-r--r-- | vpx/vp8.h | 34 | ||||
-rw-r--r-- | vpxdec.c | 77 | ||||
-rw-r--r-- | vpxenc.c | 5 |
22 files changed, 754 insertions, 656 deletions
@@ -41,6 +41,7 @@ Advanced options: ${toggle_shared} shared library support ${toggle_small} favor smaller size over speed ${toggle_arm_asm_detok} assembly version of the detokenizer (ARM platforms only) + ${toggle_postproc_visualizer} macro block / block level visualizers Codecs: Codecs can be selectively enabled or disabled individually, or by family: @@ -250,6 +251,7 @@ CONFIG_LIST=" shared small arm_asm_detok + postproc_visualizer " CMDLINE_SELECT=" extra_warnings @@ -289,6 +291,7 @@ CMDLINE_SELECT=" shared small arm_asm_detok + postproc_visualizer " process_cmdline() { @@ -325,8 +328,6 @@ post_process_cmdline() { for c in ${CODECS}; do enabled ${c} && enable ${c##*_}s done - - } @@ -536,6 +537,10 @@ process_toolchain() { # Other toolchain specific defaults case $toolchain in x86*|ppc*|universal*) soft_enable postproc;; esac + + if enabled postproc_visualizer; then + enabled postproc || die "postproc_visualizer requires postproc to be enabled" + fi } diff --git a/vp8/common/onyx.h b/vp8/common/onyx.h index a006306db..3c199d1c2 100644 --- a/vp8/common/onyx.h +++ b/vp8/common/onyx.h @@ -204,7 +204,7 @@ extern "C" // and not just a copy of the pointer.. int vp8_receive_raw_frame(VP8_PTR comp, unsigned int frame_flags, YV12_BUFFER_CONFIG *sd, INT64 time_stamp, INT64 end_time_stamp); int vp8_get_compressed_data(VP8_PTR comp, unsigned int *frame_flags, unsigned long *size, unsigned char *dest, INT64 *time_stamp, INT64 *time_end, int flush); - int vp8_get_preview_raw_frame(VP8_PTR comp, YV12_BUFFER_CONFIG *dest, int deblock_level, int noise_level, int flags); + int vp8_get_preview_raw_frame(VP8_PTR comp, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t *flags); int vp8_use_as_reference(VP8_PTR comp, int ref_frame_flags); int vp8_update_reference(VP8_PTR comp, int ref_frame_flags); diff --git a/vp8/common/onyxd.h b/vp8/common/onyxd.h index 00a97d97d..e53bc3138 100644 --- a/vp8/common/onyxd.h +++ b/vp8/common/onyxd.h @@ -51,7 +51,7 @@ extern "C" int vp8dx_get_setting(VP8D_PTR comp, VP8D_SETTING oxst); int vp8dx_receive_compressed_data(VP8D_PTR comp, unsigned long size, const unsigned char *dest, INT64 time_stamp); - int vp8dx_get_raw_frame(VP8D_PTR comp, YV12_BUFFER_CONFIG *sd, INT64 *time_stamp, INT64 *time_end_stamp, int deblock_level, int noise_level, int flags); + int vp8dx_get_raw_frame(VP8D_PTR comp, YV12_BUFFER_CONFIG *sd, INT64 *time_stamp, INT64 *time_end_stamp, vp8_ppflags_t *flags); int vp8dx_get_reference(VP8D_PTR comp, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd); int vp8dx_set_reference(VP8D_PTR comp, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd); diff --git a/vp8/common/postproc.c b/vp8/common/postproc.c index e797e1036..15b1c2c89 100644 --- a/vp8/common/postproc.c +++ b/vp8/common/postproc.c @@ -26,7 +26,7 @@ ( (0.439*(float)(t>>16)) - (0.368*(float)(t>>8&0xff)) - (0.071*(float)(t&0xff)) + 128) /* global constants */ - +#if CONFIG_POSTPROC_VISUALIZER static const unsigned char MB_PREDICTION_MODE_colors[MB_MODE_COUNT][3] = { { RGB_TO_YUV(0x98FB98) }, /* PaleGreen */ @@ -59,13 +59,14 @@ static const unsigned char B_PREDICTION_MODE_colors[B_MODE_COUNT][3] = { RGB_TO_YUV(0xccff33) }, /* Yellow */ }; -static const unsigned char MV_REFERENCE_FRAME_colors[MB_MODE_COUNT][3] = +static const unsigned char MV_REFERENCE_FRAME_colors[MAX_REF_FRAMES][3] = { { RGB_TO_YUV(0x00ff00) }, /* Blue */ { RGB_TO_YUV(0x0000ff) }, /* Green */ { RGB_TO_YUV(0xffff00) }, /* Yellow */ { RGB_TO_YUV(0xff0000) }, /* Red */ }; +#endif static const short kernel5[] = { @@ -677,10 +678,13 @@ static void constrain_line (int x0, int *x1, int y0, int *y1, int width, int hei #define RTCD_VTABLE(oci) NULL #endif -int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_level, int noise_level, int flags) +int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t *ppflags) { char message[512]; int q = oci->filter_level * 10 / 6; + int flags = ppflags->post_proc_flag; + int deblock_level = ppflags->deblocking_level; + int noise_level = ppflags->noise_level; if (!oci->frame_to_show) return -1; @@ -737,7 +741,8 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_l oci->post_proc_buffer.y_stride); } - if (flags & VP8D_DEBUG_LEVEL1) +#if CONFIG_POSTPROC_VISUALIZER + if (flags & VP8D_DEBUG_TXT_FRAME_INFO) { sprintf(message, "F%1dG%1dQ%3dF%3dP%d_s%dx%d", (oci->frame_type == KEY_FRAME), @@ -749,7 +754,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_l vp8_blit_text(message, oci->post_proc_buffer.y_buffer, oci->post_proc_buffer.y_stride); } - if (flags & VP8D_DEBUG_LEVEL2) + if (flags & VP8D_DEBUG_TXT_MBLK_MODES) { int i, j; unsigned char *y_ptr; @@ -781,7 +786,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_l } } - if (flags & VP8D_DEBUG_LEVEL3) + if (flags & VP8D_DEBUG_TXT_DC_DIFF) { int i, j; unsigned char *y_ptr; @@ -816,45 +821,14 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_l } } - if (flags & VP8D_DEBUG_LEVEL4) + if (flags & VP8D_DEBUG_TXT_RATE_INFO) { sprintf(message, "Bitrate: %10.2f frame_rate: %10.2f ", oci->bitrate, oci->framerate); vp8_blit_text(message, oci->post_proc_buffer.y_buffer, oci->post_proc_buffer.y_stride); -#if 0 - int i, j; - unsigned char *y_ptr; - YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer; - int mb_rows = post->y_height >> 4; - int mb_cols = post->y_width >> 4; - int mb_index = 0; - MODE_INFO *mi = oci->mi; - - y_ptr = post->y_buffer + 4 * post->y_stride + 4; - - /* vp8_filter each macro block */ - for (i = 0; i < mb_rows; i++) - { - for (j = 0; j < mb_cols; j++) - { - char zz[4]; - - sprintf(zz, "%c", mi[mb_index].mbmi.dc_diff + '0'); - vp8_blit_text(zz, y_ptr, post->y_stride); - mb_index ++; - y_ptr += 16; - } - - mb_index ++; /* border */ - y_ptr += post->y_stride * 16 - post->y_width; - - } - -#endif - } /* Draw motion vectors */ - if (flags & VP8D_DEBUG_DRAW_MV) + if ((flags & VP8D_DEBUG_DRAW_MV) && ppflags->display_mv_flag) { YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer; int width = post->y_width; @@ -871,6 +845,12 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_l { int x1, y1; + if (!(ppflags->display_mv_flag & (1<<mi->mbmi.mode))) + { + mi++; + continue; + } + if (mi->mbmi.mode == SPLITMV) { switch (mi->mbmi.partitioning) @@ -996,6 +976,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_l else vp8_blit_line (lx0, x1, ly0, y1, y_buffer, y_stride); } + mi++; } mi++; @@ -1003,7 +984,8 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_l } /* Color in block modes */ - if (flags & VP8D_DEBUG_CLR_BLK_MODES) + if ((flags & VP8D_DEBUG_CLR_BLK_MODES) + && (ppflags->display_mb_modes_flag || ppflags->display_b_modes_flag)) { int y, x; YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer; @@ -1021,7 +1003,8 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_l { int Y = 0, U = 0, V = 0; - if (mi->mbmi.mode == B_PRED) + if (mi->mbmi.mode == B_PRED && + ((ppflags->display_mb_modes_flag & B_PRED) || ppflags->display_b_modes_flag)) { int by, bx; unsigned char *yl, *ul, *vl; @@ -1035,13 +1018,16 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_l { for (bx = 0; bx < 16; bx += 4) { - Y = B_PREDICTION_MODE_colors[bmi->mode][0]; - U = B_PREDICTION_MODE_colors[bmi->mode][1]; - V = B_PREDICTION_MODE_colors[bmi->mode][2]; - - POSTPROC_INVOKE(RTCD_VTABLE(oci), blend_b) - (yl+bx, ul+(bx>>1), vl+(bx>>1), Y, U, V, 0xc000, y_stride); + if ((ppflags->display_b_modes_flag & (1<<mi->mbmi.mode)) + || (ppflags->display_mb_modes_flag & B_PRED)) + { + Y = B_PREDICTION_MODE_colors[bmi->mode][0]; + U = B_PREDICTION_MODE_colors[bmi->mode][1]; + V = B_PREDICTION_MODE_colors[bmi->mode][2]; + POSTPROC_INVOKE(RTCD_VTABLE(oci), blend_b) + (yl+bx, ul+(bx>>1), vl+(bx>>1), Y, U, V, 0xc000, y_stride); + } bmi++; } @@ -1050,7 +1036,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_l vl += y_stride*1; } } - else + else if (ppflags->display_mb_modes_flag & (1<<mi->mbmi.mode)) { Y = MB_PREDICTION_MODE_colors[mi->mbmi.mode][0]; U = MB_PREDICTION_MODE_colors[mi->mbmi.mode][1]; @@ -1059,6 +1045,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_l POSTPROC_INVOKE(RTCD_VTABLE(oci), blend_mb_inner) (y_ptr+x, u_ptr+(x>>1), v_ptr+(x>>1), Y, U, V, 0xc000, y_stride); } + mi++; } y_ptr += y_stride*16; @@ -1070,7 +1057,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_l } /* Color in frame reference blocks */ - if (flags & VP8D_DEBUG_CLR_FRM_REF_BLKS) + if ((flags & VP8D_DEBUG_CLR_FRM_REF_BLKS) && ppflags->display_ref_frame_flag) { int y, x; YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer; @@ -1088,12 +1075,15 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_l { int Y = 0, U = 0, V = 0; - Y = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][0]; - U = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][1]; - V = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][2]; + if (ppflags->display_ref_frame_flag & (1<<mi->mbmi.ref_frame)) + { + Y = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][0]; + U = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][1]; + V = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][2]; - POSTPROC_INVOKE(RTCD_VTABLE(oci), blend_mb_outer) - (y_ptr+x, u_ptr+(x>>1), v_ptr+(x>>1), Y, U, V, 0xc000, y_stride); + POSTPROC_INVOKE(RTCD_VTABLE(oci), blend_mb_outer) + (y_ptr+x, u_ptr+(x>>1), v_ptr+(x>>1), Y, U, V, 0xc000, y_stride); + } mi++; } @@ -1104,6 +1094,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_l mi++; } } +#endif *dest = oci->post_proc_buffer; diff --git a/vp8/common/postproc.h b/vp8/common/postproc.h index 7485135bf..c641b9ca5 100644 --- a/vp8/common/postproc.h +++ b/vp8/common/postproc.h @@ -111,7 +111,7 @@ struct postproc_state #include "onyxc_int.h" #include "ppflags.h" int vp8_post_proc_frame(struct VP8Common *oci, YV12_BUFFER_CONFIG *dest, - int deblock_level, int noise_level, int flags); + vp8_ppflags_t *flags); void vp8_de_noise(YV12_BUFFER_CONFIG *source, diff --git a/vp8/common/ppflags.h b/vp8/common/ppflags.h index b8d713cf0..65b0cab6a 100644 --- a/vp8/common/ppflags.h +++ b/vp8/common/ppflags.h @@ -17,13 +17,24 @@ enum VP8D_DEBLOCK = 1<<0, VP8D_DEMACROBLOCK = 1<<1, VP8D_ADDNOISE = 1<<2, - VP8D_DEBUG_LEVEL1 = 1<<3, - VP8D_DEBUG_LEVEL2 = 1<<4, - VP8D_DEBUG_LEVEL3 = 1<<5, - VP8D_DEBUG_LEVEL4 = 1<<6, + VP8D_DEBUG_TXT_FRAME_INFO = 1<<3, + VP8D_DEBUG_TXT_MBLK_MODES = 1<<4, + VP8D_DEBUG_TXT_DC_DIFF = 1<<5, + VP8D_DEBUG_TXT_RATE_INFO = 1<<6, VP8D_DEBUG_DRAW_MV = 1<<7, VP8D_DEBUG_CLR_BLK_MODES = 1<<8, VP8D_DEBUG_CLR_FRM_REF_BLKS = 1<<9 }; +typedef struct +{ + int post_proc_flag; + int deblocking_level; + int noise_level; + int display_ref_frame_flag; + int display_mb_modes_flag; + int display_b_modes_flag; + int display_mv_flag; +} vp8_ppflags_t; + #endif diff --git a/vp8/decoder/onyxd_if.c b/vp8/decoder/onyxd_if.c index 6eda45e4a..aa2709f5b 100644 --- a/vp8/decoder/onyxd_if.c +++ b/vp8/decoder/onyxd_if.c @@ -506,7 +506,7 @@ int vp8dx_receive_compressed_data(VP8D_PTR ptr, unsigned long size, const unsign pbi->common.error.setjmp = 0; return retcode; } -int vp8dx_get_raw_frame(VP8D_PTR ptr, YV12_BUFFER_CONFIG *sd, INT64 *time_stamp, INT64 *time_end_stamp, int deblock_level, int noise_level, int flags) +int vp8dx_get_raw_frame(VP8D_PTR ptr, YV12_BUFFER_CONFIG *sd, INT64 *time_stamp, INT64 *time_end_stamp, vp8_ppflags_t *flags) { int ret = -1; VP8D_COMP *pbi = (VP8D_COMP *) ptr; @@ -524,7 +524,7 @@ int vp8dx_get_raw_frame(VP8D_PTR ptr, YV12_BUFFER_CONFIG *sd, INT64 *time_stamp, sd->clrtype = pbi->common.clr_type; #if CONFIG_POSTPROC - ret = vp8_post_proc_frame(&pbi->common, sd, deblock_level, noise_level, flags); + ret = vp8_post_proc_frame(&pbi->common, sd, flags); #else if (pbi->common.frame_to_show) diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c index b67edd39f..2aac20b31 100644 --- a/vp8/encoder/encodeframe.c +++ b/vp8/encoder/encodeframe.c @@ -146,16 +146,25 @@ static const int qzbin_factors_y2[129] = #define EXACT_QUANT #ifdef EXACT_QUANT -static void vp8cx_invert_quant(short *quant, short *shift, short d) +static void vp8cx_invert_quant(int improved_quant, short *quant, + short *shift, short d) { - unsigned t; - int l; - t = d; - for(l = 0; t > 1; l++) - t>>=1; - t = 1 + (1<<(16+l))/d; - *quant = (short)(t - (1<<16)); - *shift = l; + if(improved_quant) + { + unsigned t; + int l; + t = d; + for(l = 0; t > 1; l++) + t>>=1; + t = 1 + (1<<(16+l))/d; + *quant = (short)(t - (1<<16)); + *shift = l; + } + else + { + *quant = (1 << 16) / d; + *shift = 0; + } } void vp8cx_init_quantizer(VP8_COMP *cpi) @@ -170,7 +179,7 @@ void vp8cx_init_quantizer(VP8_COMP *cpi) { // dc values quant_val = vp8_dc_quant(Q, cpi->common.y1dc_delta_q); - vp8cx_invert_quant(cpi->Y1quant[Q] + 0, + vp8cx_invert_quant(cpi->sf.improved_quant, cpi->Y1quant[Q] + 0, cpi->Y1quant_shift[Q] + 0, quant_val); cpi->Y1zbin[Q][0] = ((qzbin_factors[Q] * quant_val) + 64) >> 7; cpi->Y1round[Q][0] = (qrounding_factors[Q] * quant_val) >> 7; @@ -178,7 +187,7 @@ void vp8cx_init_quantizer(VP8_COMP *cpi) cpi->zrun_zbin_boost_y1[Q][0] = (quant_val * zbin_boost[0]) >> 7; quant_val = vp8_dc2quant(Q, cpi->common.y2dc_delta_q); - vp8cx_invert_quant(cpi->Y2quant[Q] + 0, + vp8cx_invert_quant(cpi->sf.improved_quant, cpi->Y2quant[Q] + 0, cpi->Y2quant_shift[Q] + 0, quant_val); cpi->Y2zbin[Q][0] = ((qzbin_factors_y2[Q] * quant_val) + 64) >> 7; cpi->Y2round[Q][0] = (qrounding_factors_y2[Q] * quant_val) >> 7; @@ -186,7 +195,7 @@ void vp8cx_init_quantizer(VP8_COMP *cpi) cpi->zrun_zbin_boost_y2[Q][0] = (quant_val * zbin_boost[0]) >> 7; quant_val = vp8_dc_uv_quant(Q, cpi->common.uvdc_delta_q); - vp8cx_invert_quant(cpi->UVquant[Q] + 0, + vp8cx_invert_quant(cpi->sf.improved_quant, cpi->UVquant[Q] + 0, cpi->UVquant_shift[Q] + 0, quant_val); cpi->UVzbin[Q][0] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;; cpi->UVround[Q][0] = (qrounding_factors[Q] * quant_val) >> 7; @@ -199,7 +208,7 @@ void vp8cx_init_quantizer(VP8_COMP *cpi) int rc = vp8_default_zig_zag1d[i]; quant_val = vp8_ac_yquant(Q); - vp8cx_invert_quant(cpi->Y1quant[Q] + rc, + vp8cx_invert_quant(cpi->sf.improved_quant, cpi->Y1quant[Q] + rc, cpi->Y1quant_shift[Q] + rc, quant_val); cpi->Y1zbin[Q][rc] = ((qzbin_factors[Q] * quant_val) + 64) >> 7; cpi->Y1round[Q][rc] = (qrounding_factors[Q] * quant_val) >> 7; @@ -207,7 +216,7 @@ void vp8cx_init_quantizer(VP8_COMP *cpi) cpi->zrun_zbin_boost_y1[Q][i] = (quant_val * zbin_boost[i]) >> 7; quant_val = vp8_ac2quant(Q, cpi->common.y2ac_delta_q); - vp8cx_invert_quant(cpi->Y2quant[Q] + rc, + vp8cx_invert_quant(cpi->sf.improved_quant, cpi->Y2quant[Q] + rc, cpi->Y2quant_shift[Q] + rc, quant_val); cpi->Y2zbin[Q][rc] = ((qzbin_factors_y2[Q] * quant_val) + 64) >> 7; cpi->Y2round[Q][rc] = (qrounding_factors_y2[Q] * quant_val) >> 7; @@ -215,7 +224,7 @@ void vp8cx_init_quantizer(VP8_COMP *cpi) cpi->zrun_zbin_boost_y2[Q][i] = (quant_val * zbin_boost[i]) >> 7; quant_val = vp8_ac_uv_quant(Q, cpi->common.uvac_delta_q); - vp8cx_invert_quant(cpi->UVquant[Q] + rc, + vp8cx_invert_quant(cpi->sf.improved_quant, cpi->UVquant[Q] + rc, cpi->UVquant_shift[Q] + rc, quant_val); cpi->UVzbin[Q][rc] = ((qzbin_factors[Q] * quant_val) + 64) >> 7; cpi->UVround[Q][rc] = (qrounding_factors[Q] * quant_val) >> 7; @@ -405,14 +414,14 @@ void encode_mb_row(VP8_COMP *cpi, // Set up limit values for vertical motion vector components // to prevent them extending beyond the UMV borders x->mv_row_min = -((mb_row * 16) + (VP8BORDERINPIXELS - 16)); - x->mv_row_max = ((cm->mb_rows - 1 - mb_row) * 16) + x->mv_row_max = ((cm->mb_rows - 1 - mb_row) * 16) + (VP8BORDERINPIXELS - 16); // for each macroblock col in image for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) { - // Distance of Mb to the left & right edges, specified in - // 1/8th pel units as they are always compared to values + // Distance of Mb to the left & right edges, specified in + // 1/8th pel units as they are always compared to values // that are in 1/8th pel units xd->mb_to_left_edge = -((mb_col * 16) << 3); xd->mb_to_right_edge = ((cm->mb_cols - 1 - mb_col) * 16) << 3; @@ -420,7 +429,7 @@ void encode_mb_row(VP8_COMP *cpi, // Set up limit values for horizontal motion vector components // to prevent them extending beyond the UMV borders x->mv_col_min = -((mb_col * 16) + (VP8BORDERINPIXELS - 16)); - x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16) + x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16) + (VP8BORDERINPIXELS - 16); xd->dst.y_buffer = cm->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset; diff --git a/vp8/encoder/encodemb.c b/vp8/encoder/encodemb.c index 043eac219..e9753ac48 100644 --- a/vp8/encoder/encodemb.c +++ b/vp8/encoder/encodemb.c @@ -309,8 +309,10 @@ void vp8_optimize_b(MACROBLOCK *mb, int ib, int type, eob = d->eob; /* Now set up a Viterbi trellis to evaluate alternative roundings. */ - /* TODO: These should vary with the block type, since the quantizer does. */ rdmult = (mb->rdmult << 2)*err_mult; + if(mb->e_mbd.mode_info_context->mbmi.ref_frame==INTRA_FRAME) + rdmult = (rdmult * 9)>>4; + rddiv = mb->rddiv; best_mask[0] = best_mask[1] = 0; /* Initialize the sentinel node of the trellis. */ diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c index 8a94fa369..a7f5ce44c 100644 --- a/vp8/encoder/firstpass.c +++ b/vp8/encoder/firstpass.c @@ -1439,7 +1439,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) // Boost for arf frame Boost = (cpi->gfu_boost * 3 * GFQ_ADJUSTMENT) / (2 * 100); - Boost += (cpi->baseline_gf_interval * 50); + Boost += (i * 50); allocation_chunks = (i * 100) + Boost; // Normalize Altboost and allocations chunck down to prevent overflow @@ -1738,16 +1738,6 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) vp8_avg_stats(§ionstats); - if (sectionstats.pcnt_motion < .17) - cpi->section_is_low_motion = 1; - else - cpi->section_is_low_motion = 0; - - if (sectionstats.mvc_abs + sectionstats.mvr_abs > 45) - cpi->section_is_fast_motion = 1; - else - cpi->section_is_fast_motion = 0; - cpi->section_intra_rating = sectionstats.intra_error / DOUBLE_DIVIDE_CHECK(sectionstats.coded_error); Ratio = sectionstats.intra_error / DOUBLE_DIVIDE_CHECK(sectionstats.coded_error); @@ -1980,7 +1970,14 @@ void vp8_second_pass(VP8_COMP *cpi) cpi->ni_av_qi = cpi->worst_quality; } } - else + // The last few frames of a clip almost always have to few or too many + // bits and for the sake of over exact rate control we dont want to make + // radical adjustments to the allowed quantizer range just to use up a + // few surplus bits or get beneath the target rate. + else if ( (cpi->common.current_video_frame < + (((unsigned int)cpi->total_stats->count * 255)>>8)) && + ((cpi->common.current_video_frame + cpi->baseline_gf_interval) < + (unsigned int)cpi->total_stats->count) ) { if (frames_left < 1) frames_left = 1; @@ -2344,17 +2341,7 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) vp8_avg_stats(§ionstats); - if (sectionstats.pcnt_motion < .17) - cpi->section_is_low_motion = 1; - else - cpi->section_is_low_motion = 0; - - if (sectionstats.mvc_abs + sectionstats.mvr_abs > 45) - cpi->section_is_fast_motion = 1; - else - cpi->section_is_fast_motion = 0; - - cpi->section_intra_rating = sectionstats.intra_error / DOUBLE_DIVIDE_CHECK(sectionstats.coded_error); + cpi->section_intra_rating = sectionstats.intra_error / DOUBLE_DIVIDE_CHECK(sectionstats.coded_error); Ratio = sectionstats.intra_error / DOUBLE_DIVIDE_CHECK(sectionstats.coded_error); // if( (Ratio > 11) ) //&& (sectionstats.pcnt_second_ref < .20) ) diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c index 93e13d139..00ecf97a6 100644 --- a/vp8/encoder/onyx_if.c +++ b/vp8/encoder/onyx_if.c @@ -563,6 +563,7 @@ void vp8_set_speed_features(VP8_COMP *cpi) int Speed = cpi->Speed; int i; VP8_COMMON *cm = &cpi->common; + int last_improved_quant = sf->improved_quant; // Initialise default mode frequency sampling variables for (i = 0; i < MAX_MODES; i ++) @@ -1262,6 +1263,8 @@ void vp8_set_speed_features(VP8_COMP *cpi) { cpi->mb.quantize_b = QUANTIZE_INVOKE(&cpi->rtcd.quantize, fastquantb); } + if (cpi->sf.improved_quant != last_improved_quant) + vp8cx_init_quantizer(cpi); #if CONFIG_RUNTIME_CPU_DETECT cpi->mb.e_mbd.rtcd = &cpi->common.rtcd; @@ -5224,7 +5227,7 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon return 0; } -int vp8_get_preview_raw_frame(VP8_PTR comp, YV12_BUFFER_CONFIG *dest, int deblock_level, int noise_level, int flags) +int vp8_get_preview_raw_frame(VP8_PTR comp, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t *flags) { VP8_COMP *cpi = (VP8_COMP *) comp; @@ -5234,7 +5237,7 @@ int vp8_get_preview_raw_frame(VP8_PTR comp, YV12_BUFFER_CONFIG *dest, int debloc { int ret; #if CONFIG_POSTPROC - ret = vp8_post_proc_frame(&cpi->common, dest, deblock_level, noise_level, flags); + ret = vp8_post_proc_frame(&cpi->common, dest, flags); #else if (cpi->common.frame_to_show) diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h index be5b00de8..a9eedf399 100644 --- a/vp8/encoder/onyx_int.h +++ b/vp8/encoder/onyx_int.h @@ -614,9 +614,6 @@ typedef struct unsigned int tempdata2; int base_skip_false_prob[128]; - unsigned int section_is_low_motion; - unsigned int section_benefits_from_aggresive_q; - unsigned int section_is_fast_motion; unsigned int section_intra_rating; double section_max_qfactor; diff --git a/vp8/encoder/quantize.c b/vp8/encoder/quantize.c index c2c0351c0..a1be6614b 100644 --- a/vp8/encoder/quantize.c +++ b/vp8/encoder/quantize.c @@ -17,7 +17,8 @@ #include "predictdc.h" #define EXACT_QUANT -#ifdef EXACT_QUANT + +#ifdef EXACT_FASTQUANT void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d) { int i, rc, eob; @@ -64,6 +65,45 @@ void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d) d->eob = eob + 1; } +#else + +void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d) +{ + int i, rc, eob; + int zbin; + int x, y, z, sz; + short *coeff_ptr = b->coeff; + short *round_ptr = b->round; + short *quant_ptr = b->quant; + short *qcoeff_ptr = d->qcoeff; + short *dqcoeff_ptr = d->dqcoeff; + short *dequant_ptr = d->dequant; + + eob = -1; + for (i = 0; i < 16; i++) + { + rc = vp8_default_zig_zag1d[i]; + z = coeff_ptr[rc]; + + sz = (z >> 31); // sign of z + x = (z ^ sz) - sz; // x = abs(z) + + y = ((x + round_ptr[rc]) * quant_ptr[rc]) >> 16; // quantize (x) + x = (y ^ sz) - sz; // get the sign back + qcoeff_ptr[rc] = x; // write to destination + dqcoeff_ptr[rc] = x * dequant_ptr[rc]; // dequantized value + + if (y) + { + eob = i; // last nonzero coeffs + } + } + d->eob = eob + 1; +} + +#endif + +#ifdef EXACT_QUANT void vp8_regular_quantize_b(BLOCK *b, BLOCKD *d) { int i, rc, eob; @@ -178,39 +218,6 @@ void vp8_strict_quantize_b(BLOCK *b, BLOCKD *d) } #else -void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d) -{ - int i, rc, eob; - int zbin; - int x, y, z, sz; - short *coeff_ptr = b->coeff; - short *round_ptr = b->round; - short *quant_ptr = b->quant; - short *qcoeff_ptr = d->qcoeff; - short *dqcoeff_ptr = d->dqcoeff; - short *dequant_ptr = d->dequant; - - eob = -1; - for (i = 0; i < 16; i++) - { - rc = vp8_default_zig_zag1d[i]; - z = coeff_ptr[rc]; - - sz = (z >> 31); // sign of z - x = (z ^ sz) - sz; // x = abs(z) - - y = ((x + round_ptr[rc]) * quant_ptr[rc]) >> 16; // quantize (x) - x = (y ^ sz) - sz; // get the sign back - qcoeff_ptr[rc] = x; // write to destination - dqcoeff_ptr[rc] = x * dequant_ptr[rc]; // dequantized value - - if (y) - { - eob = i; // last nonzero coeffs - } - } - d->eob = eob + 1; -} void vp8_regular_quantize_b(BLOCK *b, BLOCKD *d) { diff --git a/vp8/encoder/x86/dct_mmx.asm b/vp8/encoder/x86/dct_mmx.asm index 5acaca875..f07b030bd 100644 --- a/vp8/encoder/x86/dct_mmx.asm +++ b/vp8/encoder/x86/dct_mmx.asm @@ -11,511 +11,231 @@ %include "vpx_ports/x86_abi_support.asm" -section .text - global sym(vp8_short_fdct4x4_mmx) - global sym(vp8_short_fdct8x4_wmt) - - -%define DCTCONSTANTSBITS (16) -%define DCTROUNDINGVALUE (1<< (DCTCONSTANTSBITS-1)) -%define x_c1 (60547) ; cos(pi /8) * (1<<15) -%define x_c2 (46341) ; cos(pi*2/8) * (1<<15) -%define x_c3 (25080) ; cos(pi*3/8) * (1<<15) - - ;void vp8_short_fdct4x4_mmx(short *input, short *output, int pitch) +global sym(vp8_short_fdct4x4_mmx) sym(vp8_short_fdct4x4_mmx): push rbp - mov rbp, rsp + mov rbp, rsp SHADOW_ARGS_TO_STACK 3 GET_GOT rbx - push rsi - push rdi + push rsi + push rdi ; end prolog - mov rsi, arg(0) ;input - mov rdi, arg(1) ;output - - lea rdx, [GLOBAL(dct_const_mmx)] - movsxd rax, dword ptr arg(2) ;pitch - - lea rcx, [rsi + rax*2] - ; read the input data - movq mm0, [rsi] - movq mm1, [rsi + rax ] - - movq mm2, [rcx] - movq mm3, [rcx + rax] - ; get the constants - ;shift to left by 1 for prescision - psllw mm0, 3 - psllw mm1, 3 - - psllw mm2, 3 - psllw mm3, 3 - - ; transpose for the second stage - movq mm4, mm0 ; 00 01 02 03 - movq mm5, mm2 ; 10 11 12 03 - - punpcklwd mm0, mm1 ; 00 10 01 11 - punpckhwd mm4, mm1 ; 02 12 03 13 - - punpcklwd mm2, mm3 ; 20 30 21 31 - punpckhwd mm5, mm3 ; 22 32 23 33 - - - movq mm1, mm0 ; 00 10 01 11 - punpckldq mm0, mm2 ; 00 10 20 30 - - punpckhdq mm1, mm2 ; 01 11 21 31 - - movq mm2, mm4 ; 02 12 03 13 - punpckldq mm2, mm5 ; 02 12 22 32 - - punpckhdq mm4, mm5 ; 03 13 23 33 - movq mm3, mm4 - - - ; first stage - movq mm5, mm0 - movq mm4, mm1 - - paddw mm0, mm3 ; a = 0 + 3 - paddw mm1, mm2 ; b = 1 + 2 - - psubw mm4, mm2 ; c = 1 - 2 - psubw mm5, mm3 ; d = 0 - 3 - - - ; output 0 and 2 - movq mm6, [rdx + 16] ; c2 - movq mm2, mm0 ; a - - paddw mm0, mm1 ; a + b - psubw mm2, mm1 ; a - b - - movq mm1, mm0 ; a + b - pmulhw mm0, mm6 ; 00 01 02 03 - - paddw mm0, mm1 ; output 00 01 02 03 - pmulhw mm6, mm2 ; 20 21 22 23 - - paddw mm2, mm6 ; output 20 21 22 23 - - ; output 1 and 3 - movq mm6, [rdx + 8] ; c1 - movq mm7, [rdx + 24] ; c3 - - movq mm1, mm4 ; c - movq mm3, mm5 ; d - - pmulhw mm1, mm7 ; c * c3 - pmulhw mm3, mm6 ; d * c1 - - paddw mm3, mm5 ; d * c1 rounded - paddw mm1, mm3 ; output 10 11 12 13 - - movq mm3, mm4 ; c - pmulhw mm5, mm7 ; d * c3 - - pmulhw mm4, mm6 ; c * c1 - paddw mm3, mm4 ; round c* c1 - - psubw mm5, mm3 ; output 30 31 32 33 - movq mm3, mm5 - - - ; done with vertical - ; transpose for the second stage - movq mm4, mm0 ; 00 01 02 03 - movq mm5, mm2 ; 10 11 12 03 - - punpcklwd mm0, mm1 ; 00 10 01 11 - punpckhwd mm4, mm1 ; 02 12 03 13 - - punpcklwd mm2, mm3 ; 20 30 21 31 - punpckhwd mm5, mm3 ; 22 32 23 33 - - - movq mm1, mm0 ; 00 10 01 11 - punpckldq mm0, mm2 ; 00 10 20 30 - - punpckhdq mm1, mm2 ; 01 11 21 31 - - movq mm2, mm4 ; 02 12 03 13 - punpckldq mm2, mm5 ; 02 12 22 32 - - punpckhdq mm4, mm5 ; 03 13 23 33 - movq mm3, mm4 - - - ; first stage - movq mm5, mm0 - movq mm4, mm1 - paddw mm0, mm3 ; a = 0 + 3 - paddw mm1, mm2 ; b = 1 + 2 + mov rsi, arg(0) ; input + mov rdi, arg(1) ; output - psubw mm4, mm2 ; c = 1 - 2 - psubw mm5, mm3 ; d = 0 - 3 + movsxd rax, dword ptr arg(2) ;pitch - - ; output 0 and 2 - movq mm6, [rdx + 16] ; c2 - movq mm2, mm0 ; a - paddw mm0, mm1 ; a + b - - psubw mm2, mm1 ; a - b - - movq mm1, mm0 ; a + b - pmulhw mm0, mm6 ; 00 01 02 03 - - paddw mm0, mm1 ; output 00 01 02 03 - pmulhw mm6, mm2 ; 20 21 22 23 - - paddw mm2, mm6 ; output 20 21 22 23 - - - ; output 1 and 3 - movq mm6, [rdx + 8] ; c1 - movq mm7, [rdx + 24] ; c3 - - movq mm1, mm4 ; c - movq mm3, mm5 ; d - - pmulhw mm1, mm7 ; c * c3 - pmulhw mm3, mm6 ; d * c1 - - paddw mm3, mm5 ; d * c1 rounded - paddw mm1, mm3 ; output 10 11 12 13 - - movq mm3, mm4 ; c - pmulhw mm5, mm7 ; d * c3 - - pmulhw mm4, mm6 ; c * c1 - paddw mm3, mm4 ; round c* c1 - - psubw mm5, mm3 ; output 30 31 32 33 - movq mm3, mm5 - ; done with vertical - - pcmpeqw mm4, mm4 - pcmpeqw mm5, mm5 - psrlw mm4, 15 - psrlw mm5, 15 - - psllw mm4, 2 - psllw mm5, 2 - - paddw mm0, mm4 - paddw mm1, mm5 - paddw mm2, mm4 - paddw mm3, mm5 - - psraw mm0, 3 - psraw mm1, 3 - psraw mm2, 3 - psraw mm3, 3 - - movq [rdi ], mm0 - movq [rdi+ 8], mm1 - movq [rdi+16], mm2 - movq [rdi+24], mm3 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - -;void vp8_short_fdct8x4_wmt(short *input, short *output, int pitch) -sym(vp8_short_fdct8x4_wmt): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 3 - GET_GOT rbx - push rsi - push rdi - ; end prolog - mov rsi, arg(0) ;input - mov rdi, arg(1) ;output - - lea rdx, [GLOBAL(dct_const_xmm)] - movsxd rax, dword ptr arg(2) ;pitch - - lea rcx, [rsi + rax*2] + lea rcx, [rsi + rax*2] ; read the input data - movdqa xmm0, [rsi] - movdqa xmm2, [rsi + rax] - - movdqa xmm4, [rcx] - movdqa xmm3, [rcx + rax] - ; get the constants - ;shift to left by 1 for prescision - psllw xmm0, 3 - psllw xmm2, 3 - - psllw xmm4, 3 - psllw xmm3, 3 - - ; transpose for the second stage - movdqa xmm1, xmm0 ; 00 01 02 03 04 05 06 07 - movdqa xmm5, xmm4 ; 20 21 22 23 24 25 26 27 - - punpcklwd xmm0, xmm2 ; 00 10 01 11 02 12 03 13 - punpckhwd xmm1, xmm2 ; 04 14 05 15 06 16 07 17 + movq mm0, [rsi] + movq mm1, [rsi + rax] - punpcklwd xmm4, xmm3 ; 20 30 21 31 22 32 23 33 - punpckhwd xmm5, xmm3 ; 24 34 25 35 26 36 27 37 + movq mm2, [rcx] + movq mm4, [rcx + rax] - movdqa xmm2, xmm0 ; 00 10 01 11 02 12 03 13 - punpckldq xmm0, xmm4 ; 00 10 20 30 01 11 21 31 + ; transpose for the first stage + movq mm3, mm0 ; 00 01 02 03 + movq mm5, mm2 ; 20 21 22 23 - punpckhdq xmm2, xmm4 ; 02 12 22 32 03 13 23 33 + punpcklwd mm0, mm1 ; 00 10 01 11 + punpckhwd mm3, mm1 ; 02 12 03 13 + punpcklwd mm2, mm4 ; 20 30 21 31 + punpckhwd mm5, mm4 ; 22 32 23 33 - movdqa xmm4, xmm1 ; 04 14 05 15 06 16 07 17 - punpckldq xmm4, xmm5 ; 04 14 24 34 05 15 25 35 + movq mm1, mm0 ; 00 10 01 11 + punpckldq mm0, mm2 ; 00 10 20 30 - punpckhdq xmm1, xmm5 ; 06 16 26 36 07 17 27 37 - movdqa xmm3, xmm2 ; 02 12 22 32 03 13 23 33 + punpckhdq mm1, mm2 ; 01 11 21 31 - punpckhqdq xmm3, xmm1 ; 03 13 23 33 07 17 27 37 - punpcklqdq xmm2, xmm1 ; 02 12 22 32 06 16 26 36 + movq mm2, mm3 ; 02 12 03 13 + punpckldq mm2, mm5 ; 02 12 22 32 - movdqa xmm1, xmm0 ; 00 10 20 30 01 11 21 31 - punpcklqdq xmm0, xmm4 ; 00 10 20 30 04 14 24 34 + punpckhdq mm3, mm5 ; 03 13 23 33 - punpckhqdq xmm1, xmm4 ; 01 11 21 32 05 15 25 35 - - ; xmm0 0 - ; xmm1 1 - ; xmm2 2 - ; xmm3 3 + ; mm0 0 + ; mm1 1 + ; mm2 2 + ; mm3 3 ; first stage - movdqa xmm5, xmm0 - movdqa xmm4, xmm1 - - paddw xmm0, xmm3 ; a = 0 + 3 - paddw xmm1, xmm2 ; b = 1 + 2 - - psubw xmm4, xmm2 ; c = 1 - 2 - psubw xmm5, xmm3 ; d = 0 - 3 + movq mm5, mm0 + movq mm4, mm1 + paddw mm0, mm3 ; a1 = 0 + 3 + paddw mm1, mm2 ; b1 = 1 + 2 - ; output 0 and 2 - movdqa xmm6, [rdx + 32] ; c2 - movdqa xmm2, xmm0 ; a + psubw mm4, mm2 ; c1 = 1 - 2 + psubw mm5, mm3 ; d1 = 0 - 3 - paddw xmm0, xmm1 ; a + b - psubw xmm2, xmm1 ; a - b + psllw mm5, 3 + psllw mm4, 3 - movdqa xmm1, xmm0 ; a + b - pmulhw xmm0, xmm6 ; 00 01 02 03 + psllw mm0, 3 + psllw mm1, 3 - paddw xmm0, xmm1 ; output 00 01 02 03 - pmulhw xmm6, xmm2 ; 20 21 22 23 + ; output 0 and 2 + movq mm2, mm0 ; a1 - paddw xmm2, xmm6 ; output 20 21 22 23 + paddw mm0, mm1 ; op[0] = a1 + b1 + psubw mm2, mm1 ; op[2] = a1 - b1 ; output 1 and 3 - movdqa xmm6, [rdx + 16] ; c1 - movdqa xmm7, [rdx + 48] ; c3 - - movdqa xmm1, xmm4 ; c - movdqa xmm3, xmm5 ; d + ; interleave c1, d1 + movq mm1, mm5 ; d1 + punpcklwd mm1, mm4 ; c1 d1 + punpckhwd mm5, mm4 ; c1 d1 - pmulhw xmm1, xmm7 ; c * c3 - pmulhw xmm3, xmm6 ; d * c1 + movq mm3, mm1 + movq mm4, mm5 - paddw xmm3, xmm5 ; d * c1 rounded - paddw xmm1, xmm3 ; output 10 11 12 13 + pmaddwd mm1, MMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 + pmaddwd mm4, MMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 - movdqa xmm3, xmm4 ; c - pmulhw xmm5, xmm7 ; d * c3 + pmaddwd mm3, MMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 + pmaddwd mm5, MMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 - pmulhw xmm4, xmm6 ; c * c1 - paddw xmm3, xmm4 ; round c* c1 + paddd mm1, MMWORD PTR[GLOBAL(_14500)] + paddd mm4, MMWORD PTR[GLOBAL(_14500)] + paddd mm3, MMWORD PTR[GLOBAL(_7500)] + paddd mm5, MMWORD PTR[GLOBAL(_7500)] - psubw xmm5, xmm3 ; output 30 31 32 33 - movdqa xmm3, xmm5 + psrad mm1, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12 + psrad mm4, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12 + psrad mm3, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12 + psrad mm5, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12 + packssdw mm1, mm4 ; op[1] + packssdw mm3, mm5 ; op[3] ; done with vertical ; transpose for the second stage - movdqa xmm4, xmm2 ; 02 12 22 32 06 16 26 36 - movdqa xmm2, xmm1 ; 01 11 21 31 05 15 25 35 + movq mm4, mm0 ; 00 10 20 30 + movq mm5, mm2 ; 02 12 22 32 - movdqa xmm1, xmm0 ; 00 10 20 30 04 14 24 34 - movdqa xmm5, xmm4 ; 02 12 22 32 06 16 26 36 + punpcklwd mm0, mm1 ; 00 01 10 11 + punpckhwd mm4, mm1 ; 20 21 30 31 - punpcklwd xmm0, xmm2 ; 00 01 10 11 20 21 30 31 - punpckhwd xmm1, xmm2 ; 04 05 14 15 24 25 34 35 + punpcklwd mm2, mm3 ; 02 03 12 13 + punpckhwd mm5, mm3 ; 22 23 32 33 - punpcklwd xmm4, xmm3 ; 02 03 12 13 22 23 32 33 - punpckhwd xmm5, xmm3 ; 06 07 16 17 26 27 36 37 + movq mm1, mm0 ; 00 01 10 11 + punpckldq mm0, mm2 ; 00 01 02 03 - movdqa xmm2, xmm0 ; 00 01 10 11 20 21 30 31 - punpckldq xmm0, xmm4 ; 00 01 02 03 10 11 12 13 + punpckhdq mm1, mm2 ; 01 22 12 13 - punpckhdq xmm2, xmm4 ; 20 21 22 23 30 31 32 33 + movq mm2, mm4 ; 20 31 30 31 + punpckldq mm2, mm5 ; 20 21 22 23 + punpckhdq mm4, mm5 ; 30 31 32 33 - movdqa xmm4, xmm1 ; 04 05 14 15 24 25 34 35 - punpckldq xmm4, xmm5 ; 04 05 06 07 14 15 16 17 + ; mm0 0 + ; mm1 1 + ; mm2 2 + ; mm3 4 - punpckhdq xmm1, xmm5 ; 24 25 26 27 34 35 36 37 - movdqa xmm3, xmm2 ; 20 21 22 23 30 31 32 33 + movq mm5, mm0 + movq mm3, mm1 - punpckhqdq xmm3, xmm1 ; 30 31 32 33 34 35 36 37 - punpcklqdq xmm2, xmm1 ; 20 21 22 23 24 25 26 27 + paddw mm0, mm4 ; a1 = 0 + 3 + paddw mm1, mm2 ; b1 = 1 + 2 - movdqa xmm1, xmm0 ; 00 01 02 03 10 11 12 13 - punpcklqdq xmm0, xmm4 ; 00 01 02 03 04 05 06 07 + psubw mm3, mm2 ; c1 = 1 - 2 + psubw mm5, mm4 ; d1 = 0 - 3 - punpckhqdq xmm1, xmm4 ; 10 11 12 13 14 15 16 17 + pxor mm6, mm6 ; zero out for compare - ; first stage - movdqa xmm5, xmm0 - movdqa xmm4, xmm1 - - paddw xmm0, xmm3 ; a = 0 + 3 - paddw xmm1, xmm2 ; b = 1 + 2 - - psubw xmm4, xmm2 ; c = 1 - 2 - psubw xmm5, xmm3 ; d = 0 - 3 + pcmpeqw mm6, mm5 ; d1 != 0 + pandn mm6, MMWORD PTR[GLOBAL(_cmp_mask)] ; clear upper, + ; and keep bit 0 of lower ; output 0 and 2 - movdqa xmm6, [rdx + 32] ; c2 - movdqa xmm2, xmm0 ; a + movq mm2, mm0 ; a1 - paddw xmm0, xmm1 ; a + b - psubw xmm2, xmm1 ; a - b + paddw mm0, mm1 ; a1 + b1 + psubw mm2, mm1 ; a1 - b1 - movdqa xmm1, xmm0 ; a + b - pmulhw xmm0, xmm6 ; 00 01 02 03 + paddw mm0, MMWORD PTR[GLOBAL(_7w)] + paddw mm2, MMWORD PTR[GLOBAL(_7w)] - paddw xmm0, xmm1 ; output 00 01 02 03 - pmulhw xmm6, xmm2 ; 20 21 22 23 + psraw mm0, 4 ; op[0] = (a1 + b1 + 7)>>4 + psraw mm2, 4 ; op[8] = (a1 - b1 + 7)>>4 - paddw xmm2, xmm6 ; output 20 21 22 23 + movq MMWORD PTR[rdi + 0 ], mm0 + movq MMWORD PTR[rdi + 16], mm2 ; output 1 and 3 - movdqa xmm6, [rdx + 16] ; c1 - movdqa xmm7, [rdx + 48] ; c3 + ; interleave c1, d1 + movq mm1, mm5 ; d1 + punpcklwd mm1, mm3 ; c1 d1 + punpckhwd mm5, mm3 ; c1 d1 - movdqa xmm1, xmm4 ; c - movdqa xmm3, xmm5 ; d + movq mm3, mm1 + movq mm4, mm5 - pmulhw xmm1, xmm7 ; c * c3 - pmulhw xmm3, xmm6 ; d * c1 + pmaddwd mm1, MMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 + pmaddwd mm4, MMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 - paddw xmm3, xmm5 ; d * c1 rounded - paddw xmm1, xmm3 ; output 10 11 12 13 + pmaddwd mm3, MMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 + pmaddwd mm5, MMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 - movdqa xmm3, xmm4 ; c - pmulhw xmm5, xmm7 ; d * c3 + paddd mm1, MMWORD PTR[GLOBAL(_12000)] + paddd mm4, MMWORD PTR[GLOBAL(_12000)] + paddd mm3, MMWORD PTR[GLOBAL(_51000)] + paddd mm5, MMWORD PTR[GLOBAL(_51000)] - pmulhw xmm4, xmm6 ; c * c1 - paddw xmm3, xmm4 ; round c* c1 + psrad mm1, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16 + psrad mm4, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16 + psrad mm3, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16 + psrad mm5, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16 - psubw xmm5, xmm3 ; output 30 31 32 33 - movdqa xmm3, xmm5 - ; done with vertical + packssdw mm1, mm4 ; op[4] + packssdw mm3, mm5 ; op[12] + + paddw mm1, mm6 ; op[4] += (d1!=0) + movq MMWORD PTR[rdi + 8 ], mm1 + movq MMWORD PTR[rdi + 24], mm3 - pcmpeqw xmm4, xmm4 - pcmpeqw xmm5, xmm5; - psrlw xmm4, 15 - psrlw xmm5, 15 - - psllw xmm4, 2 - psllw xmm5, 2 - - paddw xmm0, xmm4 - paddw xmm1, xmm5 - paddw xmm2, xmm4 - paddw xmm3, xmm5 - - psraw xmm0, 3 - psraw xmm1, 3 - psraw xmm2, 3 - psraw xmm3, 3 - - movq QWORD PTR[rdi ], xmm0 - movq QWORD PTR[rdi+ 8], xmm1 - movq QWORD PTR[rdi+16], xmm2 - movq QWORD PTR[rdi+24], xmm3 - - psrldq xmm0, 8 - psrldq xmm1, 8 - psrldq xmm2, 8 - psrldq xmm3, 8 - - movq QWORD PTR[rdi+32], xmm0 - movq QWORD PTR[rdi+40], xmm1 - movq QWORD PTR[rdi+48], xmm2 - movq QWORD PTR[rdi+56], xmm3 - ; begin epilog - pop rdi - pop rsi + ; begin epilog + pop rdi + pop rsi RESTORE_GOT UNSHADOW_ARGS pop rbp ret - SECTION_RODATA -;static const unsigned int dct1st_stage_rounding_mmx[2] = -align 16 -dct1st_stage_rounding_mmx: - times 2 dd 8192 - - -;static const unsigned int dct2nd_stage_rounding_mmx[2] = -align 16 -dct2nd_stage_rounding_mmx: - times 2 dd 32768 - - -;static const short dct_matrix[4][4]= -align 16 -dct_matrix: - times 4 dw 23170 - - dw 30274 - dw 12540 - dw -12540 - dw -30274 - - dw 23170 - times 2 dw -23170 - dw 23170 - - dw 12540 - dw -30274 - dw 30274 - dw -12540 - - -;static const unsigned short dct_const_mmx[4 * 4]= -align 16 -dct_const_mmx: - times 4 dw 0 - times 4 dw 60547 - times 4 dw 46341 - times 4 dw 25080 - - -;static const unsigned short dct_const_xmm[8 * 4]= -align 16 -dct_const_xmm: - times 8 dw 0 - times 8 dw 60547 - times 8 dw 46341 - times 8 dw 25080 +align 8 +_5352_2217: + dw 5352 + dw 2217 + dw 5352 + dw 2217 +align 8 +_2217_neg5352: + dw 2217 + dw -5352 + dw 2217 + dw -5352 +align 8 +_cmp_mask: + times 4 dw 1 +align 8 +_7w: + times 4 dw 7 +align 8 +_14500: + times 2 dd 14500 +align 8 +_7500: + times 2 dd 7500 +align 8 +_12000: + times 2 dd 12000 +align 8 +_51000: + times 2 dd 51000 diff --git a/vp8/encoder/x86/dct_sse2.asm b/vp8/encoder/x86/dct_sse2.asm index 723a78d76..652dd9804 100644 --- a/vp8/encoder/x86/dct_sse2.asm +++ b/vp8/encoder/x86/dct_sse2.asm @@ -11,32 +11,68 @@ %include "vpx_ports/x86_abi_support.asm" -;void vp8_short_fdct4x4_sse2(short *input, short *output, int pitch) -global sym(vp8_short_fdct4x4_sse2) -sym(vp8_short_fdct4x4_sse2): +%macro STACK_FRAME_CREATE 0 +%if ABI_IS_32BIT + %define input rsi + %define output rdi + %define pitch rax push rbp mov rbp, rsp - SHADOW_ARGS_TO_STACK 3 -;; SAVE_XMM GET_GOT rbx push rsi push rdi ; end prolog mov rsi, arg(0) - movsxd rax, DWORD PTR arg(2) - lea rdi, [rsi + rax*2] + mov rdi, arg(1) + + movsxd rax, dword ptr arg(2) + lea rcx, [rsi + rax*2] +%else + %ifidn __OUTPUT_FORMAT__,x64 + %define input rcx + %define output rdx + %define pitch r8 + %else + %define input rdi + %define output rsi + %define pitch rdx + %endif +%endif +%endmacro + +%macro STACK_FRAME_DESTROY 0 + %define input + %define output + %define pitch + +%if ABI_IS_32BIT + pop rdi + pop rsi + RESTORE_GOT + pop rbp +%else + %ifidn __OUTPUT_FORMAT__,x64 + %endif +%endif + ret +%endmacro + +;void vp8_short_fdct4x4_sse2(short *input, short *output, int pitch) +global sym(vp8_short_fdct4x4_sse2) +sym(vp8_short_fdct4x4_sse2): - movq xmm0, MMWORD PTR[rsi ] ;03 02 01 00 - movq xmm2, MMWORD PTR[rsi + rax] ;13 12 11 10 - movq xmm1, MMWORD PTR[rsi + rax*2] ;23 22 21 20 - movq xmm3, MMWORD PTR[rdi + rax] ;33 32 31 30 + STACK_FRAME_CREATE + + movq xmm0, MMWORD PTR[input ] ;03 02 01 00 + movq xmm2, MMWORD PTR[input+ pitch] ;13 12 11 10 + lea input, [input+2*pitch] + movq xmm1, MMWORD PTR[input ] ;23 22 21 20 + movq xmm3, MMWORD PTR[input+ pitch] ;33 32 31 30 punpcklqdq xmm0, xmm2 ;13 12 11 10 03 02 01 00 punpcklqdq xmm1, xmm3 ;33 32 31 30 23 22 21 20 - mov rdi, arg(1) - movdqa xmm2, xmm0 punpckldq xmm0, xmm1 ;23 22 03 02 21 20 01 00 punpckhdq xmm2, xmm1 ;33 32 13 12 31 30 11 10 @@ -51,6 +87,7 @@ sym(vp8_short_fdct4x4_sse2): psubw xmm3, xmm1 ;c1 d1 c1 d1 c1 d1 c1 d1 psllw xmm0, 3 ;b1 <<= 3 a1 <<= 3 psllw xmm3, 3 ;c1 <<= 3 d1 <<= 3 + movdqa xmm1, xmm0 pmaddwd xmm0, XMMWORD PTR[GLOBAL(_mult_add)] ;a1 + b1 pmaddwd xmm1, XMMWORD PTR[GLOBAL(_mult_sub)] ;a1 - b1 @@ -121,17 +158,216 @@ sym(vp8_short_fdct4x4_sse2): punpcklqdq xmm0, xmm3 ;op[4] op[0] punpckhqdq xmm1, xmm3 ;op[12] op[8] - movdqa XMMWORD PTR[rdi + 0], xmm0 - movdqa XMMWORD PTR[rdi + 16], xmm1 + movdqa XMMWORD PTR[output + 0], xmm0 + movdqa XMMWORD PTR[output + 16], xmm1 - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT -;; RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret + STACK_FRAME_DESTROY + +;void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch) +global sym(vp8_short_fdct8x4_sse2) +sym(vp8_short_fdct8x4_sse2): + + STACK_FRAME_CREATE + + ; read the input data + movdqa xmm0, [input ] + movdqa xmm2, [input+ pitch] + lea input, [input+2*pitch] + movdqa xmm4, [input ] + movdqa xmm3, [input+ pitch] + + ; transpose for the first stage + movdqa xmm1, xmm0 ; 00 01 02 03 04 05 06 07 + movdqa xmm5, xmm4 ; 20 21 22 23 24 25 26 27 + + punpcklwd xmm0, xmm2 ; 00 10 01 11 02 12 03 13 + punpckhwd xmm1, xmm2 ; 04 14 05 15 06 16 07 17 + + punpcklwd xmm4, xmm3 ; 20 30 21 31 22 32 23 33 + punpckhwd xmm5, xmm3 ; 24 34 25 35 26 36 27 37 + + movdqa xmm2, xmm0 ; 00 10 01 11 02 12 03 13 + punpckldq xmm0, xmm4 ; 00 10 20 30 01 11 21 31 + + punpckhdq xmm2, xmm4 ; 02 12 22 32 03 13 23 33 + + movdqa xmm4, xmm1 ; 04 14 05 15 06 16 07 17 + punpckldq xmm4, xmm5 ; 04 14 24 34 05 15 25 35 + + punpckhdq xmm1, xmm5 ; 06 16 26 36 07 17 27 37 + movdqa xmm3, xmm2 ; 02 12 22 32 03 13 23 33 + + punpckhqdq xmm3, xmm1 ; 03 13 23 33 07 17 27 37 + punpcklqdq xmm2, xmm1 ; 02 12 22 32 06 16 26 36 + + movdqa xmm1, xmm0 ; 00 10 20 30 01 11 21 31 + punpcklqdq xmm0, xmm4 ; 00 10 20 30 04 14 24 34 + + punpckhqdq xmm1, xmm4 ; 01 11 21 32 05 15 25 35 + + ; xmm0 0 + ; xmm1 1 + ; xmm2 2 + ; xmm3 3 + + ; first stage + movdqa xmm5, xmm0 + movdqa xmm4, xmm1 + + paddw xmm0, xmm3 ; a1 = 0 + 3 + paddw xmm1, xmm2 ; b1 = 1 + 2 + + psubw xmm4, xmm2 ; c1 = 1 - 2 + psubw xmm5, xmm3 ; d1 = 0 - 3 + + psllw xmm5, 3 + psllw xmm4, 3 + + psllw xmm0, 3 + psllw xmm1, 3 + + ; output 0 and 2 + movdqa xmm2, xmm0 ; a1 + + paddw xmm0, xmm1 ; op[0] = a1 + b1 + psubw xmm2, xmm1 ; op[2] = a1 - b1 + + ; output 1 and 3 + ; interleave c1, d1 + movdqa xmm1, xmm5 ; d1 + punpcklwd xmm1, xmm4 ; c1 d1 + punpckhwd xmm5, xmm4 ; c1 d1 + + movdqa xmm3, xmm1 + movdqa xmm4, xmm5 + + pmaddwd xmm1, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 + pmaddwd xmm4, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 + + pmaddwd xmm3, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 + pmaddwd xmm5, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 + + paddd xmm1, XMMWORD PTR[GLOBAL(_14500)] + paddd xmm4, XMMWORD PTR[GLOBAL(_14500)] + paddd xmm3, XMMWORD PTR[GLOBAL(_7500)] + paddd xmm5, XMMWORD PTR[GLOBAL(_7500)] + + psrad xmm1, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12 + psrad xmm4, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12 + psrad xmm3, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12 + psrad xmm5, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12 + + packssdw xmm1, xmm4 ; op[1] + packssdw xmm3, xmm5 ; op[3] + + ; done with vertical + ; transpose for the second stage + movdqa xmm4, xmm0 ; 00 10 20 30 04 14 24 34 + movdqa xmm5, xmm2 ; 02 12 22 32 06 16 26 36 + + punpcklwd xmm0, xmm1 ; 00 01 10 11 20 21 30 31 + punpckhwd xmm4, xmm1 ; 04 05 14 15 24 25 34 35 + + punpcklwd xmm2, xmm3 ; 02 03 12 13 22 23 32 33 + punpckhwd xmm5, xmm3 ; 06 07 16 17 26 27 36 37 + + movdqa xmm1, xmm0 ; 00 01 10 11 20 21 30 31 + punpckldq xmm0, xmm2 ; 00 01 02 03 10 11 12 13 + + punpckhdq xmm1, xmm2 ; 20 21 22 23 30 31 32 33 + + movdqa xmm2, xmm4 ; 04 05 14 15 24 25 34 35 + punpckldq xmm2, xmm5 ; 04 05 06 07 14 15 16 17 + + punpckhdq xmm4, xmm5 ; 24 25 26 27 34 35 36 37 + movdqa xmm3, xmm1 ; 20 21 22 23 30 31 32 33 + + punpckhqdq xmm3, xmm4 ; 30 31 32 33 34 35 36 37 + punpcklqdq xmm1, xmm4 ; 20 21 22 23 24 25 26 27 + + movdqa xmm4, xmm0 ; 00 01 02 03 10 11 12 13 + punpcklqdq xmm0, xmm2 ; 00 01 02 03 04 05 06 07 + + punpckhqdq xmm4, xmm2 ; 10 11 12 13 14 15 16 17 + + ; xmm0 0 + ; xmm1 4 + ; xmm2 1 + ; xmm3 3 + + movdqa xmm5, xmm0 + movdqa xmm2, xmm1 + + paddw xmm0, xmm3 ; a1 = 0 + 3 + paddw xmm1, xmm4 ; b1 = 1 + 2 + + psubw xmm4, xmm2 ; c1 = 1 - 2 + psubw xmm5, xmm3 ; d1 = 0 - 3 + + pxor xmm6, xmm6 ; zero out for compare + + pcmpeqw xmm6, xmm5 ; d1 != 0 + + pandn xmm6, XMMWORD PTR[GLOBAL(_cmp_mask8x4)] ; clear upper, + ; and keep bit 0 of lower + + ; output 0 and 2 + movdqa xmm2, xmm0 ; a1 + + paddw xmm0, xmm1 ; a1 + b1 + psubw xmm2, xmm1 ; a1 - b1 + + paddw xmm0, XMMWORD PTR[GLOBAL(_7w)] + paddw xmm2, XMMWORD PTR[GLOBAL(_7w)] + + psraw xmm0, 4 ; op[0] = (a1 + b1 + 7)>>4 + psraw xmm2, 4 ; op[8] = (a1 - b1 + 7)>>4 + + ; output 1 and 3 + ; interleave c1, d1 + movdqa xmm1, xmm5 ; d1 + punpcklwd xmm1, xmm4 ; c1 d1 + punpckhwd xmm5, xmm4 ; c1 d1 + + movdqa xmm3, xmm1 + movdqa xmm4, xmm5 + + pmaddwd xmm1, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 + pmaddwd xmm4, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 + + pmaddwd xmm3, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 + pmaddwd xmm5, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 + + paddd xmm1, XMMWORD PTR[GLOBAL(_12000)] + paddd xmm4, XMMWORD PTR[GLOBAL(_12000)] + paddd xmm3, XMMWORD PTR[GLOBAL(_51000)] + paddd xmm5, XMMWORD PTR[GLOBAL(_51000)] + + psrad xmm1, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16 + psrad xmm4, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16 + psrad xmm3, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16 + psrad xmm5, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16 + + packssdw xmm1, xmm4 ; op[4] + packssdw xmm3, xmm5 ; op[12] + + paddw xmm1, xmm6 ; op[4] += (d1!=0) + + movdqa xmm4, xmm0 + movdqa xmm5, xmm2 + + punpcklqdq xmm0, xmm1 + punpckhqdq xmm4, xmm1 + + punpcklqdq xmm2, xmm3 + punpckhqdq xmm5, xmm3 + + movdqa XMMWORD PTR[output + 0 ], xmm0 + movdqa XMMWORD PTR[output + 16], xmm2 + movdqa XMMWORD PTR[output + 32], xmm4 + movdqa XMMWORD PTR[output + 48], xmm5 + + STACK_FRAME_DESTROY SECTION_RODATA align 16 @@ -161,7 +397,9 @@ align 16 _cmp_mask: times 4 dw 1 times 4 dw 0 - +align 16 +_cmp_mask8x4: + times 8 dw 1 align 16 _mult_sub: dw 1 @@ -176,6 +414,9 @@ align 16 _7: times 4 dd 7 align 16 +_7w: + times 8 dw 7 +align 16 _14500: times 4 dd 14500 align 16 diff --git a/vp8/encoder/x86/dct_x86.h b/vp8/encoder/x86/dct_x86.h index 05824c684..59a5cb1d7 100644 --- a/vp8/encoder/x86/dct_x86.h +++ b/vp8/encoder/x86/dct_x86.h @@ -24,33 +24,31 @@ extern prototype_fdct(vp8_short_fdct4x4_mmx); extern prototype_fdct(vp8_short_fdct8x4_mmx); #if !CONFIG_RUNTIME_CPU_DETECT -#if 0 + #undef vp8_fdct_short4x4 #define vp8_fdct_short4x4 vp8_short_fdct4x4_mmx #undef vp8_fdct_short8x4 #define vp8_fdct_short8x4 vp8_short_fdct8x4_mmx -#endif #endif + #endif #if HAVE_SSE2 -extern prototype_fdct(vp8_short_fdct8x4_wmt); +extern prototype_fdct(vp8_short_fdct8x4_sse2); extern prototype_fdct(vp8_short_walsh4x4_sse2); extern prototype_fdct(vp8_short_fdct4x4_sse2); #if !CONFIG_RUNTIME_CPU_DETECT -#if 1 -/* short SSE2 DCT currently disabled, does not match the MMX version */ + #undef vp8_fdct_short4x4 #define vp8_fdct_short4x4 vp8_short_fdct4x4_sse2 #undef vp8_fdct_short8x4 #define vp8_fdct_short8x4 vp8_short_fdct8x4_sse2 -#endif #undef vp8_fdct_fast4x4 #define vp8_fdct_fast4x4 vp8_short_fdct4x4_sse2 @@ -58,7 +56,7 @@ extern prototype_fdct(vp8_short_fdct4x4_sse2); #undef vp8_fdct_fast8x4 #define vp8_fdct_fast8x4 vp8_short_fdct8x4_sse2 -#undef vp8_fdct_walsh_short4x4 +#undef vp8_fdct_walsh_short4x4 #define vp8_fdct_walsh_short4x4 vp8_short_walsh4x4_sse2 #endif diff --git a/vp8/encoder/x86/x86_csystemdependent.c b/vp8/encoder/x86/x86_csystemdependent.c index fb1b37ccb..781079849 100644 --- a/vp8/encoder/x86/x86_csystemdependent.c +++ b/vp8/encoder/x86/x86_csystemdependent.c @@ -18,11 +18,10 @@ #if HAVE_MMX void vp8_short_fdct8x4_mmx(short *input, short *output, int pitch) { - vp8_short_fdct4x4_c(input, output, pitch); - vp8_short_fdct4x4_c(input + 4, output + 16, pitch); + vp8_short_fdct4x4_mmx(input, output, pitch); + vp8_short_fdct4x4_mmx(input + 4, output + 16, pitch); } - int vp8_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr, short *qcoeff_ptr, short *dequant_ptr, short *scan_mask, short *round_ptr, @@ -82,12 +81,6 @@ void vp8_subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch) #endif #if HAVE_SSE2 -void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch) -{ - vp8_short_fdct4x4_sse2(input, output, pitch); - vp8_short_fdct4x4_sse2(input + 4, output + 16, pitch); -} - int vp8_fast_quantize_b_impl_sse2(short *coeff_ptr, short *qcoeff_ptr, short *dequant_ptr, short *scan_mask, short *round_ptr, @@ -249,18 +242,11 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi) cpi->rtcd.variance.get8x8var = vp8_get8x8var_mmx; cpi->rtcd.variance.get16x16var = vp8_get16x16var_mmx; cpi->rtcd.variance.get4x4sse_cs = vp8_get4x4sse_cs_mmx; -#if 0 // new fdct + cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_mmx; cpi->rtcd.fdct.short8x4 = vp8_short_fdct8x4_mmx; cpi->rtcd.fdct.fast4x4 = vp8_short_fdct4x4_mmx; cpi->rtcd.fdct.fast8x4 = vp8_short_fdct8x4_mmx; -#else - cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_c; - cpi->rtcd.fdct.short8x4 = vp8_short_fdct8x4_c; - cpi->rtcd.fdct.fast4x4 = vp8_short_fdct4x4_c; - cpi->rtcd.fdct.fast8x4 = vp8_short_fdct8x4_c; - -#endif cpi->rtcd.fdct.walsh_short4x4 = vp8_short_walsh4x4_c; diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c index 8e50b7f1b..6a2872031 100644 --- a/vp8/vp8_cx_iface.c +++ b/vp8/vp8_cx_iface.c @@ -860,8 +860,16 @@ static vpx_image_t *vp8e_get_preview(vpx_codec_alg_priv_t *ctx) { YV12_BUFFER_CONFIG sd; + vp8_ppflags_t flags = {0}; - if (0 == vp8_get_preview_raw_frame(ctx->cpi, &sd, ctx->preview_ppcfg.deblocking_level, ctx->preview_ppcfg.noise_level, ctx->preview_ppcfg.post_proc_flag)) + if (ctx->preview_ppcfg.post_proc_flag) + { + flags.post_proc_flag = ctx->preview_ppcfg.post_proc_flag; + flags.deblocking_level = ctx->preview_ppcfg.deblocking_level; + flags.noise_level = ctx->preview_ppcfg.noise_level; + } + + if (0 == vp8_get_preview_raw_frame(ctx->cpi, &sd, &flags)) { /* diff --git a/vp8/vp8_dx_iface.c b/vp8/vp8_dx_iface.c index 9964124d1..9dd492217 100644 --- a/vp8/vp8_dx_iface.c +++ b/vp8/vp8_dx_iface.c @@ -65,12 +65,19 @@ struct vpx_codec_alg_priv vpx_codec_priv_t base; vpx_codec_mmap_t mmaps[NELEMENTS(vp8_mem_req_segs)-1]; vpx_codec_dec_cfg_t cfg; - vp8_stream_info_t si; + vp8_stream_info_t si; int defer_alloc; int decoder_init; VP8D_PTR pbi; int postproc_cfg_set; vp8_postproc_cfg_t postproc_cfg; +#if CONFIG_POSTPROC_VISUALIZER + unsigned int dbg_postproc_flag; + int dbg_color_ref_frame_flag; + int dbg_color_mb_modes_flag; + int dbg_color_b_modes_flag; + int dbg_display_mv_flag; +#endif vpx_image_t img; int img_setup; int img_avail; @@ -416,15 +423,27 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx, { YV12_BUFFER_CONFIG sd; INT64 time_stamp = 0, time_end_stamp = 0; - int ppflag = 0; - int ppdeblocking = 0; - int ppnoise = 0; + vp8_ppflags_t flags = {0}; if (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC) { - ppflag = ctx->postproc_cfg.post_proc_flag; - ppdeblocking = ctx->postproc_cfg.deblocking_level; - ppnoise = ctx->postproc_cfg.noise_level; + flags.post_proc_flag= ctx->postproc_cfg.post_proc_flag +#if CONFIG_POSTPROC_VISUALIZER + + | ((ctx->dbg_color_ref_frame_flag != 0) ? VP8D_DEBUG_CLR_FRM_REF_BLKS : 0) + | ((ctx->dbg_color_mb_modes_flag != 0) ? VP8D_DEBUG_CLR_BLK_MODES : 0) + | ((ctx->dbg_color_b_modes_flag != 0) ? VP8D_DEBUG_CLR_BLK_MODES : 0) + | ((ctx->dbg_display_mv_flag != 0) ? VP8D_DEBUG_DRAW_MV : 0) +#endif + ; + flags.deblocking_level = ctx->postproc_cfg.deblocking_level; + flags.noise_level = ctx->postproc_cfg.noise_level; +#if CONFIG_POSTPROC_VISUALIZER + flags.display_ref_frame_flag= ctx->dbg_color_ref_frame_flag; + flags.display_mb_modes_flag = ctx->dbg_color_mb_modes_flag; + flags.display_b_modes_flag = ctx->dbg_color_b_modes_flag; + flags.display_mv_flag = ctx->dbg_display_mv_flag; +#endif } if (vp8dx_receive_compressed_data(ctx->pbi, data_sz, data, deadline)) @@ -433,7 +452,7 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx, res = update_error_state(ctx, &pbi->common.error); } - if (!res && 0 == vp8dx_get_raw_frame(ctx->pbi, &sd, &time_stamp, &time_end_stamp, ppdeblocking, ppnoise, ppflag)) + if (!res && 0 == vp8dx_get_raw_frame(ctx->pbi, &sd, &time_stamp, &time_end_stamp, &flags)) { /* Align width/height */ unsigned int a_w = (sd.y_width + 15) & ~15; @@ -646,12 +665,38 @@ static vpx_codec_err_t vp8_set_postproc(vpx_codec_alg_priv_t *ctx, #endif } +static vpx_codec_err_t vp8_set_dbg_options(vpx_codec_alg_priv_t *ctx, + int ctrl_id, + va_list args) +{ +#if CONFIG_POSTPROC_VISUALIZER && CONFIG_POSTPROC + int data = va_arg(args, int); + +#define MAP(id, var) case id: var = data; break; + + switch (ctrl_id) + { + MAP (VP8_SET_DBG_COLOR_REF_FRAME, ctx->dbg_color_ref_frame_flag); + MAP (VP8_SET_DBG_COLOR_MB_MODES, ctx->dbg_color_mb_modes_flag); + MAP (VP8_SET_DBG_COLOR_B_MODES, ctx->dbg_color_b_modes_flag); + MAP (VP8_SET_DBG_DISPLAY_MV, ctx->dbg_display_mv_flag); + } + + return VPX_CODEC_OK; +#else + return VPX_CODEC_INCAPABLE; +#endif +} vpx_codec_ctrl_fn_map_t vp8_ctf_maps[] = { - {VP8_SET_REFERENCE, vp8_set_reference}, - {VP8_COPY_REFERENCE, vp8_get_reference}, - {VP8_SET_POSTPROC, vp8_set_postproc}, + {VP8_SET_REFERENCE, vp8_set_reference}, + {VP8_COPY_REFERENCE, vp8_get_reference}, + {VP8_SET_POSTPROC, vp8_set_postproc}, + {VP8_SET_DBG_COLOR_REF_FRAME, vp8_set_dbg_options}, + {VP8_SET_DBG_COLOR_MB_MODES, vp8_set_dbg_options}, + {VP8_SET_DBG_COLOR_B_MODES, vp8_set_dbg_options}, + {VP8_SET_DBG_DISPLAY_MV, vp8_set_dbg_options}, { -1, NULL}, }; @@ -38,9 +38,13 @@ */ enum vp8_dec_control_id { - VP8_SET_REFERENCE = 1, /**< pass in an external frame into decoder to be used as reference frame */ - VP8_COPY_REFERENCE = 2, /**< get a copy of reference frame from the decoder */ - VP8_SET_POSTPROC = 3, /**< set decoder's the post processing settings */ + VP8_SET_REFERENCE = 1, /**< pass in an external frame into decoder to be used as reference frame */ + VP8_COPY_REFERENCE = 2, /**< get a copy of reference frame from the decoder */ + VP8_SET_POSTPROC = 3, /**< set the decoder's post processing settings */ + VP8_SET_DBG_COLOR_REF_FRAME = 4, /**< set the reference frames to color for each macroblock */ + VP8_SET_DBG_COLOR_MB_MODES = 5, /**< set which macro block modes to color */ + VP8_SET_DBG_COLOR_B_MODES = 6, /**< set which blocks modes to color */ + VP8_SET_DBG_DISPLAY_MV = 7, /**< set which motion vector modes to draw */ VP8_COMMON_CTRL_ID_MAX }; @@ -50,10 +54,14 @@ enum vp8_dec_control_id */ enum vp8_postproc_level { - VP8_NOFILTERING = 0, - VP8_DEBLOCK = 1, - VP8_DEMACROBLOCK = 2, - VP8_ADDNOISE = 4 + VP8_NOFILTERING = 0, + VP8_DEBLOCK = 1<<0, + VP8_DEMACROBLOCK = 1<<1, + VP8_ADDNOISE = 1<<2, + VP8_DEBUG_TXT_FRAME_INFO = 1<<3, /**< print frame information */ + VP8_DEBUG_TXT_MBLK_MODES = 1<<4, /**< print macro block modes over each macro block */ + VP8_DEBUG_TXT_DC_DIFF = 1<<5, /**< print dc diff for each macro block */ + VP8_DEBUG_TXT_RATE_INFO = 1<<6, /**< print video rate info (encoder only) */ }; /*!\brief post process flags @@ -65,9 +73,9 @@ enum vp8_postproc_level typedef struct vp8_postproc_cfg { - int post_proc_flag; /**< the types of post processing to be done, should be combination of "vp8_postproc_level" */ - int deblocking_level; /**< the strength of deblocking, valid range [0, 16] */ - int noise_level; /**< the strength of additive noise, valid range [0, 16] */ + int post_proc_flag; /**< the types of post processing to be done, should be combination of "vp8_postproc_level" */ + int deblocking_level; /**< the strength of deblocking, valid range [0, 16] */ + int noise_level; /**< the strength of additive noise, valid range [0, 16] */ } vp8_postproc_cfg_t; /*!\brief reference frame type @@ -95,12 +103,16 @@ typedef struct vpx_ref_frame /*!\brief vp8 decoder control funciton parameter type * - * defines the data type for each of VP8 decoder control funciton requires + * defines the data type for each of VP8 decoder control function requires */ VPX_CTRL_USE_TYPE(VP8_SET_REFERENCE, vpx_ref_frame_t *) VPX_CTRL_USE_TYPE(VP8_COPY_REFERENCE, vpx_ref_frame_t *) VPX_CTRL_USE_TYPE(VP8_SET_POSTPROC, vp8_postproc_cfg_t *) +VPX_CTRL_USE_TYPE(VP8_SET_DBG_COLOR_REF_FRAME, int) +VPX_CTRL_USE_TYPE(VP8_SET_DBG_COLOR_MB_MODES, int) +VPX_CTRL_USE_TYPE(VP8_SET_DBG_COLOR_B_MODES, int) +VPX_CTRL_USE_TYPE(VP8_SET_DBG_DISPLAY_MV, int) /*! @} - end defgroup vp8 */ @@ -108,11 +108,19 @@ static const arg_def_t demacroblock_level = ARG_DEF(NULL, "demacroblock-level", "Enable VP8 demacroblocking, w/ level"); static const arg_def_t pp_debug_info = ARG_DEF(NULL, "pp-debug-info", 1, "Enable VP8 visible debug info"); - +static const arg_def_t pp_disp_ref_frame = ARG_DEF(NULL, "pp-dbg-ref-frame", 1, + "Display only selected reference frame per macro block"); +static const arg_def_t pp_disp_mb_modes = ARG_DEF(NULL, "pp-dbg-mb-modes", 1, + "Display only selected macro block modes"); +static const arg_def_t pp_disp_b_modes = ARG_DEF(NULL, "pp-dbg-b-modes", 1, + "Display only selected block modes"); +static const arg_def_t pp_disp_mvs = ARG_DEF(NULL, "pp-dbg-mvs", 1, + "Draw only selected motion vectors"); static const arg_def_t *vp8_pp_args[] = { &addnoise_level, &deblock, &demacroblock_level, &pp_debug_info, + &pp_disp_ref_frame, &pp_disp_mb_modes, &pp_disp_b_modes, &pp_disp_mvs, NULL }; #endif @@ -705,6 +713,10 @@ int main(int argc, const char **argv_) vpx_codec_dec_cfg_t cfg = {0}; #if CONFIG_VP8_DECODER vp8_postproc_cfg_t vp8_pp_cfg = {0}; + int vp8_dbg_color_ref_frame = 0; + int vp8_dbg_color_mb_modes = 0; + int vp8_dbg_color_b_modes = 0; + int vp8_dbg_display_mv = 0; #endif struct input_ctx input = {0}; @@ -790,6 +802,42 @@ int main(int argc, const char **argv_) if (level) vp8_pp_cfg.post_proc_flag |= level; } + else if (arg_match(&arg, &pp_disp_ref_frame, argi)) + { + unsigned int flags = arg_parse_int(&arg); + if (flags) + { + postproc = 1; + vp8_dbg_color_ref_frame = flags; + } + } + else if (arg_match(&arg, &pp_disp_mb_modes, argi)) + { + unsigned int flags = arg_parse_int(&arg); + if (flags) + { + postproc = 1; + vp8_dbg_color_mb_modes = flags; + } + } + else if (arg_match(&arg, &pp_disp_b_modes, argi)) + { + unsigned int flags = arg_parse_int(&arg); + if (flags) + { + postproc = 1; + vp8_dbg_color_b_modes = flags; + } + } + else if (arg_match(&arg, &pp_disp_mvs, argi)) + { + unsigned int flags = arg_parse_int(&arg); + if (flags) + { + postproc = 1; + vp8_dbg_display_mv = flags; + } + } #endif else @@ -929,6 +977,33 @@ int main(int argc, const char **argv_) return EXIT_FAILURE; } + if (vp8_dbg_color_ref_frame + && vpx_codec_control(&decoder, VP8_SET_DBG_COLOR_REF_FRAME, vp8_dbg_color_ref_frame)) + { + fprintf(stderr, "Failed to configure reference block visualizer: %s\n", vpx_codec_error(&decoder)); + return EXIT_FAILURE; + } + + if (vp8_dbg_color_mb_modes + && vpx_codec_control(&decoder, VP8_SET_DBG_COLOR_MB_MODES, vp8_dbg_color_mb_modes)) + { + fprintf(stderr, "Failed to configure macro block visualizer: %s\n", vpx_codec_error(&decoder)); + return EXIT_FAILURE; + } + + if (vp8_dbg_color_b_modes + && vpx_codec_control(&decoder, VP8_SET_DBG_COLOR_B_MODES, vp8_dbg_color_b_modes)) + { + fprintf(stderr, "Failed to configure block visualizer: %s\n", vpx_codec_error(&decoder)); + return EXIT_FAILURE; + } + + if (vp8_dbg_display_mv + && vpx_codec_control(&decoder, VP8_SET_DBG_DISPLAY_MV, vp8_dbg_display_mv)) + { + fprintf(stderr, "Failed to configure motion vector visualizer: %s\n", vpx_codec_error(&decoder)); + return EXIT_FAILURE; + } #endif /* Decode file */ @@ -435,7 +435,7 @@ struct EbmlGlobal int debug; FILE *stream; - uint64_t last_pts_ms; + int64_t last_pts_ms; vpx_rational_t framerate; /* These pointers are to the start of an element */ @@ -648,7 +648,7 @@ write_webm_block(EbmlGlobal *glob, unsigned char track_number; unsigned short block_timecode = 0; unsigned char flags; - uint64_t pts_ms; + int64_t pts_ms; int start_cluster = 0, is_keyframe; /* Calculate the PTS of this frame in milliseconds */ @@ -1074,6 +1074,7 @@ int main(int argc, const char **argv_) int psnr_count = 0; exec_name = argv_[0]; + ebml.last_pts_ms = -1; if (argc < 3) usage_exit(); |