From 3c18a2bb2e5f6cde8189643345e33a1c27189ff8 Mon Sep 17 00:00:00 2001 From: Scott LaVarnway Date: Wed, 26 Jan 2011 16:42:56 -0500 Subject: Performance improvement of first pass Improved the performance of the first pass only (~6% on 720p test clip) by making use of LUT instead of the float calculations. Might try a SIMD version later. Also started to make use of int_mv instead of MV. Change-Id: If2a217c7d6b59cd2c25c5553e0ca7e0502403af8 --- vp8/encoder/firstpass.c | 99 ++++++++++++++++++++++++++++++------------------- 1 file changed, 61 insertions(+), 38 deletions(-) diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c index 06e26be85..fc6f043c3 100644 --- a/vp8/encoder/firstpass.c +++ b/vp8/encoder/firstpass.c @@ -8,7 +8,6 @@ * be found in the AUTHORS file in the root of the source tree. */ - #include "math.h" #include "limits.h" #include "block.h" @@ -178,40 +177,68 @@ static double calculate_modified_err(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) return modified_err; } +static const double weight_table[256] = { +0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, +0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, +0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, +0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, +0.020000, 0.031250, 0.062500, 0.093750, 0.125000, 0.156250, 0.187500, 0.218750, +0.250000, 0.281250, 0.312500, 0.343750, 0.375000, 0.406250, 0.437500, 0.468750, +0.500000, 0.531250, 0.562500, 0.593750, 0.625000, 0.656250, 0.687500, 0.718750, +0.750000, 0.781250, 0.812500, 0.843750, 0.875000, 0.906250, 0.937500, 0.968750, +1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, +1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, +1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, +1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, +1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, +1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, +1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, +1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, +1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, +1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, +1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, +1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, +1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, +1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, +1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, +1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, +1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, +1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, +1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, +1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, +1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, +1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, +1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, +1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000 +}; + double vp8_simple_weight(YV12_BUFFER_CONFIG *source) { int i, j; unsigned char *src = source->y_buffer; - unsigned char value; double sum_weights = 0.0; - double Weight; // Loop throught the Y plane raw examining levels and creating a weight for the image - for (i = 0; i < source->y_height; i++) + i = source->y_height; + do { - for (j = 0; j < source->y_width; j++) + j = source->y_width; + do { - value = src[j]; - - if (value >= 64) - Weight = 1.0; - else if (value > 32) - Weight = (value - 32.0f) / 32.0f; - else - Weight = 0.02; - - sum_weights += Weight; - } - + sum_weights += weight_table[ *src]; + src++; + }while(--j); + src -= source->y_width; src += source->y_stride; - } + }while(--i); sum_weights /= (source->y_height * source->y_width); return sum_weights; } + // This function returns the current per frame maximum bitrate target int frame_max_bits(VP8_COMP *cpi) { @@ -440,7 +467,6 @@ void vp8_end_first_pass(VP8_COMP *cpi) vp8_output_stats(cpi, cpi->output_pkt_list, cpi->total_stats); } - void vp8_zz_motion_search( VP8_COMP *cpi, MACROBLOCK * x, YV12_BUFFER_CONFIG * recon_buffer, int * best_motion_err, int recon_yoffset ) { MACROBLOCKD * const xd = & x->e_mbd; @@ -460,7 +486,6 @@ void vp8_zz_motion_search( VP8_COMP *cpi, MACROBLOCK * x, YV12_BUFFER_CONFIG * r VARIANCE_INVOKE(IF_RTCD(&cpi->rtcd.variance), mse16x16) ( src_ptr, src_stride, ref_ptr, ref_stride, (unsigned int *)(best_motion_err)); } - void vp8_first_pass_motion_search(VP8_COMP *cpi, MACROBLOCK *x, MV *ref_mv, MV *best_mv, YV12_BUFFER_CONFIG *recon_buffer, int *best_motion_err, int recon_yoffset ) { MACROBLOCKD *const xd = & x->e_mbd; @@ -548,7 +573,6 @@ void vp8_first_pass(VP8_COMP *cpi) int sum_in_vectors = 0; - MV best_ref_mv = {0, 0}; MV zero_ref_mv = {0, 0}; unsigned char *fp_motion_map_ptr = cpi->fp_motion_map; @@ -586,13 +610,20 @@ void vp8_first_pass(VP8_COMP *cpi) // for each macroblock row in image for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) { - MV best_ref_mv = {0, 0}; + int_mv best_ref_mv; + + best_ref_mv.as_int = 0; // reset above block coeffs xd->up_available = (mb_row != 0); recon_yoffset = (mb_row * recon_y_stride * 16); recon_uvoffset = (mb_row * recon_uv_stride * 8); + // Set up limit values for motion vectors to prevent them extending outside the UMV borders + x->mv_row_min = -((mb_row * 16) + (VP8BORDERINPIXELS - 16)); + x->mv_row_max = ((cm->mb_rows - 1 - mb_row) * 16) + (VP8BORDERINPIXELS - 16); + + // for each macroblock col in image for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) { @@ -625,8 +656,6 @@ void vp8_first_pass(VP8_COMP *cpi) // Set up limit values for motion vectors to prevent them extending outside the UMV borders x->mv_col_min = -((mb_col * 16) + (VP8BORDERINPIXELS - 16)); x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16) + (VP8BORDERINPIXELS - 16); - x->mv_row_min = -((mb_row * 16) + (VP8BORDERINPIXELS - 16)); - x->mv_row_max = ((cm->mb_rows - 1 - mb_row) * 16) + (VP8BORDERINPIXELS - 16); // Other than for the first frame do a motion search if (cm->current_video_frame > 0) @@ -647,12 +676,12 @@ void vp8_first_pass(VP8_COMP *cpi) // Test last reference frame using the previous best mv as the // starting point (best reference) for the search - vp8_first_pass_motion_search(cpi, x, &best_ref_mv, + vp8_first_pass_motion_search(cpi, x, &best_ref_mv.as_mv, &d->bmi.mv.as_mv, lst_yv12, &motion_error, recon_yoffset); // If the current best reference mv is not centred on 0,0 then do a 0,0 based search as well - if ((best_ref_mv.col != 0) || (best_ref_mv.row != 0)) + if (best_ref_mv.as_int) { tmp_err = INT_MAX; vp8_first_pass_motion_search(cpi, x, &zero_ref_mv, &tmp_mv, @@ -664,7 +693,6 @@ void vp8_first_pass(VP8_COMP *cpi) d->bmi.mv.as_mv.row = tmp_mv.row; d->bmi.mv.as_mv.col = tmp_mv.col; } - } // Experimental search in a second reference frame ((0,0) based only) @@ -693,6 +721,9 @@ void vp8_first_pass(VP8_COMP *cpi) xd->pre.v_buffer = lst_yv12->v_buffer + recon_uvoffset; } + /* Intra assumed best */ + best_ref_mv.as_int = 0; + if (motion_error <= this_error) { d->bmi.mv.as_mv.row <<= 3; @@ -708,13 +739,10 @@ void vp8_first_pass(VP8_COMP *cpi) sum_mvcs += d->bmi.mv.as_mv.col * d->bmi.mv.as_mv.col; intercount++; - best_ref_mv.row = d->bmi.mv.as_mv.row; - best_ref_mv.col = d->bmi.mv.as_mv.col; - //best_ref_mv.row = 0; - //best_ref_mv.col = 0; + best_ref_mv.as_int = d->bmi.mv.as_int; // Was the vector non-zero - if (d->bmi.mv.as_mv.row || d->bmi.mv.as_mv.col) + if (d->bmi.mv.as_int) { mvcount++; @@ -770,12 +798,6 @@ void vp8_first_pass(VP8_COMP *cpi) *fp_motion_map_ptr = 1; } } - else - { - // Intra was best - best_ref_mv.row = 0; - best_ref_mv.col = 0; - } } coded_error += this_error; @@ -813,6 +835,7 @@ void vp8_first_pass(VP8_COMP *cpi) fps.coded_error = coded_error >> 8; weight = vp8_simple_weight(cpi->Source); + if (weight < 0.1) weight = 0.1; -- cgit v1.2.3 From e9f513d74ae9cfc88f5423cb25bd65000bc32c0d Mon Sep 17 00:00:00 2001 From: Adrian Grange Date: Fri, 28 Jan 2011 14:47:36 +0000 Subject: Changed condition for using RD in Intra Mode The condition for using RD when selecting the intra coding mode for a MB is that the RD flag is set AND we're not in real-time mode. Previously the code used RD if either the RD flag was set OR we were not using real-time mode. Change-Id: Ic711151298468a3f99babad39ba8375f66d55a08 --- vp8/encoder/encodeframe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c index 1689b43d1..793191d24 100644 --- a/vp8/encoder/encodeframe.c +++ b/vp8/encoder/encodeframe.c @@ -1160,7 +1160,7 @@ int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t) #if !(CONFIG_REALTIME_ONLY) - if (cpi->sf.RD || cpi->compressor_speed != 2) + if (cpi->sf.RD && cpi->compressor_speed != 2) { Error4x4 = vp8_rd_pick_intra4x4mby_modes(cpi, x, &rate4x4, &rate4x4_tokenonly, &dist4x4); -- cgit v1.2.3 From 8f279596cbb7a6a3016fdc00624bc33ba36641bf Mon Sep 17 00:00:00 2001 From: Yaowu Xu Date: Wed, 19 Jan 2011 16:21:01 -0800 Subject: change the threshold of DC check for encode breakout Previously, the DC check is to make sure there is no code-able DC shift for quantizer Q0, which has been verified rather conservative. This commit changes the criteria to have two components, DC and AC, to address the conservativeness. First, it checks if all AC energy is enough to contribute a single non-zero quantized AC coefficient. Second, for DC, the decision to skip further considers two possible scenarios: 1. There is no code-able 2nd order DC coefficient at all; 2 The residue is relatively flat, but the uniform DC change is very small, i.e. less than 1/2 gray level per pixel. Comparing to previous criteria, the new criteria is about 10% to 15% faster in encoding time with a very small quality loss. (threshold ~1000 and quality range 33db-45db) It should be noted that this commit enables "automatic" static threshold for encodebreakout if a non-zero small value is passed in to encoder. Change-Id: I0f77719a1ac2c2dfddbd950d84920df374515ce3 --- vp8/encoder/rdopt.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c index 36420aad1..fcff74778 100644 --- a/vp8/encoder/rdopt.c +++ b/vp8/encoder/rdopt.c @@ -2267,22 +2267,28 @@ int vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int else if (x->encode_breakout) { int sum, sse; + int threshold = (xd->block[0].dequant[1] + * xd->block[0].dequant[1] >>4); + + if(threshold < x->encode_breakout) + threshold = x->encode_breakout; VARIANCE_INVOKE(&cpi->rtcd.variance, get16x16var) (x->src.y_buffer, x->src.y_stride, x->e_mbd.predictor, 16, (unsigned int *)(&sse), &sum); - if (sse < x->encode_breakout) + if (sse < threshold) { // Check u and v to make sure skip is ok int sse2 = 0; - - // add dc check - if (abs(sum) < (cpi->common.Y2dequant[0][0] << 2)) + /* If theres is no codeable 2nd order dc + or a very small uniform pixel change change */ + if (abs(sum) < (xd->block[24].dequant[0]<<2)|| + ((sum * sum>>8) > sse && abs(sum) <128)) { sse2 = VP8_UVSSE(x, IF_RTCD(&cpi->rtcd.variance)); - if (sse2 * 2 < x->encode_breakout) + if (sse2 * 2 < threshold) { x->skip = 1; distortion2 = sse + sse2; @@ -2428,6 +2434,7 @@ int vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int if (x->skip) break; + } // Reduce the activation RD thresholds for the best choice mode -- cgit v1.2.3 From 2d03f073a729b74e8c92448dcfd7291f810b2fb1 Mon Sep 17 00:00:00 2001 From: John Koleszar Date: Fri, 28 Jan 2011 11:56:18 -0500 Subject: validate min_q against max_q min_q is required to be <= max_q. Change-Id: I28eccf96df3b52a94913762b54c4fbe0d021ce5e --- vp8/vp8_cx_iface.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c index 903c56c88..b23bd951d 100644 --- a/vp8/vp8_cx_iface.c +++ b/vp8/vp8_cx_iface.c @@ -142,8 +142,8 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, RANGE_CHECK(cfg, g_timebase.den, 1, 1000000000); RANGE_CHECK(cfg, g_timebase.num, 1, cfg->g_timebase.den); RANGE_CHECK_HI(cfg, g_profile, 3); - RANGE_CHECK_HI(cfg, rc_min_quantizer, 63); RANGE_CHECK_HI(cfg, rc_max_quantizer, 63); + RANGE_CHECK_HI(cfg, rc_min_quantizer, cfg->rc_max_quantizer); RANGE_CHECK_HI(cfg, g_threads, 64); #if !(CONFIG_REALTIME_ONLY) RANGE_CHECK_HI(cfg, g_lag_in_frames, 25); -- cgit v1.2.3