From 9c63e887e675d2bf5934c82a92f289ed33f0c8a4 Mon Sep 17 00:00:00 2001 From: Jim Bankoski Date: Mon, 3 Mar 2014 11:49:13 -0800 Subject: cost_coefs approximation speed up This patch adds a new speed feature which doesn't do the rather expensive entropy context lookup or save to the table, while doing costing. The speed up on desktop36p.y4m is around 10% other clips much less. On the RTC test set this was + 1% in overall datarate. Change-Id: Ia5144bbf45270671e7be9c8e4055369909e2f738 --- vp9/encoder/vp9_onyx_if.c | 5 ++++ vp9/encoder/vp9_onyx_int.h | 4 +++ vp9/encoder/vp9_rdopt.c | 70 +++++++++++++++++++++++++++++----------------- 3 files changed, 53 insertions(+), 26 deletions(-) (limited to 'vp9') diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c index d7bac6f0b..db0fd80fa 100644 --- a/vp9/encoder/vp9_onyx_if.c +++ b/vp9/encoder/vp9_onyx_if.c @@ -662,6 +662,7 @@ static void set_good_speed_feature(VP9_COMMON *cm, sf->use_lp32x32fdct = 1; sf->subpel_iters_per_step = 1; sf->use_fast_coef_updates = 2; + sf->use_fast_coef_costing = 1; sf->adaptive_rd_thresh = 4; sf->mode_skip_start = 6; @@ -698,6 +699,7 @@ static void set_good_speed_feature(VP9_COMMON *cm, sf->use_lp32x32fdct = 1; sf->subpel_iters_per_step = 1; sf->use_fast_coef_updates = 2; + sf->use_fast_coef_costing = 1; sf->adaptive_rd_thresh = 4; sf->mode_skip_start = 6; @@ -730,6 +732,7 @@ static void set_good_speed_feature(VP9_COMMON *cm, sf->intra_uv_mode_mask[i] = INTRA_DC_ONLY; } sf->use_fast_coef_updates = 2; + sf->use_fast_coef_costing = 1; sf->adaptive_rd_thresh = 4; sf->mode_skip_start = 6; } @@ -742,6 +745,7 @@ static void set_rt_speed_feature(VP9_COMMON *cm, sf->adaptive_rd_thresh = 1; sf->recode_loop = ((speed < 1) ? ALLOW_RECODE : ALLOW_RECODE_KFMAXBW); sf->encode_breakout_thresh = 1; + sf->use_fast_coef_costing = 1; if (speed == 1) { sf->use_square_partition_only = !frame_is_intra_only(cm); @@ -923,6 +927,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) { sf->use_uv_intra_rd_estimate = 0; sf->use_fast_lpf_pick = 0; sf->use_fast_coef_updates = 0; + sf->use_fast_coef_costing = 0; sf->mode_skip_start = MAX_MODES; // Mode index at which mode skip mask set sf->use_nonrd_pick_mode = 0; sf->encode_breakout_thresh = 0; diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h index 019cb1388..f1e5e3a68 100644 --- a/vp9/encoder/vp9_onyx_int.h +++ b/vp9/encoder/vp9_onyx_int.h @@ -420,6 +420,10 @@ typedef struct { // A binary mask indicating if NEARESTMV, NEARMV, ZEROMV, NEWMV // modes are disabled in order from LSB to MSB for each BLOCK_SIZE. int disable_inter_mode_mask[BLOCK_SIZES]; + + // This feature controls whether we do the expensive context update and + // calculation in the rd coefficient costing loop. + int use_fast_coef_costing; } SPEED_FEATURES; typedef struct { diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index 76683dae4..3b2d78889 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -72,6 +72,7 @@ struct rdcost_block_args { int64_t this_rd; int64_t best_rd; int skip; + int use_fast_coef_costing; const scan_order *so; }; @@ -549,12 +550,12 @@ static const int16_t band_counts[TX_SIZES][8] = { { 1, 2, 3, 4, 11, 256 - 21, 0 }, { 1, 2, 3, 4, 11, 1024 - 21, 0 }, }; - static INLINE int cost_coeffs(MACROBLOCK *x, int plane, int block, ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L, TX_SIZE tx_size, - const int16_t *scan, const int16_t *nb) { + const int16_t *scan, const int16_t *nb, + int use_fast_coef_costing) { MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi; struct macroblock_plane *p = &x->plane[plane]; @@ -568,7 +569,6 @@ static INLINE int cost_coeffs(MACROBLOCK *x, uint8_t *p_tok = x->token_cache; int pt = combine_entropy_contexts(*A, *L); int c, cost; - // Check for consistency of tx_size with mode info assert(type == PLANE_TYPE_Y ? mbmi->tx_size == tx_size : get_uv_tx_size(mbmi) == tx_size); @@ -594,9 +594,13 @@ static INLINE int cost_coeffs(MACROBLOCK *x, v = qcoeff[rc]; t = vp9_dct_value_tokens_ptr[v].token; - pt = get_coef_context(nb, p_tok, c); - cost += (*token_costs)[!prev_t][pt][t] + vp9_dct_value_cost_ptr[v]; - p_tok[rc] = vp9_pt_energy_class[t]; + if (use_fast_coef_costing) { + cost += (*token_costs)[!prev_t][!prev_t][t] + vp9_dct_value_cost_ptr[v]; + } else { + pt = get_coef_context(nb, p_tok, c); + cost += (*token_costs)[!prev_t][pt][t] + vp9_dct_value_cost_ptr[v]; + p_tok[rc] = vp9_pt_energy_class[t]; + } prev_t = t; if (!--band_left) { band_left = *band_count++; @@ -606,8 +610,12 @@ static INLINE int cost_coeffs(MACROBLOCK *x, // eob token if (band_left) { - pt = get_coef_context(nb, p_tok, c); - cost += (*token_costs)[0][pt][EOB_TOKEN]; + if (use_fast_coef_costing) { + cost += (*token_costs)[0][!prev_t][EOB_TOKEN]; + } else { + pt = get_coef_context(nb, p_tok, c); + cost += (*token_costs)[0][pt][EOB_TOKEN]; + } } } @@ -616,7 +624,6 @@ static INLINE int cost_coeffs(MACROBLOCK *x, return cost; } - static void dist_block(int plane, int block, TX_SIZE tx_size, struct rdcost_block_args* args) { const int ss_txfrm_size = tx_size << 1; @@ -648,7 +655,8 @@ static void rate_block(int plane, int block, BLOCK_SIZE plane_bsize, args->rate = cost_coeffs(args->x, plane, block, args->t_above + x_idx, args->t_left + y_idx, tx_size, - args->so->scan, args->so->neighbors); + args->so->scan, args->so->neighbors, + args->use_fast_coef_costing); } static void block_rd_txfm(int plane, int block, BLOCK_SIZE plane_bsize, @@ -732,12 +740,14 @@ static void txfm_rd_in_plane(MACROBLOCK *x, int *rate, int64_t *distortion, int *skippable, int64_t *sse, int64_t ref_best_rd, int plane, - BLOCK_SIZE bsize, TX_SIZE tx_size) { + BLOCK_SIZE bsize, TX_SIZE tx_size, + int use_fast_coef_casting) { MACROBLOCKD *const xd = &x->e_mbd; struct macroblockd_plane *const pd = &xd->plane[plane]; struct rdcost_block_args args = { 0 }; args.x = x; args.best_rd = ref_best_rd; + args.use_fast_coef_costing = use_fast_coef_casting; if (plane == 0) xd->mi_8x8[0]->mbmi.tx_size = tx_size; @@ -776,7 +786,7 @@ static void choose_largest_txfm_size(VP9_COMP *cpi, MACROBLOCK *x, txfm_rd_in_plane(x, rate, distortion, skip, &sse[mbmi->tx_size], ref_best_rd, 0, bs, - mbmi->tx_size); + mbmi->tx_size, cpi->sf.use_fast_coef_costing); cpi->tx_stepdown_count[0]++; } @@ -920,7 +930,8 @@ static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x, // Actually encode using the chosen mode if a model was used, but do not // update the r, d costs txfm_rd_in_plane(x, rate, distortion, skip, - &sse[mbmi->tx_size], ref_best_rd, 0, bs, mbmi->tx_size); + &sse[mbmi->tx_size], ref_best_rd, 0, bs, mbmi->tx_size, + cpi->sf.use_fast_coef_costing); if (max_tx_size == TX_32X32 && best_tx == TX_32X32) { cpi->tx_stepdown_count[0]++; @@ -968,7 +979,8 @@ static void inter_super_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size) txfm_rd_in_plane(x, &r[tx_size][0], &d[tx_size], &s[tx_size], &sse[tx_size], - ref_best_rd, 0, bs, tx_size); + ref_best_rd, 0, bs, tx_size, + cpi->sf.use_fast_coef_costing); choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s, skip, txfm_cache, bs); } @@ -997,7 +1009,8 @@ static void intra_super_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, for (tx_size = TX_4X4; tx_size <= max_txsize_lookup[bs]; ++tx_size) txfm_rd_in_plane(x, &r[tx_size][0], &d[tx_size], &s[tx_size], &sse[tx_size], - ref_best_rd, 0, bs, tx_size); + ref_best_rd, 0, bs, tx_size, + cpi->sf.use_fast_coef_costing); choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s, skip, txfm_cache, bs); } @@ -1100,7 +1113,8 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib, vp9_fwht4x4(src_diff, coeff, 8); vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan); ratey += cost_coeffs(x, 0, block, tempa + idx, templ + idy, TX_4X4, - so->scan, so->neighbors); + so->scan, so->neighbors, + cpi->sf.use_fast_coef_costing); if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd) goto next; vp9_iwht4x4_add(BLOCK_OFFSET(pd->dqcoeff, block), dst, dst_stride, @@ -1112,7 +1126,8 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib, vp9_fht4x4(src_diff, coeff, 8, tx_type); vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan); ratey += cost_coeffs(x, 0, block, tempa + idx, templ + idy, TX_4X4, - so->scan, so->neighbors); + so->scan, so->neighbors, + cpi->sf.use_fast_coef_costing); distortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, block), 16, &unused) >> 2; if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd) @@ -1296,7 +1311,7 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x, return best_rd; } -static void super_block_uvrd(MACROBLOCK *x, +static void super_block_uvrd(const VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *distortion, int *skippable, int64_t *sse, BLOCK_SIZE bsize, int64_t ref_best_rd) { @@ -1323,7 +1338,8 @@ static void super_block_uvrd(MACROBLOCK *x, for (plane = 1; plane < MAX_MB_PLANE; ++plane) { txfm_rd_in_plane(x, &pnrate, &pndist, &pnskip, &pnsse, - ref_best_rd, plane, bsize, uv_txfm_size); + ref_best_rd, plane, bsize, uv_txfm_size, + cpi->sf.use_fast_coef_costing); if (pnrate == INT_MAX) goto term; *rate += pnrate; @@ -1359,7 +1375,7 @@ static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x, xd->mi_8x8[0]->mbmi.uv_mode = mode; - super_block_uvrd(x, &this_rate_tokenonly, + super_block_uvrd(cpi, x, &this_rate_tokenonly, &this_distortion, &s, &this_sse, bsize, best_rd); if (this_rate_tokenonly == INT_MAX) continue; @@ -1402,14 +1418,15 @@ static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x, return best_rd; } -static int64_t rd_sbuv_dcpred(const VP9_COMMON *cm, MACROBLOCK *x, +static int64_t rd_sbuv_dcpred(const VP9_COMP *cpi, MACROBLOCK *x, int *rate, int *rate_tokenonly, int64_t *distortion, int *skippable, BLOCK_SIZE bsize) { + const VP9_COMMON *cm = &cpi->common; int64_t unused; x->e_mbd.mi_8x8[0]->mbmi.uv_mode = DC_PRED; - super_block_uvrd(x, rate_tokenonly, distortion, + super_block_uvrd(cpi, x, rate_tokenonly, distortion, skippable, &unused, bsize, INT64_MAX); *rate = *rate_tokenonly + x->intra_uv_mode_cost[cm->frame_type][DC_PRED]; return RDCOST(x->rdmult, x->rddiv, *rate, *distortion); @@ -1425,7 +1442,7 @@ static void choose_intra_uv_mode(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx, // Use an estimated rd for uv_intra based on DC_PRED if the // appropriate speed flag is set. if (cpi->sf.use_uv_intra_rd_estimate) { - rd_sbuv_dcpred(&cpi->common, x, rate_uv, rate_uv_tokenonly, dist_uv, + rd_sbuv_dcpred(cpi, x, rate_uv, rate_uv_tokenonly, dist_uv, skip_uv, bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize); // Else do a proper rd search for each possible transform size that may // be considered in the main rd loop. @@ -1588,7 +1605,8 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi, 16, &ssz); thissse += ssz; thisrate += cost_coeffs(x, 0, k, ta + (k & 1), tl + (k >> 1), TX_4X4, - so->scan, so->neighbors); + so->scan, so->neighbors, + cpi->sf.use_fast_coef_costing); rd1 = RDCOST(x->rdmult, x->rddiv, thisrate, thisdistortion >> 2); rd2 = RDCOST(x->rdmult, x->rddiv, 0, thissse >> 2); rd = MIN(rd1, rd2); @@ -3026,7 +3044,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, rdcosty = RDCOST(x->rdmult, x->rddiv, *rate2, *distortion); rdcosty = MIN(rdcosty, RDCOST(x->rdmult, x->rddiv, 0, *psse)); - super_block_uvrd(x, rate_uv, distortion_uv, &skippable_uv, &sseuv, + super_block_uvrd(cpi, x, rate_uv, distortion_uv, &skippable_uv, &sseuv, bsize, ref_best_rd - rdcosty); if (*rate_uv == INT_MAX) { *rate2 = INT_MAX; @@ -4145,7 +4163,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, // then dont bother looking at UV vp9_build_inter_predictors_sbuv(&x->e_mbd, mi_row, mi_col, BLOCK_8X8); - super_block_uvrd(x, &rate_uv, &distortion_uv, &uv_skippable, + super_block_uvrd(cpi, x, &rate_uv, &distortion_uv, &uv_skippable, &uv_sse, BLOCK_8X8, tmp_best_rdu); if (rate_uv == INT_MAX) continue; -- cgit v1.2.3