diff options
-rw-r--r-- | vp9/encoder/vp9_rd.c | 40 | ||||
-rw-r--r-- | vp9/encoder/vp9_rd.h | 5 | ||||
-rw-r--r-- | vp9/encoder/vp9_rdopt.c | 91 | ||||
-rw-r--r-- | vp9/encoder/vp9_speed_features.c | 2 | ||||
-rw-r--r-- | vp9/encoder/vp9_speed_features.h | 4 |
5 files changed, 54 insertions, 88 deletions
diff --git a/vp9/encoder/vp9_rd.c b/vp9/encoder/vp9_rd.c index 58dd75b44..95c95971c 100644 --- a/vp9/encoder/vp9_rd.c +++ b/vp9/encoder/vp9_rd.c @@ -513,22 +513,6 @@ static void model_rd_norm(int xsq_q10, int *r_q10, int *d_q10) { *d_q10 = (dist_tab_q10[xq] * b_q10 + dist_tab_q10[xq + 1] * a_q10) >> 10; } -static void model_rd_norm_vec(int xsq_q10[MAX_MB_PLANE], - int r_q10[MAX_MB_PLANE], - int d_q10[MAX_MB_PLANE]) { - int i; - const int one_q10 = 1 << 10; - for (i = 0; i < MAX_MB_PLANE; ++i) { - const int tmp = (xsq_q10[i] >> 2) + 8; - const int k = get_msb(tmp) - 3; - const int xq = (k << 3) + ((tmp >> k) & 0x7); - const int a_q10 = ((xsq_q10[i] - xsq_iq_q10[xq]) << 10) >> (2 + k); - const int b_q10 = one_q10 - a_q10; - r_q10[i] = (rate_tab_q10[xq] * b_q10 + rate_tab_q10[xq + 1] * a_q10) >> 10; - d_q10[i] = (dist_tab_q10[xq] * b_q10 + dist_tab_q10[xq + 1] * a_q10) >> 10; - } -} - static const uint32_t MAX_XSQ_Q10 = 245727; void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n_log2, @@ -554,30 +538,6 @@ void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n_log2, } } -// Implements a fixed length vector form of vp9_model_rd_from_var_lapndz where -// vectors are of length MAX_MB_PLANE and all elements of var are non-zero. -void vp9_model_rd_from_var_lapndz_vec(unsigned int var[MAX_MB_PLANE], - unsigned int n_log2[MAX_MB_PLANE], - unsigned int qstep[MAX_MB_PLANE], - int64_t *rate_sum, int64_t *dist_sum) { - int i; - int xsq_q10[MAX_MB_PLANE], d_q10[MAX_MB_PLANE], r_q10[MAX_MB_PLANE]; - for (i = 0; i < MAX_MB_PLANE; ++i) { - const uint64_t xsq_q10_64 = - (((uint64_t)qstep[i] * qstep[i] << (n_log2[i] + 10)) + (var[i] >> 1)) / - var[i]; - xsq_q10[i] = (int)VPXMIN(xsq_q10_64, MAX_XSQ_Q10); - } - model_rd_norm_vec(xsq_q10, r_q10, d_q10); - for (i = 0; i < MAX_MB_PLANE; ++i) { - int rate = - ROUND_POWER_OF_TWO(r_q10[i] << n_log2[i], 10 - VP9_PROB_COST_SHIFT); - int64_t dist = (var[i] * (int64_t)d_q10[i] + 512) >> 10; - *rate_sum += rate; - *dist_sum += dist; - } -} - // Disable gcc 12.2 false positive warning. // warning: writing 1 byte into a region of size 0 [-Wstringop-overflow=] #if defined(__GNUC__) && !defined(__clang__) diff --git a/vp9/encoder/vp9_rd.h b/vp9/encoder/vp9_rd.h index efd854edf..6c61ae514 100644 --- a/vp9/encoder/vp9_rd.h +++ b/vp9/encoder/vp9_rd.h @@ -164,11 +164,6 @@ void vp9_initialize_me_consts(struct VP9_COMP *cpi, MACROBLOCK *x, int qindex); void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n_log2, unsigned int qstep, int *rate, int64_t *dist); -void vp9_model_rd_from_var_lapndz_vec(unsigned int var[MAX_MB_PLANE], - unsigned int n_log2[MAX_MB_PLANE], - unsigned int qstep[MAX_MB_PLANE], - int64_t *rate_sum, int64_t *dist_sum); - int vp9_get_switchable_rate(const struct VP9_COMP *cpi, const MACROBLOCKD *const xd); diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index 201bf416d..309341682 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -160,10 +160,12 @@ static void swap_block_ptr(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx, int m, int n, } #if !CONFIG_REALTIME_ONLY -static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x, - MACROBLOCKD *xd, int *out_rate_sum, - int64_t *out_dist_sum, int *skip_txfm_sb, - int64_t *skip_sse_sb) { +static int model_rd_for_sb_earlyterm(VP9_COMP *cpi, int mi_row, int mi_col, + BLOCK_SIZE bsize, MACROBLOCK *x, + MACROBLOCKD *xd, int *out_rate_sum, + int64_t *out_dist_sum, int *skip_txfm_sb, + int64_t *skip_sse_sb, int do_earlyterm, + int64_t best_rd) { // Note our transform coeffs are 8 times an orthogonal transform. // Hence quantizer step is also 8 times. To get effective quantizer // we need to divide by 8 before sending to modeling function. @@ -176,19 +178,15 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x, int64_t total_sse = 0; int skip_flag = 1; const int shift = 6; - int64_t dist; const int dequant_shift = #if CONFIG_VP9_HIGHBITDEPTH (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : #endif // CONFIG_VP9_HIGHBITDEPTH 3; - unsigned int qstep_vec[MAX_MB_PLANE]; - unsigned int nlog2_vec[MAX_MB_PLANE]; - unsigned int sum_sse_vec[MAX_MB_PLANE]; - int any_zero_sum_sse = 0; x->pred_sse[ref] = 0; + // Build prediction signal, compute stats and RD cost on per-plane basis for (i = 0; i < MAX_MB_PLANE; ++i) { struct macroblock_plane *const p = &x->plane[i]; struct macroblockd_plane *const pd = &xd->plane[i]; @@ -207,7 +205,14 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x, int idx, idy; int lw = b_width_log2_lookup[unit_size] + 2; int lh = b_height_log2_lookup[unit_size] + 2; + unsigned int qstep; + unsigned int nlog2; + int64_t dist = 0; + // Build inter predictor + vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, i); + + // Compute useful stats for (idy = 0; idy < bh; ++idy) { for (idx = 0; idx < bw; ++idx) { uint8_t *src = p->src.buf + (idy * p->src.stride << lh) + (idx << lw); @@ -243,46 +248,36 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x, } total_sse += sum_sse; - sum_sse_vec[i] = sum_sse; - any_zero_sum_sse = any_zero_sum_sse || (sum_sse == 0); - qstep_vec[i] = pd->dequant[1] >> dequant_shift; - nlog2_vec[i] = num_pels_log2_lookup[bs]; - } + qstep = pd->dequant[1] >> dequant_shift; + nlog2 = num_pels_log2_lookup[bs]; - // Fast approximate the modelling function. - if (cpi->sf.simple_model_rd_from_var) { - for (i = 0; i < MAX_MB_PLANE; ++i) { + // Fast approximate the modelling function. + if (cpi->sf.simple_model_rd_from_var) { int64_t rate; - const int64_t square_error = sum_sse_vec[i]; - int quantizer = qstep_vec[i]; - - if (quantizer < 120) - rate = (square_error * (280 - quantizer)) >> (16 - VP9_PROB_COST_SHIFT); + if (qstep < 120) + rate = ((int64_t)sum_sse * (280 - qstep)) >> (16 - VP9_PROB_COST_SHIFT); else rate = 0; - dist = (square_error * quantizer) >> 8; + dist = ((int64_t)sum_sse * qstep) >> 8; rate_sum += rate; - dist_sum += dist; - } - } else { - if (any_zero_sum_sse) { - for (i = 0; i < MAX_MB_PLANE; ++i) { - int rate; - vp9_model_rd_from_var_lapndz(sum_sse_vec[i], nlog2_vec[i], qstep_vec[i], - &rate, &dist); - rate_sum += rate; - dist_sum += dist; - } } else { - vp9_model_rd_from_var_lapndz_vec(sum_sse_vec, nlog2_vec, qstep_vec, - &rate_sum, &dist_sum); + int rate; + vp9_model_rd_from_var_lapndz(sum_sse, nlog2, qstep, &rate, &dist); + rate_sum += rate; + } + dist_sum += dist; + if (do_earlyterm) { + if (RDCOST(x->rdmult, x->rddiv, rate_sum, + dist_sum << VP9_DIST_SCALE_LOG2) >= best_rd) + return 1; } } - *skip_txfm_sb = skip_flag; *skip_sse_sb = total_sse << VP9_DIST_SCALE_LOG2; *out_rate_sum = (int)rate_sum; *out_dist_sum = dist_sum << VP9_DIST_SCALE_LOG2; + + return 0; } #endif // !CONFIG_REALTIME_ONLY @@ -2964,6 +2959,9 @@ static int64_t handle_inter_mode( int64_t rs_rd; int tmp_skip_sb = 0; int64_t tmp_skip_sse = INT64_MAX; + const int enable_earlyterm = + cpi->sf.early_term_interp_search_plane_rd && cm->interp_filter != i; + int64_t filt_best_rd; mi->interp_filter = i; rs = vp9_get_switchable_rate(cpi, xd); @@ -2997,9 +2995,16 @@ static int64_t handle_inter_mode( xd->plane[j].dst.stride = 64; } } - vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize); - model_rd_for_sb(cpi, bsize, x, xd, &rate_sum, &dist_sum, &tmp_skip_sb, - &tmp_skip_sse); + // Compute RD cost with early termination option + filt_best_rd = + cm->interp_filter == SWITCHABLE ? (best_rd - rs_rd) : best_rd; + if (model_rd_for_sb_earlyterm(cpi, mi_row, mi_col, bsize, x, xd, + &rate_sum, &dist_sum, &tmp_skip_sb, + &tmp_skip_sse, enable_earlyterm, + filt_best_rd)) { + filter_cache[i] = INT64_MAX; + continue; + } rd = RDCOST(x->rdmult, x->rddiv, rate_sum, dist_sum); filter_cache[i] = rd; @@ -3067,9 +3072,9 @@ static int64_t handle_inter_mode( // Handles the special case when a filter that is not in the // switchable list (ex. bilinear) is indicated at the frame level, or // skip condition holds. - vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize); - model_rd_for_sb(cpi, bsize, x, xd, &tmp_rate, &tmp_dist, &skip_txfm_sb, - &skip_sse_sb); + model_rd_for_sb_earlyterm(cpi, mi_row, mi_col, bsize, x, xd, &tmp_rate, + &tmp_dist, &skip_txfm_sb, &skip_sse_sb, + 0 /*do_earlyterm*/, INT64_MAX); rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate, tmp_dist); memcpy(skip_txfm, x->skip_txfm, sizeof(skip_txfm)); memcpy(bsse, x->bsse, sizeof(bsse)); diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c index ce83a9762..f19385b6a 100644 --- a/vp9/encoder/vp9_speed_features.c +++ b/vp9/encoder/vp9_speed_features.c @@ -227,6 +227,7 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi, sf->temporal_filter_search_method = NSTEP; sf->tx_size_search_breakout = 1; sf->use_square_partition_only = !boosted; + sf->early_term_interp_search_plane_rd = 1; sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V; @@ -919,6 +920,7 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi, int speed) { sf->adaptive_pred_interp_filter = 0; sf->adaptive_mode_search = 0; sf->cb_pred_filter_search = 0; + sf->early_term_interp_search_plane_rd = 0; sf->cb_partition_search = 0; sf->motion_field_mode_search = 0; sf->alt_ref_search_fp = 0; diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h index c2ae970b7..bd8e658cf 100644 --- a/vp9/encoder/vp9_speed_features.h +++ b/vp9/encoder/vp9_speed_features.h @@ -402,6 +402,10 @@ typedef struct SPEED_FEATURES { // Chessboard pattern prediction filter type search int cb_pred_filter_search; + // This variable enables an early termination of interpolation filter eval + // based on the current rd cost after processing each plane + int early_term_interp_search_plane_rd; + int cb_partition_search; int motion_field_mode_search; |