summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--vp9/encoder/vp9_rd.c40
-rw-r--r--vp9/encoder/vp9_rd.h5
-rw-r--r--vp9/encoder/vp9_rdopt.c91
-rw-r--r--vp9/encoder/vp9_speed_features.c2
-rw-r--r--vp9/encoder/vp9_speed_features.h4
5 files changed, 54 insertions, 88 deletions
diff --git a/vp9/encoder/vp9_rd.c b/vp9/encoder/vp9_rd.c
index 58dd75b44..95c95971c 100644
--- a/vp9/encoder/vp9_rd.c
+++ b/vp9/encoder/vp9_rd.c
@@ -513,22 +513,6 @@ static void model_rd_norm(int xsq_q10, int *r_q10, int *d_q10) {
*d_q10 = (dist_tab_q10[xq] * b_q10 + dist_tab_q10[xq + 1] * a_q10) >> 10;
}
-static void model_rd_norm_vec(int xsq_q10[MAX_MB_PLANE],
- int r_q10[MAX_MB_PLANE],
- int d_q10[MAX_MB_PLANE]) {
- int i;
- const int one_q10 = 1 << 10;
- for (i = 0; i < MAX_MB_PLANE; ++i) {
- const int tmp = (xsq_q10[i] >> 2) + 8;
- const int k = get_msb(tmp) - 3;
- const int xq = (k << 3) + ((tmp >> k) & 0x7);
- const int a_q10 = ((xsq_q10[i] - xsq_iq_q10[xq]) << 10) >> (2 + k);
- const int b_q10 = one_q10 - a_q10;
- r_q10[i] = (rate_tab_q10[xq] * b_q10 + rate_tab_q10[xq + 1] * a_q10) >> 10;
- d_q10[i] = (dist_tab_q10[xq] * b_q10 + dist_tab_q10[xq + 1] * a_q10) >> 10;
- }
-}
-
static const uint32_t MAX_XSQ_Q10 = 245727;
void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n_log2,
@@ -554,30 +538,6 @@ void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n_log2,
}
}
-// Implements a fixed length vector form of vp9_model_rd_from_var_lapndz where
-// vectors are of length MAX_MB_PLANE and all elements of var are non-zero.
-void vp9_model_rd_from_var_lapndz_vec(unsigned int var[MAX_MB_PLANE],
- unsigned int n_log2[MAX_MB_PLANE],
- unsigned int qstep[MAX_MB_PLANE],
- int64_t *rate_sum, int64_t *dist_sum) {
- int i;
- int xsq_q10[MAX_MB_PLANE], d_q10[MAX_MB_PLANE], r_q10[MAX_MB_PLANE];
- for (i = 0; i < MAX_MB_PLANE; ++i) {
- const uint64_t xsq_q10_64 =
- (((uint64_t)qstep[i] * qstep[i] << (n_log2[i] + 10)) + (var[i] >> 1)) /
- var[i];
- xsq_q10[i] = (int)VPXMIN(xsq_q10_64, MAX_XSQ_Q10);
- }
- model_rd_norm_vec(xsq_q10, r_q10, d_q10);
- for (i = 0; i < MAX_MB_PLANE; ++i) {
- int rate =
- ROUND_POWER_OF_TWO(r_q10[i] << n_log2[i], 10 - VP9_PROB_COST_SHIFT);
- int64_t dist = (var[i] * (int64_t)d_q10[i] + 512) >> 10;
- *rate_sum += rate;
- *dist_sum += dist;
- }
-}
-
// Disable gcc 12.2 false positive warning.
// warning: writing 1 byte into a region of size 0 [-Wstringop-overflow=]
#if defined(__GNUC__) && !defined(__clang__)
diff --git a/vp9/encoder/vp9_rd.h b/vp9/encoder/vp9_rd.h
index efd854edf..6c61ae514 100644
--- a/vp9/encoder/vp9_rd.h
+++ b/vp9/encoder/vp9_rd.h
@@ -164,11 +164,6 @@ void vp9_initialize_me_consts(struct VP9_COMP *cpi, MACROBLOCK *x, int qindex);
void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n_log2,
unsigned int qstep, int *rate, int64_t *dist);
-void vp9_model_rd_from_var_lapndz_vec(unsigned int var[MAX_MB_PLANE],
- unsigned int n_log2[MAX_MB_PLANE],
- unsigned int qstep[MAX_MB_PLANE],
- int64_t *rate_sum, int64_t *dist_sum);
-
int vp9_get_switchable_rate(const struct VP9_COMP *cpi,
const MACROBLOCKD *const xd);
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 201bf416d..309341682 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -160,10 +160,12 @@ static void swap_block_ptr(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx, int m, int n,
}
#if !CONFIG_REALTIME_ONLY
-static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x,
- MACROBLOCKD *xd, int *out_rate_sum,
- int64_t *out_dist_sum, int *skip_txfm_sb,
- int64_t *skip_sse_sb) {
+static int model_rd_for_sb_earlyterm(VP9_COMP *cpi, int mi_row, int mi_col,
+ BLOCK_SIZE bsize, MACROBLOCK *x,
+ MACROBLOCKD *xd, int *out_rate_sum,
+ int64_t *out_dist_sum, int *skip_txfm_sb,
+ int64_t *skip_sse_sb, int do_earlyterm,
+ int64_t best_rd) {
// Note our transform coeffs are 8 times an orthogonal transform.
// Hence quantizer step is also 8 times. To get effective quantizer
// we need to divide by 8 before sending to modeling function.
@@ -176,19 +178,15 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x,
int64_t total_sse = 0;
int skip_flag = 1;
const int shift = 6;
- int64_t dist;
const int dequant_shift =
#if CONFIG_VP9_HIGHBITDEPTH
(xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 :
#endif // CONFIG_VP9_HIGHBITDEPTH
3;
- unsigned int qstep_vec[MAX_MB_PLANE];
- unsigned int nlog2_vec[MAX_MB_PLANE];
- unsigned int sum_sse_vec[MAX_MB_PLANE];
- int any_zero_sum_sse = 0;
x->pred_sse[ref] = 0;
+ // Build prediction signal, compute stats and RD cost on per-plane basis
for (i = 0; i < MAX_MB_PLANE; ++i) {
struct macroblock_plane *const p = &x->plane[i];
struct macroblockd_plane *const pd = &xd->plane[i];
@@ -207,7 +205,14 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x,
int idx, idy;
int lw = b_width_log2_lookup[unit_size] + 2;
int lh = b_height_log2_lookup[unit_size] + 2;
+ unsigned int qstep;
+ unsigned int nlog2;
+ int64_t dist = 0;
+ // Build inter predictor
+ vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, i);
+
+ // Compute useful stats
for (idy = 0; idy < bh; ++idy) {
for (idx = 0; idx < bw; ++idx) {
uint8_t *src = p->src.buf + (idy * p->src.stride << lh) + (idx << lw);
@@ -243,46 +248,36 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x,
}
total_sse += sum_sse;
- sum_sse_vec[i] = sum_sse;
- any_zero_sum_sse = any_zero_sum_sse || (sum_sse == 0);
- qstep_vec[i] = pd->dequant[1] >> dequant_shift;
- nlog2_vec[i] = num_pels_log2_lookup[bs];
- }
+ qstep = pd->dequant[1] >> dequant_shift;
+ nlog2 = num_pels_log2_lookup[bs];
- // Fast approximate the modelling function.
- if (cpi->sf.simple_model_rd_from_var) {
- for (i = 0; i < MAX_MB_PLANE; ++i) {
+ // Fast approximate the modelling function.
+ if (cpi->sf.simple_model_rd_from_var) {
int64_t rate;
- const int64_t square_error = sum_sse_vec[i];
- int quantizer = qstep_vec[i];
-
- if (quantizer < 120)
- rate = (square_error * (280 - quantizer)) >> (16 - VP9_PROB_COST_SHIFT);
+ if (qstep < 120)
+ rate = ((int64_t)sum_sse * (280 - qstep)) >> (16 - VP9_PROB_COST_SHIFT);
else
rate = 0;
- dist = (square_error * quantizer) >> 8;
+ dist = ((int64_t)sum_sse * qstep) >> 8;
rate_sum += rate;
- dist_sum += dist;
- }
- } else {
- if (any_zero_sum_sse) {
- for (i = 0; i < MAX_MB_PLANE; ++i) {
- int rate;
- vp9_model_rd_from_var_lapndz(sum_sse_vec[i], nlog2_vec[i], qstep_vec[i],
- &rate, &dist);
- rate_sum += rate;
- dist_sum += dist;
- }
} else {
- vp9_model_rd_from_var_lapndz_vec(sum_sse_vec, nlog2_vec, qstep_vec,
- &rate_sum, &dist_sum);
+ int rate;
+ vp9_model_rd_from_var_lapndz(sum_sse, nlog2, qstep, &rate, &dist);
+ rate_sum += rate;
+ }
+ dist_sum += dist;
+ if (do_earlyterm) {
+ if (RDCOST(x->rdmult, x->rddiv, rate_sum,
+ dist_sum << VP9_DIST_SCALE_LOG2) >= best_rd)
+ return 1;
}
}
-
*skip_txfm_sb = skip_flag;
*skip_sse_sb = total_sse << VP9_DIST_SCALE_LOG2;
*out_rate_sum = (int)rate_sum;
*out_dist_sum = dist_sum << VP9_DIST_SCALE_LOG2;
+
+ return 0;
}
#endif // !CONFIG_REALTIME_ONLY
@@ -2964,6 +2959,9 @@ static int64_t handle_inter_mode(
int64_t rs_rd;
int tmp_skip_sb = 0;
int64_t tmp_skip_sse = INT64_MAX;
+ const int enable_earlyterm =
+ cpi->sf.early_term_interp_search_plane_rd && cm->interp_filter != i;
+ int64_t filt_best_rd;
mi->interp_filter = i;
rs = vp9_get_switchable_rate(cpi, xd);
@@ -2997,9 +2995,16 @@ static int64_t handle_inter_mode(
xd->plane[j].dst.stride = 64;
}
}
- vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
- model_rd_for_sb(cpi, bsize, x, xd, &rate_sum, &dist_sum, &tmp_skip_sb,
- &tmp_skip_sse);
+ // Compute RD cost with early termination option
+ filt_best_rd =
+ cm->interp_filter == SWITCHABLE ? (best_rd - rs_rd) : best_rd;
+ if (model_rd_for_sb_earlyterm(cpi, mi_row, mi_col, bsize, x, xd,
+ &rate_sum, &dist_sum, &tmp_skip_sb,
+ &tmp_skip_sse, enable_earlyterm,
+ filt_best_rd)) {
+ filter_cache[i] = INT64_MAX;
+ continue;
+ }
rd = RDCOST(x->rdmult, x->rddiv, rate_sum, dist_sum);
filter_cache[i] = rd;
@@ -3067,9 +3072,9 @@ static int64_t handle_inter_mode(
// Handles the special case when a filter that is not in the
// switchable list (ex. bilinear) is indicated at the frame level, or
// skip condition holds.
- vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
- model_rd_for_sb(cpi, bsize, x, xd, &tmp_rate, &tmp_dist, &skip_txfm_sb,
- &skip_sse_sb);
+ model_rd_for_sb_earlyterm(cpi, mi_row, mi_col, bsize, x, xd, &tmp_rate,
+ &tmp_dist, &skip_txfm_sb, &skip_sse_sb,
+ 0 /*do_earlyterm*/, INT64_MAX);
rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate, tmp_dist);
memcpy(skip_txfm, x->skip_txfm, sizeof(skip_txfm));
memcpy(bsse, x->bsse, sizeof(bsse));
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index ce83a9762..f19385b6a 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -227,6 +227,7 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi,
sf->temporal_filter_search_method = NSTEP;
sf->tx_size_search_breakout = 1;
sf->use_square_partition_only = !boosted;
+ sf->early_term_interp_search_plane_rd = 1;
sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
@@ -919,6 +920,7 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi, int speed) {
sf->adaptive_pred_interp_filter = 0;
sf->adaptive_mode_search = 0;
sf->cb_pred_filter_search = 0;
+ sf->early_term_interp_search_plane_rd = 0;
sf->cb_partition_search = 0;
sf->motion_field_mode_search = 0;
sf->alt_ref_search_fp = 0;
diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h
index c2ae970b7..bd8e658cf 100644
--- a/vp9/encoder/vp9_speed_features.h
+++ b/vp9/encoder/vp9_speed_features.h
@@ -402,6 +402,10 @@ typedef struct SPEED_FEATURES {
// Chessboard pattern prediction filter type search
int cb_pred_filter_search;
+ // This variable enables an early termination of interpolation filter eval
+ // based on the current rd cost after processing each plane
+ int early_term_interp_search_plane_rd;
+
int cb_partition_search;
int motion_field_mode_search;