diff options
Diffstat (limited to 'vp9')
-rw-r--r-- | vp9/common/vp9_reconinter.c | 7 | ||||
-rw-r--r-- | vp9/common/vp9_reconinter.h | 3 | ||||
-rw-r--r-- | vp9/decoder/vp9_decodeframe.c | 130 | ||||
-rw-r--r-- | vp9/decoder/vp9_decoder.h | 1 | ||||
-rw-r--r-- | vp9/decoder/vp9_dthread.c | 15 | ||||
-rw-r--r-- | vp9/encoder/vp9_block.h | 4 | ||||
-rw-r--r-- | vp9/encoder/vp9_encodeframe.c | 44 | ||||
-rw-r--r-- | vp9/encoder/vp9_mcomp.c | 11 | ||||
-rw-r--r-- | vp9/encoder/vp9_pickmode.c | 82 |
9 files changed, 260 insertions, 37 deletions
diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c index ed3ea7e1f..1be358e87 100644 --- a/vp9/common/vp9_reconinter.c +++ b/vp9/common/vp9_reconinter.c @@ -264,11 +264,18 @@ void vp9_build_inter_predictors_sby(MACROBLOCKD *xd, int mi_row, int mi_col, BLOCK_SIZE bsize) { build_inter_predictors_for_planes(xd, bsize, mi_row, mi_col, 0, 0); } + +void vp9_build_inter_predictors_sbp(MACROBLOCKD *xd, int mi_row, int mi_col, + BLOCK_SIZE bsize, int plane) { + build_inter_predictors_for_planes(xd, bsize, mi_row, mi_col, plane, plane); +} + void vp9_build_inter_predictors_sbuv(MACROBLOCKD *xd, int mi_row, int mi_col, BLOCK_SIZE bsize) { build_inter_predictors_for_planes(xd, bsize, mi_row, mi_col, 1, MAX_MB_PLANE - 1); } + void vp9_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col, BLOCK_SIZE bsize) { build_inter_predictors_for_planes(xd, bsize, mi_row, mi_col, 0, diff --git a/vp9/common/vp9_reconinter.h b/vp9/common/vp9_reconinter.h index d5ecf85b4..e7057445a 100644 --- a/vp9/common/vp9_reconinter.h +++ b/vp9/common/vp9_reconinter.h @@ -52,6 +52,9 @@ void build_inter_predictors(MACROBLOCKD *xd, int plane, int block, void vp9_build_inter_predictors_sby(MACROBLOCKD *xd, int mi_row, int mi_col, BLOCK_SIZE bsize); +void vp9_build_inter_predictors_sbp(MACROBLOCKD *xd, int mi_row, int mi_col, + BLOCK_SIZE bsize, int plane); + void vp9_build_inter_predictors_sbuv(MACROBLOCKD *xd, int mi_row, int mi_col, BLOCK_SIZE bsize); diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c index 07249d092..31dd7ebcd 100644 --- a/vp9/decoder/vp9_decodeframe.c +++ b/vp9/decoder/vp9_decodeframe.c @@ -1064,8 +1064,7 @@ static int tile_worker_hook(TileWorkerData *const tile_data, vp9_zero(tile_data->xd.left_seg_context); for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end; mi_col += MI_BLOCK_SIZE) { - decode_partition(tile_data->pbi, &tile_data->xd, - &tile_data->pbi->common.counts, + decode_partition(tile_data->pbi, &tile_data->xd, &tile_data->counts, tile, mi_row, mi_col, &tile_data->bit_reader, BLOCK_64X64); } @@ -1086,6 +1085,105 @@ static int compare_tile_buffers(const void *a, const void *b) { } } +// Accumulate frame counts. +static void accumulate_frame_counts(VP9_COMMON *cm, FRAME_COUNTS *counts) { + int i, j, k, l, m, n; + + for (i = 0; i < BLOCK_SIZE_GROUPS; i++) + for (j = 0; j < INTRA_MODES; j++) + cm->counts.y_mode[i][j] += counts->y_mode[i][j]; + + for (i = 0; i < INTRA_MODES; i++) + for (j = 0; j < INTRA_MODES; j++) + cm->counts.uv_mode[i][j] += counts->uv_mode[i][j]; + + for (i = 0; i < PARTITION_CONTEXTS; i++) + for (j = 0; j < PARTITION_TYPES; j++) + cm->counts.partition[i][j] += counts->partition[i][j]; + + for (i = 0; i < TX_SIZES; i++) + for (j = 0; j < PLANE_TYPES; j++) + for (k = 0; k < REF_TYPES; k++) + for (l = 0; l < COEF_BANDS; l++) + for (m = 0; m < COEFF_CONTEXTS; m++) { + cm->counts.eob_branch[i][j][k][l][m] += + counts->eob_branch[i][j][k][l][m]; + for (n = 0; n < UNCONSTRAINED_NODES + 1; n++) + cm->counts.coef[i][j][k][l][m][n] += + counts->coef[i][j][k][l][m][n]; + } + + for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) + for (j = 0; j < SWITCHABLE_FILTERS; j++) + cm->counts.switchable_interp[i][j] += counts->switchable_interp[i][j]; + + for (i = 0; i < INTER_MODE_CONTEXTS; i++) + for (j = 0; j < INTER_MODES; j++) + cm->counts.inter_mode[i][j] += counts->inter_mode[i][j]; + + for (i = 0; i < INTRA_INTER_CONTEXTS; i++) + for (j = 0; j < 2; j++) + cm->counts.intra_inter[i][j] += counts->intra_inter[i][j]; + + for (i = 0; i < COMP_INTER_CONTEXTS; i++) + for (j = 0; j < 2; j++) + cm->counts.comp_inter[i][j] += counts->comp_inter[i][j]; + + for (i = 0; i < REF_CONTEXTS; i++) + for (j = 0; j < 2; j++) + for (k = 0; k < 2; k++) + cm->counts.single_ref[i][j][k] += counts->single_ref[i][j][k]; + + for (i = 0; i < REF_CONTEXTS; i++) + for (j = 0; j < 2; j++) + cm->counts.comp_ref[i][j] += counts->comp_ref[i][j]; + + for (i = 0; i < TX_SIZE_CONTEXTS; i++) { + for (j = 0; j < TX_SIZES; j++) + cm->counts.tx.p32x32[i][j] += counts->tx.p32x32[i][j]; + + for (j = 0; j < TX_SIZES - 1; j++) + cm->counts.tx.p16x16[i][j] += counts->tx.p16x16[i][j]; + + for (j = 0; j < TX_SIZES - 2; j++) + cm->counts.tx.p8x8[i][j] += counts->tx.p8x8[i][j]; + } + + for (i = 0; i < SKIP_CONTEXTS; i++) + for (j = 0; j < 2; j++) + cm->counts.skip[i][j] += counts->skip[i][j]; + + for (i = 0; i < MV_JOINTS; i++) + cm->counts.mv.joints[i] += counts->mv.joints[i]; + + for (k = 0; k < 2; k++) { + nmv_component_counts *comps = &cm->counts.mv.comps[k]; + nmv_component_counts *comps_t = &counts->mv.comps[k]; + + for (i = 0; i < 2; i++) { + comps->sign[i] += comps_t->sign[i]; + comps->class0_hp[i] += comps_t->class0_hp[i]; + comps->hp[i] += comps_t->hp[i]; + } + + for (i = 0; i < MV_CLASSES; i++) + comps->classes[i] += comps_t->classes[i]; + + for (i = 0; i < CLASS0_SIZE; i++) { + comps->class0[i] += comps_t->class0[i]; + for (j = 0; j < MV_FP_SIZE; j++) + comps->class0_fp[i][j] += comps_t->class0_fp[i][j]; + } + + for (i = 0; i < MV_OFFSET_BITS; i++) + for (j = 0; j < 2; j++) + comps->bits[i][j] += comps_t->bits[i][j]; + + for (i = 0; i < MV_FP_SIZE; i++) + comps->fp[i] += comps_t->fp[i]; + } +} + static const uint8_t *decode_tiles_mt(VP9Decoder *pbi, const uint8_t *data, const uint8_t *data_end) { @@ -1172,6 +1270,17 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi, } } + // Initialize thread frame counts. + if (!cm->frame_parallel_decoding_mode) { + int i; + + for (i = 0; i < num_workers; ++i) { + TileWorkerData *const tile_data = + (TileWorkerData*)pbi->tile_workers[i].data1; + vp9_zero(tile_data->counts); + } + } + n = 0; while (n < tile_cols) { int i; @@ -1184,7 +1293,7 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi, tile_data->pbi = pbi; tile_data->xd = pbi->mb; tile_data->xd.corrupted = 0; - vp9_tile_init(tile, &pbi->common, 0, buf->col); + vp9_tile_init(tile, cm, 0, buf->col); setup_token_decoder(buf->data, data_end, buf->size, &cm->error, &tile_data->bit_reader, pbi->decrypt_cb, pbi->decrypt_state); @@ -1218,6 +1327,15 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi, bit_reader_end = vp9_reader_find_end(&tile_data->bit_reader); final_worker = -1; } + + // Accumulate thread frame counts. + if (n >= tile_cols && !cm->frame_parallel_decoding_mode) { + for (i = 0; i < num_workers; ++i) { + TileWorkerData *const tile_data = + (TileWorkerData*)pbi->tile_workers[i].data1; + accumulate_frame_counts(cm, &tile_data->counts); + } + } } return bit_reader_end; @@ -1673,10 +1791,8 @@ void vp9_decode_frame(VP9Decoder *pbi, vp9_frameworker_unlock_stats(worker); } - // TODO(jzern): remove frame_parallel_decoding_mode restriction for - // single-frame tile decoding. - if (pbi->max_threads > 1 && tile_rows == 1 && tile_cols > 1 && - cm->frame_parallel_decoding_mode) { + if (pbi->max_threads > 1 && tile_rows == 1 && tile_cols > 1) { + // Multi-threaded tile decoder *p_data_end = decode_tiles_mt(pbi, data + first_partition_size, data_end); if (!xd->corrupted) { // If multiple threads are used to decode tiles, then we use those threads diff --git a/vp9/decoder/vp9_decoder.h b/vp9/decoder/vp9_decoder.h index 47cce068f..4dfbe8171 100644 --- a/vp9/decoder/vp9_decoder.h +++ b/vp9/decoder/vp9_decoder.h @@ -36,6 +36,7 @@ typedef struct TileData { typedef struct TileWorkerData { struct VP9Decoder *pbi; vp9_reader bit_reader; + FRAME_COUNTS counts; DECLARE_ALIGNED(16, MACROBLOCKD, xd); struct vpx_internal_error_info error_info; } TileWorkerData; diff --git a/vp9/decoder/vp9_dthread.c b/vp9/decoder/vp9_dthread.c index 7aa888848..d2a2b819c 100644 --- a/vp9/decoder/vp9_dthread.c +++ b/vp9/decoder/vp9_dthread.c @@ -45,6 +45,13 @@ void vp9_frameworker_signal_stats(VP9Worker *const worker) { #endif } +// This macro prevents thread_sanitizer from reporting known concurrent writes. +#if defined(__has_feature) +#if __has_feature(thread_sanitizer) +#define BUILDING_WITH_TSAN +#endif +#endif + // TODO(hkuang): Remove worker parameter as it is only used in debug code. void vp9_frameworker_wait(VP9Worker *const worker, RefCntBuffer *const ref_buf, int row) { @@ -52,9 +59,11 @@ void vp9_frameworker_wait(VP9Worker *const worker, RefCntBuffer *const ref_buf, if (!ref_buf) return; - // Enabling the following line of code will get harmless tsan error but - // will get best performance. - // if (ref_buf->row >= row && ref_buf->buf.corrupted != 1) return; +#ifndef BUILDING_WITH_TSAN + // The following line of code will get harmless tsan error but it is the key + // to get best performance. + if (ref_buf->row >= row && ref_buf->buf.corrupted != 1) return; +#endif { // Find the worker thread that owns the reference frame. If the reference diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h index 68174a6cc..04a1b8f3c 100644 --- a/vp9/encoder/vp9_block.h +++ b/vp9/encoder/vp9_block.h @@ -118,6 +118,10 @@ struct macroblock { // Used to store sub partition's choices. MV pred_mv[MAX_REF_FRAMES]; + // Strong color activity detection. Used in RTC coding mode to enhance + // the visual quality at the boundary of moving color objects. + uint8_t color_sensitivity[2]; + void (*fwd_txm4x4)(const int16_t *input, tran_low_t *output, int stride); void (*itxm_add)(const tran_low_t *input, uint8_t *dest, int stride, int eob); #if CONFIG_VP9_HIGHBITDEPTH diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index 091013060..d17b051ec 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -504,7 +504,7 @@ static void choose_partitioning(VP9_COMP *cpi, threshold_base = (int64_t)(threshold_multiplier * vp9_convert_qindex_to_q(cm->base_qindex, cm->bit_depth)); threshold = threshold_base; - threshold_bsize_min = threshold_base << 6; + threshold_bsize_min = threshold_base << cpi->oxcf.speed; threshold_bsize_max = threshold_base; // Modify thresholds for key frame and for low-resolutions (set lower @@ -529,12 +529,23 @@ static void choose_partitioning(VP9_COMP *cpi, if (cm->frame_type != KEY_FRAME) { MB_MODE_INFO *mbmi = &xd->mi[0].src_mi->mbmi; + unsigned int var = 0, sse; vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col, sf); mbmi->ref_frame[0] = LAST_FRAME; mbmi->ref_frame[1] = NONE; mbmi->sb_type = BLOCK_64X64; mbmi->mv[0].as_int = 0; - vp9_build_inter_predictors_sby(xd, mi_row, mi_col, BLOCK_64X64); + vp9_build_inter_predictors_sb(xd, mi_row, mi_col, BLOCK_64X64); + + for (i = 1; i <= 2; ++i) { + struct macroblock_plane *p = &x->plane[i]; + struct macroblockd_plane *pd = &xd->plane[i]; + const BLOCK_SIZE bs = get_plane_block_size(BLOCK_64X64, pd); + var += cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride, + pd->dst.buf, pd->dst.stride, &sse); + if (sse > 2048) + x->color_sensitivity[i - 1] = 1; + } d = xd->plane[0].dst.buf; dp = xd->plane[0].dst.stride; @@ -2738,6 +2749,10 @@ static MV_REFERENCE_FRAME get_frame_type(const VP9_COMP *cpi) { static TX_MODE select_tx_mode(const VP9_COMP *cpi, MACROBLOCKD *const xd) { if (xd->lossless) return ONLY_4X4; + if (cpi->common.frame_type == KEY_FRAME && + cpi->sf.use_nonrd_pick_mode && + cpi->sf.partition_search_type == VAR_BASED_PARTITION) + return ALLOW_16X16; if (cpi->sf.tx_size_search_method == USE_LARGESTALL) return ALLOW_32X32; else if (cpi->sf.tx_size_search_method == USE_FULL_RD|| @@ -3382,6 +3397,8 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, x->source_variance = UINT_MAX; vp9_zero(x->pred_mv); vp9_rd_cost_init(&dummy_rdc); + x->color_sensitivity[0] = 0; + x->color_sensitivity[1] = 0; // Set the partition type of the 64X64 block switch (sf->partition_search_type) { @@ -3671,14 +3688,6 @@ static void encode_frame_internal(VP9_COMP *cpi) { cm->uv_dc_delta_q == 0 && cm->uv_ac_delta_q == 0; - cm->tx_mode = select_tx_mode(cpi, xd); - if (cm->frame_type == KEY_FRAME && - cpi->sf.use_nonrd_pick_mode && - cpi->sf.partition_search_type == VAR_BASED_PARTITION) { - cm->tx_mode = ALLOW_16X16; - } - - #if CONFIG_VP9_HIGHBITDEPTH if (cm->use_highbitdepth) x->fwd_txm4x4 = xd->lossless ? vp9_highbd_fwht4x4 : vp9_highbd_fdct4x4; @@ -3691,10 +3700,10 @@ static void encode_frame_internal(VP9_COMP *cpi) { #endif // CONFIG_VP9_HIGHBITDEPTH x->itxm_add = xd->lossless ? vp9_iwht4x4_add : vp9_idct4x4_add; - if (xd->lossless) { + if (xd->lossless) x->optimize = 0; - cm->lf.filter_level = 0; - } + + cm->tx_mode = select_tx_mode(cpi, xd); vp9_frame_init_quantizer(cpi); @@ -3782,9 +3791,6 @@ static INTERP_FILTER get_interp_filter( void vp9_encode_frame(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; - RD_OPT *const rd_opt = &cpi->rd; - FRAME_COUNTS *counts = cpi->td.counts; - RD_COUNTS *const rdc = &cpi->td.rd_counts; // In the longer term the encoder should be generalized to match the // decoder such that we allow compound where one of the 3 buffers has a @@ -3806,11 +3812,11 @@ void vp9_encode_frame(VP9_COMP *cpi) { } } - vpx_memset(cpi->td.counts->tx.tx_totals, 0, - sizeof(cpi->td.counts->tx.tx_totals)); - if (cpi->sf.frame_parameter_update) { int i; + RD_OPT *const rd_opt = &cpi->rd; + FRAME_COUNTS *counts = cpi->td.counts; + RD_COUNTS *const rdc = &cpi->td.rd_counts; // This code does a single RD pass over the whole frame assuming // either compound, single or hybrid prediction as per whatever has diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c index a428f1a2d..159e0fc0c 100644 --- a/vp9/encoder/vp9_mcomp.c +++ b/vp9/encoder/vp9_mcomp.c @@ -90,13 +90,10 @@ static int mv_err_cost(const MV *mv, const MV *ref, static int mvsad_err_cost(const MACROBLOCK *x, const MV *mv, const MV *ref, int error_per_bit) { - if (x->nmvsadcost) { - const MV diff = { mv->row - ref->row, - mv->col - ref->col }; - return ROUND_POWER_OF_TWO(mv_cost(&diff, x->nmvjointsadcost, - x->nmvsadcost) * error_per_bit, 8); - } - return 0; + const MV diff = { mv->row - ref->row, + mv->col - ref->col }; + return ROUND_POWER_OF_TWO(mv_cost(&diff, x->nmvjointsadcost, + x->nmvsadcost) * error_per_bit, 8); } void vp9_init_dsmotion_compensation(search_site_config *cfg, int stride) { diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c index e239c008f..a34b12258 100644 --- a/vp9/encoder/vp9_pickmode.c +++ b/vp9/encoder/vp9_pickmode.c @@ -283,6 +283,71 @@ static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize, x->skip_txfm[0] = 1; } +static void model_rd_for_sb_uv(VP9_COMP *cpi, BLOCK_SIZE bsize, + MACROBLOCK *x, MACROBLOCKD *xd, + int *out_rate_sum, int64_t *out_dist_sum, + unsigned int *var_y, unsigned int *sse_y) { + // Note our transform coeffs are 8 times an orthogonal transform. + // Hence quantizer step is also 8 times. To get effective quantizer + // we need to divide by 8 before sending to modeling function. + unsigned int sse; + int rate; + int64_t dist; + int i; + + *out_rate_sum = 0; + *out_dist_sum = 0; + + for (i = 1; i <= 2; ++i) { + struct macroblock_plane *const p = &x->plane[i]; + struct macroblockd_plane *const pd = &xd->plane[i]; + const uint32_t dc_quant = pd->dequant[0]; + const uint32_t ac_quant = pd->dequant[1]; + const BLOCK_SIZE bs = get_plane_block_size(bsize, pd); + unsigned int var; + + if (!x->color_sensitivity[i - 1]) + continue; + + var = cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride, + pd->dst.buf, pd->dst.stride, &sse); + *var_y += var; + *sse_y += sse; + + #if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bs], + dc_quant >> (xd->bd - 5), &rate, &dist); + } else { + vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bs], + dc_quant >> 3, &rate, &dist); + } + #else + vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bs], + dc_quant >> 3, &rate, &dist); + #endif // CONFIG_VP9_HIGHBITDEPTH + + *out_rate_sum += rate >> 1; + *out_dist_sum += dist << 3; + + #if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bs], + ac_quant >> (xd->bd - 5), &rate, &dist); + } else { + vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bs], + ac_quant >> 3, &rate, &dist); + } + #else + vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bs], + ac_quant >> 3, &rate, &dist); + #endif // CONFIG_VP9_HIGHBITDEPTH + + *out_rate_sum += rate; + *out_dist_sum += dist << 4; + } +} + static int get_pred_buffer(PRED_BUFFER *p, int len) { int i; @@ -658,7 +723,8 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, continue; // Select prediction reference frames. - xd->plane[0].pre[0] = yv12_mb[ref_frame][0]; + for (i = 0; i < MAX_MB_PLANE; i++) + xd->plane[i].pre[0] = yv12_mb[ref_frame][i]; clamp_mv2(&frame_mv[NEARESTMV][ref_frame].as_mv, xd); clamp_mv2(&frame_mv[NEARMV][ref_frame].as_mv, xd); @@ -776,6 +842,20 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, &var_y, &sse_y); } + // chroma component rate-distortion cost modeling + if (x->color_sensitivity[0] || x->color_sensitivity[1]) { + int uv_rate = 0; + int64_t uv_dist = 0; + if (x->color_sensitivity[0]) + vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, 1); + if (x->color_sensitivity[1]) + vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, 2); + model_rd_for_sb_uv(cpi, bsize, x, xd, &uv_rate, &uv_dist, + &var_y, &sse_y); + this_rdc.rate += uv_rate; + this_rdc.dist += uv_dist; + } + this_rdc.rate += rate_mv; this_rdc.rate += cpi->inter_mode_cost[mbmi->mode_context[ref_frame]] [INTER_OFFSET(this_mode)]; |