From b54cdcc3de62390dc438a36425665a89958aefea Mon Sep 17 00:00:00 2001 From: Hui Su Date: Fri, 20 Jul 2018 15:29:14 -0700 Subject: Add prune_ref_frame_for_rect_partitions feature Add a speed feature to prune reference frames for rectangular partitions. Rectangular partition RD search happens after square partition RD search. With this feature, we keep record of the ref frames picked by square partitions, and only consider those ref frames during rect partition RD search. With this feature on, the computation cost of rect partition RD search is greatly reduced, so we can afford to skip rect partition RD search less aggressively. Overall, both compression and encoding speed are improved. Only speed 0 is affected. Coding gains: lowres midres hdres ovr psnr 0.00% -0.36% -0.37% avg psnr 0.00% -0.36% -0.36% Tested encoding speed with QP=40 on about 30 sequences. Speed gains: lowres midres hdres average 13.4% 7.1% 6.1% max 28.0% 12.0% 9.8% Change-Id: Id5f36dd2ac75028ae98550d67b0a524aa251b692 --- vp9/encoder/vp9_context_tree.h | 2 ++ vp9/encoder/vp9_encodeframe.c | 53 +++++++++++++++++++++++++++++++++++++--- vp9/encoder/vp9_rdopt.c | 9 +++++++ vp9/encoder/vp9_speed_features.c | 15 +++++++----- vp9/encoder/vp9_speed_features.h | 3 +++ 5 files changed, 73 insertions(+), 9 deletions(-) diff --git a/vp9/encoder/vp9_context_tree.h b/vp9/encoder/vp9_context_tree.h index 73423c075..2bcc26e94 100644 --- a/vp9/encoder/vp9_context_tree.h +++ b/vp9/encoder/vp9_context_tree.h @@ -75,6 +75,8 @@ typedef struct { // Used for the machine learning-based early termination int32_t sum_y_eobs; + // Skip certain ref frames during RD search of rectangular partitions. + uint8_t skip_ref_frame_mask; } PICK_MODE_CONTEXT; typedef struct PC_TREE { diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index 8f622d0ed..f3a4ae7fe 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -2633,6 +2633,7 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td, ctx, INT64_MAX); break; case PARTITION_HORZ: + pc_tree->horizontal[0].skip_ref_frame_mask = 0; rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc, subsize, &pc_tree->horizontal[0], INT64_MAX); if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 && @@ -2642,6 +2643,7 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td, vp9_rd_cost_init(&tmp_rdc); update_state(cpi, td, ctx, mi_row, mi_col, subsize, 0); encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, ctx); + pc_tree->horizontal[1].skip_ref_frame_mask = 0; rd_pick_sb_modes(cpi, tile_data, x, mi_row + (mi_step >> 1), mi_col, &tmp_rdc, subsize, &pc_tree->horizontal[1], INT64_MAX); if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) { @@ -2654,6 +2656,7 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td, } break; case PARTITION_VERT: + pc_tree->vertical[0].skip_ref_frame_mask = 0; rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc, subsize, &pc_tree->vertical[0], INT64_MAX); if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 && @@ -2663,6 +2666,7 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td, vp9_rd_cost_init(&tmp_rdc); update_state(cpi, td, ctx, mi_row, mi_col, subsize, 0); encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, ctx); + pc_tree->vertical[bsize > BLOCK_8X8].skip_ref_frame_mask = 0; rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + (mi_step >> 1), &tmp_rdc, subsize, &pc_tree->vertical[bsize > BLOCK_8X8], INT64_MAX); @@ -3712,10 +3716,12 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td, int64_t dist_breakout_thr = cpi->sf.partition_search_breakout_thr.dist; int rate_breakout_thr = cpi->sf.partition_search_breakout_thr.rate; int must_split = 0; - int partition_mul = cpi->sf.enable_tpl_model && cpi->oxcf.aq_mode == NO_AQ ? x->cb_rdmult : cpi->rd.RDMULT; + // Ref frames picked in the [i_th] quarter subblock during square partition + // RD search. It may be used to prune ref frame selection of rect partitions. + uint8_t ref_frames_used[4] = { 0, 0, 0, 0 }; (void)*tp_orig; @@ -3846,6 +3852,14 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td, rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, bsize, ctx, best_rdc.rdcost); if (this_rdc.rate != INT_MAX) { + if (cpi->sf.prune_ref_frame_for_rect_partitions) { + const int ref1 = ctx->mic.ref_frame[0]; + const int ref2 = ctx->mic.ref_frame[1]; + for (i = 0; i < 4; ++i) { + ref_frames_used[i] |= (1 << ref1); + if (ref2 > 0) ref_frames_used[i] |= (1 << ref2); + } + } if (bsize >= BLOCK_8X8) { this_rdc.rdcost += RDCOST(partition_mul, x->rddiv, cpi->partition_cost[pl][PARTITION_NONE], 0); @@ -3970,8 +3984,18 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td, pc_tree->leaf_split[0]->pred_interp_filter = pred_interp_filter; rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, subsize, pc_tree->leaf_split[0], best_rdc.rdcost); - - if (sum_rdc.rate == INT_MAX) sum_rdc.rdcost = INT64_MAX; + if (sum_rdc.rate == INT_MAX) { + sum_rdc.rdcost = INT64_MAX; + } else { + if (cpi->sf.prune_ref_frame_for_rect_partitions) { + const int ref1 = pc_tree->leaf_split[0]->mic.ref_frame[0]; + const int ref2 = pc_tree->leaf_split[0]->mic.ref_frame[1]; + for (i = 0; i < 4; ++i) { + ref_frames_used[i] |= (1 << ref1); + if (ref2 > 0) ref_frames_used[i] |= (1 << ref2); + } + } + } } else { for (i = 0; (i < 4) && ((sum_rdc.rdcost < best_rdc.rdcost) || must_split); ++i) { @@ -3999,6 +4023,13 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td, sum_rdc.rdcost = INT64_MAX; break; } else { + if (cpi->sf.prune_ref_frame_for_rect_partitions && + pc_tree->split[i]->none.rate != INT_MAX) { + const int ref1 = pc_tree->split[i]->none.mic.ref_frame[0]; + const int ref2 = pc_tree->split[i]->none.mic.ref_frame[1]; + ref_frames_used[i] |= (1 << ref1); + if (ref2 > 0) ref_frames_used[i] |= (1 << ref2); + } sum_rdc.rate += this_rdc.rate; sum_rdc.dist += this_rdc.dist; sum_rdc.rdcost += this_rdc.rdcost; @@ -4036,6 +4067,22 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td, restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize); } + pc_tree->horizontal[0].skip_ref_frame_mask = 0; + pc_tree->horizontal[1].skip_ref_frame_mask = 0; + pc_tree->vertical[0].skip_ref_frame_mask = 0; + pc_tree->vertical[1].skip_ref_frame_mask = 0; + if (cpi->sf.prune_ref_frame_for_rect_partitions) { + uint8_t used_frames; + used_frames = ref_frames_used[0] | ref_frames_used[1]; + if (used_frames) pc_tree->horizontal[0].skip_ref_frame_mask = ~used_frames; + used_frames = ref_frames_used[2] | ref_frames_used[3]; + if (used_frames) pc_tree->horizontal[1].skip_ref_frame_mask = ~used_frames; + used_frames = ref_frames_used[0] | ref_frames_used[2]; + if (used_frames) pc_tree->vertical[0].skip_ref_frame_mask = ~used_frames; + used_frames = ref_frames_used[1] | ref_frames_used[3]; + if (used_frames) pc_tree->vertical[1].skip_ref_frame_mask = ~used_frames; + } + // PARTITION_HORZ if (partition_horz_allowed && (do_rect || vp9_active_h_edge(cpi, mi_row, mi_step))) { diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index e3672edf5..ad4b48a9c 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -3073,6 +3073,8 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, // lock mechanism involved with reads from // tile_mode_map const int mode_search_skip_flags = sf->mode_search_skip_flags; + const int is_rect_partition = + num_4x4_blocks_wide_lookup[bsize] != num_4x4_blocks_high_lookup[bsize]; int64_t mask_filter = 0; int64_t filter_cache[SWITCHABLE_FILTER_CONTEXTS]; @@ -3224,6 +3226,13 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, vp9_zero(x->sum_y_eobs); + if (is_rect_partition) { + if (ctx->skip_ref_frame_mask & (1 << ref_frame)) continue; + if (second_ref_frame > 0 && + (ctx->skip_ref_frame_mask & (1 << second_ref_frame))) + continue; + } + // Look at the reference frame of the best mode so far and set the // skip mask to look at a subset of the remaining modes. if (midx == mode_skip_start && best_mode_index >= 0) { diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c index 80adf845e..75a8de270 100644 --- a/vp9/encoder/vp9_speed_features.c +++ b/vp9/encoder/vp9_speed_features.c @@ -70,11 +70,14 @@ static void set_good_speed_feature_framesize_dependent(VP9_COMP *cpi, // speed 0 features sf->partition_search_breakout_thr.dist = (1 << 20); sf->partition_search_breakout_thr.rate = 80; + sf->use_square_only_threshold = BLOCK_SIZES; - // Currently, the machine-learning based partition search early termination - // is only used while VPXMIN(cm->width, cm->height) >= 480 and speed = 0. if (is_480p_or_larger) { + // Currently, the machine-learning based partition search early termination + // is only used while VPXMIN(cm->width, cm->height) >= 480 and speed = 0. sf->ml_partition_search_early_termination = 1; + } else { + sf->use_square_only_threshold = BLOCK_32X32; } if (!is_1080p_or_larger) { @@ -92,6 +95,7 @@ static void set_good_speed_feature_framesize_dependent(VP9_COMP *cpi, if (speed >= 1) { sf->ml_partition_search_early_termination = 0; + sf->use_square_only_threshold = BLOCK_4X4; if (is_720p_or_larger) { sf->disable_split_mask = @@ -193,7 +197,7 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi, sf->allow_skip_recode = 1; sf->less_rectangular_check = 1; sf->use_square_partition_only = !frame_is_boosted(cpi); - sf->use_square_only_threshold = BLOCK_16X16; + sf->prune_ref_frame_for_rect_partitions = 1; if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) { sf->exhaustive_searches_thresh = (1 << 22); @@ -210,6 +214,7 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi, if (speed >= 1) { sf->enable_tpl_model = 0; + sf->prune_ref_frame_for_rect_partitions = 0; if (oxcf->pass == 2) { TWO_PASS *const twopass = &cpi->twopass; if ((twopass->fr_content_type == FC_GRAPHICS_ANIMATION) || @@ -226,10 +231,7 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi, sf->tx_domain_thresh = tx_dom_thresholds[(speed < 6) ? speed : 5]; sf->allow_quant_coeff_opt = sf->optimize_coefficients; sf->quant_opt_thresh = qopt_thresholds[(speed < 6) ? speed : 5]; - - sf->use_square_only_threshold = BLOCK_4X4; sf->less_rectangular_check = 1; - sf->use_rd_breakout = 1; sf->adaptive_motion_search = 1; sf->mv.auto_mv_step_size = 1; @@ -848,6 +850,7 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) { #else sf->enable_tpl_model = 1; #endif + sf->prune_ref_frame_for_rect_partitions = 0; for (i = 0; i < TX_SIZES; i++) { sf->intra_y_mode_mask[i] = INTRA_ALL; diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h index 7a591e491..fd4973fb2 100644 --- a/vp9/encoder/vp9_speed_features.h +++ b/vp9/encoder/vp9_speed_features.h @@ -319,6 +319,9 @@ typedef struct SPEED_FEATURES { int use_square_partition_only; BLOCK_SIZE use_square_only_threshold; + // Prune reference frames for rectangular partitions. + int prune_ref_frame_for_rect_partitions; + // Sets min and max partition sizes for this 64x64 region based on the // same 64x64 in last encoded frame, and the left and above neighbor. AUTO_MIN_MAX_MODE auto_min_max_partition_size; -- cgit v1.2.3