diff options
Diffstat (limited to 'vp9/encoder')
-rw-r--r-- | vp9/encoder/vp9_block.h | 5 | ||||
-rw-r--r-- | vp9/encoder/vp9_encodeframe.c | 36 | ||||
-rw-r--r-- | vp9/encoder/vp9_firstpass.c | 110 | ||||
-rw-r--r-- | vp9/encoder/vp9_firstpass.h | 9 | ||||
-rw-r--r-- | vp9/encoder/vp9_pickmode.c | 115 | ||||
-rw-r--r-- | vp9/encoder/vp9_ratectrl.c | 51 | ||||
-rw-r--r-- | vp9/encoder/vp9_speed_features.c | 6 | ||||
-rw-r--r-- | vp9/encoder/vp9_speed_features.h | 4 | ||||
-rw-r--r-- | vp9/encoder/x86/vp9_dct_intrin_sse2.c (renamed from vp9/encoder/x86/vp9_dct_sse2.c) | 0 | ||||
-rw-r--r-- | vp9/encoder/x86/vp9_dct_mmx.asm | 104 | ||||
-rw-r--r-- | vp9/encoder/x86/vp9_dct_sse2.asm | 87 | ||||
-rw-r--r-- | vp9/encoder/x86/vp9_diamond_search_sad_avx.c | 323 |
12 files changed, 291 insertions, 559 deletions
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h index 147743e8d..bbdfbb823 100644 --- a/vp9/encoder/vp9_block.h +++ b/vp9/encoder/vp9_block.h @@ -145,6 +145,11 @@ struct macroblock { uint8_t sb_is_skin; + // Used to save the status of whether a block has a low variance in + // choose_partitioning. 0 for 64x64, 1 2 for 64x32, 3 4 for 32x64, 5~8 for + // 32x32. + uint8_t variance_low[9]; + void (*fwd_txm4x4)(const int16_t *input, tran_low_t *output, int stride); void (*itxm_add)(const tran_low_t *input, uint8_t *dest, int stride, int eob); #if CONFIG_VP9_HIGHBITDEPTH diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index 67069e7c1..e3570504e 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -747,6 +747,8 @@ static int choose_partitioning(VP9_COMP *cpi, const uint8_t *d; int sp; int dp; + // Ref frame used in partitioning. + MV_REFERENCE_FRAME ref_frame_partition = LAST_FRAME; int pixels_wide = 64, pixels_high = 64; int64_t thresholds[4] = {cpi->vbp_thresholds[0], cpi->vbp_thresholds[1], cpi->vbp_thresholds[2], cpi->vbp_thresholds[3]}; @@ -771,6 +773,10 @@ static int choose_partitioning(VP9_COMP *cpi, } } + for (i = 0; i < 9; i++) { + x->variance_low[i] = 0; + } + if (xd->mb_to_right_edge < 0) pixels_wide += (xd->mb_to_right_edge >> 3); if (xd->mb_to_bottom_edge < 0) @@ -831,8 +837,10 @@ static int choose_partitioning(VP9_COMP *cpi, mi->ref_frame[0] = GOLDEN_FRAME; mi->mv[0].as_int = 0; y_sad = y_sad_g; + ref_frame_partition = GOLDEN_FRAME; } else { x->pred_mv[LAST_FRAME] = mi->mv[0].as_mv; + ref_frame_partition = LAST_FRAME; } set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]); @@ -1073,6 +1081,34 @@ static int choose_partitioning(VP9_COMP *cpi, } } } + + if (cpi->sf.short_circuit_low_temp_var) { + // Set low variance flag, only for blocks >= 32x32 and if LAST_FRAME was + // selected. + if (ref_frame_partition == LAST_FRAME) { + if (xd->mi[0]->sb_type == BLOCK_64X64 && + vt.part_variances.none.variance < (thresholds[0] >> 1)) { + x->variance_low[0] = 1; + } else if (xd->mi[0]->sb_type == BLOCK_64X32) { + if (vt.part_variances.horz[0].variance < (thresholds[0] >> 2)) + x->variance_low[1] = 1; + if (vt.part_variances.horz[1].variance < (thresholds[0] >> 2)) + x->variance_low[2] = 1; + } else if (xd->mi[0]->sb_type == BLOCK_32X64) { + if (vt.part_variances.vert[0].variance < (thresholds[0] >> 2)) + x->variance_low[3] = 1; + if (vt.part_variances.vert[1].variance < (thresholds[0] >> 2)) + x->variance_low[4] = 1; + } else { + // 32x32 + for (i = 0; i < 4; i++) { + if (!force_split[i + 1] && + vt.split[i].part_variances.none.variance < (thresholds[1] >> 1)) + x->variance_low[i + 5] = 1; + } + } + } + } return 0; } diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index f456f37a1..a70eaea3e 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -236,13 +236,6 @@ static void subtract_stats(FIRSTPASS_STATS *section, section->duration -= frame->duration; } -// Calculate the linear size relative to a baseline of 1080P -#define BASE_SIZE 2073600.0 // 1920x1080 -static double get_linear_size_factor(const VP9_COMP *cpi) { - const double this_area = cpi->initial_width * cpi->initial_height; - return pow(this_area / BASE_SIZE, 0.5); -} - // Calculate an active area of the image that discounts formatting // bars and partially discounts other 0 energy areas. #define MIN_ACTIVE_AREA 0.5 @@ -1247,14 +1240,15 @@ static double calc_correction_factor(double err_per_mb, return fclamp(pow(error_term, power_term), 0.05, 5.0); } -#define ERR_DIVISOR 100.0 -static int get_twopass_worst_quality(const VP9_COMP *cpi, +#define ERR_DIVISOR 115.0 +static int get_twopass_worst_quality(VP9_COMP *cpi, const double section_err, double inactive_zone, - int section_target_bandwidth, - double group_weight_factor) { + int section_target_bandwidth) { const RATE_CONTROL *const rc = &cpi->rc; const VP9EncoderConfig *const oxcf = &cpi->oxcf; + TWO_PASS *const twopass = &cpi->twopass; + // Clamp the target rate to VBR min / max limts. const int target_rate = vp9_rc_clamp_pframe_target_size(cpi, section_target_bandwidth); @@ -1269,7 +1263,7 @@ static int get_twopass_worst_quality(const VP9_COMP *cpi, const int active_mbs = VPXMAX(1, num_mbs - (int)(num_mbs * inactive_zone)); const double av_err_per_mb = section_err / active_mbs; const double speed_term = 1.0 + 0.04 * oxcf->speed; - double ediv_size_correction; + double last_group_rate_err; const int target_norm_bits_per_mb = ((uint64_t)target_rate << BPER_MB_NORMBITS) / active_mbs; int q; @@ -1278,29 +1272,27 @@ static int get_twopass_worst_quality(const VP9_COMP *cpi, if (is_two_pass_svc(cpi) && cpi->svc.spatial_layer_id > 0) is_svc_upper_layer = 1; - // Larger image formats are expected to be a little harder to code - // relatively given the same prediction error score. This in part at - // least relates to the increased size and hence coding overheads of - // motion vectors. Some account of this is made through adjustment of - // the error divisor. - ediv_size_correction = - VPXMAX(0.2, VPXMIN(5.0, get_linear_size_factor(cpi))); - if (ediv_size_correction < 1.0) - ediv_size_correction = -(1.0 / ediv_size_correction); - ediv_size_correction *= 4.0; + // based on recent history adjust expectations of bits per macroblock. + last_group_rate_err = (double)twopass->rolling_arf_group_actual_bits / + DOUBLE_DIVIDE_CHECK((double)twopass->rolling_arf_group_target_bits); + last_group_rate_err = + VPXMAX(0.25, VPXMIN(4.0, last_group_rate_err)); + twopass->bpm_factor *= (3.0 + last_group_rate_err) / 4.0; + twopass->bpm_factor = + VPXMAX(0.25, VPXMIN(4.0, twopass->bpm_factor)); // Try and pick a max Q that will be high enough to encode the // content at the given rate. for (q = rc->best_quality; q < rc->worst_quality; ++q) { const double factor = calc_correction_factor(av_err_per_mb, - ERR_DIVISOR - ediv_size_correction, + ERR_DIVISOR, is_svc_upper_layer ? SVC_FACTOR_PT_LOW : FACTOR_PT_LOW, FACTOR_PT_HIGH, q, cpi->common.bit_depth); const int bits_per_mb = vp9_rc_bits_per_mb(INTER_FRAME, q, - factor * speed_term * group_weight_factor, + factor * speed_term * cpi->twopass.bpm_factor, cpi->common.bit_depth); if (bits_per_mb <= target_norm_bits_per_mb) break; @@ -2115,8 +2107,6 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { old_boost_score = boost_score; } - twopass->gf_zeromotion_pct = (int)(zero_motion_accumulator * 1000.0); - // Was the group length constrained by the requirement for a new KF? rc->constrained_gf_group = (i >= rc->frames_to_key) ? 1 : 0; @@ -2184,24 +2174,12 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { const double group_av_inactive_zone = ((gf_group_inactive_zone_rows * 2) / (rc->baseline_gf_interval * (double)cm->mb_rows)); - - int tmp_q; - // rc factor is a weight factor that corrects for local rate control drift. - double rc_factor = 1.0; - if (rc->rate_error_estimate > 0) { - rc_factor = VPXMAX(RC_FACTOR_MIN, - (double)(100 - rc->rate_error_estimate) / 100.0); - } else { - rc_factor = VPXMIN(RC_FACTOR_MAX, - (double)(100 - rc->rate_error_estimate) / 100.0); - } - tmp_q = - get_twopass_worst_quality(cpi, group_av_err, - (group_av_skip_pct + group_av_inactive_zone), - vbr_group_bits_per_frame, - twopass->kfgroup_inter_fraction * rc_factor); + int tmp_q = + get_twopass_worst_quality(cpi, group_av_err, + (group_av_skip_pct + group_av_inactive_zone), + vbr_group_bits_per_frame); twopass->active_worst_quality = - VPXMAX(tmp_q, twopass->active_worst_quality >> 1); + (tmp_q + (twopass->active_worst_quality * 3)) >> 2; } #endif @@ -2243,6 +2221,10 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { // Default to starting GF groups at normal frame size. cpi->rc.next_frame_size_selector = UNSCALED; } + + // Reset rolling actual and target bits counters for ARF groups. + twopass->rolling_arf_group_target_bits = 0; + twopass->rolling_arf_group_actual_bits = 0; } // Threshold for use of the lagging second reference frame. High second ref @@ -2580,16 +2562,6 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { kf_bits = calculate_boost_bits((rc->frames_to_key - 1), rc->kf_boost, twopass->kf_group_bits); - // Work out the fraction of the kf group bits reserved for the inter frames - // within the group after discounting the bits for the kf itself. - if (twopass->kf_group_bits) { - twopass->kfgroup_inter_fraction = - (double)(twopass->kf_group_bits - kf_bits) / - (double)twopass->kf_group_bits; - } else { - twopass->kfgroup_inter_fraction = 1.0; - } - twopass->kf_group_bits -= kf_bits; // Save the bits to spend on the key frame. @@ -2683,21 +2655,12 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) { RATE_CONTROL *const rc = &cpi->rc; TWO_PASS *const twopass = &cpi->twopass; GF_GROUP *const gf_group = &twopass->gf_group; - int frames_left; FIRSTPASS_STATS this_frame; int target_rate; LAYER_CONTEXT *const lc = is_two_pass_svc(cpi) ? &cpi->svc.layer_context[cpi->svc.spatial_layer_id] : 0; - if (lc != NULL) { - frames_left = (int)(twopass->total_stats.count - - lc->current_video_frame_in_layer); - } else { - frames_left = (int)(twopass->total_stats.count - - cm->current_video_frame); - } - if (!twopass->stats_in) return; @@ -2739,6 +2702,9 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) { twopass->active_worst_quality = cpi->oxcf.cq_level; } else if (cm->current_video_frame == 0 || (lc != NULL && lc->current_video_frame_in_layer == 0)) { + const int frames_left = (int)(twopass->total_stats.count - + ((lc != NULL) ? lc->current_video_frame_in_layer + : cm->current_video_frame)); // Special case code for first frame. const int section_target_bandwidth = (int)(twopass->bits_left / frames_left); @@ -2750,10 +2716,17 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) { const double section_inactive_zone = (twopass->total_left_stats.inactive_zone_rows * 2) / ((double)cm->mb_rows * section_length); - const int tmp_q = - get_twopass_worst_quality(cpi, section_error, - section_intra_skip + section_inactive_zone, - section_target_bandwidth, DEFAULT_GRP_WEIGHT); + int tmp_q; + + // Initialize bits per macro_block estimate correction factor. + twopass->bpm_factor = 1.0; + // Initiallize actual and target bits counters for ARF groups so that + // at the start we have a neutral bpm adjustment. + twopass->rolling_arf_group_target_bits = 1; + twopass->rolling_arf_group_actual_bits = 1; + + tmp_q = get_twopass_worst_quality(cpi, section_error, + section_intra_skip + section_inactive_zone, section_target_bandwidth); twopass->active_worst_quality = tmp_q; twopass->baseline_active_worst_quality = tmp_q; @@ -2871,6 +2844,10 @@ void vp9_twopass_postencode_update(VP9_COMP *cpi) { rc->vbr_bits_off_target += rc->base_frame_target - rc->projected_frame_size; twopass->bits_left = VPXMAX(twopass->bits_left - bits_used, 0); + // Target vs actual bits for this arf group. + twopass->rolling_arf_group_target_bits += rc->this_frame_target; + twopass->rolling_arf_group_actual_bits += rc->projected_frame_size; + // Calculate the pct rc error. if (rc->total_actual_bits) { rc->rate_error_estimate = @@ -2892,7 +2869,6 @@ void vp9_twopass_postencode_update(VP9_COMP *cpi) { // If the rate control is drifting consider adjustment to min or maxq. if ((cpi->oxcf.rc_mode != VPX_Q) && - (cpi->twopass.gf_zeromotion_pct < VLOW_MOTION_THRESHOLD) && !cpi->rc.is_src_frame_alt_ref) { const int maxq_adj_limit = rc->worst_quality - twopass->active_worst_quality; diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h index 7eb44fa13..76072884d 100644 --- a/vp9/encoder/vp9_firstpass.h +++ b/vp9/encoder/vp9_firstpass.h @@ -39,8 +39,6 @@ typedef struct { } FIRSTPASS_MB_STATS; #endif -#define VLOW_MOTION_THRESHOLD 950 - typedef struct { double frame; double weight; @@ -124,14 +122,13 @@ typedef struct { // Error score of frames still to be coded in kf group int64_t kf_group_error_left; - // The fraction for a kf groups total bits allocated to the inter frames - double kfgroup_inter_fraction; + double bpm_factor; + int rolling_arf_group_target_bits; + int rolling_arf_group_actual_bits; int sr_update_lag; - int kf_zeromotion_pct; int last_kfgroup_zeromotion_pct; - int gf_zeromotion_pct; int active_worst_quality; int baseline_active_worst_quality; int extend_minq; diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c index 918b3b10b..554409b74 100644 --- a/vp9/encoder/vp9_pickmode.c +++ b/vp9/encoder/vp9_pickmode.c @@ -1126,34 +1126,38 @@ static INLINE void find_predictors(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, int mi_row, int mi_col, struct buf_2d yv12_mb[4][MAX_MB_PLANE], - BLOCK_SIZE bsize) { + BLOCK_SIZE bsize, + int force_skip_low_temp_var) { VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame); TileInfo *const tile_info = &tile_data->tile_info; -// TODO(jingning) placeholder for inter-frame non-RD mode decision. + // TODO(jingning) placeholder for inter-frame non-RD mode decision. x->pred_mv_sad[ref_frame] = INT_MAX; frame_mv[NEWMV][ref_frame].as_int = INVALID_MV; frame_mv[ZEROMV][ref_frame].as_int = 0; -// this needs various further optimizations. to be continued.. + // this needs various further optimizations. to be continued.. if ((cpi->ref_frame_flags & flag_list[ref_frame]) && (yv12 != NULL)) { int_mv *const candidates = x->mbmi_ext->ref_mvs[ref_frame]; const struct scale_factors *const sf = &cm->frame_refs[ref_frame - 1].sf; vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, sf); - if (cm->use_prev_frame_mvs) + if (cm->use_prev_frame_mvs) { vp9_find_mv_refs(cm, xd, xd->mi[0], ref_frame, candidates, mi_row, mi_col, x->mbmi_ext->mode_context); - else - const_motion[ref_frame] = - mv_refs_rt(cpi, cm, x, xd, tile_info, xd->mi[0], ref_frame, - candidates, &frame_mv[NEWMV][ref_frame], mi_row, mi_col, - (int)(cpi->svc.use_base_mv && cpi->svc.spatial_layer_id)); + } else { + const_motion[ref_frame] = + mv_refs_rt(cpi, cm, x, xd, tile_info, xd->mi[0], ref_frame, + candidates, &frame_mv[NEWMV][ref_frame], mi_row, mi_col, + (int)(cpi->svc.use_base_mv && cpi->svc.spatial_layer_id)); + } vp9_find_best_ref_mvs(xd, cm->allow_high_precision_mv, candidates, &frame_mv[NEARESTMV][ref_frame], &frame_mv[NEARMV][ref_frame]); - if (!vp9_is_scaled(sf) && bsize >= BLOCK_8X8) { + // Early exit for golden frame if force_skip_low_temp_var is set. + if (!vp9_is_scaled(sf) && bsize >= BLOCK_8X8 && + !(force_skip_low_temp_var && ref_frame == GOLDEN_FRAME)) { vp9_mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12->y_stride, ref_frame, bsize); } @@ -1266,6 +1270,39 @@ static void recheck_zeromv_after_denoising( } #endif // CONFIG_VP9_TEMPORAL_DENOISING +static INLINE int set_force_skip_low_temp_var(uint8_t *variance_low, + int mi_row, int mi_col, + BLOCK_SIZE bsize) { + int force_skip_low_temp_var = 0; + // Set force_skip_low_temp_var based on the block size and block offset. + if (bsize == BLOCK_64X64) { + force_skip_low_temp_var = variance_low[0]; + } else if (bsize == BLOCK_64X32) { + if (!(mi_col & 0x7) && !(mi_row & 0x7)) { + force_skip_low_temp_var = variance_low[1]; + } else if (!(mi_col & 0x7) && (mi_row & 0x7)) { + force_skip_low_temp_var = variance_low[2]; + } + } else if (bsize == BLOCK_32X64) { + if (!(mi_col & 0x7) && !(mi_row & 0x7)) { + force_skip_low_temp_var = variance_low[3]; + } else if ((mi_col & 0x7) && !(mi_row & 0x7)) { + force_skip_low_temp_var = variance_low[4]; + } + } else if (bsize == BLOCK_32X32) { + if (!(mi_col & 0x7) && !(mi_row & 0x7)) { + force_skip_low_temp_var = variance_low[5]; + } else if ((mi_col & 0x7) && !(mi_row & 0x7)) { + force_skip_low_temp_var = variance_low[6]; + } else if (!(mi_col & 0x7) && (mi_row & 0x7)) { + force_skip_low_temp_var = variance_low[7]; + } else if ((mi_col & 0x7) && (mi_row & 0x7)) { + force_skip_low_temp_var = variance_low[8]; + } + } + return force_skip_low_temp_var; +} + void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, int mi_row, int mi_col, RD_COST *rd_cost, @@ -1323,6 +1360,8 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, int ref_frame_cost[MAX_REF_FRAMES]; int svc_force_zero_mode[3] = {0}; int perform_intra_pred = 1; + int use_golden_nonzeromv = 1; + int force_skip_low_temp_var = 0; #if CONFIG_VP9_TEMPORAL_DENOISING VP9_PICKMODE_CTX_DEN ctx_den; int64_t zero_last_cost_orig = INT64_MAX; @@ -1409,10 +1448,19 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, } } + if (cpi->sf.short_circuit_low_temp_var) { + force_skip_low_temp_var = + set_force_skip_low_temp_var(&x->variance_low[0], mi_row, mi_col, bsize); + } + + if (!((cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) && + !svc_force_zero_mode[GOLDEN_FRAME - 1] && !force_skip_low_temp_var)) + use_golden_nonzeromv = 0; + for (ref_frame = LAST_FRAME; ref_frame <= usable_ref_frame; ++ref_frame) { find_predictors(cpi, x, ref_frame, frame_mv, const_motion, &ref_frame_skip_mask, flag_list, tile_data, mi_row, mi_col, - yv12_mb, bsize); + yv12_mb, bsize, force_skip_low_temp_var); } for (idx = 0; idx < RT_INTER_MODES; ++idx) { @@ -1424,6 +1472,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, int is_skippable; int this_early_term = 0; PREDICTION_MODE this_mode = ref_mode_set[idx].pred_mode; + if (cpi->use_svc) this_mode = ref_mode_set_svc[idx].pred_mode; @@ -1442,17 +1491,27 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, if (!(cpi->ref_frame_flags & flag_list[ref_frame])) continue; + if (const_motion[ref_frame] && this_mode == NEARMV) continue; + // Skip non-zeromv mode search for golden frame if force_skip_low_temp_var + // is set. If nearestmv for golden frame is 0, zeromv mode will be skipped + // later. + if (force_skip_low_temp_var && ref_frame == GOLDEN_FRAME && + frame_mv[this_mode][ref_frame].as_int != 0) { + continue; + } + if (cpi->use_svc) { if (svc_force_zero_mode[ref_frame - 1] && frame_mv[this_mode][ref_frame].as_int != 0) continue; } - if (!(frame_mv[this_mode][ref_frame].as_int == 0 && - ref_frame == LAST_FRAME)) { + if (!force_skip_low_temp_var && + !(frame_mv[this_mode][ref_frame].as_int == 0 && + ref_frame == LAST_FRAME)) { i = (ref_frame == LAST_FRAME) ? GOLDEN_FRAME : LAST_FRAME; if ((cpi->ref_frame_flags & flag_list[i]) && sf->reference_masking) if (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[i] << 1)) @@ -1543,7 +1602,10 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, } } - if (this_mode == NEWMV && ref_frame == LAST_FRAME && + // If use_golden_nonzeromv is false, NEWMV mode is skipped for golden, no + // need to compute best_pred_sad which is only used to skip golden NEWMV. + if (use_golden_nonzeromv && this_mode == NEWMV && + ref_frame == LAST_FRAME && frame_mv[NEWMV][LAST_FRAME].as_int != INVALID_MV) { const int pre_stride = xd->plane[0].pre[0].stride; const uint8_t * const pre_buf = xd->plane[0].pre[0].buf + @@ -1555,21 +1617,6 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, x->pred_mv_sad[LAST_FRAME] = best_pred_sad; } - if (cpi->use_svc) { - if (this_mode == NEWMV && ref_frame == GOLDEN_FRAME && - frame_mv[NEWMV][GOLDEN_FRAME].as_int != INVALID_MV) { - const int pre_stride = xd->plane[0].pre[0].stride; - const uint8_t * const pre_buf = xd->plane[0].pre[0].buf + - (frame_mv[NEWMV][GOLDEN_FRAME].as_mv.row >> 3) * pre_stride + - (frame_mv[NEWMV][GOLDEN_FRAME].as_mv.col >> 3); - best_pred_sad = cpi->fn_ptr[bsize].sdf(x->plane[0].src.buf, - x->plane[0].src.stride, - pre_buf, pre_stride); - x->pred_mv_sad[GOLDEN_FRAME] = best_pred_sad; - } - } - - if (this_mode != NEARESTMV && frame_mv[this_mode][ref_frame].as_int == frame_mv[NEARESTMV][ref_frame].as_int) @@ -1795,11 +1842,11 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, inter_mode_thresh = (inter_mode_thresh << 1) + inter_mode_thresh; } // Perform intra prediction search, if the best SAD is above a certain - // threshold. - if (perform_intra_pred && - ((best_rdc.rdcost == INT64_MAX || - (!x->skip && best_rdc.rdcost > inter_mode_thresh && - bsize <= cpi->sf.max_intra_bsize)))) { + // threshold. Skip intra prediction if force_skip_low_temp_var is set. + if (!force_skip_low_temp_var && perform_intra_pred && + (best_rdc.rdcost == INT64_MAX || + (!x->skip && best_rdc.rdcost > inter_mode_thresh && + bsize <= cpi->sf.max_intra_bsize))) { struct estimate_block_intra_args args = { cpi, x, DC_PRED, 1, 0, 0 }; int i; TX_SIZE best_intra_tx_size = TX_SIZES; diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c index b8a5e6e7d..6c3f91951 100644 --- a/vp9/encoder/vp9_ratectrl.c +++ b/vp9/encoder/vp9_ratectrl.c @@ -1160,8 +1160,7 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, // Extension to max or min Q if undershoot or overshoot is outside // the permitted range. - if ((cpi->oxcf.rc_mode != VPX_Q) && - (cpi->twopass.gf_zeromotion_pct < VLOW_MOTION_THRESHOLD)) { + if (cpi->oxcf.rc_mode != VPX_Q) { if (frame_is_intra_only(cm) || (!rc->is_src_frame_alt_ref && (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame))) { @@ -1559,12 +1558,13 @@ void vp9_rc_get_one_pass_vbr_params(VP9_COMP *cpi) { if (cm->current_video_frame > 30 && rc->avg_frame_qindex[INTER_FRAME] > (7 * rc->worst_quality) >> 3 && rc->avg_size_inter > (5 * rc->avg_frame_bandwidth) >> 1) { - rc->baseline_gf_interval = (3 * rc->baseline_gf_interval) >> 1; + rc->baseline_gf_interval = + VPXMIN(15, (3 * rc->baseline_gf_interval) >> 1); } else if (cm->current_video_frame > 30 && rc->avg_frame_low_motion < 20) { // Decrease boost and gf interval for high motion case. rc->gfu_boost = DEFAULT_GF_BOOST >> 1; - rc->baseline_gf_interval = VPXMIN(6, rc->baseline_gf_interval >> 1); + rc->baseline_gf_interval = VPXMAX(5, rc->baseline_gf_interval >> 1); } adjust_gf_key_frame(cpi); rc->frames_till_gf_update_due = rc->baseline_gf_interval; @@ -1890,27 +1890,28 @@ static void vbr_rate_correction(VP9_COMP *cpi, int *this_frame_target) { RATE_CONTROL *const rc = &cpi->rc; int64_t vbr_bits_off_target = rc->vbr_bits_off_target; int max_delta; - double position_factor = 1.0; - - // How far through the clip are we. - // This number is used to damp the per frame rate correction. - // Range 0 - 1.0 - if (cpi->twopass.total_stats.count) { - position_factor = sqrt((double)cpi->common.current_video_frame / - cpi->twopass.total_stats.count); - } - max_delta = (int)(position_factor * - ((*this_frame_target * VBR_PCT_ADJUSTMENT_LIMIT) / 100)); - - // vbr_bits_off_target > 0 means we have extra bits to spend - if (vbr_bits_off_target > 0) { - *this_frame_target += - (vbr_bits_off_target > max_delta) ? max_delta - : (int)vbr_bits_off_target; - } else { - *this_frame_target -= - (vbr_bits_off_target < -max_delta) ? max_delta - : (int)-vbr_bits_off_target; + int frame_window = VPXMIN(16, + ((int)cpi->twopass.total_stats.count - cpi->common.current_video_frame)); + + // Calcluate the adjustment to rate for this frame. + if (frame_window > 0) { + max_delta = (vbr_bits_off_target > 0) + ? (int)(vbr_bits_off_target / frame_window) + : (int)(-vbr_bits_off_target / frame_window); + + max_delta = VPXMIN(max_delta, + ((*this_frame_target * VBR_PCT_ADJUSTMENT_LIMIT) / 100)); + + // vbr_bits_off_target > 0 means we have extra bits to spend + if (vbr_bits_off_target > 0) { + *this_frame_target += + (vbr_bits_off_target > max_delta) ? max_delta + : (int)vbr_bits_off_target; + } else { + *this_frame_target -= + (vbr_bits_off_target < -max_delta) ? max_delta + : (int)-vbr_bits_off_target; + } } // Fast redistribution of bits arising from massive local undershoot. diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c index 02be3c3f9..0090b4f40 100644 --- a/vp9/encoder/vp9_speed_features.c +++ b/vp9/encoder/vp9_speed_features.c @@ -429,6 +429,11 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, sf->mv.search_method = NSTEP; sf->mv.reduce_first_step_size = 1; sf->skip_encode_sb = 0; + if (!cpi->use_svc && cpi->oxcf.rc_mode == VPX_CBR && cpi->oxcf.pass == 0 && + content != VP9E_CONTENT_SCREEN) { + // Enable short circuit for low temporal variance. + sf->short_circuit_low_temp_var = 1; + } } if (speed >= 7) { @@ -554,6 +559,7 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) { sf->default_interp_filter = SWITCHABLE; sf->simple_model_rd_from_var = 0; sf->short_circuit_flat_blocks = 0; + sf->short_circuit_low_temp_var = 0; // Some speed-up features even for best quality as minimal impact on quality. sf->adaptive_rd_thresh = 1; diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h index 90b32164b..71ff0ac10 100644 --- a/vp9/encoder/vp9_speed_features.h +++ b/vp9/encoder/vp9_speed_features.h @@ -446,6 +446,10 @@ typedef struct SPEED_FEATURES { // Skip a number of expensive mode evaluations for blocks with zero source // variance. int short_circuit_flat_blocks; + + // Skip a number of expensive mode evaluations for blocks with very low + // temporal variance. + int short_circuit_low_temp_var; } SPEED_FEATURES; struct VP9_COMP; diff --git a/vp9/encoder/x86/vp9_dct_sse2.c b/vp9/encoder/x86/vp9_dct_intrin_sse2.c index fa37b6fed..fa37b6fed 100644 --- a/vp9/encoder/x86/vp9_dct_sse2.c +++ b/vp9/encoder/x86/vp9_dct_intrin_sse2.c diff --git a/vp9/encoder/x86/vp9_dct_mmx.asm b/vp9/encoder/x86/vp9_dct_mmx.asm deleted file mode 100644 index 7a7a6b655..000000000 --- a/vp9/encoder/x86/vp9_dct_mmx.asm +++ /dev/null @@ -1,104 +0,0 @@ -; -; Copyright (c) 2014 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - -%define private_prefix vp9 - -%include "third_party/x86inc/x86inc.asm" - -SECTION .text - -%macro TRANSFORM_COLS 0 - paddw m0, m1 - movq m4, m0 - psubw m3, m2 - psubw m4, m3 - psraw m4, 1 - movq m5, m4 - psubw m5, m1 ;b1 - psubw m4, m2 ;c1 - psubw m0, m4 - paddw m3, m5 - ; m0 a0 - SWAP 1, 4 ; m1 c1 - SWAP 2, 3 ; m2 d1 - SWAP 3, 5 ; m3 b1 -%endmacro - -%macro TRANSPOSE_4X4 0 - movq m4, m0 - movq m5, m2 - punpcklwd m4, m1 - punpckhwd m0, m1 - punpcklwd m5, m3 - punpckhwd m2, m3 - movq m1, m4 - movq m3, m0 - punpckldq m1, m5 - punpckhdq m4, m5 - punpckldq m3, m2 - punpckhdq m0, m2 - SWAP 2, 3, 0, 1, 4 -%endmacro - -INIT_MMX mmx -cglobal fwht4x4, 3, 4, 8, input, output, stride - lea r3q, [inputq + strideq*4] - movq m0, [inputq] ;a1 - movq m1, [inputq + strideq*2] ;b1 - movq m2, [r3q] ;c1 - movq m3, [r3q + strideq*2] ;d1 - - TRANSFORM_COLS - TRANSPOSE_4X4 - TRANSFORM_COLS - TRANSPOSE_4X4 - - psllw m0, 2 - psllw m1, 2 - psllw m2, 2 - psllw m3, 2 - -%if CONFIG_VP9_HIGHBITDEPTH - pxor m4, m4 - pxor m5, m5 - pcmpgtw m4, m0 - pcmpgtw m5, m1 - movq m6, m0 - movq m7, m1 - punpcklwd m0, m4 - punpcklwd m1, m5 - punpckhwd m6, m4 - punpckhwd m7, m5 - movq [outputq], m0 - movq [outputq + 8], m6 - movq [outputq + 16], m1 - movq [outputq + 24], m7 - pxor m4, m4 - pxor m5, m5 - pcmpgtw m4, m2 - pcmpgtw m5, m3 - movq m6, m2 - movq m7, m3 - punpcklwd m2, m4 - punpcklwd m3, m5 - punpckhwd m6, m4 - punpckhwd m7, m5 - movq [outputq + 32], m2 - movq [outputq + 40], m6 - movq [outputq + 48], m3 - movq [outputq + 56], m7 -%else - movq [outputq], m0 - movq [outputq + 8], m1 - movq [outputq + 16], m2 - movq [outputq + 24], m3 -%endif - - RET diff --git a/vp9/encoder/x86/vp9_dct_sse2.asm b/vp9/encoder/x86/vp9_dct_sse2.asm new file mode 100644 index 000000000..d3b2a271b --- /dev/null +++ b/vp9/encoder/x86/vp9_dct_sse2.asm @@ -0,0 +1,87 @@ +; +; Copyright (c) 2016 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%define private_prefix vp9 + +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +%macro TRANSFORM_COLS 0 + paddw m0, m1 + movq m4, m0 + psubw m3, m2 + psubw m4, m3 + psraw m4, 1 + movq m5, m4 + psubw m5, m1 ;b1 + psubw m4, m2 ;c1 + psubw m0, m4 + paddw m3, m5 + ; m0 a0 + SWAP 1, 4 ; m1 c1 + SWAP 2, 3 ; m2 d1 + SWAP 3, 5 ; m3 b1 +%endmacro + +%macro TRANSPOSE_4X4 0 + ; 00 01 02 03 + ; 10 11 12 13 + ; 20 21 22 23 + ; 30 31 32 33 + punpcklwd m0, m1 ; 00 10 01 11 02 12 03 13 + punpcklwd m2, m3 ; 20 30 21 31 22 32 23 33 + mova m1, m0 + punpckldq m0, m2 ; 00 10 20 30 01 11 21 31 + punpckhdq m1, m2 ; 02 12 22 32 03 13 23 33 +%endmacro + +INIT_XMM sse2 +cglobal fwht4x4, 3, 4, 8, input, output, stride +; TODO(linfeng): The duplication with vp10 should be resolved. + lea r3q, [inputq + strideq*4] + movq m0, [inputq] ;a1 + movq m1, [inputq + strideq*2] ;b1 + movq m2, [r3q] ;c1 + movq m3, [r3q + strideq*2] ;d1 + + TRANSFORM_COLS + TRANSPOSE_4X4 + SWAP 1, 2 + psrldq m1, m0, 8 + psrldq m3, m2, 8 + TRANSFORM_COLS + TRANSPOSE_4X4 + + psllw m0, 2 + psllw m1, 2 + +%if CONFIG_VP9_HIGHBITDEPTH + ; sign extension + mova m2, m0 + mova m3, m1 + punpcklwd m0, m0 + punpcklwd m1, m1 + punpckhwd m2, m2 + punpckhwd m3, m3 + psrad m0, 16 + psrad m1, 16 + psrad m2, 16 + psrad m3, 16 + mova [outputq], m0 + mova [outputq + 16], m2 + mova [outputq + 32], m1 + mova [outputq + 48], m3 +%else + mova [outputq], m0 + mova [outputq + 16], m1 +%endif + + RET diff --git a/vp9/encoder/x86/vp9_diamond_search_sad_avx.c b/vp9/encoder/x86/vp9_diamond_search_sad_avx.c deleted file mode 100644 index 0bc417fc1..000000000 --- a/vp9/encoder/x86/vp9_diamond_search_sad_avx.c +++ /dev/null @@ -1,323 +0,0 @@ -/* - * Copyright (c) 2015 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#if defined(_MSC_VER) -# include <intrin.h> -#endif -#include <emmintrin.h> -#include <smmintrin.h> - -#include "vpx_dsp/vpx_dsp_common.h" -#include "vp9/encoder/vp9_encoder.h" -#include "vpx_ports/mem.h" - -#ifdef __GNUC__ -# define LIKELY(v) __builtin_expect(v, 1) -# define UNLIKELY(v) __builtin_expect(v, 0) -#else -# define LIKELY(v) (v) -# define UNLIKELY(v) (v) -#endif - -static INLINE int_mv pack_int_mv(int16_t row, int16_t col) { - int_mv result; - result.as_mv.row = row; - result.as_mv.col = col; - return result; -} - -static INLINE MV_JOINT_TYPE get_mv_joint(const int_mv mv) { - // This is simplified from the C implementation to utilise that - // x->nmvjointsadcost[1] == x->nmvjointsadcost[2] and - // x->nmvjointsadcost[1] == x->nmvjointsadcost[3] - return mv.as_int == 0 ? 0 : 1; -} - -static INLINE int mv_cost(const int_mv mv, - const int *joint_cost, int *const comp_cost[2]) { - return joint_cost[get_mv_joint(mv)] + - comp_cost[0][mv.as_mv.row] + comp_cost[1][mv.as_mv.col]; -} - -static int mvsad_err_cost(const MACROBLOCK *x, const int_mv mv, const MV *ref, - int sad_per_bit) { - const int_mv diff = pack_int_mv(mv.as_mv.row - ref->row, - mv.as_mv.col - ref->col); - return ROUND_POWER_OF_TWO((unsigned)mv_cost(diff, x->nmvjointsadcost, - x->nmvsadcost) * - sad_per_bit, VP9_PROB_COST_SHIFT); -} - -/***************************************************************************** - * This function utilises 3 properties of the cost function lookup tables, * - * constructed in using 'cal_nmvjointsadcost' and 'cal_nmvsadcosts' in * - * vp9_encoder.c. * - * For the joint cost: * - * - mvjointsadcost[1] == mvjointsadcost[2] == mvjointsadcost[3] * - * For the component costs: * - * - For all i: mvsadcost[0][i] == mvsadcost[1][i] * - * (Equal costs for both components) * - * - For all i: mvsadcost[0][i] == mvsadcost[0][-i] * - * (Cost function is even) * - * If these do not hold, then this function cannot be used without * - * modification, in which case you can revert to using the C implementation, * - * which does not rely on these properties. * - *****************************************************************************/ -int vp9_diamond_search_sad_avx(const MACROBLOCK *x, - const search_site_config *cfg, - MV *ref_mv, MV *best_mv, int search_param, - int sad_per_bit, int *num00, - const vp9_variance_fn_ptr_t *fn_ptr, - const MV *center_mv) { - const int_mv maxmv = pack_int_mv(x->mv_row_max, x->mv_col_max); - const __m128i v_max_mv_w = _mm_set1_epi32(maxmv.as_int); - const int_mv minmv = pack_int_mv(x->mv_row_min, x->mv_col_min); - const __m128i v_min_mv_w = _mm_set1_epi32(minmv.as_int); - - const __m128i v_spb_d = _mm_set1_epi32(sad_per_bit); - - const __m128i v_joint_cost_0_d = _mm_set1_epi32(x->nmvjointsadcost[0]); - const __m128i v_joint_cost_1_d = _mm_set1_epi32(x->nmvjointsadcost[1]); - - // search_param determines the length of the initial step and hence the number - // of iterations. - // 0 = initial step (MAX_FIRST_STEP) pel - // 1 = (MAX_FIRST_STEP/2) pel, - // 2 = (MAX_FIRST_STEP/4) pel... - const MV *ss_mv = &cfg->ss_mv[cfg->searches_per_step * search_param]; - const intptr_t *ss_os = &cfg->ss_os[cfg->searches_per_step * search_param]; - const int tot_steps = cfg->total_steps - search_param; - - const int_mv fcenter_mv = pack_int_mv(center_mv->row >> 3, - center_mv->col >> 3); - const __m128i vfcmv = _mm_set1_epi32(fcenter_mv.as_int); - - const int ref_row = clamp(ref_mv->row, minmv.as_mv.row, maxmv.as_mv.row); - const int ref_col = clamp(ref_mv->col, minmv.as_mv.col, maxmv.as_mv.col); - - int_mv bmv = pack_int_mv(ref_row, ref_col); - int_mv new_bmv = bmv; - __m128i v_bmv_w = _mm_set1_epi32(bmv.as_int); - - const int what_stride = x->plane[0].src.stride; - const int in_what_stride = x->e_mbd.plane[0].pre[0].stride; - const uint8_t *const what = x->plane[0].src.buf; - const uint8_t *const in_what = x->e_mbd.plane[0].pre[0].buf + - ref_row * in_what_stride + ref_col; - - // Work out the start point for the search - const uint8_t *best_address = in_what; - const uint8_t *new_best_address = best_address; -#if ARCH_X86_64 - __m128i v_ba_q = _mm_set1_epi64x((intptr_t)best_address); -#else - __m128i v_ba_d = _mm_set1_epi32((intptr_t)best_address); -#endif - - unsigned int best_sad; - - int i; - int j; - int step; - - // Check the prerequisite cost function properties that are easy to check - // in an assert. See the function-level documentation for details on all - // prerequisites. - assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[2]); - assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[3]); - - // Check the starting position - best_sad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride); - best_sad += mvsad_err_cost(x, bmv, &fcenter_mv.as_mv, sad_per_bit); - - *num00 = 0; - - for (i = 0, step = 0; step < tot_steps; step++) { - for (j = 0; j < cfg->searches_per_step; j += 4, i += 4) { - __m128i v_sad_d; - __m128i v_cost_d; - __m128i v_outside_d; - __m128i v_inside_d; - __m128i v_diff_mv_w; -#if ARCH_X86_64 - __m128i v_blocka[2]; -#else - __m128i v_blocka[1]; -#endif - - // Compute the candidate motion vectors - const __m128i v_ss_mv_w = _mm_loadu_si128((const __m128i*)&ss_mv[i]); - const __m128i v_these_mv_w = _mm_add_epi16(v_bmv_w, v_ss_mv_w); - // Clamp them to the search bounds - __m128i v_these_mv_clamp_w = v_these_mv_w; - v_these_mv_clamp_w = _mm_min_epi16(v_these_mv_clamp_w, v_max_mv_w); - v_these_mv_clamp_w = _mm_max_epi16(v_these_mv_clamp_w, v_min_mv_w); - // The ones that did not change are inside the search area - v_inside_d = _mm_cmpeq_epi32(v_these_mv_clamp_w, v_these_mv_w); - - // If none of them are inside, then move on - if (LIKELY(_mm_test_all_zeros(v_inside_d, v_inside_d))) { - continue; - } - - // The inverse mask indicates which of the MVs are outside - v_outside_d = _mm_xor_si128(v_inside_d, _mm_set1_epi8(0xff)); - // Shift right to keep the sign bit clear, we will use this later - // to set the cost to the maximum value. - v_outside_d = _mm_srli_epi32(v_outside_d, 1); - - // Compute the difference MV - v_diff_mv_w = _mm_sub_epi16(v_these_mv_clamp_w, vfcmv); - // We utilise the fact that the cost function is even, and use the - // absolute difference. This allows us to use unsigned indexes later - // and reduces cache pressure somewhat as only a half of the table - // is ever referenced. - v_diff_mv_w = _mm_abs_epi16(v_diff_mv_w); - - // Compute the SIMD pointer offsets. - { -#if ARCH_X86_64 // sizeof(intptr_t) == 8 - // Load the offsets - __m128i v_bo10_q = _mm_loadu_si128((const __m128i*)&ss_os[i+0]); - __m128i v_bo32_q = _mm_loadu_si128((const __m128i*)&ss_os[i+2]); - // Set the ones falling outside to zero - v_bo10_q = _mm_and_si128(v_bo10_q, - _mm_cvtepi32_epi64(v_inside_d)); - v_bo32_q = _mm_and_si128(v_bo32_q, - _mm_unpackhi_epi32(v_inside_d, v_inside_d)); - // Compute the candidate addresses - v_blocka[0] = _mm_add_epi64(v_ba_q, v_bo10_q); - v_blocka[1] = _mm_add_epi64(v_ba_q, v_bo32_q); -#else // ARCH_X86 // sizeof(intptr_t) == 4 - __m128i v_bo_d = _mm_loadu_si128((const __m128i*)&ss_os[i]); - v_bo_d = _mm_and_si128(v_bo_d, v_inside_d); - v_blocka[0] = _mm_add_epi32(v_ba_d, v_bo_d); -#endif - } - - fn_ptr->sdx4df(what, what_stride, - (const uint8_t **)&v_blocka[0], in_what_stride, - (uint32_t*)&v_sad_d); - - // Look up the component cost of the residual motion vector - { - const int32_t row0 = _mm_extract_epi16(v_diff_mv_w, 0); - const int32_t col0 = _mm_extract_epi16(v_diff_mv_w, 1); - const int32_t row1 = _mm_extract_epi16(v_diff_mv_w, 2); - const int32_t col1 = _mm_extract_epi16(v_diff_mv_w, 3); - const int32_t row2 = _mm_extract_epi16(v_diff_mv_w, 4); - const int32_t col2 = _mm_extract_epi16(v_diff_mv_w, 5); - const int32_t row3 = _mm_extract_epi16(v_diff_mv_w, 6); - const int32_t col3 = _mm_extract_epi16(v_diff_mv_w, 7); - - // Note: This is a use case for vpgather in AVX2 - const uint32_t cost0 = x->nmvsadcost[0][row0] + x->nmvsadcost[0][col0]; - const uint32_t cost1 = x->nmvsadcost[0][row1] + x->nmvsadcost[0][col1]; - const uint32_t cost2 = x->nmvsadcost[0][row2] + x->nmvsadcost[0][col2]; - const uint32_t cost3 = x->nmvsadcost[0][row3] + x->nmvsadcost[0][col3]; - - __m128i v_cost_10_d, v_cost_32_d; - - v_cost_10_d = _mm_cvtsi32_si128(cost0); - v_cost_10_d = _mm_insert_epi32(v_cost_10_d, cost1, 1); - - v_cost_32_d = _mm_cvtsi32_si128(cost2); - v_cost_32_d = _mm_insert_epi32(v_cost_32_d, cost3, 1); - - v_cost_d = _mm_unpacklo_epi64(v_cost_10_d, v_cost_32_d); - } - - // Now add in the joint cost - { - const __m128i v_sel_d = _mm_cmpeq_epi32(v_diff_mv_w, - _mm_setzero_si128()); - const __m128i v_joint_cost_d = _mm_blendv_epi8(v_joint_cost_1_d, - v_joint_cost_0_d, - v_sel_d); - v_cost_d = _mm_add_epi32(v_cost_d, v_joint_cost_d); - } - - // Multiply by sad_per_bit - v_cost_d = _mm_mullo_epi32(v_cost_d, v_spb_d); - // ROUND_POWER_OF_TWO(v_cost_d, 8) - v_cost_d = _mm_add_epi32(v_cost_d, _mm_set1_epi32(0x80)); - v_cost_d = _mm_srai_epi32(v_cost_d, 8); - // Add the cost to the sad - v_sad_d = _mm_add_epi32(v_sad_d, v_cost_d); - - // Make the motion vectors outside the search area have max cost - // by or'ing in the comparison mask, this way the minimum search won't - // pick them. - v_sad_d = _mm_or_si128(v_sad_d, v_outside_d); - - // Find the minimum value and index horizontally in v_sad_d - { - // Try speculatively on 16 bits, so we can use the minpos intrinsic - const __m128i v_sad_w = _mm_packus_epi32(v_sad_d, v_sad_d); - const __m128i v_minp_w = _mm_minpos_epu16(v_sad_w); - - uint32_t local_best_sad = _mm_extract_epi16(v_minp_w, 0); - uint32_t local_best_idx = _mm_extract_epi16(v_minp_w, 1); - - // If the local best value is not saturated, just use it, otherwise - // find the horizontal minimum again the hard way on 32 bits. - // This is executed rarely. - if (UNLIKELY(local_best_sad == 0xffff)) { - __m128i v_loval_d, v_hival_d, v_loidx_d, v_hiidx_d, v_sel_d; - - v_loval_d = v_sad_d; - v_loidx_d = _mm_set_epi32(3, 2, 1, 0); - v_hival_d = _mm_srli_si128(v_loval_d, 8); - v_hiidx_d = _mm_srli_si128(v_loidx_d, 8); - - v_sel_d = _mm_cmplt_epi32(v_hival_d, v_loval_d); - - v_loval_d = _mm_blendv_epi8(v_loval_d, v_hival_d, v_sel_d); - v_loidx_d = _mm_blendv_epi8(v_loidx_d, v_hiidx_d, v_sel_d); - v_hival_d = _mm_srli_si128(v_loval_d, 4); - v_hiidx_d = _mm_srli_si128(v_loidx_d, 4); - - v_sel_d = _mm_cmplt_epi32(v_hival_d, v_loval_d); - - v_loval_d = _mm_blendv_epi8(v_loval_d, v_hival_d, v_sel_d); - v_loidx_d = _mm_blendv_epi8(v_loidx_d, v_hiidx_d, v_sel_d); - - local_best_sad = _mm_extract_epi32(v_loval_d, 0); - local_best_idx = _mm_extract_epi32(v_loidx_d, 0); - } - - // Update the global minimum if the local minimum is smaller - if (LIKELY(local_best_sad < best_sad)) { - new_bmv = ((const int_mv *)&v_these_mv_w)[local_best_idx]; - new_best_address = ((const uint8_t **)v_blocka)[local_best_idx]; - - best_sad = local_best_sad; - } - } - } - - bmv = new_bmv; - best_address = new_best_address; - - v_bmv_w = _mm_set1_epi32(bmv.as_int); -#if ARCH_X86_64 - v_ba_q = _mm_set1_epi64x((intptr_t)best_address); -#else - v_ba_d = _mm_set1_epi32((intptr_t)best_address); -#endif - - if (UNLIKELY(best_address == in_what)) { - (*num00)++; - } - } - - *best_mv = bmv.as_mv; - return best_sad; -} |