diff options
Diffstat (limited to 'vp9')
-rw-r--r-- | vp9/common/x86/vp9_idct_intrin_sse2.c | 17 | ||||
-rw-r--r-- | vp9/encoder/vp9_alt_ref_aq.c | 2 | ||||
-rw-r--r-- | vp9/encoder/vp9_alt_ref_aq.h | 2 | ||||
-rw-r--r-- | vp9/encoder/vp9_aq_cyclicrefresh.c | 3 | ||||
-rw-r--r-- | vp9/encoder/vp9_encodeframe.c | 12 | ||||
-rw-r--r-- | vp9/encoder/vp9_encodemb.c | 285 | ||||
-rw-r--r-- | vp9/encoder/vp9_encoder.h | 1 | ||||
-rw-r--r-- | vp9/encoder/vp9_pickmode.c | 34 | ||||
-rw-r--r-- | vp9/encoder/vp9_ratectrl.c | 5 | ||||
-rw-r--r-- | vp9/encoder/vp9_speed_features.c | 7 | ||||
-rw-r--r-- | vp9/encoder/x86/temporal_filter_sse4.c | 1 | ||||
-rw-r--r-- | vp9/encoder/x86/vp9_dct_intrin_sse2.c | 78 | ||||
-rw-r--r-- | vp9/vp9_cx_iface.c | 8 |
13 files changed, 341 insertions, 114 deletions
diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c index bb2dcf52b..7e8089b51 100644 --- a/vp9/common/x86/vp9_idct_intrin_sse2.c +++ b/vp9/common/x86/vp9_idct_intrin_sse2.c @@ -54,7 +54,6 @@ void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type) { __m128i in[8]; - const __m128i zero = _mm_setzero_si128(); const __m128i final_rounding = _mm_set1_epi16(1 << 4); // load input data @@ -106,14 +105,14 @@ void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, in[6] = _mm_srai_epi16(in[6], 5); in[7] = _mm_srai_epi16(in[7], 5); - RECON_AND_STORE(dest + 0 * stride, in[0]); - RECON_AND_STORE(dest + 1 * stride, in[1]); - RECON_AND_STORE(dest + 2 * stride, in[2]); - RECON_AND_STORE(dest + 3 * stride, in[3]); - RECON_AND_STORE(dest + 4 * stride, in[4]); - RECON_AND_STORE(dest + 5 * stride, in[5]); - RECON_AND_STORE(dest + 6 * stride, in[6]); - RECON_AND_STORE(dest + 7 * stride, in[7]); + recon_and_store(dest + 0 * stride, in[0]); + recon_and_store(dest + 1 * stride, in[1]); + recon_and_store(dest + 2 * stride, in[2]); + recon_and_store(dest + 3 * stride, in[3]); + recon_and_store(dest + 4 * stride, in[4]); + recon_and_store(dest + 5 * stride, in[5]); + recon_and_store(dest + 6 * stride, in[6]); + recon_and_store(dest + 7 * stride, in[7]); } void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, diff --git a/vp9/encoder/vp9_alt_ref_aq.c b/vp9/encoder/vp9_alt_ref_aq.c index 3aeefb584..acc3764c7 100644 --- a/vp9/encoder/vp9_alt_ref_aq.c +++ b/vp9/encoder/vp9_alt_ref_aq.c @@ -15,7 +15,7 @@ struct ALT_REF_AQ { int dummy; }; -struct ALT_REF_AQ *vp9_alt_ref_aq_create() { +struct ALT_REF_AQ *vp9_alt_ref_aq_create(void) { return (struct ALT_REF_AQ *)vpx_malloc(sizeof(struct ALT_REF_AQ)); } diff --git a/vp9/encoder/vp9_alt_ref_aq.h b/vp9/encoder/vp9_alt_ref_aq.h index 18acd8a85..e508cb44a 100644 --- a/vp9/encoder/vp9_alt_ref_aq.h +++ b/vp9/encoder/vp9_alt_ref_aq.h @@ -54,7 +54,7 @@ struct ALT_REF_AQ; * * \return Instance of the class */ -struct ALT_REF_AQ *vp9_alt_ref_aq_create(); +struct ALT_REF_AQ *vp9_alt_ref_aq_create(void); /*!\brief Upload segmentation_map to self object * diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.c b/vp9/encoder/vp9_aq_cyclicrefresh.c index 048ea629f..2f2f0055a 100644 --- a/vp9/encoder/vp9_aq_cyclicrefresh.c +++ b/vp9/encoder/vp9_aq_cyclicrefresh.c @@ -425,9 +425,10 @@ void vp9_cyclic_refresh_update_parameters(VP9_COMP *const cpi) { int target_refresh = 0; double weight_segment_target = 0; double weight_segment = 0; + int thresh_low_motion = (cm->width < 720) ? 55 : 20; cr->apply_cyclic_refresh = 1; if (cm->frame_type == KEY_FRAME || cpi->svc.temporal_layer_id > 0 || - (!cpi->use_svc && rc->avg_frame_low_motion < 55 && + (!cpi->use_svc && rc->avg_frame_low_motion < thresh_low_motion && rc->frames_since_key > 40)) { cr->apply_cyclic_refresh = 0; return; diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index 6215e198c..2b694a389 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -489,8 +489,9 @@ static int set_vt_partitioning(VP9_COMP *cpi, MACROBLOCK *const x, return 0; } -int64_t scale_part_thresh_sumdiff(int64_t threshold_base, int speed, int width, - int height, int content_state) { +static int64_t scale_part_thresh_sumdiff(int64_t threshold_base, int speed, + int width, int height, + int content_state) { if (speed >= 8) { if (width <= 640 && height <= 480) return (5 * threshold_base) >> 2; @@ -1022,6 +1023,9 @@ static void avg_source_sad(VP9_COMP *cpi, MACROBLOCK *x, int shift, if (tmp_variance < (tmp_sse >> 3) && (tmp_sse - tmp_variance) > 10000) x->content_state_sb = kLowVarHighSumdiff; + if (tmp_sad > (avg_source_sad_threshold << 1)) + x->content_state_sb = kVeryHighSad; + if (cpi->content_state_sb_fd != NULL) { if (tmp_sad < avg_source_sad_threshold2) { // Cap the increment to 255. @@ -1197,7 +1201,9 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]); vp9_build_inter_predictors_sb(xd, mi_row, mi_col, BLOCK_64X64); - x->sb_is_skin = skin_sb_split(cpi, x, low_res, mi_row, mi_col, force_split); + if (cpi->use_skin_detection) + x->sb_is_skin = + skin_sb_split(cpi, x, low_res, mi_row, mi_col, force_split); d = xd->plane[0].dst.buf; dp = xd->plane[0].dst.stride; diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c index 7e30499c5..d8ea92af0 100644 --- a/vp9/encoder/vp9_encodemb.c +++ b/vp9/encoder/vp9_encodemb.c @@ -49,19 +49,275 @@ void vp9_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) { pd->dst.buf, pd->dst.stride); } -typedef struct vp9_token_state { - int64_t error; - int rate; - int16_t next; +static const int plane_rd_mult[REF_TYPES][PLANE_TYPES] = { + { 10, 6 }, { 8, 5 }, +}; + +#define USE_GREEDY_OPTIMIZE_B 0 + +#if USE_GREEDY_OPTIMIZE_B + +typedef struct { int16_t token; tran_low_t qc; tran_low_t dqc; - uint8_t best_index; } vp9_token_state; -static const int plane_rd_mult[REF_TYPES][PLANE_TYPES] = { - { 10, 6 }, { 8, 5 }, -}; +// 'num' can be negative, but 'shift' must be non-negative. +#define RIGHT_SHIFT_POSSIBLY_NEGATIVE(num, shift) \ + ((num) >= 0) ? (num) >> (shift) : -((-(num)) >> (shift)) + +int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size, + int ctx) { + MACROBLOCKD *const xd = &mb->e_mbd; + struct macroblock_plane *const p = &mb->plane[plane]; + struct macroblockd_plane *const pd = &xd->plane[plane]; + const int ref = is_inter_block(xd->mi[0]); + vp9_token_state tokens[1025][2]; + uint8_t token_cache[1024]; + const tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block); + tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block); + tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); + const int eob = p->eobs[block]; + const PLANE_TYPE plane_type = get_plane_type(plane); + const int default_eob = 16 << (tx_size << 1); + const int shift = (tx_size == TX_32X32); + const int16_t *const dequant_ptr = pd->dequant; + const uint8_t *const band_translate = get_band_translate(tx_size); + const scan_order *const so = get_scan(xd, tx_size, plane_type, block); + const int16_t *const scan = so->scan; + const int16_t *const nb = so->neighbors; + const int64_t rdmult = + ((int64_t)mb->rdmult * plane_rd_mult[ref][plane_type]) >> 1; + const int64_t rddiv = mb->rddiv; + int64_t rd_cost0, rd_cost1; + int64_t rate0, rate1; + int16_t t0, t1; + int i, final_eob; +#if CONFIG_VP9_HIGHBITDEPTH + const uint16_t *cat6_high_cost = vp9_get_high_cost_table(xd->bd); +#else + const uint16_t *cat6_high_cost = vp9_get_high_cost_table(8); +#endif + unsigned int(*const token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] = + mb->token_costs[tx_size][plane_type][ref]; + unsigned int(*token_costs_cur)[2][COEFF_CONTEXTS][ENTROPY_TOKENS]; + int64_t eob_cost0, eob_cost1; + const int ctx0 = ctx; + int64_t accu_rate = 0; + // Initialized to the worst possible error for the largest transform size. + // This ensures that it never goes negative. + int64_t accu_error = ((int64_t)1) << 50; + int64_t best_block_rd_cost = INT64_MAX; + int x_prev = 1; + assert((!plane_type && !plane) || (plane_type && plane)); + assert(eob <= default_eob); + + for (i = 0; i < eob; i++) { + const int rc = scan[i]; + int x = qcoeff[rc]; + t0 = vp9_get_token(x); + tokens[i][0].qc = x; + tokens[i][0].token = t0; + tokens[i][0].dqc = dqcoeff[rc]; + token_cache[rc] = vp9_pt_energy_class[t0]; + } + tokens[eob][0].token = EOB_TOKEN; + tokens[eob][0].qc = 0; + tokens[eob][0].dqc = 0; + tokens[eob][1] = tokens[eob][0]; + final_eob = 0; + + // Initial RD cost. + token_costs_cur = token_costs + band_translate[0]; + rate0 = (*token_costs_cur)[0][ctx0][EOB_TOKEN]; + best_block_rd_cost = RDCOST(rdmult, rddiv, rate0, accu_error); + + // For each token, pick one of two choices greedily: + // (i) First candidate: Keep current quantized value, OR + // (ii) Second candidate: Reduce quantized value by 1. + for (i = 0; i < eob; i++) { + const int rc = scan[i]; + const int x = qcoeff[rc]; + const int band_cur = band_translate[i]; + const int ctx_cur = (i == 0) ? ctx : get_coef_context(nb, token_cache, i); + const int token_tree_sel_cur = (x_prev == 0); + token_costs_cur = token_costs + band_cur; + if (x == 0) { // No need to search + rate0 = + (*token_costs_cur)[token_tree_sel_cur][ctx_cur][tokens[i][0].token]; + accu_rate += rate0; + x_prev = 0; + // Note: accu_error does not change. + } else { + const int dqv = dequant_ptr[rc != 0]; + // Compute the distortion for quantizing to 0. + const int diff_for_zero_raw = (0 - coeff[rc]) * (1 << shift); + const int diff_for_zero = +#if CONFIG_VP9_HIGHBITDEPTH + (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) + ? RIGHT_SHIFT_POSSIBLY_NEGATIVE(diff_for_zero_raw, xd->bd - 8) + : +#endif + diff_for_zero_raw; + const int64_t distortion_for_zero = + (int64_t)diff_for_zero * diff_for_zero; + + // Compute the distortion for the first candidate + const int diff0_raw = (dqcoeff[rc] - coeff[rc]) * (1 << shift); + const int diff0 = +#if CONFIG_VP9_HIGHBITDEPTH + (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) + ? RIGHT_SHIFT_POSSIBLY_NEGATIVE(diff0_raw, xd->bd - 8) + : +#endif // CONFIG_VP9_HIGHBITDEPTH + diff0_raw; + const int64_t distortion0 = (int64_t)diff0 * diff0; + + // Compute the distortion for the second candidate + const int sign = -(x < 0); // -1 if x is negative and 0 otherwise. + const int x1 = x - 2 * sign - 1; // abs(x1) = abs(x) - 1. + int64_t distortion1; + if (x1 != 0) { + const int dqv_step = +#if CONFIG_VP9_HIGHBITDEPTH + (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? dqv >> (xd->bd - 8) + : +#endif // CONFIG_VP9_HIGHBITDEPTH + dqv; + const int diff_step = (dqv_step + sign) ^ sign; + const int diff1 = diff0 - diff_step; + assert(dqv > 0); // We aren't right shifting a negative number above. + distortion1 = (int64_t)diff1 * diff1; + } else { + distortion1 = distortion_for_zero; + } + { + // Calculate RDCost for current coeff for the two candidates. + const int64_t base_bits0 = vp9_get_token_cost(x, &t0, cat6_high_cost); + const int64_t base_bits1 = vp9_get_token_cost(x1, &t1, cat6_high_cost); + rate0 = + base_bits0 + (*token_costs_cur)[token_tree_sel_cur][ctx_cur][t0]; + rate1 = + base_bits1 + (*token_costs_cur)[token_tree_sel_cur][ctx_cur][t1]; + } + { + int rdcost_better_for_x1, eob_rdcost_better_for_x1; + int dqc0, dqc1; + int64_t best_eob_cost_cur; + + // Calculate RD Cost effect on the next coeff for the two candidates. + int64_t next_bits0 = 0; + int64_t next_bits1 = 0; + int64_t next_eob_bits0 = 0; + int64_t next_eob_bits1 = 0; + if (i < default_eob - 1) { + int ctx_next, token_tree_sel_next; + const int band_next = band_translate[i + 1]; + unsigned int( + *const token_costs_next)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] = + token_costs + band_next; + token_cache[rc] = vp9_pt_energy_class[t0]; + ctx_next = get_coef_context(nb, token_cache, i + 1); + token_tree_sel_next = (x == 0); + next_bits0 = (*token_costs_next)[token_tree_sel_next][ctx_next] + [tokens[i + 1][0].token]; + next_eob_bits0 = + (*token_costs_next)[token_tree_sel_next][ctx_next][EOB_TOKEN]; + token_cache[rc] = vp9_pt_energy_class[t1]; + ctx_next = get_coef_context(nb, token_cache, i + 1); + token_tree_sel_next = (x1 == 0); + next_bits1 = (*token_costs_next)[token_tree_sel_next][ctx_next] + [tokens[i + 1][0].token]; + if (x1 != 0) { + next_eob_bits1 = + (*token_costs_next)[token_tree_sel_next][ctx_next][EOB_TOKEN]; + } + } + + // Compare the total RD costs for two candidates. + rd_cost0 = RDCOST(rdmult, rddiv, (rate0 + next_bits0), distortion0); + rd_cost1 = RDCOST(rdmult, rddiv, (rate1 + next_bits1), distortion1); + rdcost_better_for_x1 = (rd_cost1 < rd_cost0); + eob_cost0 = RDCOST(rdmult, rddiv, (accu_rate + rate0 + next_eob_bits0), + (accu_error + distortion0 - distortion_for_zero)); + eob_cost1 = eob_cost0; + if (x1 != 0) { + eob_cost1 = + RDCOST(rdmult, rddiv, (accu_rate + rate1 + next_eob_bits1), + (accu_error + distortion1 - distortion_for_zero)); + eob_rdcost_better_for_x1 = (eob_cost1 < eob_cost0); + } else { + eob_rdcost_better_for_x1 = 0; + } + + // Calculate the two candidate de-quantized values. + dqc0 = dqcoeff[rc]; + dqc1 = 0; + if (rdcost_better_for_x1 + eob_rdcost_better_for_x1) { + if (x1 != 0) { + dqc1 = RIGHT_SHIFT_POSSIBLY_NEGATIVE(x1 * dqv, shift); + } else { + dqc1 = 0; + } + } + + // Pick and record the better quantized and de-quantized values. + if (rdcost_better_for_x1) { + qcoeff[rc] = x1; + dqcoeff[rc] = dqc1; + accu_rate += rate1; + accu_error += distortion1 - distortion_for_zero; + assert(distortion1 <= distortion_for_zero); + token_cache[rc] = vp9_pt_energy_class[t1]; + } else { + accu_rate += rate0; + accu_error += distortion0 - distortion_for_zero; + assert(distortion0 <= distortion_for_zero); + token_cache[rc] = vp9_pt_energy_class[t0]; + } + assert(accu_error >= 0); + x_prev = qcoeff[rc]; // Update based on selected quantized value. + + best_eob_cost_cur = eob_cost0; + tokens[i][1].token = t0; + tokens[i][1].qc = x; + tokens[i][1].dqc = dqc0; + if ((x1 != 0) && eob_rdcost_better_for_x1) { + best_eob_cost_cur = eob_cost1; + tokens[i][1].token = t1; + tokens[i][1].qc = x1; + tokens[i][1].dqc = dqc1; + } + + // Determine whether to move the eob position to i+1 + if (best_eob_cost_cur < best_block_rd_cost) { + best_block_rd_cost = best_eob_cost_cur; + final_eob = i + 1; + } + } + } + } + assert(final_eob <= eob); + if (final_eob > 0) { + int rc; + assert(tokens[final_eob - 1][1].qc != 0); + i = final_eob - 1; + rc = scan[i]; + qcoeff[rc] = tokens[i][1].qc; + dqcoeff[rc] = tokens[i][1].dqc; + } + for (i = final_eob; i < eob; i++) { + int rc = scan[i]; + qcoeff[rc] = 0; + dqcoeff[rc] = 0; + } + mb->plane[plane].eobs[block] = final_eob; + return final_eob; +} +#undef RIGHT_SHIFT_POSSIBLY_NEGATIVE + +#else #define UPDATE_RD_COST() \ { \ @@ -92,6 +348,17 @@ static const int16_t band_cum_count_table[TX_SIZES][8] = { { 0, 1, 3, 6, 10, 21, 256, 0 }, { 0, 1, 3, 6, 10, 21, 1024, 0 }, }; + +typedef struct vp9_token_state { + int64_t error; + int rate; + int16_t next; + int16_t token; + tran_low_t qc; + tran_low_t dqc; + uint8_t best_index; +} vp9_token_state; + int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size, int ctx) { MACROBLOCKD *const xd = &mb->e_mbd; @@ -327,6 +594,8 @@ int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size, return final_eob; } +#endif // USE_GREEDY_OPTIMIZE_B + static INLINE void fdct32x32(int rd_transform, const int16_t *src, tran_low_t *dst, int src_stride) { if (rd_transform) diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index 672c83bfd..7ab892000 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -138,6 +138,7 @@ typedef enum { kHighSadLowSumdiff = 3, kHighSadHighSumdiff = 4, kLowVarHighSumdiff = 5, + kVeryHighSad = 6, } CONTENT_STATE_SB; typedef struct VP9EncoderConfig { diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c index 525523ade..17dc0637f 100644 --- a/vp9/encoder/vp9_pickmode.c +++ b/vp9/encoder/vp9_pickmode.c @@ -217,7 +217,8 @@ static int combined_motion_search(VP9_COMP *cpi, MACROBLOCK *x, } if (rv && search_subpel) { - const int subpel_force_stop = cpi->sf.mv.subpel_force_stop; + int subpel_force_stop = cpi->sf.mv.subpel_force_stop; + if (use_base_mv && cpi->sf.base_mv_aggressive) subpel_force_stop = 2; cpi->find_fractional_mv_step( x, &tmp_mv->as_mv, &ref_mv, cpi->common.allow_high_precision_mv, x->errorperbit, &cpi->fn_ptr[bsize], subpel_force_stop, @@ -1489,6 +1490,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, int force_skip_low_temp_var = 0; int skip_ref_find_pred[4] = { 0 }; unsigned int sse_zeromv_normalized = UINT_MAX; + unsigned int best_sse_sofar = UINT_MAX; unsigned int thresh_svc_skip_golden = 500; #if CONFIG_VP9_TEMPORAL_DENOISING VP9_PICKMODE_CTX_DEN ctx_den; @@ -1615,7 +1617,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, if (cpi->oxcf.speed >= 8 && !cpi->use_svc && ((cpi->rc.frames_since_golden + 1) < x->last_sb_high_content || - x->last_sb_high_content > 40)) + x->last_sb_high_content > 40 || cpi->rc.frames_since_golden > 120)) usable_ref_frame = LAST_FRAME; for (ref_frame = LAST_FRAME; ref_frame <= usable_ref_frame; ++ref_frame) { @@ -1691,7 +1693,8 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, continue; } - if ((cpi->sf.short_circuit_low_temp_var >= 2 || + if (x->content_state_sb != kVeryHighSad && + (cpi->sf.short_circuit_low_temp_var >= 2 || (cpi->sf.short_circuit_low_temp_var == 1 && bsize == BLOCK_64X64)) && force_skip_low_temp_var && ref_frame == LAST_FRAME && this_mode == NEWMV) { @@ -1786,17 +1789,29 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, } else if (svc->use_base_mv && svc->spatial_layer_id) { if (frame_mv[NEWMV][ref_frame].as_int != INVALID_MV) { const int pre_stride = xd->plane[0].pre[0].stride; - int base_mv_sad = INT_MAX; - const float base_mv_bias = sf->base_mv_aggressive ? 1.5f : 1.0f; + unsigned int base_mv_sse = UINT_MAX; + int scale = (cpi->rc.avg_frame_low_motion > 60) ? 2 : 4; const uint8_t *const pre_buf = xd->plane[0].pre[0].buf + (frame_mv[NEWMV][ref_frame].as_mv.row >> 3) * pre_stride + (frame_mv[NEWMV][ref_frame].as_mv.col >> 3); - base_mv_sad = cpi->fn_ptr[bsize].sdf( - x->plane[0].src.buf, x->plane[0].src.stride, pre_buf, pre_stride); - - if (base_mv_sad < (int)(base_mv_bias * x->pred_mv_sad[ref_frame])) { + cpi->fn_ptr[bsize].vf(x->plane[0].src.buf, x->plane[0].src.stride, + pre_buf, pre_stride, &base_mv_sse); + // Exit NEWMV search if base_mv_sse is large. + if (sf->base_mv_aggressive && base_mv_sse > (best_sse_sofar << scale)) + continue; + if (base_mv_sse < (best_sse_sofar << 1)) { // Base layer mv is good. + // Exit NEWMV search if the base_mv is (0, 0) and sse is low, since + // (0, 0) mode is already tested. + unsigned int base_mv_sse_normalized = + base_mv_sse >> + (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]); + if (sf->base_mv_aggressive && base_mv_sse <= best_sse_sofar && + base_mv_sse_normalized < 400 && + frame_mv[NEWMV][ref_frame].as_mv.row == 0 && + frame_mv[NEWMV][ref_frame].as_mv.col == 0) + continue; if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col, &frame_mv[NEWMV][ref_frame], &rate_mv, best_rdc.rdcost, 1)) { @@ -1942,6 +1957,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, sse_zeromv_normalized = sse_y >> (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]); } + if (sse_y < best_sse_sofar) best_sse_sofar = sse_y; } if (!this_early_term) { diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c index 27fea5d4e..1b5279412 100644 --- a/vp9/encoder/vp9_ratectrl.c +++ b/vp9/encoder/vp9_ratectrl.c @@ -209,7 +209,7 @@ int vp9_estimate_bits_at_q(FRAME_TYPE frame_type, int q, int mbs, const int bpm = (int)(vp9_rc_bits_per_mb(frame_type, q, correction_factor, bit_depth)); return VPXMAX(FRAME_OVERHEAD_BITS, - (int)((uint64_t)bpm * mbs) >> BPER_MB_NORMBITS); + (int)(((uint64_t)bpm * mbs) >> BPER_MB_NORMBITS)); } int vp9_rc_clamp_pframe_target_size(const VP9_COMP *const cpi, int target) { @@ -2070,7 +2070,8 @@ int vp9_resize_one_pass_cbr(VP9_COMP *cpi) { return resize_action; } -void adjust_gf_boost_lag_one_pass_vbr(VP9_COMP *cpi, uint64_t avg_sad_current) { +static void adjust_gf_boost_lag_one_pass_vbr(VP9_COMP *cpi, + uint64_t avg_sad_current) { VP9_COMMON *const cm = &cpi->common; RATE_CONTROL *const rc = &cpi->rc; int target; diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c index 9df4f80ec..4c864d46a 100644 --- a/vp9/encoder/vp9_speed_features.c +++ b/vp9/encoder/vp9_speed_features.c @@ -534,10 +534,7 @@ static void set_rt_speed_feature_framesize_independent( if (cpi->svc.temporal_layer_id > 0) { sf->adaptive_rd_thresh = 4; sf->limit_newmv_early_exit = 0; - sf->base_mv_aggressive = - (cpi->svc.temporal_layer_id == cpi->svc.number_temporal_layers - 1) - ? 1 - : 0; + sf->base_mv_aggressive = 1; } } @@ -589,7 +586,7 @@ static void set_rt_speed_feature_framesize_independent( if (content == VP9E_CONTENT_SCREEN) sf->mv.subpel_force_stop = 3; - else if (cm->width * cm->height > 352 * 288) { + else if (cm->width * cm->height > 1280 * 720) { sf->mv.subpel_force_stop = 2; if (cpi->rc.avg_frame_low_motion > 87 && cm->current_video_frame > 30) sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED_EVENMORE; diff --git a/vp9/encoder/x86/temporal_filter_sse4.c b/vp9/encoder/x86/temporal_filter_sse4.c index be4cd8685..460dab659 100644 --- a/vp9/encoder/x86/temporal_filter_sse4.c +++ b/vp9/encoder/x86/temporal_filter_sse4.c @@ -11,6 +11,7 @@ #include <assert.h> #include <smmintrin.h> +#include "./vp9_rtcd.h" #include "./vpx_config.h" #include "vpx/vpx_integer.h" diff --git a/vp9/encoder/x86/vp9_dct_intrin_sse2.c b/vp9/encoder/x86/vp9_dct_intrin_sse2.c index 09a1e48fc..969c60aba 100644 --- a/vp9/encoder/x86/vp9_dct_intrin_sse2.c +++ b/vp9/encoder/x86/vp9_dct_intrin_sse2.c @@ -15,6 +15,7 @@ #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/txfm_common.h" #include "vpx_dsp/x86/fwd_txfm_sse2.h" +#include "vpx_dsp/x86/transpose_sse2.h" #include "vpx_dsp/x86/txfm_common_sse2.h" #include "vpx_ports/mem.h" @@ -706,58 +707,6 @@ static INLINE void write_buffer_8x8(tran_low_t *output, __m128i *res, store_output(&res[7], (output + 7 * stride)); } -// perform in-place transpose -static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) { - const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); - const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); - const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]); - const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]); - const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); - const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); - const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]); - const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]); - // 00 10 01 11 02 12 03 13 - // 20 30 21 31 22 32 23 33 - // 04 14 05 15 06 16 07 17 - // 24 34 25 35 26 36 27 37 - // 40 50 41 51 42 52 43 53 - // 60 70 61 71 62 72 63 73 - // 44 54 45 55 46 56 47 57 - // 64 74 65 75 66 76 67 77 - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); - const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5); - const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); - const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5); - const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3); - const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); - const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3); - const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); - // 00 10 20 30 01 11 21 31 - // 40 50 60 70 41 51 61 71 - // 02 12 22 32 03 13 23 33 - // 42 52 62 72 43 53 63 73 - // 04 14 24 34 05 15 25 35 - // 44 54 64 74 45 55 65 75 - // 06 16 26 36 07 17 27 37 - // 46 56 66 76 47 57 67 77 - res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1); - res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1); - res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3); - res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3); - res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5); - res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5); - res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7); - res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7); - // 00 10 20 30 40 50 60 70 - // 01 11 21 31 41 51 61 71 - // 02 12 22 32 42 52 62 72 - // 03 13 23 33 43 53 63 73 - // 04 14 24 34 44 54 64 74 - // 05 15 25 35 45 55 65 75 - // 06 16 26 36 46 56 66 76 - // 07 17 27 37 47 57 67 77 -} - static void fdct8_sse2(__m128i *in) { // constants const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); @@ -895,7 +844,7 @@ static void fdct8_sse2(__m128i *in) { in[7] = _mm_packs_epi32(v6, v7); // transpose - array_transpose_8x8(in, in); + transpose_16bit_8x8(in, in); } static void fadst8_sse2(__m128i *in) { @@ -1125,7 +1074,7 @@ static void fadst8_sse2(__m128i *in) { in[7] = _mm_sub_epi16(k__const_0, s1); // transpose - array_transpose_8x8(in, in); + transpose_16bit_8x8(in, in); } void vp9_fht8x8_sse2(const int16_t *input, tran_low_t *output, int stride, @@ -1182,23 +1131,6 @@ static INLINE void write_buffer_16x16(tran_low_t *output, __m128i *in0, write_buffer_8x8(output + 8 * stride, in1 + 8, stride); } -static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) { - __m128i tbuf[8]; - array_transpose_8x8(res0, res0); - array_transpose_8x8(res1, tbuf); - array_transpose_8x8(res0 + 8, res1); - array_transpose_8x8(res1 + 8, res1 + 8); - - res0[8] = tbuf[0]; - res0[9] = tbuf[1]; - res0[10] = tbuf[2]; - res0[11] = tbuf[3]; - res0[12] = tbuf[4]; - res0[13] = tbuf[5]; - res0[14] = tbuf[6]; - res0[15] = tbuf[7]; -} - static INLINE void right_shift_16x16(__m128i *res0, __m128i *res1) { // perform rounding operations right_shift_8x8(res0, 2); @@ -2002,13 +1934,13 @@ static void fadst16_8col(__m128i *in) { static void fdct16_sse2(__m128i *in0, __m128i *in1) { fdct16_8col(in0); fdct16_8col(in1); - array_transpose_16x16(in0, in1); + transpose_16bit_16x16(in0, in1); } static void fadst16_sse2(__m128i *in0, __m128i *in1) { fadst16_8col(in0); fadst16_8col(in1); - array_transpose_16x16(in0, in1); + transpose_16bit_16x16(in0, in1); } void vp9_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride, diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index bb6b30bd4..d18457f34 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -432,8 +432,12 @@ static void config_target_level(VP9EncoderConfig *oxcf) { (int)vp9_level_defs[target_level_index].min_altref_distance) { oxcf->min_gf_interval = (int)vp9_level_defs[target_level_index].min_altref_distance + 1; - oxcf->max_gf_interval = - VPXMAX(oxcf->max_gf_interval, oxcf->min_gf_interval); + // If oxcf->max_gf_interval == 0, it will be assigned with a default value + // in vp9_rc_set_gf_interval_range(). + if (oxcf->max_gf_interval != 0) { + oxcf->max_gf_interval = + VPXMAX(oxcf->max_gf_interval, oxcf->min_gf_interval); + } } // Adjust maximum column tiles. |